From f3f0d7b026ae34d6ed5ae67cd4dd5909f9cd70a5 Mon Sep 17 00:00:00 2001 From: Tim Shimmin Date: Thu, 30 Oct 2008 18:30:09 +1100 Subject: [XFS] remove restricted chown parameter from xfs linux On Linux all filesystems are supposed to be operating under Posix' restricted chown. Restricted chown means it restricts chown to the owner unless you have CAP_FOWNER. NOTE: that 2 files outside of fs/xfs have been modified too for this change. Reviewed-by: Dave Chinner SGI-PV: 988919 SGI-Modid: 2.6.x-xfs-melb:linux:32413b Signed-off-by: Tim Shimmin Signed-off-by: Christoph Hellwig Signed-off-by: David Chinner Signed-off-by: Lachlan McIlroy --- kernel/sysctl_check.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index c35da23ab8f..fafeb48f27c 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -730,7 +730,6 @@ static const struct trans_ctl_table trans_fs_quota_table[] = { }; static const struct trans_ctl_table trans_fs_xfs_table[] = { - { XFS_RESTRICT_CHOWN, "restrict_chown" }, { XFS_SGID_INHERIT, "irix_sgid_inherit" }, { XFS_SYMLINK_MODE, "irix_symlink_mode" }, { XFS_PANIC_MASK, "panic_mask" }, -- cgit v1.2.3-70-g09d2 From 1f29fae29709b4668979e244c09b2fa78ff1ad59 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 5 Nov 2008 16:08:52 -0600 Subject: file capabilities: add no_file_caps switch (v4) Add a no_file_caps boot option when file capabilities are compiled into the kernel (CONFIG_SECURITY_FILE_CAPABILITIES=y). This allows distributions to ship a kernel with file capabilities compiled in, without forcing users to use (and understand and trust) them. When no_file_caps is specified at boot, then when a process executes a file, any file capabilities stored with that file will not be used in the calculation of the process' new capability sets. This means that booting with the no_file_caps boot option will not be the same as booting a kernel with file capabilities compiled out - in particular a task with CAP_SETPCAP will not have any chance of passing capabilities to another task (which isn't "really" possible anyway, and which may soon by killed altogether by David Howells in any case), and it will instead be able to put new capabilities in its pI. However since fI will always be empty and pI is masked with fI, it gains the task nothing. We also support the extra prctl options, setting securebits and dropping capabilities from the per-process bounding set. The other remaining difference is that killpriv, task_setscheduler, setioprio, and setnice will continue to be hooked. That will be noticable in the case where a root task changed its uid while keeping some caps, and another task owned by the new uid tries to change settings for the more privileged task. Changelog: Nov 05 2008: (v4) trivial port on top of always-start-\ with-clear-caps patch Sep 23 2008: nixed file_caps_enabled when file caps are not compiled in as it isn't used. Document no_file_caps in kernel-parameters.txt. Signed-off-by: Serge Hallyn Acked-by: Andrew G. Morgan Signed-off-by: James Morris --- Documentation/kernel-parameters.txt | 4 ++++ include/linux/capability.h | 3 +++ kernel/capability.c | 11 +++++++++++ security/commoncap.c | 3 +++ 4 files changed, 21 insertions(+) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 1bbcaa8982b..784443acca9 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1459,6 +1459,10 @@ and is between 256 and 4096 characters. It is defined in the file instruction doesn't work correctly and not to use it. + no_file_caps Tells the kernel not to honor file capabilities. The + only way then for a file to be executed with privilege + is to be setuid root or executed by root. + nohalt [IA-64] Tells the kernel not to use the power saving function PAL_HALT_LIGHT when idle. This increases power-consumption. On the positive side, it reduces diff --git a/include/linux/capability.h b/include/linux/capability.h index 9d1fe30b6f6..5bc145bd759 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -68,6 +68,9 @@ typedef struct __user_cap_data_struct { #define VFS_CAP_U32 VFS_CAP_U32_2 #define VFS_CAP_REVISION VFS_CAP_REVISION_2 +#ifdef CONFIG_SECURITY_FILE_CAPABILITIES +extern int file_caps_enabled; +#endif struct vfs_cap_data { __le32 magic_etc; /* Little endian */ diff --git a/kernel/capability.c b/kernel/capability.c index 33e51e78c2d..e13a68535ad 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -33,6 +33,17 @@ EXPORT_SYMBOL(__cap_empty_set); EXPORT_SYMBOL(__cap_full_set); EXPORT_SYMBOL(__cap_init_eff_set); +#ifdef CONFIG_SECURITY_FILE_CAPABILITIES +int file_caps_enabled = 1; + +static int __init file_caps_disable(char *str) +{ + file_caps_enabled = 0; + return 1; +} +__setup("no_file_caps", file_caps_disable); +#endif + /* * More recent versions of libcap are available from: * diff --git a/security/commoncap.c b/security/commoncap.c index 3976613db82..f88119cb2bc 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -281,6 +281,9 @@ static int get_file_caps(struct linux_binprm *bprm) bprm_clear_caps(bprm); + if (!file_caps_enabled) + return 0; + if (bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID) return 0; -- cgit v1.2.3-70-g09d2 From 851f7ff56d9c21272f289dd85fb3f1b6cf7a6e10 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 11 Nov 2008 21:48:14 +1100 Subject: This patch will print cap_permitted and cap_inheritable data in the PATH records of any file that has file capabilities set. Files which do not have fcaps set will not have different PATH records. An example audit record if you run: setcap "cap_net_admin+pie" /bin/bash /bin/bash type=SYSCALL msg=audit(1225741937.363:230): arch=c000003e syscall=59 success=yes exit=0 a0=2119230 a1=210da30 a2=20ee290 a3=8 items=2 ppid=2149 pid=2923 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=3 comm="ping" exe="/bin/ping" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=EXECVE msg=audit(1225741937.363:230): argc=2 a0="ping" a1="www.google.com" type=CWD msg=audit(1225741937.363:230): cwd="/root" type=PATH msg=audit(1225741937.363:230): item=0 name="/bin/ping" inode=49256 dev=fd:00 mode=0104755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ping_exec_t:s0 cap_fp=0000000000002000 cap_fi=0000000000002000 cap_fe=1 cap_fver=2 type=PATH msg=audit(1225741937.363:230): item=1 name=(null) inode=507915 dev=fd:00 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ld_so_t:s0 Signed-off-by: Eric Paris Acked-by: Serge Hallyn Signed-off-by: James Morris --- include/linux/capability.h | 5 +++ kernel/auditsc.c | 82 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 82 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/capability.h b/include/linux/capability.h index d567af247ed..0f195018110 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -53,6 +53,7 @@ typedef struct __user_cap_data_struct { #define XATTR_NAME_CAPS XATTR_SECURITY_PREFIX XATTR_CAPS_SUFFIX #define VFS_CAP_REVISION_MASK 0xFF000000 +#define VFS_CAP_REVISION_SHIFT 24 #define VFS_CAP_FLAGS_MASK ~VFS_CAP_REVISION_MASK #define VFS_CAP_FLAGS_EFFECTIVE 0x000001 @@ -534,6 +535,10 @@ kernel_cap_t cap_set_effective(const kernel_cap_t pE_new); extern int capable(int cap); +/* audit system wants to get cap info from files as well */ +struct dentry; +extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); + #endif /* __KERNEL__ */ #endif /* !_LINUX_CAPABILITY_H */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index cf5bc2f5f9c..de7e9bcba9a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -65,6 +65,7 @@ #include #include #include +#include #include "audit.h" @@ -84,6 +85,15 @@ int audit_n_rules; /* determines whether we collect data for signals sent */ int audit_signals; +struct audit_cap_data { + kernel_cap_t permitted; + kernel_cap_t inheritable; + union { + unsigned int fE; /* effective bit of a file capability */ + kernel_cap_t effective; /* effective set of a process */ + }; +}; + /* When fs/namei.c:getname() is called, we store the pointer in name and * we don't let putname() free it (instead we free all of the saved * pointers at syscall exit time). @@ -100,6 +110,8 @@ struct audit_names { gid_t gid; dev_t rdev; u32 osid; + struct audit_cap_data fcap; + unsigned int fcap_ver; }; struct audit_aux_data { @@ -1171,6 +1183,35 @@ static void audit_log_execve_info(struct audit_context *context, kfree(buf); } +static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) +{ + int i; + + audit_log_format(ab, " %s=", prefix); + CAP_FOR_EACH_U32(i) { + audit_log_format(ab, "%08x", cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]); + } +} + +static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) +{ + kernel_cap_t *perm = &name->fcap.permitted; + kernel_cap_t *inh = &name->fcap.inheritable; + int log = 0; + + if (!cap_isclear(*perm)) { + audit_log_cap(ab, "cap_fp", perm); + log = 1; + } + if (!cap_isclear(*inh)) { + audit_log_cap(ab, "cap_fi", inh); + log = 1; + } + + if (log) + audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver); +} + static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { int i, call_panic = 0; @@ -1421,6 +1462,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts } } + audit_log_fcaps(ab, n); + audit_log_end(ab); } @@ -1787,8 +1830,36 @@ static int audit_inc_name_count(struct audit_context *context, return 0; } + +static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) +{ + struct cpu_vfs_cap_data caps; + int rc; + + memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t)); + memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t)); + name->fcap.fE = 0; + name->fcap_ver = 0; + + if (!dentry) + return 0; + + rc = get_vfs_caps_from_disk(dentry, &caps); + if (rc) + return rc; + + name->fcap.permitted = caps.permitted; + name->fcap.inheritable = caps.inheritable; + name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); + name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; + + return 0; +} + + /* Copy inode data into an audit_names. */ -static void audit_copy_inode(struct audit_names *name, const struct inode *inode) +static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, + const struct inode *inode) { name->ino = inode->i_ino; name->dev = inode->i_sb->s_dev; @@ -1797,6 +1868,7 @@ static void audit_copy_inode(struct audit_names *name, const struct inode *inode name->gid = inode->i_gid; name->rdev = inode->i_rdev; security_inode_getsecid(inode, &name->osid); + audit_copy_fcaps(name, dentry); } /** @@ -1831,7 +1903,7 @@ void __audit_inode(const char *name, const struct dentry *dentry) context->names[idx].name = NULL; } handle_path(dentry); - audit_copy_inode(&context->names[idx], inode); + audit_copy_inode(&context->names[idx], dentry, inode); } /** @@ -1892,7 +1964,7 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry, if (!strcmp(dname, n->name) || !audit_compare_dname_path(dname, n->name, &dirlen)) { if (inode) - audit_copy_inode(n, inode); + audit_copy_inode(n, NULL, inode); else n->ino = (unsigned long)-1; found_child = n->name; @@ -1906,7 +1978,7 @@ add_names: return; idx = context->name_count - 1; context->names[idx].name = NULL; - audit_copy_inode(&context->names[idx], parent); + audit_copy_inode(&context->names[idx], NULL, parent); } if (!found_child) { @@ -1927,7 +1999,7 @@ add_names: } if (inode) - audit_copy_inode(&context->names[idx], inode); + audit_copy_inode(&context->names[idx], NULL, inode); else context->names[idx].ino = (unsigned long)-1; } -- cgit v1.2.3-70-g09d2 From 3fc689e96c0c90b6fede5946d6c31075e9464f69 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 11 Nov 2008 21:48:18 +1100 Subject: Any time fcaps or a setuid app under SECURE_NOROOT is used to result in a non-zero pE we will crate a new audit record which contains the entire set of known information about the executable in question, fP, fI, fE, fversion and includes the process's pE, pI, pP. Before and after the bprm capability are applied. This record type will only be emitted from execve syscalls. an example of making ping use fcaps instead of setuid: setcap "cat_net_raw+pe" /bin/ping type=SYSCALL msg=audit(1225742021.015:236): arch=c000003e syscall=59 success=yes exit=0 a0=1457f30 a1=14606b0 a2=1463940 a3=321b770a70 items=2 ppid=2929 pid=2963 auid=0 uid=500 gid=500 euid=500 suid=500 fsuid=500 egid=500 sgid=500 fsgid=500 tty=pts0 ses=3 comm="ping" exe="/bin/ping" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1321] msg=audit(1225742021.015:236): fver=2 fp=0000000000002000 fi=0000000000000000 fe=1 old_pp=0000000000000000 old_pi=0000000000000000 old_pe=0000000000000000 new_pp=0000000000002000 new_pi=0000000000000000 new_pe=0000000000002000 type=EXECVE msg=audit(1225742021.015:236): argc=2 a0="ping" a1="127.0.0.1" type=CWD msg=audit(1225742021.015:236): cwd="/home/test" type=PATH msg=audit(1225742021.015:236): item=0 name="/bin/ping" inode=49256 dev=fd:00 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ping_exec_t:s0 cap_fp=0000000000002000 cap_fe=1 cap_fver=2 type=PATH msg=audit(1225742021.015:236): item=1 name=(null) inode=507915 dev=fd:00 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ld_so_t:s0 Signed-off-by: Eric Paris Acked-by: Serge Hallyn Signed-off-by: James Morris --- include/linux/audit.h | 26 ++++++++++++++++++++ kernel/auditsc.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++ security/commoncap.c | 23 ++++++++++++++++- 3 files changed, 116 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 6272a395d43..8cfb9feb2a0 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -99,6 +99,7 @@ #define AUDIT_OBJ_PID 1318 /* ptrace target */ #define AUDIT_TTY 1319 /* Input on an administrative TTY */ #define AUDIT_EOE 1320 /* End of multi-record event */ +#define AUDIT_BPRM_FCAPS 1321 /* Information about fcaps increasing perms */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ @@ -452,6 +453,7 @@ extern int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_pr extern int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned int __user *u_msg_prio, const struct timespec __user *u_abs_timeout); extern int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification); extern int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat); +extern void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE); static inline int audit_ipc_obj(struct kern_ipc_perm *ipcp) { @@ -501,6 +503,29 @@ static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) return __audit_mq_getsetattr(mqdes, mqstat); return 0; } + +/* + * ieieeeeee, an audit function without a return code! + * + * This function might fail! I decided that it didn't matter. We are too late + * to fail the syscall and the information isn't REQUIRED for any purpose. It's + * just nice to have. We should be able to look at past audit logs to figure + * out this process's current cap set along with the fcaps from the PATH record + * and use that to come up with the final set. Yeah, its ugly, but all the info + * is still in the audit log. So I'm not going to bother mentioning we failed + * if we couldn't allocate memory. + * + * If someone changes their mind they could create the aux record earlier and + * then search here and use that earlier allocation. But I don't wanna. + * + * -Eric + */ +static inline void audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE) +{ + if (unlikely(!audit_dummy_context())) + __audit_log_bprm_fcaps(bprm, pP, pE); +} + extern int audit_n_rules; extern int audit_signals; #else @@ -532,6 +557,7 @@ extern int audit_signals; #define audit_mq_timedreceive(d,l,p,t) ({ 0; }) #define audit_mq_notify(d,n) ({ 0; }) #define audit_mq_getsetattr(d,s) ({ 0; }) +#define audit_log_bprm_fcaps(b, p, e) do { ; } while (0) #define audit_ptrace(t) ((void)0) #define audit_n_rules 0 #define audit_signals 0 diff --git a/kernel/auditsc.c b/kernel/auditsc.c index de7e9bcba9a..3229cd4206f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -196,6 +196,14 @@ struct audit_aux_data_pids { int pid_count; }; +struct audit_aux_data_bprm_fcaps { + struct audit_aux_data d; + struct audit_cap_data fcap; + unsigned int fcap_ver; + struct audit_cap_data old_pcap; + struct audit_cap_data new_pcap; +}; + struct audit_tree_refs { struct audit_tree_refs *next; struct audit_chunk *c[31]; @@ -1375,6 +1383,20 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]); break; } + case AUDIT_BPRM_FCAPS: { + struct audit_aux_data_bprm_fcaps *axs = (void *)aux; + audit_log_format(ab, "fver=%x", axs->fcap_ver); + audit_log_cap(ab, "fp", &axs->fcap.permitted); + audit_log_cap(ab, "fi", &axs->fcap.inheritable); + audit_log_format(ab, " fe=%d", axs->fcap.fE); + audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted); + audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable); + audit_log_cap(ab, "old_pe", &axs->old_pcap.effective); + audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted); + audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable); + audit_log_cap(ab, "new_pe", &axs->new_pcap.effective); + break; } + } audit_log_end(ab); } @@ -2501,6 +2523,52 @@ int __audit_signal_info(int sig, struct task_struct *t) return 0; } +/** + * __audit_log_bprm_fcaps - store information about a loading bprm and relevant fcaps + * @bprm pointer to the bprm being processed + * @caps the caps read from the disk + * + * Simply check if the proc already has the caps given by the file and if not + * store the priv escalation info for later auditing at the end of the syscall + * + * this can fail and we don't care. See the note in audit.h for + * audit_log_bprm_fcaps() for my explaination.... + * + * -Eric + */ +void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE) +{ + struct audit_aux_data_bprm_fcaps *ax; + struct audit_context *context = current->audit_context; + struct cpu_vfs_cap_data vcaps; + struct dentry *dentry; + + ax = kmalloc(sizeof(*ax), GFP_KERNEL); + if (!ax) + return; + + ax->d.type = AUDIT_BPRM_FCAPS; + ax->d.next = context->aux; + context->aux = (void *)ax; + + dentry = dget(bprm->file->f_dentry); + get_vfs_caps_from_disk(dentry, &vcaps); + dput(dentry); + + ax->fcap.permitted = vcaps.permitted; + ax->fcap.inheritable = vcaps.inheritable; + ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); + ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; + + ax->old_pcap.permitted = *pP; + ax->old_pcap.inheritable = current->cap_inheritable; + ax->old_pcap.effective = *pE; + + ax->new_pcap.permitted = current->cap_permitted; + ax->new_pcap.inheritable = current->cap_inheritable; + ax->new_pcap.effective = current->cap_effective; +} + /** * audit_core_dumps - record information about processes that end abnormally * @signr: signal value diff --git a/security/commoncap.c b/security/commoncap.c index d7eff5797b9..d4539338099 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include @@ -376,6 +377,9 @@ int cap_bprm_set_security (struct linux_binprm *bprm) void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe) { + kernel_cap_t pP = current->cap_permitted; + kernel_cap_t pE = current->cap_effective; + if (bprm->e_uid != current->uid || bprm->e_gid != current->gid || !cap_issubset(bprm->cap_post_exec_permitted, current->cap_permitted)) { @@ -409,7 +413,24 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe) cap_clear(current->cap_effective); } - /* AUD: Audit candidate if current->cap_effective is set */ + /* + * Audit candidate if current->cap_effective is set + * + * We do not bother to audit if 3 things are true: + * 1) cap_effective has all caps + * 2) we are root + * 3) root is supposed to have all caps (SECURE_NOROOT) + * Since this is just a normal root execing a process. + * + * Number 1 above might fail if you don't have a full bset, but I think + * that is interesting information to audit. + */ + if (!cap_isclear(current->cap_effective)) { + if (!cap_issubset(CAP_FULL_SET, current->cap_effective) || + (bprm->e_uid != 0) || (current->uid != 0) || + issecure(SECURE_NOROOT)) + audit_log_bprm_fcaps(bprm, &pP, &pE); + } current->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); } -- cgit v1.2.3-70-g09d2 From e68b75a027bb94066576139ee33676264f867b87 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 11 Nov 2008 21:48:22 +1100 Subject: When the capset syscall is used it is not possible for audit to record the actual capbilities being added/removed. This patch adds a new record type which emits the target pid and the eff, inh, and perm cap sets. example output if you audit capset syscalls would be: type=SYSCALL msg=audit(1225743140.465:76): arch=c000003e syscall=126 success=yes exit=0 a0=17f2014 a1=17f201c a2=80000000 a3=7fff2ab7f060 items=0 ppid=2160 pid=2223 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=1 comm="setcap" exe="/usr/sbin/setcap" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1322] msg=audit(1225743140.465:76): pid=0 cap_pi=ffffffffffffffff cap_pp=ffffffffffffffff cap_pe=ffffffffffffffff Signed-off-by: Eric Paris Acked-by: Serge Hallyn Signed-off-by: James Morris --- include/linux/audit.h | 10 ++++++++++ kernel/auditsc.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/capability.c | 5 +++++ 3 files changed, 63 insertions(+) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 8cfb9feb2a0..6fbebac7b1b 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -100,6 +100,7 @@ #define AUDIT_TTY 1319 /* Input on an administrative TTY */ #define AUDIT_EOE 1320 /* End of multi-record event */ #define AUDIT_BPRM_FCAPS 1321 /* Information about fcaps increasing perms */ +#define AUDIT_CAPSET 1322 /* Record showing argument to sys_capset */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ @@ -454,6 +455,7 @@ extern int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned int __u extern int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification); extern int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat); extern void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE); +extern int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm); static inline int audit_ipc_obj(struct kern_ipc_perm *ipcp) { @@ -526,6 +528,13 @@ static inline void audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t __audit_log_bprm_fcaps(bprm, pP, pE); } +static inline int audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm) +{ + if (unlikely(!audit_dummy_context())) + return __audit_log_capset(pid, eff, inh, perm); + return 0; +} + extern int audit_n_rules; extern int audit_signals; #else @@ -558,6 +567,7 @@ extern int audit_signals; #define audit_mq_notify(d,n) ({ 0; }) #define audit_mq_getsetattr(d,s) ({ 0; }) #define audit_log_bprm_fcaps(b, p, e) do { ; } while (0) +#define audit_log_capset(pid, e, i, p) ({ 0; }) #define audit_ptrace(t) ((void)0) #define audit_n_rules 0 #define audit_signals 0 diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3229cd4206f..cef34235b36 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -204,6 +204,12 @@ struct audit_aux_data_bprm_fcaps { struct audit_cap_data new_pcap; }; +struct audit_aux_data_capset { + struct audit_aux_data d; + pid_t pid; + struct audit_cap_data cap; +}; + struct audit_tree_refs { struct audit_tree_refs *next; struct audit_chunk *c[31]; @@ -1397,6 +1403,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_cap(ab, "new_pe", &axs->new_pcap.effective); break; } + case AUDIT_CAPSET: { + struct audit_aux_data_capset *axs = (void *)aux; + audit_log_format(ab, "pid=%d", axs->pid); + audit_log_cap(ab, "cap_pi", &axs->cap.inheritable); + audit_log_cap(ab, "cap_pp", &axs->cap.permitted); + audit_log_cap(ab, "cap_pe", &axs->cap.effective); + break; } + } audit_log_end(ab); } @@ -2569,6 +2583,40 @@ void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_ ax->new_pcap.effective = current->cap_effective; } +/** + * __audit_log_capset - store information about the arguments to the capset syscall + * @pid target pid of the capset call + * @eff effective cap set + * @inh inheritible cap set + * @perm permited cap set + * + * Record the aguments userspace sent to sys_capset for later printing by the + * audit system if applicable + */ +int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm) +{ + struct audit_aux_data_capset *ax; + struct audit_context *context = current->audit_context; + + if (likely(!audit_enabled || !context || context->dummy)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_KERNEL); + if (!ax) + return -ENOMEM; + + ax->d.type = AUDIT_CAPSET; + ax->d.next = context->aux; + context->aux = (void *)ax; + + ax->pid = pid; + ax->cap.effective = *eff; + ax->cap.inheritable = *eff; + ax->cap.permitted = *perm; + + return 0; +} + /** * audit_core_dumps - record information about processes that end abnormally * @signr: signal value diff --git a/kernel/capability.c b/kernel/capability.c index e13a68535ad..19f9eda8997 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -7,6 +7,7 @@ * 30 May 2002: Cleanup, Robert M. Love */ +#include #include #include #include @@ -468,6 +469,10 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) i++; } + ret = audit_log_capset(pid, &effective, &inheritable, &permitted); + if (ret) + return ret; + if (pid && (pid != task_pid_vnr(current))) ret = do_sys_capset_other_tasks(pid, &effective, &inheritable, &permitted); -- cgit v1.2.3-70-g09d2 From 637d32dc720897616e8a1a4f9e9609e29d431800 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 29 Oct 2008 15:42:12 +1100 Subject: Capabilities: BUG when an invalid capability is requested If an invalid (large) capability is requested the capabilities system may panic as it is dereferencing an array of fixed (short) length. Its possible (and actually often happens) that the capability system accidentally stumbled into a valid memory region but it also regularly happens that it hits invalid memory and BUGs. If such an operation does get past cap_capable then the selinux system is sure to have problems as it already does a (simple) validity check and BUG. This is known to happen by the broken and buggy firegl driver. This patch cleanly checks all capable calls and BUG if a call is for an invalid capability. This will likely break the firegl driver for some situations, but it is the right thing to do. Garbage into a security system gets you killed/bugged Signed-off-by: Eric Paris Acked-by: Arjan van de Ven Acked-by: Serge Hallyn Acked-by: Andrew G. Morgan Signed-off-by: James Morris --- kernel/capability.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 19f9eda8997..adb262f83de 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -514,6 +514,11 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) */ int capable(int cap) { + if (unlikely(!cap_valid(cap))) { + printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); + BUG(); + } + if (has_capability(current, cap)) { current->flags |= PF_SUPERPRIV; return 1; -- cgit v1.2.3-70-g09d2 From 76aac0e9a17742e60d408be1a706e9aaad370891 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:12 +1100 Subject: CRED: Wrap task credential accesses in the core kernel Wrap access to task credentials so that they can be separated more easily from the task_struct during the introduction of COW creds. Change most current->(|e|s|fs)[ug]id to current_(|e|s|fs)[ug]id(). Change some task->e?[ug]id to task_e?[ug]id(). In some places it makes more sense to use RCU directly rather than a convenient wrapper; these will be addressed by later patches. Signed-off-by: David Howells Reviewed-by: James Morris Acked-by: Serge Hallyn Cc: Al Viro Cc: linux-audit@redhat.com Cc: containers@lists.linux-foundation.org Cc: linux-mm@kvack.org Signed-off-by: James Morris --- kernel/acct.c | 7 +++---- kernel/auditsc.c | 6 ++++-- kernel/cgroup.c | 9 +++++---- kernel/futex.c | 8 +++++--- kernel/futex_compat.c | 3 ++- kernel/ptrace.c | 15 +++++++++------ kernel/sched.c | 11 +++++++---- kernel/signal.c | 15 +++++++++------ kernel/sys.c | 16 ++++++++-------- kernel/sysctl.c | 2 +- kernel/timer.c | 8 ++++---- kernel/user_namespace.c | 2 +- mm/mempolicy.c | 7 +++++-- mm/migrate.c | 7 +++++-- mm/shmem.c | 8 ++++---- 15 files changed, 72 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index f6006a60df5..d57b7cbb98b 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -530,15 +530,14 @@ static void do_acct_process(struct bsd_acct_struct *acct, do_div(elapsed, AHZ); ac.ac_btime = get_seconds() - elapsed; /* we really need to bite the bullet and change layout */ - ac.ac_uid = current->uid; - ac.ac_gid = current->gid; + current_uid_gid(&ac.ac_uid, &ac.ac_gid); #if ACCT_VERSION==2 ac.ac_ahz = AHZ; #endif #if ACCT_VERSION==1 || ACCT_VERSION==2 /* backward-compatible 16 bit fields */ - ac.ac_uid16 = current->uid; - ac.ac_gid16 = current->gid; + ac.ac_uid16 = ac.ac_uid; + ac.ac_gid16 = ac.ac_gid; #endif #if ACCT_VERSION==3 ac.ac_pid = task_tgid_nr_ns(current, ns); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index cef34235b36..9c7e47ae457 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2628,7 +2628,8 @@ void audit_core_dumps(long signr) { struct audit_buffer *ab; u32 sid; - uid_t auid = audit_get_loginuid(current); + uid_t auid = audit_get_loginuid(current), uid; + gid_t gid; unsigned int sessionid = audit_get_sessionid(current); if (!audit_enabled) @@ -2638,8 +2639,9 @@ void audit_core_dumps(long signr) return; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); + current_uid_gid(&uid, &gid); audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", - auid, current->uid, current->gid, sessionid); + auid, uid, gid, sessionid); security_task_getsecid(current, &sid); if (sid) { char *ctx = NULL; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 35eebd5510c..78f9b310c4f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -571,8 +571,8 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) if (inode) { inode->i_mode = mode; - inode->i_uid = current->fsuid; - inode->i_gid = current->fsgid; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; @@ -1279,6 +1279,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) { struct task_struct *tsk; + uid_t euid; int ret; if (pid) { @@ -1291,8 +1292,8 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) get_task_struct(tsk); rcu_read_unlock(); - if ((current->euid) && (current->euid != tsk->uid) - && (current->euid != tsk->suid)) { + euid = current_euid(); + if (euid && euid != tsk->uid && euid != tsk->suid) { put_task_struct(tsk); return -EACCES; } diff --git a/kernel/futex.c b/kernel/futex.c index 8af10027514..e06962132aa 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -439,10 +439,11 @@ static void free_pi_state(struct futex_pi_state *pi_state) static struct task_struct * futex_find_get_task(pid_t pid) { struct task_struct *p; + uid_t euid = current_euid(); rcu_read_lock(); p = find_task_by_vpid(pid); - if (!p || ((current->euid != p->euid) && (current->euid != p->uid))) + if (!p || (euid != p->euid && euid != p->uid)) p = ERR_PTR(-ESRCH); else get_task_struct(p); @@ -1829,6 +1830,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, { struct robust_list_head __user *head; unsigned long ret; + uid_t euid = current_euid(); if (!futex_cmpxchg_enabled) return -ENOSYS; @@ -1844,8 +1846,8 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, if (!p) goto err_unlock; ret = -EPERM; - if ((current->euid != p->euid) && (current->euid != p->uid) && - !capable(CAP_SYS_PTRACE)) + if (euid != p->euid && euid != p->uid && + !capable(CAP_SYS_PTRACE)) goto err_unlock; head = p->robust_list; rcu_read_unlock(); diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 04ac3a9e42c..3254d4e41e8 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -135,6 +135,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, { struct compat_robust_list_head __user *head; unsigned long ret; + uid_t euid = current_euid(); if (!futex_cmpxchg_enabled) return -ENOSYS; @@ -150,7 +151,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, if (!p) goto err_unlock; ret = -EPERM; - if ((current->euid != p->euid) && (current->euid != p->uid) && + if (euid != p->euid && euid != p->uid && !capable(CAP_SYS_PTRACE)) goto err_unlock; head = p->compat_robust_list; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1e68e4c39e2..937f6b5b200 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -123,16 +123,19 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) * because setting up the necessary parent/child relationship * or halting the specified task is impossible. */ + uid_t uid; + gid_t gid; int dumpable = 0; /* Don't let security modules deny introspection */ if (task == current) return 0; - if (((current->uid != task->euid) || - (current->uid != task->suid) || - (current->uid != task->uid) || - (current->gid != task->egid) || - (current->gid != task->sgid) || - (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) + current_uid_gid(&uid, &gid); + if ((uid != task->euid || + uid != task->suid || + uid != task->uid || + gid != task->egid || + gid != task->sgid || + gid != task->gid) && !capable(CAP_SYS_PTRACE)) return -EPERM; smp_rmb(); if (task->mm) diff --git a/kernel/sched.c b/kernel/sched.c index e8819bc6f46..c3b8b1fcde0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5128,6 +5128,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy, unsigned long flags; const struct sched_class *prev_class = p->sched_class; struct rq *rq; + uid_t euid; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); @@ -5180,8 +5181,9 @@ recheck: return -EPERM; /* can't change other user's priorities */ - if ((current->euid != p->euid) && - (current->euid != p->uid)) + euid = current_euid(); + if (euid != p->euid && + euid != p->uid) return -EPERM; } @@ -5392,6 +5394,7 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) cpumask_t cpus_allowed; cpumask_t new_mask = *in_mask; struct task_struct *p; + uid_t euid; int retval; get_online_cpus(); @@ -5412,9 +5415,9 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) get_task_struct(p); read_unlock(&tasklist_lock); + euid = current_euid(); retval = -EPERM; - if ((current->euid != p->euid) && (current->euid != p->uid) && - !capable(CAP_SYS_NICE)) + if (euid != p->euid && euid != p->uid && !capable(CAP_SYS_NICE)) goto out_unlock; retval = security_task_setscheduler(p, 0, NULL); diff --git a/kernel/signal.c b/kernel/signal.c index 4530fc65445..167b535fe1a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -567,6 +567,7 @@ static int check_kill_permission(int sig, struct siginfo *info, struct task_struct *t) { struct pid *sid; + uid_t uid, euid; int error; if (!valid_signal(sig)) @@ -579,8 +580,10 @@ static int check_kill_permission(int sig, struct siginfo *info, if (error) return error; - if ((current->euid ^ t->suid) && (current->euid ^ t->uid) && - (current->uid ^ t->suid) && (current->uid ^ t->uid) && + uid = current_uid(); + euid = current_euid(); + if ((euid ^ t->suid) && (euid ^ t->uid) && + (uid ^ t->suid) && (uid ^ t->uid) && !capable(CAP_KILL)) { switch (sig) { case SIGCONT: @@ -844,7 +847,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, q->info.si_errno = 0; q->info.si_code = SI_USER; q->info.si_pid = task_pid_vnr(current); - q->info.si_uid = current->uid; + q->info.si_uid = current_uid(); break; case (unsigned long) SEND_SIG_PRIV: q->info.si_signo = sig; @@ -1598,7 +1601,7 @@ void ptrace_notify(int exit_code) info.si_signo = SIGTRAP; info.si_code = exit_code; info.si_pid = task_pid_vnr(current); - info.si_uid = current->uid; + info.si_uid = current_uid(); /* Let the debugger run. */ spin_lock_irq(¤t->sighand->siglock); @@ -2211,7 +2214,7 @@ sys_kill(pid_t pid, int sig) info.si_errno = 0; info.si_code = SI_USER; info.si_pid = task_tgid_vnr(current); - info.si_uid = current->uid; + info.si_uid = current_uid(); return kill_something_info(sig, &info, pid); } @@ -2228,7 +2231,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig) info.si_errno = 0; info.si_code = SI_TKILL; info.si_pid = task_tgid_vnr(current); - info.si_uid = current->uid; + info.si_uid = current_uid(); rcu_read_lock(); p = find_task_by_vpid(pid); diff --git a/kernel/sys.c b/kernel/sys.c index 31deba8f7d1..ed5c29c748a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -114,10 +114,10 @@ void (*pm_power_off_prepare)(void); static int set_one_prio(struct task_struct *p, int niceval, int error) { + uid_t euid = current_euid(); int no_nice; - if (p->uid != current->euid && - p->euid != current->euid && !capable(CAP_SYS_NICE)) { + if (p->uid != euid && p->euid != euid && !capable(CAP_SYS_NICE)) { error = -EPERM; goto out; } @@ -176,16 +176,16 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) case PRIO_USER: user = current->user; if (!who) - who = current->uid; + who = current_uid(); else - if ((who != current->uid) && !(user = find_user(who))) + if (who != current_uid() && !(user = find_user(who))) goto out_unlock; /* No processes for this user */ do_each_thread(g, p) if (p->uid == who) error = set_one_prio(p, niceval, error); while_each_thread(g, p); - if (who != current->uid) + if (who != current_uid()) free_uid(user); /* For find_user() */ break; } @@ -238,9 +238,9 @@ asmlinkage long sys_getpriority(int which, int who) case PRIO_USER: user = current->user; if (!who) - who = current->uid; + who = current_uid(); else - if ((who != current->uid) && !(user = find_user(who))) + if (who != current_uid() && !(user = find_user(who))) goto out_unlock; /* No processes for this user */ do_each_thread(g, p) @@ -250,7 +250,7 @@ asmlinkage long sys_getpriority(int which, int who) retval = niceval; } while_each_thread(g, p); - if (who != current->uid) + if (who != current_uid()) free_uid(user); /* for find_user() */ break; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9d048fa2d90..511031381c3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1641,7 +1641,7 @@ out: static int test_perm(int mode, int op) { - if (!current->euid) + if (!current_euid()) mode >>= 6; else if (in_egroup_p(0)) mode >>= 3; diff --git a/kernel/timer.c b/kernel/timer.c index 56becf373c5..b54e4646cee 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1123,25 +1123,25 @@ asmlinkage long sys_getppid(void) asmlinkage long sys_getuid(void) { /* Only we change this so SMP safe */ - return current->uid; + return current_uid(); } asmlinkage long sys_geteuid(void) { /* Only we change this so SMP safe */ - return current->euid; + return current_euid(); } asmlinkage long sys_getgid(void) { /* Only we change this so SMP safe */ - return current->gid; + return current_gid(); } asmlinkage long sys_getegid(void) { /* Only we change this so SMP safe */ - return current->egid; + return current_egid(); } #endif diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 532858fa5b8..f82730adea0 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -38,7 +38,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) } /* Reset current->user with a new one */ - new_user = alloc_uid(ns, current->uid); + new_user = alloc_uid(ns, current_uid()); if (!new_user) { free_uid(ns->root_user); kfree(ns); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 36f42573a33..07a96474077 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1115,6 +1115,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, nodemask_t old; nodemask_t new; nodemask_t task_nodes; + uid_t uid, euid; int err; err = get_nodes(&old, old_nodes, maxnode); @@ -1144,8 +1145,10 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, * capabilities, superuser privileges or the same * userid as the target process. */ - if ((current->euid != task->suid) && (current->euid != task->uid) && - (current->uid != task->suid) && (current->uid != task->uid) && + uid = current_uid(); + euid = current_euid(); + if (euid != task->suid && euid != task->uid && + uid != task->suid && uid != task->uid && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out; diff --git a/mm/migrate.c b/mm/migrate.c index 6602941bfab..6263c24c4af 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1048,6 +1048,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, struct task_struct *task; struct mm_struct *mm; int err; + uid_t uid, euid; /* Check flags */ if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) @@ -1075,8 +1076,10 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, * capabilities, superuser privileges or the same * userid as the target process. */ - if ((current->euid != task->suid) && (current->euid != task->uid) && - (current->uid != task->suid) && (current->uid != task->uid) && + uid = current_uid(); + euid = current_euid(); + if (euid != task->suid && euid != task->uid && + uid != task->suid && uid != task->uid && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out; diff --git a/mm/shmem.c b/mm/shmem.c index 0ed075215e5..f1b0d4871f3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1513,8 +1513,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) inode = new_inode(sb); if (inode) { inode->i_mode = mode; - inode->i_uid = current->fsuid; - inode->i_gid = current->fsgid; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); inode->i_blocks = 0; inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -2278,8 +2278,8 @@ static int shmem_fill_super(struct super_block *sb, sbinfo->max_blocks = 0; sbinfo->max_inodes = 0; sbinfo->mode = S_IRWXUGO | S_ISVTX; - sbinfo->uid = current->fsuid; - sbinfo->gid = current->fsgid; + sbinfo->uid = current_fsuid(); + sbinfo->gid = current_fsgid(); sbinfo->mpol = NULL; sb->s_fs_info = sbinfo; -- cgit v1.2.3-70-g09d2 From 8bbf4976b59fc9fc2861e79cab7beb3f6d647640 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:14 +1100 Subject: KEYS: Alter use of key instantiation link-to-keyring argument Alter the use of the key instantiation and negation functions' link-to-keyring arguments. Currently this specifies a keyring in the target process to link the key into, creating the keyring if it doesn't exist. This, however, can be a problem for copy-on-write credentials as it means that the instantiating process can alter the credentials of the requesting process. This patch alters the behaviour such that: (1) If keyctl_instantiate_key() or keyctl_negate_key() are given a specific keyring by ID (ringid >= 0), then that keyring will be used. (2) If keyctl_instantiate_key() or keyctl_negate_key() are given one of the special constants that refer to the requesting process's keyrings (KEY_SPEC_*_KEYRING, all <= 0), then: (a) If sys_request_key() was given a keyring to use (destringid) then the key will be attached to that keyring. (b) If sys_request_key() was given a NULL keyring, then the key being instantiated will be attached to the default keyring as set by keyctl_set_reqkey_keyring(). (3) No extra link will be made. Decision point (1) follows current behaviour, and allows those instantiators who've searched for a specifically named keyring in the requestor's keyring so as to partition the keys by type to still have their named keyrings. Decision point (2) allows the requestor to make sure that the key or keys that get produced by request_key() go where they want, whilst allowing the instantiator to request that the key is retained. This is mainly useful for situations where the instantiator makes a secondary request, the key for which should be retained by the initial requestor: +-----------+ +--------------+ +--------------+ | | | | | | | Requestor |------->| Instantiator |------->| Instantiator | | | | | | | +-----------+ +--------------+ +--------------+ request_key() request_key() This might be useful, for example, in Kerberos, where the requestor requests a ticket, and then the ticket instantiator requests the TGT, which someone else then has to go and fetch. The TGT, however, should be retained in the keyrings of the requestor, not the first instantiator. To make this explict an extra special keyring constant is also added. Signed-off-by: David Howells Reviewed-by: James Morris Signed-off-by: James Morris --- include/linux/key.h | 16 +++--- include/linux/keyctl.h | 4 +- kernel/kmod.c | 2 +- security/keys/internal.h | 12 ++-- security/keys/keyctl.c | 118 +++++++++++++++++++++++---------------- security/keys/process_keys.c | 88 ++++++++++++++++++----------- security/keys/request_key.c | 75 +++++++++++++++++-------- security/keys/request_key_auth.c | 5 +- 8 files changed, 199 insertions(+), 121 deletions(-) (limited to 'kernel') diff --git a/include/linux/key.h b/include/linux/key.h index 1b70e35a71e..df709e1af3c 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -287,11 +287,11 @@ extern void key_fsuid_changed(struct task_struct *tsk); extern void key_fsgid_changed(struct task_struct *tsk); extern void key_init(void); -#define __install_session_keyring(tsk, keyring) \ -({ \ - struct key *old_session = tsk->signal->session_keyring; \ - tsk->signal->session_keyring = keyring; \ - old_session; \ +#define __install_session_keyring(keyring) \ +({ \ + struct key *old_session = current->signal->session_keyring; \ + current->signal->session_keyring = keyring; \ + old_session; \ }) #else /* CONFIG_KEYS */ @@ -302,11 +302,11 @@ extern void key_init(void); #define key_revoke(k) do { } while(0) #define key_put(k) do { } while(0) #define key_ref_put(k) do { } while(0) -#define make_key_ref(k, p) ({ NULL; }) -#define key_ref_to_ptr(k) ({ NULL; }) +#define make_key_ref(k, p) NULL +#define key_ref_to_ptr(k) NULL #define is_key_possessed(k) 0 #define switch_uid_keyring(u) do { } while(0) -#define __install_session_keyring(t, k) ({ NULL; }) +#define __install_session_keyring(k) ({ NULL; }) #define copy_keys(f,t) 0 #define copy_thread_group_keys(t) 0 #define exit_keys(t) do { } while(0) diff --git a/include/linux/keyctl.h b/include/linux/keyctl.h index 656ee6b77a4..c0688eb7209 100644 --- a/include/linux/keyctl.h +++ b/include/linux/keyctl.h @@ -1,6 +1,6 @@ /* keyctl.h: keyctl command IDs * - * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2004, 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or @@ -20,6 +20,7 @@ #define KEY_SPEC_USER_SESSION_KEYRING -5 /* - key ID for UID-session keyring */ #define KEY_SPEC_GROUP_KEYRING -6 /* - key ID for GID-specific keyring */ #define KEY_SPEC_REQKEY_AUTH_KEY -7 /* - key ID for assumed request_key auth key */ +#define KEY_SPEC_REQUESTOR_KEYRING -8 /* - key ID for request_key() dest keyring */ /* request-key default keyrings */ #define KEY_REQKEY_DEFL_NO_CHANGE -1 @@ -30,6 +31,7 @@ #define KEY_REQKEY_DEFL_USER_KEYRING 4 #define KEY_REQKEY_DEFL_USER_SESSION_KEYRING 5 #define KEY_REQKEY_DEFL_GROUP_KEYRING 6 +#define KEY_REQKEY_DEFL_REQUESTOR_KEYRING 7 /* keyctl commands */ #define KEYCTL_GET_KEYRING_ID 0 /* ask for a keyring's ID */ diff --git a/kernel/kmod.c b/kernel/kmod.c index 3d3c3ea3a02..f044f8f5770 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -140,7 +140,7 @@ static int ____call_usermodehelper(void *data) /* Unblock all signals and set the session keyring. */ new_session = key_get(sub_info->ring); spin_lock_irq(¤t->sighand->siglock); - old_session = __install_session_keyring(current, new_session); + old_session = __install_session_keyring(new_session); flush_signal_handlers(current, 1); sigemptyset(¤t->blocked); recalc_sigpending(); diff --git a/security/keys/internal.h b/security/keys/internal.h index a60c68138b4..d1586c62978 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -109,8 +109,9 @@ extern key_ref_t search_process_keyrings(struct key_type *type, extern struct key *find_keyring_by_name(const char *name, bool skip_perm_check); -extern int install_thread_keyring(struct task_struct *tsk); -extern int install_process_keyring(struct task_struct *tsk); +extern int install_user_keyrings(void); +extern int install_thread_keyring(void); +extern int install_process_keyring(void); extern struct key *request_key_and_link(struct key_type *type, const char *description, @@ -120,8 +121,7 @@ extern struct key *request_key_and_link(struct key_type *type, struct key *dest_keyring, unsigned long flags); -extern key_ref_t lookup_user_key(struct task_struct *context, - key_serial_t id, int create, int partial, +extern key_ref_t lookup_user_key(key_serial_t id, int create, int partial, key_perm_t perm); extern long join_session_keyring(const char *name); @@ -152,6 +152,7 @@ static inline int key_permission(const key_ref_t key_ref, key_perm_t perm) */ struct request_key_auth { struct key *target_key; + struct key *dest_keyring; struct task_struct *context; void *callout_info; size_t callout_len; @@ -161,7 +162,8 @@ struct request_key_auth { extern struct key_type key_type_request_key_auth; extern struct key *request_key_auth_new(struct key *target, const void *callout_info, - size_t callout_len); + size_t callout_len, + struct key *dest_keyring); extern struct key *key_get_instantiation_authkey(key_serial_t target_id); diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 3f09e5b2a78..fcce331eca7 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -103,7 +103,7 @@ asmlinkage long sys_add_key(const char __user *_type, } /* find the target keyring (which must be writable) */ - keyring_ref = lookup_user_key(NULL, ringid, 1, 0, KEY_WRITE); + keyring_ref = lookup_user_key(ringid, 1, 0, KEY_WRITE); if (IS_ERR(keyring_ref)) { ret = PTR_ERR(keyring_ref); goto error3; @@ -185,7 +185,7 @@ asmlinkage long sys_request_key(const char __user *_type, /* get the destination keyring if specified */ dest_ref = NULL; if (destringid) { - dest_ref = lookup_user_key(NULL, destringid, 1, 0, KEY_WRITE); + dest_ref = lookup_user_key(destringid, 1, 0, KEY_WRITE); if (IS_ERR(dest_ref)) { ret = PTR_ERR(dest_ref); goto error3; @@ -235,7 +235,7 @@ long keyctl_get_keyring_ID(key_serial_t id, int create) key_ref_t key_ref; long ret; - key_ref = lookup_user_key(NULL, id, create, 0, KEY_SEARCH); + key_ref = lookup_user_key(id, create, 0, KEY_SEARCH); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error; @@ -308,7 +308,7 @@ long keyctl_update_key(key_serial_t id, } /* find the target key (which must be writable) */ - key_ref = lookup_user_key(NULL, id, 0, 0, KEY_WRITE); + key_ref = lookup_user_key(id, 0, 0, KEY_WRITE); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error2; @@ -336,7 +336,7 @@ long keyctl_revoke_key(key_serial_t id) key_ref_t key_ref; long ret; - key_ref = lookup_user_key(NULL, id, 0, 0, KEY_WRITE); + key_ref = lookup_user_key(id, 0, 0, KEY_WRITE); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error; @@ -362,7 +362,7 @@ long keyctl_keyring_clear(key_serial_t ringid) key_ref_t keyring_ref; long ret; - keyring_ref = lookup_user_key(NULL, ringid, 1, 0, KEY_WRITE); + keyring_ref = lookup_user_key(ringid, 1, 0, KEY_WRITE); if (IS_ERR(keyring_ref)) { ret = PTR_ERR(keyring_ref); goto error; @@ -388,13 +388,13 @@ long keyctl_keyring_link(key_serial_t id, key_serial_t ringid) key_ref_t keyring_ref, key_ref; long ret; - keyring_ref = lookup_user_key(NULL, ringid, 1, 0, KEY_WRITE); + keyring_ref = lookup_user_key(ringid, 1, 0, KEY_WRITE); if (IS_ERR(keyring_ref)) { ret = PTR_ERR(keyring_ref); goto error; } - key_ref = lookup_user_key(NULL, id, 1, 0, KEY_LINK); + key_ref = lookup_user_key(id, 1, 0, KEY_LINK); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error2; @@ -422,13 +422,13 @@ long keyctl_keyring_unlink(key_serial_t id, key_serial_t ringid) key_ref_t keyring_ref, key_ref; long ret; - keyring_ref = lookup_user_key(NULL, ringid, 0, 0, KEY_WRITE); + keyring_ref = lookup_user_key(ringid, 0, 0, KEY_WRITE); if (IS_ERR(keyring_ref)) { ret = PTR_ERR(keyring_ref); goto error; } - key_ref = lookup_user_key(NULL, id, 0, 0, 0); + key_ref = lookup_user_key(id, 0, 0, 0); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error2; @@ -464,7 +464,7 @@ long keyctl_describe_key(key_serial_t keyid, char *tmpbuf; long ret; - key_ref = lookup_user_key(NULL, keyid, 0, 1, KEY_VIEW); + key_ref = lookup_user_key(keyid, 0, 1, KEY_VIEW); if (IS_ERR(key_ref)) { /* viewing a key under construction is permitted if we have the * authorisation token handy */ @@ -472,7 +472,7 @@ long keyctl_describe_key(key_serial_t keyid, instkey = key_get_instantiation_authkey(keyid); if (!IS_ERR(instkey)) { key_put(instkey); - key_ref = lookup_user_key(NULL, keyid, + key_ref = lookup_user_key(keyid, 0, 1, 0); if (!IS_ERR(key_ref)) goto okay; @@ -557,7 +557,7 @@ long keyctl_keyring_search(key_serial_t ringid, } /* get the keyring at which to begin the search */ - keyring_ref = lookup_user_key(NULL, ringid, 0, 0, KEY_SEARCH); + keyring_ref = lookup_user_key(ringid, 0, 0, KEY_SEARCH); if (IS_ERR(keyring_ref)) { ret = PTR_ERR(keyring_ref); goto error2; @@ -566,7 +566,7 @@ long keyctl_keyring_search(key_serial_t ringid, /* get the destination keyring if specified */ dest_ref = NULL; if (destringid) { - dest_ref = lookup_user_key(NULL, destringid, 1, 0, KEY_WRITE); + dest_ref = lookup_user_key(destringid, 1, 0, KEY_WRITE); if (IS_ERR(dest_ref)) { ret = PTR_ERR(dest_ref); goto error3; @@ -636,7 +636,7 @@ long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen) long ret; /* find the key first */ - key_ref = lookup_user_key(NULL, keyid, 0, 0, 0); + key_ref = lookup_user_key(keyid, 0, 0, 0); if (IS_ERR(key_ref)) { ret = -ENOKEY; goto error; @@ -699,7 +699,7 @@ long keyctl_chown_key(key_serial_t id, uid_t uid, gid_t gid) if (uid == (uid_t) -1 && gid == (gid_t) -1) goto error; - key_ref = lookup_user_key(NULL, id, 1, 1, KEY_SETATTR); + key_ref = lookup_user_key(id, 1, 1, KEY_SETATTR); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error; @@ -804,7 +804,7 @@ long keyctl_setperm_key(key_serial_t id, key_perm_t perm) if (perm & ~(KEY_POS_ALL | KEY_USR_ALL | KEY_GRP_ALL | KEY_OTH_ALL)) goto error; - key_ref = lookup_user_key(NULL, id, 1, 1, KEY_SETATTR); + key_ref = lookup_user_key(id, 1, 1, KEY_SETATTR); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error; @@ -829,6 +829,43 @@ error: } /* end keyctl_setperm_key() */ +/* + * get the destination keyring for instantiation + */ +static long get_instantiation_keyring(key_serial_t ringid, + struct request_key_auth *rka, + struct key **_dest_keyring) +{ + key_ref_t dkref; + + /* just return a NULL pointer if we weren't asked to make a link */ + if (ringid == 0) { + *_dest_keyring = NULL; + return 0; + } + + /* if a specific keyring is nominated by ID, then use that */ + if (ringid > 0) { + dkref = lookup_user_key(ringid, 1, 0, KEY_WRITE); + if (IS_ERR(dkref)) + return PTR_ERR(dkref); + *_dest_keyring = key_ref_to_ptr(dkref); + return 0; + } + + if (ringid == KEY_SPEC_REQKEY_AUTH_KEY) + return -EINVAL; + + /* otherwise specify the destination keyring recorded in the + * authorisation key (any KEY_SPEC_*_KEYRING) */ + if (ringid >= KEY_SPEC_REQUESTOR_KEYRING) { + *_dest_keyring = rka->dest_keyring; + return 0; + } + + return -ENOKEY; +} + /*****************************************************************************/ /* * instantiate the key with the specified payload, and, if one is given, link @@ -840,8 +877,7 @@ long keyctl_instantiate_key(key_serial_t id, key_serial_t ringid) { struct request_key_auth *rka; - struct key *instkey; - key_ref_t keyring_ref; + struct key *instkey, *dest_keyring; void *payload; long ret; bool vm = false; @@ -883,21 +919,15 @@ long keyctl_instantiate_key(key_serial_t id, /* find the destination keyring amongst those belonging to the * requesting task */ - keyring_ref = NULL; - if (ringid) { - keyring_ref = lookup_user_key(rka->context, ringid, 1, 0, - KEY_WRITE); - if (IS_ERR(keyring_ref)) { - ret = PTR_ERR(keyring_ref); - goto error2; - } - } + ret = get_instantiation_keyring(ringid, rka, &dest_keyring); + if (ret < 0) + goto error2; /* instantiate the key and link it into a keyring */ ret = key_instantiate_and_link(rka->target_key, payload, plen, - key_ref_to_ptr(keyring_ref), instkey); + dest_keyring, instkey); - key_ref_put(keyring_ref); + key_put(dest_keyring); /* discard the assumed authority if it's just been disabled by * instantiation of the key */ @@ -924,8 +954,7 @@ error: long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid) { struct request_key_auth *rka; - struct key *instkey; - key_ref_t keyring_ref; + struct key *instkey, *dest_keyring; long ret; /* the appropriate instantiation authorisation key must have been @@ -941,20 +970,15 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid) /* find the destination keyring if present (which must also be * writable) */ - keyring_ref = NULL; - if (ringid) { - keyring_ref = lookup_user_key(NULL, ringid, 1, 0, KEY_WRITE); - if (IS_ERR(keyring_ref)) { - ret = PTR_ERR(keyring_ref); - goto error; - } - } + ret = get_instantiation_keyring(ringid, rka, &dest_keyring); + if (ret < 0) + goto error; /* instantiate the key and link it into a keyring */ ret = key_negate_and_link(rka->target_key, timeout, - key_ref_to_ptr(keyring_ref), instkey); + dest_keyring, instkey); - key_ref_put(keyring_ref); + key_put(dest_keyring); /* discard the assumed authority if it's just been disabled by * instantiation of the key */ @@ -979,13 +1003,13 @@ long keyctl_set_reqkey_keyring(int reqkey_defl) switch (reqkey_defl) { case KEY_REQKEY_DEFL_THREAD_KEYRING: - ret = install_thread_keyring(current); + ret = install_thread_keyring(); if (ret < 0) return ret; goto set; case KEY_REQKEY_DEFL_PROCESS_KEYRING: - ret = install_process_keyring(current); + ret = install_process_keyring(); if (ret < 0) return ret; @@ -1018,7 +1042,7 @@ long keyctl_set_timeout(key_serial_t id, unsigned timeout) time_t expiry; long ret; - key_ref = lookup_user_key(NULL, id, 1, 1, KEY_SETATTR); + key_ref = lookup_user_key(id, 1, 1, KEY_SETATTR); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error; @@ -1105,7 +1129,7 @@ long keyctl_get_security(key_serial_t keyid, char *context; long ret; - key_ref = lookup_user_key(NULL, keyid, 0, 1, KEY_VIEW); + key_ref = lookup_user_key(keyid, 0, 1, KEY_VIEW); if (IS_ERR(key_ref)) { if (PTR_ERR(key_ref) != -EACCES) return PTR_ERR(key_ref); @@ -1117,7 +1141,7 @@ long keyctl_get_security(key_serial_t keyid, return PTR_ERR(key_ref); key_put(instkey); - key_ref = lookup_user_key(NULL, keyid, 0, 1, 0); + key_ref = lookup_user_key(keyid, 0, 1, 0); if (IS_ERR(key_ref)) return PTR_ERR(key_ref); } diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index 5be6d018759..1c793b7090a 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -40,9 +40,9 @@ struct key_user root_key_user = { /* * install user and user session keyrings for a particular UID */ -static int install_user_keyrings(struct task_struct *tsk) +int install_user_keyrings(void) { - struct user_struct *user = tsk->user; + struct user_struct *user = current->user; struct key *uid_keyring, *session_keyring; char buf[20]; int ret; @@ -67,7 +67,7 @@ static int install_user_keyrings(struct task_struct *tsk) uid_keyring = find_keyring_by_name(buf, true); if (IS_ERR(uid_keyring)) { uid_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, - tsk, KEY_ALLOC_IN_QUOTA, + current, KEY_ALLOC_IN_QUOTA, NULL); if (IS_ERR(uid_keyring)) { ret = PTR_ERR(uid_keyring); @@ -83,7 +83,8 @@ static int install_user_keyrings(struct task_struct *tsk) if (IS_ERR(session_keyring)) { session_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, - tsk, KEY_ALLOC_IN_QUOTA, NULL); + current, KEY_ALLOC_IN_QUOTA, + NULL); if (IS_ERR(session_keyring)) { ret = PTR_ERR(session_keyring); goto error_release; @@ -146,8 +147,9 @@ void switch_uid_keyring(struct user_struct *new_user) /* * install a fresh thread keyring, discarding the old one */ -int install_thread_keyring(struct task_struct *tsk) +int install_thread_keyring(void) { + struct task_struct *tsk = current; struct key *keyring, *old; char buf[20]; int ret; @@ -178,8 +180,9 @@ error: /* * make sure a process keyring is installed */ -int install_process_keyring(struct task_struct *tsk) +int install_process_keyring(void) { + struct task_struct *tsk = current; struct key *keyring; char buf[20]; int ret; @@ -218,9 +221,9 @@ error: * install a session keyring, discarding the old one * - if a keyring is not supplied, an empty one is invented */ -static int install_session_keyring(struct task_struct *tsk, - struct key *keyring) +static int install_session_keyring(struct key *keyring) { + struct task_struct *tsk = current; unsigned long flags; struct key *old; char buf[20]; @@ -572,93 +575,91 @@ static int lookup_user_key_possessed(const struct key *key, const void *target) * - don't create special keyrings unless so requested * - partially constructed keys aren't found unless requested */ -key_ref_t lookup_user_key(struct task_struct *context, key_serial_t id, - int create, int partial, key_perm_t perm) +key_ref_t lookup_user_key(key_serial_t id, int create, int partial, + key_perm_t perm) { + struct request_key_auth *rka; + struct task_struct *t = current; key_ref_t key_ref, skey_ref; struct key *key; int ret; - if (!context) - context = current; - key_ref = ERR_PTR(-ENOKEY); switch (id) { case KEY_SPEC_THREAD_KEYRING: - if (!context->thread_keyring) { + if (!t->thread_keyring) { if (!create) goto error; - ret = install_thread_keyring(context); + ret = install_thread_keyring(); if (ret < 0) { key = ERR_PTR(ret); goto error; } } - key = context->thread_keyring; + key = t->thread_keyring; atomic_inc(&key->usage); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_PROCESS_KEYRING: - if (!context->signal->process_keyring) { + if (!t->signal->process_keyring) { if (!create) goto error; - ret = install_process_keyring(context); + ret = install_process_keyring(); if (ret < 0) { key = ERR_PTR(ret); goto error; } } - key = context->signal->process_keyring; + key = t->signal->process_keyring; atomic_inc(&key->usage); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_SESSION_KEYRING: - if (!context->signal->session_keyring) { + if (!t->signal->session_keyring) { /* always install a session keyring upon access if one * doesn't exist yet */ - ret = install_user_keyrings(context); + ret = install_user_keyrings(); if (ret < 0) goto error; - ret = install_session_keyring( - context, context->user->session_keyring); + ret = install_session_keyring(t->user->session_keyring); if (ret < 0) goto error; } rcu_read_lock(); - key = rcu_dereference(context->signal->session_keyring); + key = rcu_dereference(t->signal->session_keyring); atomic_inc(&key->usage); rcu_read_unlock(); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_USER_KEYRING: - if (!context->user->uid_keyring) { - ret = install_user_keyrings(context); + if (!t->user->uid_keyring) { + ret = install_user_keyrings(); if (ret < 0) goto error; } - key = context->user->uid_keyring; + key = t->user->uid_keyring; atomic_inc(&key->usage); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_USER_SESSION_KEYRING: - if (!context->user->session_keyring) { - ret = install_user_keyrings(context); + if (!t->user->session_keyring) { + ret = install_user_keyrings(); if (ret < 0) goto error; } - key = context->user->session_keyring; + key = t->user->session_keyring; atomic_inc(&key->usage); key_ref = make_key_ref(key, 1); break; @@ -669,7 +670,7 @@ key_ref_t lookup_user_key(struct task_struct *context, key_serial_t id, goto error; case KEY_SPEC_REQKEY_AUTH_KEY: - key = context->request_key_auth; + key = t->request_key_auth; if (!key) goto error; @@ -677,6 +678,25 @@ key_ref_t lookup_user_key(struct task_struct *context, key_serial_t id, key_ref = make_key_ref(key, 1); break; + case KEY_SPEC_REQUESTOR_KEYRING: + if (!t->request_key_auth) + goto error; + + down_read(&t->request_key_auth->sem); + if (t->request_key_auth->flags & KEY_FLAG_REVOKED) { + key_ref = ERR_PTR(-EKEYREVOKED); + key = NULL; + } else { + rka = t->request_key_auth->payload.data; + key = rka->dest_keyring; + atomic_inc(&key->usage); + } + up_read(&t->request_key_auth->sem); + if (!key) + goto error; + key_ref = make_key_ref(key, 1); + break; + default: key_ref = ERR_PTR(-EINVAL); if (id < 1) @@ -725,7 +745,7 @@ key_ref_t lookup_user_key(struct task_struct *context, key_serial_t id, goto invalid_key; /* check the permissions */ - ret = key_task_permission(key_ref, context, perm); + ret = key_task_permission(key_ref, t, perm); if (ret < 0) goto invalid_key; @@ -754,7 +774,7 @@ long join_session_keyring(const char *name) /* if no name is provided, install an anonymous keyring */ if (!name) { - ret = install_session_keyring(tsk, NULL); + ret = install_session_keyring(NULL); if (ret < 0) goto error; @@ -784,7 +804,7 @@ long join_session_keyring(const char *name) } /* we've got a keyring - now to install it */ - ret = install_session_keyring(tsk, keyring); + ret = install_session_keyring(keyring); if (ret < 0) goto error2; diff --git a/security/keys/request_key.c b/security/keys/request_key.c index 91953c81449..8e9d93b4a40 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -76,6 +76,10 @@ static int call_sbin_request_key(struct key_construction *cons, kenter("{%d},{%d},%s", key->serial, authkey->serial, op); + ret = install_user_keyrings(); + if (ret < 0) + goto error_alloc; + /* allocate a new session keyring */ sprintf(desc, "_req.%u", key->serial); @@ -165,7 +169,8 @@ error_alloc: * - we ignore program failure and go on key status instead */ static int construct_key(struct key *key, const void *callout_info, - size_t callout_len, void *aux) + size_t callout_len, void *aux, + struct key *dest_keyring) { struct key_construction *cons; request_key_actor_t actor; @@ -179,7 +184,8 @@ static int construct_key(struct key *key, const void *callout_info, return -ENOMEM; /* allocate an authorisation key */ - authkey = request_key_auth_new(key, callout_info, callout_len); + authkey = request_key_auth_new(key, callout_info, callout_len, + dest_keyring); if (IS_ERR(authkey)) { kfree(cons); ret = PTR_ERR(authkey); @@ -207,27 +213,48 @@ static int construct_key(struct key *key, const void *callout_info, } /* - * link a key to the appropriate destination keyring - * - the caller must hold a write lock on the destination keyring + * get the appropriate destination keyring for the request + * - we return whatever keyring we select with an extra reference upon it which + * the caller must release */ -static void construct_key_make_link(struct key *key, struct key *dest_keyring) +static void construct_get_dest_keyring(struct key **_dest_keyring) { + struct request_key_auth *rka; struct task_struct *tsk = current; - struct key *drop = NULL; + struct key *dest_keyring = *_dest_keyring, *authkey; - kenter("{%d},%p", key->serial, dest_keyring); + kenter("%p", dest_keyring); /* find the appropriate keyring */ - if (!dest_keyring) { + if (dest_keyring) { + /* the caller supplied one */ + key_get(dest_keyring); + } else { + /* use a default keyring; falling through the cases until we + * find one that we actually have */ switch (tsk->jit_keyring) { case KEY_REQKEY_DEFL_DEFAULT: + case KEY_REQKEY_DEFL_REQUESTOR_KEYRING: + if (tsk->request_key_auth) { + authkey = tsk->request_key_auth; + down_read(&authkey->sem); + rka = authkey->payload.data; + if (!test_bit(KEY_FLAG_REVOKED, + &authkey->flags)) + dest_keyring = + key_get(rka->dest_keyring); + up_read(&authkey->sem); + if (dest_keyring) + break; + } + case KEY_REQKEY_DEFL_THREAD_KEYRING: - dest_keyring = tsk->thread_keyring; + dest_keyring = key_get(tsk->thread_keyring); if (dest_keyring) break; case KEY_REQKEY_DEFL_PROCESS_KEYRING: - dest_keyring = tsk->signal->process_keyring; + dest_keyring = key_get(tsk->signal->process_keyring); if (dest_keyring) break; @@ -236,17 +263,16 @@ static void construct_key_make_link(struct key *key, struct key *dest_keyring) dest_keyring = key_get( rcu_dereference(tsk->signal->session_keyring)); rcu_read_unlock(); - drop = dest_keyring; if (dest_keyring) break; case KEY_REQKEY_DEFL_USER_SESSION_KEYRING: - dest_keyring = tsk->user->session_keyring; + dest_keyring = key_get(tsk->user->session_keyring); break; case KEY_REQKEY_DEFL_USER_KEYRING: - dest_keyring = tsk->user->uid_keyring; + dest_keyring = key_get(tsk->user->uid_keyring); break; case KEY_REQKEY_DEFL_GROUP_KEYRING: @@ -255,10 +281,9 @@ static void construct_key_make_link(struct key *key, struct key *dest_keyring) } } - /* and attach the key to it */ - __key_link(dest_keyring, key); - key_put(drop); - kleave(""); + *_dest_keyring = dest_keyring; + kleave(" [dk %d]", key_serial(dest_keyring)); + return; } /* @@ -288,8 +313,7 @@ static int construct_alloc_key(struct key_type *type, set_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags); - if (dest_keyring) - down_write(&dest_keyring->sem); + down_write(&dest_keyring->sem); /* attach the key to the destination keyring under lock, but we do need * to do another check just in case someone beat us to it whilst we @@ -301,12 +325,10 @@ static int construct_alloc_key(struct key_type *type, if (!IS_ERR(key_ref)) goto key_already_present; - if (dest_keyring) - construct_key_make_link(key, dest_keyring); + __key_link(dest_keyring, key); mutex_unlock(&key_construction_mutex); - if (dest_keyring) - up_write(&dest_keyring->sem); + up_write(&dest_keyring->sem); mutex_unlock(&user->cons_lock); *_key = key; kleave(" = 0 [%d]", key_serial(key)); @@ -348,21 +370,26 @@ static struct key *construct_key_and_link(struct key_type *type, if (!user) return ERR_PTR(-ENOMEM); + construct_get_dest_keyring(&dest_keyring); + ret = construct_alloc_key(type, description, dest_keyring, flags, user, &key); key_user_put(user); if (ret == 0) { - ret = construct_key(key, callout_info, callout_len, aux); + ret = construct_key(key, callout_info, callout_len, aux, + dest_keyring); if (ret < 0) goto construction_failed; } + key_put(dest_keyring); return key; construction_failed: key_negate_and_link(key, key_negative_timeout, NULL, NULL); key_put(key); + key_put(dest_keyring); return ERR_PTR(ret); } diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c index 729156b3485..1762d44711d 100644 --- a/security/keys/request_key_auth.c +++ b/security/keys/request_key_auth.c @@ -128,6 +128,7 @@ static void request_key_auth_destroy(struct key *key) } key_put(rka->target_key); + key_put(rka->dest_keyring); kfree(rka->callout_info); kfree(rka); @@ -139,7 +140,7 @@ static void request_key_auth_destroy(struct key *key) * access to the caller's security data */ struct key *request_key_auth_new(struct key *target, const void *callout_info, - size_t callout_len) + size_t callout_len, struct key *dest_keyring) { struct request_key_auth *rka, *irka; struct key *authkey = NULL; @@ -188,6 +189,7 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, } rka->target_key = key_get(target); + rka->dest_keyring = key_get(dest_keyring); memcpy(rka->callout_info, callout_info, callout_len); rka->callout_len = callout_len; @@ -223,6 +225,7 @@ error_inst: key_put(authkey); error_alloc: key_put(rka->target_key); + key_put(rka->dest_keyring); kfree(rka->callout_info); kfree(rka); kleave("= %d", ret); -- cgit v1.2.3-70-g09d2 From 1cdcbec1a3372c0c49c59d292e708fd07b509f18 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:14 +1100 Subject: CRED: Neuter sys_capset() Take away the ability for sys_capset() to affect processes other than current. This means that current will not need to lock its own credentials when reading them against interference by other processes. This has effectively been the case for a while anyway, since: (1) Without LSM enabled, sys_capset() is disallowed. (2) With file-based capabilities, sys_capset() is neutered. Signed-off-by: David Howells Acked-by: Serge Hallyn Acked-by: Andrew G. Morgan Acked-by: James Morris Signed-off-by: James Morris --- fs/open.c | 12 +-- include/linux/security.h | 48 ++++------ kernel/capability.c | 227 +++++------------------------------------------ security/commoncap.c | 29 ++---- security/security.c | 18 ++-- security/selinux/hooks.c | 10 +-- 6 files changed, 61 insertions(+), 283 deletions(-) (limited to 'kernel') diff --git a/fs/open.c b/fs/open.c index 83cdb9dee0c..500cc0c5476 100644 --- a/fs/open.c +++ b/fs/open.c @@ -441,17 +441,7 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode) current->fsgid = current->gid; if (!issecure(SECURE_NO_SETUID_FIXUP)) { - /* - * Clear the capabilities if we switch to a non-root user - */ -#ifndef CONFIG_SECURITY_FILE_CAPABILITIES - /* - * FIXME: There is a race here against sys_capset. The - * capabilities can change yet we will restore the old - * value below. We should hold task_capabilities_lock, - * but we cannot because user_path_at can sleep. - */ -#endif /* ndef CONFIG_SECURITY_FILE_CAPABILITIES */ + /* Clear the capabilities if we switch to a non-root user */ if (current->uid) old_cap = cap_set_effective(__cap_empty_set); else diff --git a/include/linux/security.h b/include/linux/security.h index 5fe28a671cd..d1ce8beddbd 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -53,8 +53,8 @@ extern int cap_settime(struct timespec *ts, struct timezone *tz); extern int cap_ptrace_may_access(struct task_struct *child, unsigned int mode); extern int cap_ptrace_traceme(struct task_struct *parent); extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); -extern int cap_capset_check(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); -extern void cap_capset_set(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); +extern int cap_capset_check(kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); +extern void cap_capset_set(kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); extern int cap_bprm_set_security(struct linux_binprm *bprm); extern void cap_bprm_apply_creds(struct linux_binprm *bprm, int unsafe); extern int cap_bprm_secureexec(struct linux_binprm *bprm); @@ -1191,24 +1191,14 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * Return 0 if the capability sets were successfully obtained. * @capset_check: * Check permission before setting the @effective, @inheritable, and - * @permitted capability sets for the @target process. - * Caveat: @target is also set to current if a set of processes is - * specified (i.e. all processes other than current and init or a - * particular process group). Hence, the capset_set hook may need to - * revalidate permission to the actual target process. - * @target contains the task_struct structure for target process. + * @permitted capability sets for the current process. * @effective contains the effective capability set. * @inheritable contains the inheritable capability set. * @permitted contains the permitted capability set. * Return 0 if permission is granted. * @capset_set: * Set the @effective, @inheritable, and @permitted capability sets for - * the @target process. Since capset_check cannot always check permission - * to the real @target process, this hook may also perform permission - * checking to determine if the current process is allowed to set the - * capability sets of the @target process. However, this hook has no way - * of returning an error due to the structure of the sys_capset code. - * @target contains the task_struct structure for target process. + * the current process. * @effective contains the effective capability set. * @inheritable contains the inheritable capability set. * @permitted contains the permitted capability set. @@ -1303,12 +1293,10 @@ struct security_operations { int (*capget) (struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); - int (*capset_check) (struct task_struct *target, - kernel_cap_t *effective, + int (*capset_check) (kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); - void (*capset_set) (struct task_struct *target, - kernel_cap_t *effective, + void (*capset_set) (kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); int (*capable) (struct task_struct *tsk, int cap, int audit); @@ -1572,12 +1560,10 @@ int security_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); -int security_capset_check(struct task_struct *target, - kernel_cap_t *effective, +int security_capset_check(kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); -void security_capset_set(struct task_struct *target, - kernel_cap_t *effective, +void security_capset_set(kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); int security_capable(struct task_struct *tsk, int cap); @@ -1769,20 +1755,18 @@ static inline int security_capget(struct task_struct *target, return cap_capget(target, effective, inheritable, permitted); } -static inline int security_capset_check(struct task_struct *target, - kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) +static inline int security_capset_check(kernel_cap_t *effective, + kernel_cap_t *inheritable, + kernel_cap_t *permitted) { - return cap_capset_check(target, effective, inheritable, permitted); + return cap_capset_check(effective, inheritable, permitted); } -static inline void security_capset_set(struct task_struct *target, - kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) +static inline void security_capset_set(kernel_cap_t *effective, + kernel_cap_t *inheritable, + kernel_cap_t *permitted) { - cap_capset_set(target, effective, inheritable, permitted); + cap_capset_set(effective, inheritable, permitted); } static inline int security_capable(struct task_struct *tsk, int cap) diff --git a/kernel/capability.c b/kernel/capability.c index adb262f83de..58b00519624 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -127,160 +127,6 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) return 0; } -#ifndef CONFIG_SECURITY_FILE_CAPABILITIES - -/* - * Without filesystem capability support, we nominally support one process - * setting the capabilities of another - */ -static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, - kernel_cap_t *pIp, kernel_cap_t *pPp) -{ - struct task_struct *target; - int ret; - - spin_lock(&task_capability_lock); - read_lock(&tasklist_lock); - - if (pid && pid != task_pid_vnr(current)) { - target = find_task_by_vpid(pid); - if (!target) { - ret = -ESRCH; - goto out; - } - } else - target = current; - - ret = security_capget(target, pEp, pIp, pPp); - -out: - read_unlock(&tasklist_lock); - spin_unlock(&task_capability_lock); - - return ret; -} - -/* - * cap_set_pg - set capabilities for all processes in a given process - * group. We call this holding task_capability_lock and tasklist_lock. - */ -static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) -{ - struct task_struct *g, *target; - int ret = -EPERM; - int found = 0; - struct pid *pgrp; - - spin_lock(&task_capability_lock); - read_lock(&tasklist_lock); - - pgrp = find_vpid(pgrp_nr); - do_each_pid_task(pgrp, PIDTYPE_PGID, g) { - target = g; - while_each_thread(g, target) { - if (!security_capset_check(target, effective, - inheritable, permitted)) { - security_capset_set(target, effective, - inheritable, permitted); - ret = 0; - } - found = 1; - } - } while_each_pid_task(pgrp, PIDTYPE_PGID, g); - - read_unlock(&tasklist_lock); - spin_unlock(&task_capability_lock); - - if (!found) - ret = 0; - return ret; -} - -/* - * cap_set_all - set capabilities for all processes other than init - * and self. We call this holding task_capability_lock and tasklist_lock. - */ -static inline int cap_set_all(kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) -{ - struct task_struct *g, *target; - int ret = -EPERM; - int found = 0; - - spin_lock(&task_capability_lock); - read_lock(&tasklist_lock); - - do_each_thread(g, target) { - if (target == current - || is_container_init(target->group_leader)) - continue; - found = 1; - if (security_capset_check(target, effective, inheritable, - permitted)) - continue; - ret = 0; - security_capset_set(target, effective, inheritable, permitted); - } while_each_thread(g, target); - - read_unlock(&tasklist_lock); - spin_unlock(&task_capability_lock); - - if (!found) - ret = 0; - - return ret; -} - -/* - * Given the target pid does not refer to the current process we - * need more elaborate support... (This support is not present when - * filesystem capabilities are configured.) - */ -static inline int do_sys_capset_other_tasks(pid_t pid, kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) -{ - struct task_struct *target; - int ret; - - if (!capable(CAP_SETPCAP)) - return -EPERM; - - if (pid == -1) /* all procs other than current and init */ - return cap_set_all(effective, inheritable, permitted); - - else if (pid < 0) /* all procs in process group */ - return cap_set_pg(-pid, effective, inheritable, permitted); - - /* target != current */ - spin_lock(&task_capability_lock); - read_lock(&tasklist_lock); - - target = find_task_by_vpid(pid); - if (!target) - ret = -ESRCH; - else { - ret = security_capset_check(target, effective, inheritable, - permitted); - - /* having verified that the proposed changes are legal, - we now put them into effect. */ - if (!ret) - security_capset_set(target, effective, inheritable, - permitted); - } - - read_unlock(&tasklist_lock); - spin_unlock(&task_capability_lock); - - return ret; -} - -#else /* ie., def CONFIG_SECURITY_FILE_CAPABILITIES */ - /* * If we have configured with filesystem capability support, then the * only thing that can change the capabilities of the current process @@ -314,22 +160,6 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, return ret; } -/* - * With filesystem capability support configured, the kernel does not - * permit the changing of capabilities in one process by another - * process. (CAP_SETPCAP has much less broad semantics when configured - * this way.) - */ -static inline int do_sys_capset_other_tasks(pid_t pid, - kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) -{ - return -EPERM; -} - -#endif /* ie., ndef CONFIG_SECURITY_FILE_CAPABILITIES */ - /* * Atomically modify the effective capabilities returning the original * value. No permission check is performed here - it is assumed that the @@ -424,16 +254,14 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) * @data: pointer to struct that contains the effective, permitted, * and inheritable capabilities * - * Set capabilities for a given process, all processes, or all - * processes in a given process group. + * Set capabilities for the current process only. The ability to any other + * process(es) has been deprecated and removed. * * The restrictions on setting capabilities are specified as: * - * [pid is for the 'target' task. 'current' is the calling task.] - * - * I: any raised capabilities must be a subset of the (old current) permitted - * P: any raised capabilities must be a subset of the (old current) permitted - * E: must be set to a subset of (new target) permitted + * I: any raised capabilities must be a subset of the old permitted + * P: any raised capabilities must be a subset of the old permitted + * E: must be set to a subset of new permitted * * Returns 0 on success and < 0 on error. */ @@ -452,10 +280,13 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) if (get_user(pid, &header->pid)) return -EFAULT; + /* may only affect current now */ + if (pid != 0 && pid != task_pid_vnr(current)) + return -EPERM; + if (copy_from_user(&kdata, data, tocopy - * sizeof(struct __user_cap_data_struct))) { + * sizeof(struct __user_cap_data_struct))) return -EFAULT; - } for (i = 0; i < tocopy; i++) { effective.cap[i] = kdata[i].effective; @@ -473,32 +304,20 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) if (ret) return ret; - if (pid && (pid != task_pid_vnr(current))) - ret = do_sys_capset_other_tasks(pid, &effective, &inheritable, - &permitted); - else { - /* - * This lock is required even when filesystem - * capability support is configured - it protects the - * sys_capget() call from returning incorrect data in - * the case that the targeted process is not the - * current one. - */ - spin_lock(&task_capability_lock); - - ret = security_capset_check(current, &effective, &inheritable, - &permitted); - /* - * Having verified that the proposed changes are - * legal, we now put them into effect. - */ - if (!ret) - security_capset_set(current, &effective, &inheritable, - &permitted); - spin_unlock(&task_capability_lock); - } - + /* This lock is required even when filesystem capability support is + * configured - it protects the sys_capget() call from returning + * incorrect data in the case that the targeted process is not the + * current one. + */ + spin_lock(&task_capability_lock); + ret = security_capset_check(&effective, &inheritable, &permitted); + /* Having verified that the proposed changes are legal, we now put them + * into effect. + */ + if (!ret) + security_capset_set(&effective, &inheritable, &permitted); + spin_unlock(&task_capability_lock); return ret; } diff --git a/security/commoncap.c b/security/commoncap.c index 8283271f076..e3f36ef629f 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -96,15 +96,6 @@ int cap_capget (struct task_struct *target, kernel_cap_t *effective, #ifdef CONFIG_SECURITY_FILE_CAPABILITIES -static inline int cap_block_setpcap(struct task_struct *target) -{ - /* - * No support for remote process capability manipulation with - * filesystem capability support. - */ - return (target != current); -} - static inline int cap_inh_is_capped(void) { /* @@ -119,7 +110,6 @@ static inline int cap_limit_ptraced_target(void) { return 1; } #else /* ie., ndef CONFIG_SECURITY_FILE_CAPABILITIES */ -static inline int cap_block_setpcap(struct task_struct *t) { return 0; } static inline int cap_inh_is_capped(void) { return 1; } static inline int cap_limit_ptraced_target(void) { @@ -128,21 +118,18 @@ static inline int cap_limit_ptraced_target(void) #endif /* def CONFIG_SECURITY_FILE_CAPABILITIES */ -int cap_capset_check (struct task_struct *target, kernel_cap_t *effective, +int cap_capset_check (kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { - if (cap_block_setpcap(target)) { - return -EPERM; - } if (cap_inh_is_capped() && !cap_issubset(*inheritable, - cap_combine(target->cap_inheritable, + cap_combine(current->cap_inheritable, current->cap_permitted))) { /* incapable of using this inheritable set */ return -EPERM; } if (!cap_issubset(*inheritable, - cap_combine(target->cap_inheritable, + cap_combine(current->cap_inheritable, current->cap_bset))) { /* no new pI capabilities outside bounding set */ return -EPERM; @@ -150,7 +137,7 @@ int cap_capset_check (struct task_struct *target, kernel_cap_t *effective, /* verify restrictions on target's new Permitted set */ if (!cap_issubset (*permitted, - cap_combine (target->cap_permitted, + cap_combine (current->cap_permitted, current->cap_permitted))) { return -EPERM; } @@ -163,12 +150,12 @@ int cap_capset_check (struct task_struct *target, kernel_cap_t *effective, return 0; } -void cap_capset_set (struct task_struct *target, kernel_cap_t *effective, +void cap_capset_set (kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { - target->cap_effective = *effective; - target->cap_inheritable = *inheritable; - target->cap_permitted = *permitted; + current->cap_effective = *effective; + current->cap_inheritable = *inheritable; + current->cap_permitted = *permitted; } static inline void bprm_clear_caps(struct linux_binprm *bprm) diff --git a/security/security.c b/security/security.c index 346f21e0ec2..dca37381e2a 100644 --- a/security/security.c +++ b/security/security.c @@ -145,20 +145,18 @@ int security_capget(struct task_struct *target, return security_ops->capget(target, effective, inheritable, permitted); } -int security_capset_check(struct task_struct *target, - kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) +int security_capset_check(kernel_cap_t *effective, + kernel_cap_t *inheritable, + kernel_cap_t *permitted) { - return security_ops->capset_check(target, effective, inheritable, permitted); + return security_ops->capset_check(effective, inheritable, permitted); } -void security_capset_set(struct task_struct *target, - kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) +void security_capset_set(kernel_cap_t *effective, + kernel_cap_t *inheritable, + kernel_cap_t *permitted) { - security_ops->capset_set(target, effective, inheritable, permitted); + security_ops->capset_set(effective, inheritable, permitted); } int security_capable(struct task_struct *tsk, int cap) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 378dc53c08e..df9986940e9 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1790,22 +1790,22 @@ static int selinux_capget(struct task_struct *target, kernel_cap_t *effective, return secondary_ops->capget(target, effective, inheritable, permitted); } -static int selinux_capset_check(struct task_struct *target, kernel_cap_t *effective, +static int selinux_capset_check(kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { int error; - error = secondary_ops->capset_check(target, effective, inheritable, permitted); + error = secondary_ops->capset_check(effective, inheritable, permitted); if (error) return error; - return task_has_perm(current, target, PROCESS__SETCAP); + return task_has_perm(current, current, PROCESS__SETCAP); } -static void selinux_capset_set(struct task_struct *target, kernel_cap_t *effective, +static void selinux_capset_set(kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { - secondary_ops->capset_set(target, effective, inheritable, permitted); + secondary_ops->capset_set(effective, inheritable, permitted); } static int selinux_capable(struct task_struct *tsk, int cap, int audit) -- cgit v1.2.3-70-g09d2 From b6dff3ec5e116e3af6f537d4caedcad6b9e5082a Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:16 +1100 Subject: CRED: Separate task security context from task_struct Separate the task security context from task_struct. At this point, the security data is temporarily embedded in the task_struct with two pointers pointing to it. Note that the Alpha arch is altered as it refers to (E)UID and (E)GID in entry.S via asm-offsets. With comment fixes Signed-off-by: Marc Dionne Signed-off-by: David Howells Acked-by: James Morris Acked-by: Serge Hallyn Signed-off-by: James Morris --- arch/alpha/kernel/asm-offsets.c | 11 +- arch/alpha/kernel/entry.S | 10 +- arch/ia64/ia32/sys_ia32.c | 8 +- arch/mips/kernel/kspd.c | 4 +- arch/s390/kernel/compat_linux.c | 28 ++--- drivers/connector/cn_proc.c | 8 +- fs/binfmt_elf.c | 12 +- fs/binfmt_elf_fdpic.c | 12 +- fs/exec.c | 4 +- fs/fcntl.c | 4 +- fs/file_table.c | 4 +- fs/fuse/dir.c | 12 +- fs/hugetlbfs/inode.c | 4 +- fs/ioprio.c | 12 +- fs/nfsd/auth.c | 22 ++-- fs/nfsd/nfs4recover.c | 12 +- fs/nfsd/nfsfh.c | 6 +- fs/open.c | 17 +-- fs/proc/array.c | 18 +-- fs/proc/base.c | 16 +-- fs/xfs/linux-2.6/xfs_cred.h | 6 +- fs/xfs/linux-2.6/xfs_globals.h | 2 +- fs/xfs/xfs_inode.h | 2 +- fs/xfs/xfs_vnodeops.h | 10 +- include/linux/cred.h | 155 +++++++++++++++++++---- include/linux/init_task.h | 24 ++-- include/linux/sched.h | 52 +------- include/linux/securebits.h | 2 +- ipc/mqueue.c | 2 +- ipc/shm.c | 4 +- kernel/auditsc.c | 52 ++++---- kernel/capability.c | 4 +- kernel/cgroup.c | 4 +- kernel/exit.c | 10 +- kernel/fork.c | 24 ++-- kernel/futex.c | 6 +- kernel/futex_compat.c | 5 +- kernel/ptrace.c | 19 +-- kernel/sched.c | 10 +- kernel/signal.c | 16 +-- kernel/sys.c | 266 ++++++++++++++++++++++----------------- kernel/trace/trace.c | 2 +- kernel/tsacct.c | 4 +- kernel/uid16.c | 28 ++--- kernel/user.c | 4 +- mm/mempolicy.c | 10 +- mm/migrate.c | 10 +- mm/oom_kill.c | 2 +- net/core/scm.c | 10 +- net/sunrpc/auth.c | 2 +- security/commoncap.c | 161 +++++++++++++----------- security/keys/keyctl.c | 25 ++-- security/keys/permission.c | 11 +- security/keys/process_keys.c | 98 ++++++++------- security/keys/request_key.c | 18 +-- security/keys/request_key_auth.c | 12 +- security/selinux/exports.c | 2 +- security/selinux/hooks.c | 116 ++++++++--------- security/selinux/selinuxfs.c | 2 +- security/selinux/xfrm.c | 6 +- security/smack/smack_access.c | 4 +- security/smack/smack_lsm.c | 77 ++++++------ security/smack/smackfs.c | 6 +- 63 files changed, 832 insertions(+), 677 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/asm-offsets.c b/arch/alpha/kernel/asm-offsets.c index 4b18cd94d59..6ff8886e7e2 100644 --- a/arch/alpha/kernel/asm-offsets.c +++ b/arch/alpha/kernel/asm-offsets.c @@ -19,15 +19,18 @@ void foo(void) BLANK(); DEFINE(TASK_BLOCKED, offsetof(struct task_struct, blocked)); - DEFINE(TASK_UID, offsetof(struct task_struct, uid)); - DEFINE(TASK_EUID, offsetof(struct task_struct, euid)); - DEFINE(TASK_GID, offsetof(struct task_struct, gid)); - DEFINE(TASK_EGID, offsetof(struct task_struct, egid)); + DEFINE(TASK_CRED, offsetof(struct task_struct, cred)); DEFINE(TASK_REAL_PARENT, offsetof(struct task_struct, real_parent)); DEFINE(TASK_GROUP_LEADER, offsetof(struct task_struct, group_leader)); DEFINE(TASK_TGID, offsetof(struct task_struct, tgid)); BLANK(); + DEFINE(CRED_UID, offsetof(struct cred, uid)); + DEFINE(CRED_EUID, offsetof(struct cred, euid)); + DEFINE(CRED_GID, offsetof(struct cred, gid)); + DEFINE(CRED_EGID, offsetof(struct cred, egid)); + BLANK(); + DEFINE(SIZEOF_PT_REGS, sizeof(struct pt_regs)); DEFINE(PT_PTRACED, PT_PTRACED); DEFINE(CLONE_VM, CLONE_VM); diff --git a/arch/alpha/kernel/entry.S b/arch/alpha/kernel/entry.S index 5fc61e281ac..f77345bc66a 100644 --- a/arch/alpha/kernel/entry.S +++ b/arch/alpha/kernel/entry.S @@ -850,8 +850,9 @@ osf_getpriority: sys_getxuid: .prologue 0 ldq $2, TI_TASK($8) - ldl $0, TASK_UID($2) - ldl $1, TASK_EUID($2) + ldq $3, TASK_CRED($2) + ldl $0, CRED_UID($3) + ldl $1, CRED_EUID($3) stq $1, 80($sp) ret .end sys_getxuid @@ -862,8 +863,9 @@ sys_getxuid: sys_getxgid: .prologue 0 ldq $2, TI_TASK($8) - ldl $0, TASK_GID($2) - ldl $1, TASK_EGID($2) + ldq $3, TASK_CRED($2) + ldl $0, CRED_GID($3) + ldl $1, CRED_EGID($3) stq $1, 80($sp) ret .end sys_getxgid diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c index 5e92ae00bdb..2445a9d3488 100644 --- a/arch/ia64/ia32/sys_ia32.c +++ b/arch/ia64/ia32/sys_ia32.c @@ -1772,20 +1772,20 @@ sys32_getgroups16 (int gidsetsize, short __user *grouplist) if (gidsetsize < 0) return -EINVAL; - get_group_info(current->group_info); - i = current->group_info->ngroups; + get_group_info(current->cred->group_info); + i = current->cred->group_info->ngroups; if (gidsetsize) { if (i > gidsetsize) { i = -EINVAL; goto out; } - if (groups16_to_user(grouplist, current->group_info)) { + if (groups16_to_user(grouplist, current->cred->group_info)) { i = -EFAULT; goto out; } } out: - put_group_info(current->group_info); + put_group_info(current->cred->group_info); return i; } diff --git a/arch/mips/kernel/kspd.c b/arch/mips/kernel/kspd.c index b0591ae0ce5..fd6e5122403 100644 --- a/arch/mips/kernel/kspd.c +++ b/arch/mips/kernel/kspd.c @@ -174,8 +174,8 @@ static unsigned int translate_open_flags(int flags) static void sp_setfsuidgid( uid_t uid, gid_t gid) { - current->fsuid = uid; - current->fsgid = gid; + current->cred->fsuid = uid; + current->cred->fsgid = gid; key_fsuid_changed(current); key_fsgid_changed(current); diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c index 4646382af34..6cc87d8c868 100644 --- a/arch/s390/kernel/compat_linux.c +++ b/arch/s390/kernel/compat_linux.c @@ -148,9 +148,9 @@ asmlinkage long sys32_getresuid16(u16 __user *ruid, u16 __user *euid, u16 __user { int retval; - if (!(retval = put_user(high2lowuid(current->uid), ruid)) && - !(retval = put_user(high2lowuid(current->euid), euid))) - retval = put_user(high2lowuid(current->suid), suid); + if (!(retval = put_user(high2lowuid(current->cred->uid), ruid)) && + !(retval = put_user(high2lowuid(current->cred->euid), euid))) + retval = put_user(high2lowuid(current->cred->suid), suid); return retval; } @@ -165,9 +165,9 @@ asmlinkage long sys32_getresgid16(u16 __user *rgid, u16 __user *egid, u16 __user { int retval; - if (!(retval = put_user(high2lowgid(current->gid), rgid)) && - !(retval = put_user(high2lowgid(current->egid), egid))) - retval = put_user(high2lowgid(current->sgid), sgid); + if (!(retval = put_user(high2lowgid(current->cred->gid), rgid)) && + !(retval = put_user(high2lowgid(current->cred->egid), egid))) + retval = put_user(high2lowgid(current->cred->sgid), sgid); return retval; } @@ -217,20 +217,20 @@ asmlinkage long sys32_getgroups16(int gidsetsize, u16 __user *grouplist) if (gidsetsize < 0) return -EINVAL; - get_group_info(current->group_info); - i = current->group_info->ngroups; + get_group_info(current->cred->group_info); + i = current->cred->group_info->ngroups; if (gidsetsize) { if (i > gidsetsize) { i = -EINVAL; goto out; } - if (groups16_to_user(grouplist, current->group_info)) { + if (groups16_to_user(grouplist, current->cred->group_info)) { i = -EFAULT; goto out; } } out: - put_group_info(current->group_info); + put_group_info(current->cred->group_info); return i; } @@ -261,22 +261,22 @@ asmlinkage long sys32_setgroups16(int gidsetsize, u16 __user *grouplist) asmlinkage long sys32_getuid16(void) { - return high2lowuid(current->uid); + return high2lowuid(current->cred->uid); } asmlinkage long sys32_geteuid16(void) { - return high2lowuid(current->euid); + return high2lowuid(current->cred->euid); } asmlinkage long sys32_getgid16(void) { - return high2lowgid(current->gid); + return high2lowgid(current->cred->gid); } asmlinkage long sys32_getegid16(void) { - return high2lowgid(current->egid); + return high2lowgid(current->cred->egid); } /* diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c index 5c9f67f98d1..354c1ff1715 100644 --- a/drivers/connector/cn_proc.c +++ b/drivers/connector/cn_proc.c @@ -116,11 +116,11 @@ void proc_id_connector(struct task_struct *task, int which_id) ev->event_data.id.process_pid = task->pid; ev->event_data.id.process_tgid = task->tgid; if (which_id == PROC_EVENT_UID) { - ev->event_data.id.r.ruid = task->uid; - ev->event_data.id.e.euid = task->euid; + ev->event_data.id.r.ruid = task->cred->uid; + ev->event_data.id.e.euid = task->cred->euid; } else if (which_id == PROC_EVENT_GID) { - ev->event_data.id.r.rgid = task->gid; - ev->event_data.id.e.egid = task->egid; + ev->event_data.id.r.rgid = task->cred->gid; + ev->event_data.id.e.egid = task->cred->egid; } else return; get_seq(&msg->seq, &ev->cpu); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 8fcfa398d35..7a52477ce49 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -223,10 +223,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, NEW_AUX_ENT(AT_BASE, interp_load_addr); NEW_AUX_ENT(AT_FLAGS, 0); NEW_AUX_ENT(AT_ENTRY, exec->e_entry); - NEW_AUX_ENT(AT_UID, tsk->uid); - NEW_AUX_ENT(AT_EUID, tsk->euid); - NEW_AUX_ENT(AT_GID, tsk->gid); - NEW_AUX_ENT(AT_EGID, tsk->egid); + NEW_AUX_ENT(AT_UID, tsk->cred->uid); + NEW_AUX_ENT(AT_EUID, tsk->cred->euid); + NEW_AUX_ENT(AT_GID, tsk->cred->gid); + NEW_AUX_ENT(AT_EGID, tsk->cred->egid); NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); NEW_AUX_ENT(AT_EXECFN, bprm->exec); if (k_platform) { @@ -1388,8 +1388,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_zomb = psinfo->pr_sname == 'Z'; psinfo->pr_nice = task_nice(p); psinfo->pr_flag = p->flags; - SET_UID(psinfo->pr_uid, p->uid); - SET_GID(psinfo->pr_gid, p->gid); + SET_UID(psinfo->pr_uid, p->cred->uid); + SET_GID(psinfo->pr_gid, p->cred->gid); strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); return 0; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 488584c8751..9f67054c2c4 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -623,10 +623,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, NEW_AUX_ENT(AT_BASE, interp_params->elfhdr_addr); NEW_AUX_ENT(AT_FLAGS, 0); NEW_AUX_ENT(AT_ENTRY, exec_params->entry_addr); - NEW_AUX_ENT(AT_UID, (elf_addr_t) current_uid()); - NEW_AUX_ENT(AT_EUID, (elf_addr_t) current_euid()); - NEW_AUX_ENT(AT_GID, (elf_addr_t) current_gid()); - NEW_AUX_ENT(AT_EGID, (elf_addr_t) current_egid()); + NEW_AUX_ENT(AT_UID, (elf_addr_t) current->cred->uid); + NEW_AUX_ENT(AT_EUID, (elf_addr_t) current->cred->euid); + NEW_AUX_ENT(AT_GID, (elf_addr_t) current->cred->gid); + NEW_AUX_ENT(AT_EGID, (elf_addr_t) current->cred->egid); NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); NEW_AUX_ENT(AT_EXECFN, bprm->exec); @@ -1440,8 +1440,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_zomb = psinfo->pr_sname == 'Z'; psinfo->pr_nice = task_nice(p); psinfo->pr_flag = p->flags; - SET_UID(psinfo->pr_uid, p->uid); - SET_GID(psinfo->pr_gid, p->gid); + SET_UID(psinfo->pr_uid, p->cred->uid); + SET_GID(psinfo->pr_gid, p->cred->gid); strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); return 0; diff --git a/fs/exec.c b/fs/exec.c index 604834f3b20..31149e430a8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1738,7 +1738,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) */ if (get_dumpable(mm) == 2) { /* Setuid core dump mode */ flag = O_EXCL; /* Stop rewrite attacks */ - current->fsuid = 0; /* Dump root private */ + current->cred->fsuid = 0; /* Dump root private */ } retval = coredump_wait(exit_code, &core_state); @@ -1834,7 +1834,7 @@ fail_unlock: if (helper_argv) argv_free(helper_argv); - current->fsuid = fsuid; + current->cred->fsuid = fsuid; coredump_finish(mm); fail: return retval; diff --git a/fs/fcntl.c b/fs/fcntl.c index bf049a805e5..63964d863ad 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -401,8 +401,8 @@ static inline int sigio_perm(struct task_struct *p, struct fown_struct *fown, int sig) { return (((fown->euid == 0) || - (fown->euid == p->suid) || (fown->euid == p->uid) || - (fown->uid == p->suid) || (fown->uid == p->uid)) && + (fown->euid == p->cred->suid) || (fown->euid == p->cred->uid) || + (fown->uid == p->cred->suid) || (fown->uid == p->cred->uid)) && !security_file_send_sigiotask(p, fown, sig)); } diff --git a/fs/file_table.c b/fs/file_table.c index 5ad0eca6eea..3152b53cfab 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -122,8 +122,8 @@ struct file *get_empty_filp(void) INIT_LIST_HEAD(&f->f_u.fu_list); atomic_long_set(&f->f_count, 1); rwlock_init(&f->f_owner.lock); - f->f_uid = tsk->fsuid; - f->f_gid = tsk->fsgid; + f->f_uid = tsk->cred->fsuid; + f->f_gid = tsk->cred->fsgid; eventpoll_init_file(f); /* f->f_version: 0 */ return f; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index fd03330cade..e97a9898186 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -872,12 +872,12 @@ int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task) if (fc->flags & FUSE_ALLOW_OTHER) return 1; - if (task->euid == fc->user_id && - task->suid == fc->user_id && - task->uid == fc->user_id && - task->egid == fc->group_id && - task->sgid == fc->group_id && - task->gid == fc->group_id) + if (task->cred->euid == fc->user_id && + task->cred->suid == fc->user_id && + task->cred->uid == fc->user_id && + task->cred->egid == fc->group_id && + task->cred->sgid == fc->group_id && + task->cred->gid == fc->group_id) return 1; return 0; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 08ad76c79b4..870a721b8bd 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -958,7 +958,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size) if (!can_do_hugetlb_shm()) return ERR_PTR(-EPERM); - if (!user_shm_lock(size, current->user)) + if (!user_shm_lock(size, current->cred->user)) return ERR_PTR(-ENOMEM); root = hugetlbfs_vfsmount->mnt_root; @@ -998,7 +998,7 @@ out_inode: out_dentry: dput(dentry); out_shm_unlock: - user_shm_unlock(size, current->user); + user_shm_unlock(size, current->cred->user); return ERR_PTR(error); } diff --git a/fs/ioprio.c b/fs/ioprio.c index 68d2cd80711..bb5210af77c 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -32,8 +32,8 @@ static int set_task_ioprio(struct task_struct *task, int ioprio) int err; struct io_context *ioc; - if (task->uid != current_euid() && - task->uid != current_uid() && !capable(CAP_SYS_NICE)) + if (task->cred->uid != current_euid() && + task->cred->uid != current_uid() && !capable(CAP_SYS_NICE)) return -EPERM; err = security_task_setioprio(task, ioprio); @@ -123,7 +123,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio) break; case IOPRIO_WHO_USER: if (!who) - user = current->user; + user = current->cred->user; else user = find_user(who); @@ -131,7 +131,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio) break; do_each_thread(g, p) { - if (p->uid != who) + if (p->cred->uid != who) continue; ret = set_task_ioprio(p, ioprio); if (ret) @@ -216,7 +216,7 @@ asmlinkage long sys_ioprio_get(int which, int who) break; case IOPRIO_WHO_USER: if (!who) - user = current->user; + user = current->cred->user; else user = find_user(who); @@ -224,7 +224,7 @@ asmlinkage long sys_ioprio_get(int which, int who) break; do_each_thread(g, p) { - if (p->uid != user->uid) + if (p->cred->uid != user->uid) continue; tmpio = get_task_ioprio(p); if (tmpio < 0) diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 294992e9bf6..808fc03a6fb 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -27,6 +27,7 @@ int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp) int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) { + struct cred *act_as = current->cred ; struct svc_cred cred = rqstp->rq_cred; int i; int flags = nfsexp_flags(rqstp, exp); @@ -55,25 +56,26 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) get_group_info(cred.cr_group_info); if (cred.cr_uid != (uid_t) -1) - current->fsuid = cred.cr_uid; + act_as->fsuid = cred.cr_uid; else - current->fsuid = exp->ex_anon_uid; + act_as->fsuid = exp->ex_anon_uid; if (cred.cr_gid != (gid_t) -1) - current->fsgid = cred.cr_gid; + act_as->fsgid = cred.cr_gid; else - current->fsgid = exp->ex_anon_gid; + act_as->fsgid = exp->ex_anon_gid; if (!cred.cr_group_info) return -ENOMEM; - ret = set_current_groups(cred.cr_group_info); + ret = set_groups(act_as, cred.cr_group_info); put_group_info(cred.cr_group_info); if ((cred.cr_uid)) { - current->cap_effective = - cap_drop_nfsd_set(current->cap_effective); + act_as->cap_effective = + cap_drop_nfsd_set(act_as->cap_effective); } else { - current->cap_effective = - cap_raise_nfsd_set(current->cap_effective, - current->cap_permitted); + act_as->cap_effective = + cap_raise_nfsd_set(act_as->cap_effective, + act_as->cap_permitted); } return ret; } + diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index bb93946ace2..a5e14e8695e 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -57,17 +57,17 @@ static int rec_dir_init = 0; static void nfs4_save_user(uid_t *saveuid, gid_t *savegid) { - *saveuid = current->fsuid; - *savegid = current->fsgid; - current->fsuid = 0; - current->fsgid = 0; + *saveuid = current->cred->fsuid; + *savegid = current->cred->fsgid; + current->cred->fsuid = 0; + current->cred->fsgid = 0; } static void nfs4_reset_user(uid_t saveuid, gid_t savegid) { - current->fsuid = saveuid; - current->fsgid = savegid; + current->cred->fsuid = saveuid; + current->cred->fsgid = savegid; } static void diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index cd25d91895a..e67cfaea086 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -186,9 +186,9 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) * access control settings being in effect, we cannot * fix that case easily. */ - current->cap_effective = - cap_raise_nfsd_set(current->cap_effective, - current->cap_permitted); + current->cred->cap_effective = + cap_raise_nfsd_set(current->cred->cap_effective, + current->cred->cap_permitted); } else { error = nfsd_setuser_and_check_port(rqstp, exp); if (error) diff --git a/fs/open.c b/fs/open.c index 500cc0c5476..b1238e195e7 100644 --- a/fs/open.c +++ b/fs/open.c @@ -425,6 +425,7 @@ out: */ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode) { + struct cred *cred = current->cred; struct path path; struct inode *inode; int old_fsuid, old_fsgid; @@ -434,18 +435,18 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode) if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ return -EINVAL; - old_fsuid = current->fsuid; - old_fsgid = current->fsgid; + old_fsuid = cred->fsuid; + old_fsgid = cred->fsgid; - current->fsuid = current->uid; - current->fsgid = current->gid; + cred->fsuid = cred->uid; + cred->fsgid = cred->gid; if (!issecure(SECURE_NO_SETUID_FIXUP)) { /* Clear the capabilities if we switch to a non-root user */ - if (current->uid) + if (current->cred->uid) old_cap = cap_set_effective(__cap_empty_set); else - old_cap = cap_set_effective(current->cap_permitted); + old_cap = cap_set_effective(cred->cap_permitted); } res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); @@ -484,8 +485,8 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode) out_path_release: path_put(&path); out: - current->fsuid = old_fsuid; - current->fsgid = old_fsgid; + cred->fsuid = old_fsuid; + cred->fsgid = old_fsgid; if (!issecure(SECURE_NO_SETUID_FIXUP)) cap_set_effective(old_cap); diff --git a/fs/proc/array.c b/fs/proc/array.c index 6af7fba7abb..62fe9b2009b 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -182,8 +182,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, task_tgid_nr_ns(p, ns), pid_nr_ns(pid, ns), ppid, tpid, - p->uid, p->euid, p->suid, p->fsuid, - p->gid, p->egid, p->sgid, p->fsgid); + p->cred->uid, p->cred->euid, p->cred->suid, p->cred->fsuid, + p->cred->gid, p->cred->egid, p->cred->sgid, p->cred->fsgid); task_lock(p); if (p->files) @@ -194,7 +194,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, fdt ? fdt->max_fds : 0); rcu_read_unlock(); - group_info = p->group_info; + group_info = p->cred->group_info; get_group_info(group_info); task_unlock(p); @@ -262,7 +262,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) blocked = p->blocked; collect_sigign_sigcatch(p, &ignored, &caught); num_threads = atomic_read(&p->signal->count); - qsize = atomic_read(&p->user->sigpending); + qsize = atomic_read(&p->cred->user->sigpending); qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur; unlock_task_sighand(p, &flags); } @@ -293,10 +293,12 @@ static void render_cap_t(struct seq_file *m, const char *header, static inline void task_cap(struct seq_file *m, struct task_struct *p) { - render_cap_t(m, "CapInh:\t", &p->cap_inheritable); - render_cap_t(m, "CapPrm:\t", &p->cap_permitted); - render_cap_t(m, "CapEff:\t", &p->cap_effective); - render_cap_t(m, "CapBnd:\t", &p->cap_bset); + struct cred *cred = p->cred; + + render_cap_t(m, "CapInh:\t", &cred->cap_inheritable); + render_cap_t(m, "CapPrm:\t", &cred->cap_permitted); + render_cap_t(m, "CapEff:\t", &cred->cap_effective); + render_cap_t(m, "CapBnd:\t", &cred->cap_bset); } static inline void task_context_switch_counts(struct seq_file *m, diff --git a/fs/proc/base.c b/fs/proc/base.c index 486cf3fe713..6862b360c36 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1428,8 +1428,8 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st inode->i_uid = 0; inode->i_gid = 0; if (task_dumpable(task)) { - inode->i_uid = task->euid; - inode->i_gid = task->egid; + inode->i_uid = task->cred->euid; + inode->i_gid = task->cred->egid; } security_task_to_inode(task, inode); @@ -1454,8 +1454,8 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat if (task) { if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || task_dumpable(task)) { - stat->uid = task->euid; - stat->gid = task->egid; + stat->uid = task->cred->euid; + stat->gid = task->cred->egid; } } rcu_read_unlock(); @@ -1486,8 +1486,8 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) if (task) { if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || task_dumpable(task)) { - inode->i_uid = task->euid; - inode->i_gid = task->egid; + inode->i_uid = task->cred->euid; + inode->i_gid = task->cred->egid; } else { inode->i_uid = 0; inode->i_gid = 0; @@ -1658,8 +1658,8 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) rcu_read_unlock(); put_files_struct(files); if (task_dumpable(task)) { - inode->i_uid = task->euid; - inode->i_gid = task->egid; + inode->i_uid = task->cred->euid; + inode->i_gid = task->cred->egid; } else { inode->i_uid = 0; inode->i_gid = 0; diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h index 293043a5573..8c022cd0ad6 100644 --- a/fs/xfs/linux-2.6/xfs_cred.h +++ b/fs/xfs/linux-2.6/xfs_cred.h @@ -23,11 +23,9 @@ /* * Credentials */ -typedef struct cred { - /* EMPTY */ -} cred_t; +typedef const struct cred cred_t; -extern struct cred *sys_cred; +extern cred_t *sys_cred; /* this is a hack.. (assumes sys_cred is the only cred_t in the system) */ static inline int capable_cred(cred_t *cr, int cid) diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h index 2770b0085ee..6eda8a3eb6f 100644 --- a/fs/xfs/linux-2.6/xfs_globals.h +++ b/fs/xfs/linux-2.6/xfs_globals.h @@ -19,6 +19,6 @@ #define __XFS_GLOBALS_H__ extern uint64_t xfs_panic_mask; /* set to cause more panics */ -extern struct cred *sys_cred; +extern cred_t *sys_cred; #endif /* __XFS_GLOBALS_H__ */ diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 1420c49674d..6be310d41da 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -497,7 +497,7 @@ int xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, xfs_inode_t **, xfs_daddr_t, uint); int xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int); int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t, - xfs_nlink_t, xfs_dev_t, struct cred *, xfs_prid_t, + xfs_nlink_t, xfs_dev_t, cred_t *, xfs_prid_t, int, struct xfs_buf **, boolean_t *, xfs_inode_t **); void xfs_dinode_from_disk(struct xfs_icdinode *, struct xfs_dinode_core *); diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index e932a96bec5..7b0c2ab8833 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -16,7 +16,7 @@ struct xfs_iomap; int xfs_open(struct xfs_inode *ip); int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags, - struct cred *credp); + cred_t *credp); #define XFS_ATTR_DMI 0x01 /* invocation from a DMI function */ #define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */ #define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */ @@ -28,24 +28,24 @@ int xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode, - xfs_dev_t rdev, struct xfs_inode **ipp, struct cred *credp); + xfs_dev_t rdev, struct xfs_inode **ipp, cred_t *credp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, struct xfs_name *target_name); int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name, - mode_t mode, struct xfs_inode **ipp, struct cred *credp); + mode_t mode, struct xfs_inode **ipp, cred_t *credp); int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, xfs_off_t *offset, filldir_t filldir); int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, mode_t mode, struct xfs_inode **ipp, - struct cred *credp); + cred_t *credp); int xfs_inode_flush(struct xfs_inode *ip, int flags); int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); int xfs_reclaim(struct xfs_inode *ip); int xfs_change_file_space(struct xfs_inode *ip, int cmd, xfs_flock64_t *bf, xfs_off_t offset, - struct cred *credp, int attr_flags); + cred_t *credp, int attr_flags); int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, struct xfs_inode *src_ip, struct xfs_inode *target_dp, struct xfs_name *target_name, struct xfs_inode *target_ip); diff --git a/include/linux/cred.h b/include/linux/cred.h index b69222cc1fd..3e65587a72e 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -12,39 +12,150 @@ #ifndef _LINUX_CRED_H #define _LINUX_CRED_H -#define get_current_user() (get_uid(current->user)) - -#define task_uid(task) ((task)->uid) -#define task_gid(task) ((task)->gid) -#define task_euid(task) ((task)->euid) -#define task_egid(task) ((task)->egid) - -#define current_uid() (current->uid) -#define current_gid() (current->gid) -#define current_euid() (current->euid) -#define current_egid() (current->egid) -#define current_suid() (current->suid) -#define current_sgid() (current->sgid) -#define current_fsuid() (current->fsuid) -#define current_fsgid() (current->fsgid) -#define current_cap() (current->cap_effective) +#include +#include +#include + +struct user_struct; +struct cred; + +/* + * COW Supplementary groups list + */ +#define NGROUPS_SMALL 32 +#define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(gid_t))) + +struct group_info { + atomic_t usage; + int ngroups; + int nblocks; + gid_t small_block[NGROUPS_SMALL]; + gid_t *blocks[0]; +}; + +/** + * get_group_info - Get a reference to a group info structure + * @group_info: The group info to reference + * + * This must be called with the owning task locked (via task_lock()) when task + * != current. The reason being that the vast majority of callers are looking + * at current->group_info, which can not be changed except by the current task. + * Changing current->group_info requires the task lock, too. + */ +#define get_group_info(group_info) \ +do { \ + atomic_inc(&(group_info)->usage); \ +} while (0) + +/** + * put_group_info - Release a reference to a group info structure + * @group_info: The group info to release + */ +#define put_group_info(group_info) \ +do { \ + if (atomic_dec_and_test(&(group_info)->usage)) \ + groups_free(group_info); \ +} while (0) + +extern struct group_info *groups_alloc(int); +extern void groups_free(struct group_info *); +extern int set_current_groups(struct group_info *); +extern int set_groups(struct cred *, struct group_info *); +extern int groups_search(struct group_info *, gid_t); + +/* access the groups "array" with this macro */ +#define GROUP_AT(gi, i) \ + ((gi)->blocks[(i) / NGROUPS_PER_BLOCK][(i) % NGROUPS_PER_BLOCK]) + +extern int in_group_p(gid_t); +extern int in_egroup_p(gid_t); + +/* + * The security context of a task + * + * The parts of the context break down into two categories: + * + * (1) The objective context of a task. These parts are used when some other + * task is attempting to affect this one. + * + * (2) The subjective context. These details are used when the task is acting + * upon another object, be that a file, a task, a key or whatever. + * + * Note that some members of this structure belong to both categories - the + * LSM security pointer for instance. + * + * A task has two security pointers. task->real_cred points to the objective + * context that defines that task's actual details. The objective part of this + * context is used whenever that task is acted upon. + * + * task->cred points to the subjective context that defines the details of how + * that task is going to act upon another object. This may be overridden + * temporarily to point to another security context, but normally points to the + * same context as task->real_cred. + */ +struct cred { + atomic_t usage; + uid_t uid; /* real UID of the task */ + gid_t gid; /* real GID of the task */ + uid_t suid; /* saved UID of the task */ + gid_t sgid; /* saved GID of the task */ + uid_t euid; /* effective UID of the task */ + gid_t egid; /* effective GID of the task */ + uid_t fsuid; /* UID for VFS ops */ + gid_t fsgid; /* GID for VFS ops */ + unsigned securebits; /* SUID-less security management */ + kernel_cap_t cap_inheritable; /* caps our children can inherit */ + kernel_cap_t cap_permitted; /* caps we're permitted */ + kernel_cap_t cap_effective; /* caps we can actually use */ + kernel_cap_t cap_bset; /* capability bounding set */ +#ifdef CONFIG_KEYS + unsigned char jit_keyring; /* default keyring to attach requested + * keys to */ + struct key *thread_keyring; /* keyring private to this thread */ + struct key *request_key_auth; /* assumed request_key authority */ +#endif +#ifdef CONFIG_SECURITY + void *security; /* subjective LSM security */ +#endif + struct user_struct *user; /* real user ID subscription */ + struct group_info *group_info; /* supplementary groups for euid/fsgid */ + struct rcu_head rcu; /* RCU deletion hook */ + spinlock_t lock; /* lock for pointer changes */ +}; + +#define get_current_user() (get_uid(current->cred->user)) + +#define task_uid(task) ((task)->cred->uid) +#define task_gid(task) ((task)->cred->gid) +#define task_euid(task) ((task)->cred->euid) +#define task_egid(task) ((task)->cred->egid) + +#define current_uid() (current->cred->uid) +#define current_gid() (current->cred->gid) +#define current_euid() (current->cred->euid) +#define current_egid() (current->cred->egid) +#define current_suid() (current->cred->suid) +#define current_sgid() (current->cred->sgid) +#define current_fsuid() (current->cred->fsuid) +#define current_fsgid() (current->cred->fsgid) +#define current_cap() (current->cred->cap_effective) #define current_uid_gid(_uid, _gid) \ do { \ - *(_uid) = current->uid; \ - *(_gid) = current->gid; \ + *(_uid) = current->cred->uid; \ + *(_gid) = current->cred->gid; \ } while(0) #define current_euid_egid(_uid, _gid) \ do { \ - *(_uid) = current->euid; \ - *(_gid) = current->egid; \ + *(_uid) = current->cred->euid; \ + *(_gid) = current->cred->egid; \ } while(0) #define current_fsuid_fsgid(_uid, _gid) \ do { \ - *(_uid) = current->fsuid; \ - *(_gid) = current->fsgid; \ + *(_uid) = current->cred->fsuid; \ + *(_gid) = current->cred->fsgid; \ } while(0) #endif /* _LINUX_CRED_H */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 23fd8909b9e..9de41ccd67b 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -113,6 +113,21 @@ extern struct group_info init_groups; # define CAP_INIT_BSET CAP_INIT_EFF_SET #endif +extern struct cred init_cred; + +#define INIT_CRED(p) \ +{ \ + .usage = ATOMIC_INIT(3), \ + .securebits = SECUREBITS_DEFAULT, \ + .cap_inheritable = CAP_INIT_INH_SET, \ + .cap_permitted = CAP_FULL_SET, \ + .cap_effective = CAP_INIT_EFF_SET, \ + .cap_bset = CAP_INIT_BSET, \ + .user = INIT_USER, \ + .group_info = &init_groups, \ + .lock = __SPIN_LOCK_UNLOCKED(p.lock), \ +} + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -147,13 +162,8 @@ extern struct group_info init_groups; .children = LIST_HEAD_INIT(tsk.children), \ .sibling = LIST_HEAD_INIT(tsk.sibling), \ .group_leader = &tsk, \ - .group_info = &init_groups, \ - .cap_effective = CAP_INIT_EFF_SET, \ - .cap_inheritable = CAP_INIT_INH_SET, \ - .cap_permitted = CAP_FULL_SET, \ - .cap_bset = CAP_INIT_BSET, \ - .securebits = SECUREBITS_DEFAULT, \ - .user = INIT_USER, \ + .__temp_cred = INIT_CRED(tsk.__temp_cred), \ + .cred = &tsk.__temp_cred, \ .comm = "swapper", \ .thread = INIT_THREAD, \ .fs = &init_fs, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index b483f39a711..c8b92502354 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -660,6 +660,7 @@ extern struct user_struct *find_user(uid_t); extern struct user_struct root_user; #define INIT_USER (&root_user) + struct backing_dev_info; struct reclaim_state; @@ -883,38 +884,7 @@ partition_sched_domains(int ndoms_new, cpumask_t *doms_new, #endif /* !CONFIG_SMP */ struct io_context; /* See blkdev.h */ -#define NGROUPS_SMALL 32 -#define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(gid_t))) -struct group_info { - int ngroups; - atomic_t usage; - gid_t small_block[NGROUPS_SMALL]; - int nblocks; - gid_t *blocks[0]; -}; - -/* - * get_group_info() must be called with the owning task locked (via task_lock()) - * when task != current. The reason being that the vast majority of callers are - * looking at current->group_info, which can not be changed except by the - * current task. Changing current->group_info requires the task lock, too. - */ -#define get_group_info(group_info) do { \ - atomic_inc(&(group_info)->usage); \ -} while (0) -#define put_group_info(group_info) do { \ - if (atomic_dec_and_test(&(group_info)->usage)) \ - groups_free(group_info); \ -} while (0) - -extern struct group_info *groups_alloc(int gidsetsize); -extern void groups_free(struct group_info *group_info); -extern int set_current_groups(struct group_info *group_info); -extern int groups_search(struct group_info *group_info, gid_t grp); -/* access the groups "array" with this macro */ -#define GROUP_AT(gi, i) \ - ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK]) #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK extern void prefetch_stack(struct task_struct *t); @@ -1181,17 +1151,9 @@ struct task_struct { struct list_head cpu_timers[3]; /* process credentials */ - uid_t uid,euid,suid,fsuid; - gid_t gid,egid,sgid,fsgid; - struct group_info *group_info; - kernel_cap_t cap_effective, cap_inheritable, cap_permitted, cap_bset; - struct user_struct *user; - unsigned securebits; -#ifdef CONFIG_KEYS - unsigned char jit_keyring; /* default keyring to attach requested keys to */ - struct key *request_key_auth; /* assumed request_key authority */ - struct key *thread_keyring; /* keyring private to this thread */ -#endif + struct cred __temp_cred __deprecated; /* temporary credentials to be removed */ + struct cred *cred; /* actual/objective task credentials */ + char comm[TASK_COMM_LEN]; /* executable name excluding path - access with [gs]et_task_comm (which lock it with task_lock()) @@ -1228,9 +1190,6 @@ struct task_struct { int (*notifier)(void *priv); void *notifier_data; sigset_t *notifier_mask; -#ifdef CONFIG_SECURITY - void *security; -#endif struct audit_context *audit_context; #ifdef CONFIG_AUDITSYSCALL uid_t loginuid; @@ -1787,9 +1746,6 @@ extern void wake_up_new_task(struct task_struct *tsk, extern void sched_fork(struct task_struct *p, int clone_flags); extern void sched_dead(struct task_struct *p); -extern int in_group_p(gid_t); -extern int in_egroup_p(gid_t); - extern void proc_caches_init(void); extern void flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); diff --git a/include/linux/securebits.h b/include/linux/securebits.h index 92f09bdf117..6d389491bfa 100644 --- a/include/linux/securebits.h +++ b/include/linux/securebits.h @@ -32,7 +32,7 @@ setting is locked or not. A setting which is locked cannot be changed from user-level. */ #define issecure_mask(X) (1 << (X)) -#define issecure(X) (issecure_mask(X) & current->securebits) +#define issecure(X) (issecure_mask(X) & current->cred->securebits) #define SECURE_ALL_BITS (issecure_mask(SECURE_NOROOT) | \ issecure_mask(SECURE_NO_SETUID_FIXUP) | \ diff --git a/ipc/mqueue.c b/ipc/mqueue.c index abda5991d7e..e1885b494ba 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -126,7 +126,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode, if (S_ISREG(mode)) { struct mqueue_inode_info *info; struct task_struct *p = current; - struct user_struct *u = p->user; + struct user_struct *u = p->cred->user; unsigned long mq_bytes, mq_msg_tblsz; inode->i_fop = &mqueue_file_operations; diff --git a/ipc/shm.c b/ipc/shm.c index 0c3debbe32d..264a9d33c5d 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -366,7 +366,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) if (shmflg & SHM_HUGETLB) { /* hugetlb_file_setup takes care of mlock user accounting */ file = hugetlb_file_setup(name, size); - shp->mlock_user = current->user; + shp->mlock_user = current->cred->user; } else { int acctflag = VM_ACCOUNT; /* @@ -767,7 +767,7 @@ asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf) goto out_unlock; if(cmd==SHM_LOCK) { - struct user_struct * user = current->user; + struct user_struct *user = current->cred->user; if (!is_file_hugepages(shp->shm_file)) { err = shmem_lock(shp->shm_file, 1, user); if (!err && !(shp->shm_perm.mode & SHM_LOCKED)){ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9c7e47ae457..2febf5165fa 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -447,6 +447,7 @@ static int audit_filter_rules(struct task_struct *tsk, struct audit_names *name, enum audit_state *state) { + struct cred *cred = tsk->cred; int i, j, need_sid = 1; u32 sid; @@ -466,28 +467,28 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_UID: - result = audit_comparator(tsk->uid, f->op, f->val); + result = audit_comparator(cred->uid, f->op, f->val); break; case AUDIT_EUID: - result = audit_comparator(tsk->euid, f->op, f->val); + result = audit_comparator(cred->euid, f->op, f->val); break; case AUDIT_SUID: - result = audit_comparator(tsk->suid, f->op, f->val); + result = audit_comparator(cred->suid, f->op, f->val); break; case AUDIT_FSUID: - result = audit_comparator(tsk->fsuid, f->op, f->val); + result = audit_comparator(cred->fsuid, f->op, f->val); break; case AUDIT_GID: - result = audit_comparator(tsk->gid, f->op, f->val); + result = audit_comparator(cred->gid, f->op, f->val); break; case AUDIT_EGID: - result = audit_comparator(tsk->egid, f->op, f->val); + result = audit_comparator(cred->egid, f->op, f->val); break; case AUDIT_SGID: - result = audit_comparator(tsk->sgid, f->op, f->val); + result = audit_comparator(cred->sgid, f->op, f->val); break; case AUDIT_FSGID: - result = audit_comparator(tsk->fsgid, f->op, f->val); + result = audit_comparator(cred->fsgid, f->op, f->val); break; case AUDIT_PERS: result = audit_comparator(tsk->personality, f->op, f->val); @@ -1228,6 +1229,7 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { + struct cred *cred = tsk->cred; int i, call_panic = 0; struct audit_buffer *ab; struct audit_aux_data *aux; @@ -1237,14 +1239,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->pid = tsk->pid; if (!context->ppid) context->ppid = sys_getppid(); - context->uid = tsk->uid; - context->gid = tsk->gid; - context->euid = tsk->euid; - context->suid = tsk->suid; - context->fsuid = tsk->fsuid; - context->egid = tsk->egid; - context->sgid = tsk->sgid; - context->fsgid = tsk->fsgid; + context->uid = cred->uid; + context->gid = cred->gid; + context->euid = cred->euid; + context->suid = cred->suid; + context->fsuid = cred->fsuid; + context->egid = cred->egid; + context->sgid = cred->sgid; + context->fsgid = cred->fsgid; context->personality = tsk->personality; ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); @@ -2086,7 +2088,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) audit_log_format(ab, "login pid=%d uid=%u " "old auid=%u new auid=%u" " old ses=%u new ses=%u", - task->pid, task->uid, + task->pid, task->cred->uid, task->loginuid, loginuid, task->sessionid, sessionid); audit_log_end(ab); @@ -2469,7 +2471,7 @@ void __audit_ptrace(struct task_struct *t) context->target_pid = t->pid; context->target_auid = audit_get_loginuid(t); - context->target_uid = t->uid; + context->target_uid = t->cred->uid; context->target_sessionid = audit_get_sessionid(t); security_task_getsecid(t, &context->target_sid); memcpy(context->target_comm, t->comm, TASK_COMM_LEN); @@ -2495,7 +2497,7 @@ int __audit_signal_info(int sig, struct task_struct *t) if (tsk->loginuid != -1) audit_sig_uid = tsk->loginuid; else - audit_sig_uid = tsk->uid; + audit_sig_uid = tsk->cred->uid; security_task_getsecid(tsk, &audit_sig_sid); } if (!audit_signals || audit_dummy_context()) @@ -2507,7 +2509,7 @@ int __audit_signal_info(int sig, struct task_struct *t) if (!ctx->target_pid) { ctx->target_pid = t->tgid; ctx->target_auid = audit_get_loginuid(t); - ctx->target_uid = t->uid; + ctx->target_uid = t->cred->uid; ctx->target_sessionid = audit_get_sessionid(t); security_task_getsecid(t, &ctx->target_sid); memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); @@ -2528,7 +2530,7 @@ int __audit_signal_info(int sig, struct task_struct *t) axp->target_pid[axp->pid_count] = t->tgid; axp->target_auid[axp->pid_count] = audit_get_loginuid(t); - axp->target_uid[axp->pid_count] = t->uid; + axp->target_uid[axp->pid_count] = t->cred->uid; axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); security_task_getsecid(t, &axp->target_sid[axp->pid_count]); memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); @@ -2575,12 +2577,12 @@ void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_ ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; ax->old_pcap.permitted = *pP; - ax->old_pcap.inheritable = current->cap_inheritable; + ax->old_pcap.inheritable = current->cred->cap_inheritable; ax->old_pcap.effective = *pE; - ax->new_pcap.permitted = current->cap_permitted; - ax->new_pcap.inheritable = current->cap_inheritable; - ax->new_pcap.effective = current->cap_effective; + ax->new_pcap.permitted = current->cred->cap_permitted; + ax->new_pcap.inheritable = current->cred->cap_inheritable; + ax->new_pcap.effective = current->cred->cap_effective; } /** diff --git a/kernel/capability.c b/kernel/capability.c index 58b00519624..a404b980b1b 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -171,8 +171,8 @@ kernel_cap_t cap_set_effective(const kernel_cap_t pE_new) spin_lock(&task_capability_lock); - pE_old = current->cap_effective; - current->cap_effective = pE_new; + pE_old = current->cred->cap_effective; + current->cred->cap_effective = pE_new; spin_unlock(&task_capability_lock); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 78f9b310c4f..e210526e640 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1293,7 +1293,9 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) rcu_read_unlock(); euid = current_euid(); - if (euid && euid != tsk->uid && euid != tsk->suid) { + if (euid && + euid != tsk->cred->uid && + euid != tsk->cred->suid) { put_task_struct(tsk); return -EACCES; } diff --git a/kernel/exit.c b/kernel/exit.c index 80137a5d946..e0f6e1892fb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -160,7 +160,7 @@ void release_task(struct task_struct * p) int zap_leader; repeat: tracehook_prepare_release_task(p); - atomic_dec(&p->user->processes); + atomic_dec(&p->cred->user->processes); proc_flush_task(p); write_lock_irq(&tasklist_lock); tracehook_finish_release_task(p); @@ -1272,7 +1272,7 @@ static int wait_task_zombie(struct task_struct *p, int options, return 0; if (unlikely(options & WNOWAIT)) { - uid_t uid = p->uid; + uid_t uid = p->cred->uid; int exit_code = p->exit_code; int why, status; @@ -1393,7 +1393,7 @@ static int wait_task_zombie(struct task_struct *p, int options, if (!retval && infop) retval = put_user(pid, &infop->si_pid); if (!retval && infop) - retval = put_user(p->uid, &infop->si_uid); + retval = put_user(p->cred->uid, &infop->si_uid); if (!retval) retval = pid; @@ -1458,7 +1458,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p, if (!unlikely(options & WNOWAIT)) p->exit_code = 0; - uid = p->uid; + uid = p->cred->uid; unlock_sig: spin_unlock_irq(&p->sighand->siglock); if (!exit_code) @@ -1535,7 +1535,7 @@ static int wait_task_continued(struct task_struct *p, int options, spin_unlock_irq(&p->sighand->siglock); pid = task_pid_vnr(p); - uid = p->uid; + uid = p->cred->uid; get_task_struct(p); read_unlock(&tasklist_lock); diff --git a/kernel/fork.c b/kernel/fork.c index f6083561dfe..81fdc773390 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -147,8 +147,8 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(tsk == current); security_task_free(tsk); - free_uid(tsk->user); - put_group_info(tsk->group_info); + free_uid(tsk->__temp_cred.user); + put_group_info(tsk->__temp_cred.group_info); delayacct_tsk_free(tsk); if (!profile_handoff_task(tsk)) @@ -969,17 +969,18 @@ static struct task_struct *copy_process(unsigned long clone_flags, DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif + p->cred = &p->__temp_cred; retval = -EAGAIN; - if (atomic_read(&p->user->processes) >= + if (atomic_read(&p->cred->user->processes) >= p->signal->rlim[RLIMIT_NPROC].rlim_cur) { if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && - p->user != current->nsproxy->user_ns->root_user) + p->cred->user != current->nsproxy->user_ns->root_user) goto bad_fork_free; } - atomic_inc(&p->user->__count); - atomic_inc(&p->user->processes); - get_group_info(p->group_info); + atomic_inc(&p->cred->user->__count); + atomic_inc(&p->cred->user->processes); + get_group_info(p->cred->group_info); /* * If multiple threads are within copy_process(), then this check @@ -1035,9 +1036,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->real_start_time = p->start_time; monotonic_to_bootbased(&p->real_start_time); #ifdef CONFIG_SECURITY - p->security = NULL; + p->cred->security = NULL; #endif - p->cap_bset = current->cap_bset; p->io_context = NULL; p->audit_context = NULL; cgroup_fork(p); @@ -1298,9 +1298,9 @@ bad_fork_cleanup_cgroup: bad_fork_cleanup_put_domain: module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: - put_group_info(p->group_info); - atomic_dec(&p->user->processes); - free_uid(p->user); + put_group_info(p->cred->group_info); + atomic_dec(&p->cred->user->processes); + free_uid(p->cred->user); bad_fork_free: free_task(p); fork_out: diff --git a/kernel/futex.c b/kernel/futex.c index e06962132aa..28421d8210b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -443,7 +443,8 @@ static struct task_struct * futex_find_get_task(pid_t pid) rcu_read_lock(); p = find_task_by_vpid(pid); - if (!p || (euid != p->euid && euid != p->uid)) + if (!p || (euid != p->cred->euid && + euid != p->cred->uid)) p = ERR_PTR(-ESRCH); else get_task_struct(p); @@ -1846,7 +1847,8 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, if (!p) goto err_unlock; ret = -EPERM; - if (euid != p->euid && euid != p->uid && + if (euid != p->cred->euid && + euid != p->cred->uid && !capable(CAP_SYS_PTRACE)) goto err_unlock; head = p->robust_list; diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 3254d4e41e8..2c3fd5ed34f 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -151,8 +151,9 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, if (!p) goto err_unlock; ret = -EPERM; - if (euid != p->euid && euid != p->uid && - !capable(CAP_SYS_PTRACE)) + if (euid != p->cred->euid && + euid != p->cred->uid && + !capable(CAP_SYS_PTRACE)) goto err_unlock; head = p->compat_robust_list; read_unlock(&tasklist_lock); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 937f6b5b200..49849d12dd1 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -115,6 +115,8 @@ int ptrace_check_attach(struct task_struct *child, int kill) int __ptrace_may_access(struct task_struct *task, unsigned int mode) { + struct cred *cred = current->cred, *tcred = task->cred; + /* May we inspect the given task? * This check is used both for attaching with ptrace * and for allowing access to sensitive information in /proc. @@ -123,19 +125,18 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) * because setting up the necessary parent/child relationship * or halting the specified task is impossible. */ - uid_t uid; - gid_t gid; + uid_t uid = cred->uid; + gid_t gid = cred->gid; int dumpable = 0; /* Don't let security modules deny introspection */ if (task == current) return 0; - current_uid_gid(&uid, &gid); - if ((uid != task->euid || - uid != task->suid || - uid != task->uid || - gid != task->egid || - gid != task->sgid || - gid != task->gid) && !capable(CAP_SYS_PTRACE)) + if ((uid != tcred->euid || + uid != tcred->suid || + uid != tcred->uid || + gid != tcred->egid || + gid != tcred->sgid || + gid != tcred->gid) && !capable(CAP_SYS_PTRACE)) return -EPERM; smp_rmb(); if (task->mm) diff --git a/kernel/sched.c b/kernel/sched.c index c3b8b1fcde0..733c59e645a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -345,7 +345,7 @@ static inline struct task_group *task_group(struct task_struct *p) struct task_group *tg; #ifdef CONFIG_USER_SCHED - tg = p->user->tg; + tg = p->cred->user->tg; #elif defined(CONFIG_CGROUP_SCHED) tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), struct task_group, css); @@ -5182,8 +5182,8 @@ recheck: /* can't change other user's priorities */ euid = current_euid(); - if (euid != p->euid && - euid != p->uid) + if (euid != p->cred->euid && + euid != p->cred->uid) return -EPERM; } @@ -5417,7 +5417,9 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) euid = current_euid(); retval = -EPERM; - if (euid != p->euid && euid != p->uid && !capable(CAP_SYS_NICE)) + if (euid != p->cred->euid && + euid != p->cred->uid && + !capable(CAP_SYS_NICE)) goto out_unlock; retval = security_task_setscheduler(p, 0, NULL); diff --git a/kernel/signal.c b/kernel/signal.c index 167b535fe1a..80e8a6489f9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -187,7 +187,7 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, * In order to avoid problems with "switch_user()", we want to make * sure that the compiler doesn't re-load "t->user" */ - user = t->user; + user = t->cred->user; barrier(); atomic_inc(&user->sigpending); if (override_rlimit || @@ -582,8 +582,8 @@ static int check_kill_permission(int sig, struct siginfo *info, uid = current_uid(); euid = current_euid(); - if ((euid ^ t->suid) && (euid ^ t->uid) && - (uid ^ t->suid) && (uid ^ t->uid) && + if ((euid ^ t->cred->suid) && (euid ^ t->cred->uid) && + (uid ^ t->cred->suid) && (uid ^ t->cred->uid) && !capable(CAP_KILL)) { switch (sig) { case SIGCONT: @@ -1100,8 +1100,8 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, goto out_unlock; } if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) - && (euid != p->suid) && (euid != p->uid) - && (uid != p->suid) && (uid != p->uid)) { + && (euid != p->cred->suid) && (euid != p->cred->uid) + && (uid != p->cred->suid) && (uid != p->cred->uid)) { ret = -EPERM; goto out_unlock; } @@ -1374,7 +1374,7 @@ int do_notify_parent(struct task_struct *tsk, int sig) info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); rcu_read_unlock(); - info.si_uid = tsk->uid; + info.si_uid = tsk->cred->uid; thread_group_cputime(tsk, &cputime); info.si_utime = cputime_to_jiffies(cputime.utime); @@ -1445,7 +1445,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why) info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); rcu_read_unlock(); - info.si_uid = tsk->uid; + info.si_uid = tsk->cred->uid; info.si_utime = cputime_to_clock_t(tsk->utime); info.si_stime = cputime_to_clock_t(tsk->stime); @@ -1713,7 +1713,7 @@ static int ptrace_signal(int signr, siginfo_t *info, info->si_errno = 0; info->si_code = SI_USER; info->si_pid = task_pid_vnr(current->parent); - info->si_uid = current->parent->uid; + info->si_uid = current->parent->cred->uid; } /* If the (new) signal is now blocked, requeue it. */ diff --git a/kernel/sys.c b/kernel/sys.c index ed5c29c748a..5d81f07c015 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -117,7 +117,9 @@ static int set_one_prio(struct task_struct *p, int niceval, int error) uid_t euid = current_euid(); int no_nice; - if (p->uid != euid && p->euid != euid && !capable(CAP_SYS_NICE)) { + if (p->cred->uid != euid && + p->cred->euid != euid && + !capable(CAP_SYS_NICE)) { error = -EPERM; goto out; } @@ -174,7 +176,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: - user = current->user; + user = current->cred->user; if (!who) who = current_uid(); else @@ -182,7 +184,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) goto out_unlock; /* No processes for this user */ do_each_thread(g, p) - if (p->uid == who) + if (p->cred->uid == who) error = set_one_prio(p, niceval, error); while_each_thread(g, p); if (who != current_uid()) @@ -236,7 +238,7 @@ asmlinkage long sys_getpriority(int which, int who) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: - user = current->user; + user = current->cred->user; if (!who) who = current_uid(); else @@ -244,7 +246,7 @@ asmlinkage long sys_getpriority(int which, int who) goto out_unlock; /* No processes for this user */ do_each_thread(g, p) - if (p->uid == who) { + if (p->cred->uid == who) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; @@ -472,8 +474,9 @@ void ctrl_alt_del(void) */ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) { - int old_rgid = current->gid; - int old_egid = current->egid; + struct cred *cred = current->cred; + int old_rgid = cred->gid; + int old_egid = cred->egid; int new_rgid = old_rgid; int new_egid = old_egid; int retval; @@ -484,7 +487,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) if (rgid != (gid_t) -1) { if ((old_rgid == rgid) || - (current->egid==rgid) || + (cred->egid == rgid) || capable(CAP_SETGID)) new_rgid = rgid; else @@ -492,8 +495,8 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) } if (egid != (gid_t) -1) { if ((old_rgid == egid) || - (current->egid == egid) || - (current->sgid == egid) || + (cred->egid == egid) || + (cred->sgid == egid) || capable(CAP_SETGID)) new_egid = egid; else @@ -505,10 +508,10 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) } if (rgid != (gid_t) -1 || (egid != (gid_t) -1 && egid != old_rgid)) - current->sgid = new_egid; - current->fsgid = new_egid; - current->egid = new_egid; - current->gid = new_rgid; + cred->sgid = new_egid; + cred->fsgid = new_egid; + cred->egid = new_egid; + cred->gid = new_rgid; key_fsgid_changed(current); proc_id_connector(current, PROC_EVENT_GID); return 0; @@ -521,7 +524,8 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) */ asmlinkage long sys_setgid(gid_t gid) { - int old_egid = current->egid; + struct cred *cred = current->cred; + int old_egid = cred->egid; int retval; retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID); @@ -533,13 +537,13 @@ asmlinkage long sys_setgid(gid_t gid) set_dumpable(current->mm, suid_dumpable); smp_wmb(); } - current->gid = current->egid = current->sgid = current->fsgid = gid; - } else if ((gid == current->gid) || (gid == current->sgid)) { + cred->gid = cred->egid = cred->sgid = cred->fsgid = gid; + } else if ((gid == cred->gid) || (gid == cred->sgid)) { if (old_egid != gid) { set_dumpable(current->mm, suid_dumpable); smp_wmb(); } - current->egid = current->fsgid = gid; + cred->egid = cred->fsgid = gid; } else return -EPERM; @@ -570,7 +574,7 @@ static int set_user(uid_t new_ruid, int dumpclear) set_dumpable(current->mm, suid_dumpable); smp_wmb(); } - current->uid = new_ruid; + current->cred->uid = new_ruid; return 0; } @@ -591,6 +595,7 @@ static int set_user(uid_t new_ruid, int dumpclear) */ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) { + struct cred *cred = current->cred; int old_ruid, old_euid, old_suid, new_ruid, new_euid; int retval; @@ -598,14 +603,14 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) if (retval) return retval; - new_ruid = old_ruid = current->uid; - new_euid = old_euid = current->euid; - old_suid = current->suid; + new_ruid = old_ruid = cred->uid; + new_euid = old_euid = cred->euid; + old_suid = cred->suid; if (ruid != (uid_t) -1) { new_ruid = ruid; if ((old_ruid != ruid) && - (current->euid != ruid) && + (cred->euid != ruid) && !capable(CAP_SETUID)) return -EPERM; } @@ -613,8 +618,8 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) if (euid != (uid_t) -1) { new_euid = euid; if ((old_ruid != euid) && - (current->euid != euid) && - (current->suid != euid) && + (cred->euid != euid) && + (cred->suid != euid) && !capable(CAP_SETUID)) return -EPERM; } @@ -626,11 +631,11 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) set_dumpable(current->mm, suid_dumpable); smp_wmb(); } - current->fsuid = current->euid = new_euid; + cred->fsuid = cred->euid = new_euid; if (ruid != (uid_t) -1 || (euid != (uid_t) -1 && euid != old_ruid)) - current->suid = current->euid; - current->fsuid = current->euid; + cred->suid = cred->euid; + cred->fsuid = cred->euid; key_fsuid_changed(current); proc_id_connector(current, PROC_EVENT_UID); @@ -653,7 +658,8 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) */ asmlinkage long sys_setuid(uid_t uid) { - int old_euid = current->euid; + struct cred *cred = current->cred; + int old_euid = cred->euid; int old_ruid, old_suid, new_suid; int retval; @@ -661,23 +667,23 @@ asmlinkage long sys_setuid(uid_t uid) if (retval) return retval; - old_ruid = current->uid; - old_suid = current->suid; + old_ruid = cred->uid; + old_suid = cred->suid; new_suid = old_suid; if (capable(CAP_SETUID)) { if (uid != old_ruid && set_user(uid, old_euid != uid) < 0) return -EAGAIN; new_suid = uid; - } else if ((uid != current->uid) && (uid != new_suid)) + } else if ((uid != cred->uid) && (uid != new_suid)) return -EPERM; if (old_euid != uid) { set_dumpable(current->mm, suid_dumpable); smp_wmb(); } - current->fsuid = current->euid = uid; - current->suid = new_suid; + cred->fsuid = cred->euid = uid; + cred->suid = new_suid; key_fsuid_changed(current); proc_id_connector(current, PROC_EVENT_UID); @@ -692,9 +698,10 @@ asmlinkage long sys_setuid(uid_t uid) */ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) { - int old_ruid = current->uid; - int old_euid = current->euid; - int old_suid = current->suid; + struct cred *cred = current->cred; + int old_ruid = cred->uid; + int old_euid = cred->euid; + int old_suid = cred->suid; int retval; retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES); @@ -702,30 +709,31 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) return retval; if (!capable(CAP_SETUID)) { - if ((ruid != (uid_t) -1) && (ruid != current->uid) && - (ruid != current->euid) && (ruid != current->suid)) + if ((ruid != (uid_t) -1) && (ruid != cred->uid) && + (ruid != cred->euid) && (ruid != cred->suid)) return -EPERM; - if ((euid != (uid_t) -1) && (euid != current->uid) && - (euid != current->euid) && (euid != current->suid)) + if ((euid != (uid_t) -1) && (euid != cred->uid) && + (euid != cred->euid) && (euid != cred->suid)) return -EPERM; - if ((suid != (uid_t) -1) && (suid != current->uid) && - (suid != current->euid) && (suid != current->suid)) + if ((suid != (uid_t) -1) && (suid != cred->uid) && + (suid != cred->euid) && (suid != cred->suid)) return -EPERM; } if (ruid != (uid_t) -1) { - if (ruid != current->uid && set_user(ruid, euid != current->euid) < 0) + if (ruid != cred->uid && + set_user(ruid, euid != cred->euid) < 0) return -EAGAIN; } if (euid != (uid_t) -1) { - if (euid != current->euid) { + if (euid != cred->euid) { set_dumpable(current->mm, suid_dumpable); smp_wmb(); } - current->euid = euid; + cred->euid = euid; } - current->fsuid = current->euid; + cred->fsuid = cred->euid; if (suid != (uid_t) -1) - current->suid = suid; + cred->suid = suid; key_fsuid_changed(current); proc_id_connector(current, PROC_EVENT_UID); @@ -735,11 +743,12 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid) { + struct cred *cred = current->cred; int retval; - if (!(retval = put_user(current->uid, ruid)) && - !(retval = put_user(current->euid, euid))) - retval = put_user(current->suid, suid); + if (!(retval = put_user(cred->uid, ruid)) && + !(retval = put_user(cred->euid, euid))) + retval = put_user(cred->suid, suid); return retval; } @@ -749,6 +758,7 @@ asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __us */ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) { + struct cred *cred = current->cred; int retval; retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES); @@ -756,28 +766,28 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) return retval; if (!capable(CAP_SETGID)) { - if ((rgid != (gid_t) -1) && (rgid != current->gid) && - (rgid != current->egid) && (rgid != current->sgid)) + if ((rgid != (gid_t) -1) && (rgid != cred->gid) && + (rgid != cred->egid) && (rgid != cred->sgid)) return -EPERM; - if ((egid != (gid_t) -1) && (egid != current->gid) && - (egid != current->egid) && (egid != current->sgid)) + if ((egid != (gid_t) -1) && (egid != cred->gid) && + (egid != cred->egid) && (egid != cred->sgid)) return -EPERM; - if ((sgid != (gid_t) -1) && (sgid != current->gid) && - (sgid != current->egid) && (sgid != current->sgid)) + if ((sgid != (gid_t) -1) && (sgid != cred->gid) && + (sgid != cred->egid) && (sgid != cred->sgid)) return -EPERM; } if (egid != (gid_t) -1) { - if (egid != current->egid) { + if (egid != cred->egid) { set_dumpable(current->mm, suid_dumpable); smp_wmb(); } - current->egid = egid; + cred->egid = egid; } - current->fsgid = current->egid; + cred->fsgid = cred->egid; if (rgid != (gid_t) -1) - current->gid = rgid; + cred->gid = rgid; if (sgid != (gid_t) -1) - current->sgid = sgid; + cred->sgid = sgid; key_fsgid_changed(current); proc_id_connector(current, PROC_EVENT_GID); @@ -786,11 +796,12 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid) { + struct cred *cred = current->cred; int retval; - if (!(retval = put_user(current->gid, rgid)) && - !(retval = put_user(current->egid, egid))) - retval = put_user(current->sgid, sgid); + if (!(retval = put_user(cred->gid, rgid)) && + !(retval = put_user(cred->egid, egid))) + retval = put_user(cred->sgid, sgid); return retval; } @@ -804,20 +815,21 @@ asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __us */ asmlinkage long sys_setfsuid(uid_t uid) { + struct cred *cred = current->cred; int old_fsuid; - old_fsuid = current->fsuid; + old_fsuid = cred->fsuid; if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS)) return old_fsuid; - if (uid == current->uid || uid == current->euid || - uid == current->suid || uid == current->fsuid || + if (uid == cred->uid || uid == cred->euid || + uid == cred->suid || uid == cred->fsuid || capable(CAP_SETUID)) { if (uid != old_fsuid) { set_dumpable(current->mm, suid_dumpable); smp_wmb(); } - current->fsuid = uid; + cred->fsuid = uid; } key_fsuid_changed(current); @@ -833,20 +845,21 @@ asmlinkage long sys_setfsuid(uid_t uid) */ asmlinkage long sys_setfsgid(gid_t gid) { + struct cred *cred = current->cred; int old_fsgid; - old_fsgid = current->fsgid; + old_fsgid = cred->fsgid; if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS)) return old_fsgid; - if (gid == current->gid || gid == current->egid || - gid == current->sgid || gid == current->fsgid || + if (gid == cred->gid || gid == cred->egid || + gid == cred->sgid || gid == cred->fsgid || capable(CAP_SETGID)) { if (gid != old_fsgid) { set_dumpable(current->mm, suid_dumpable); smp_wmb(); } - current->fsgid = gid; + cred->fsgid = gid; key_fsgid_changed(current); proc_id_connector(current, PROC_EVENT_GID); } @@ -1208,8 +1221,15 @@ int groups_search(struct group_info *group_info, gid_t grp) return 0; } -/* validate and set current->group_info */ -int set_current_groups(struct group_info *group_info) +/** + * set_groups - Change a group subscription in a security record + * @sec: The security record to alter + * @group_info: The group list to impose + * + * Validate a group subscription and, if valid, impose it upon a task security + * record. + */ +int set_groups(struct cred *cred, struct group_info *group_info) { int retval; struct group_info *old_info; @@ -1221,20 +1241,34 @@ int set_current_groups(struct group_info *group_info) groups_sort(group_info); get_group_info(group_info); - task_lock(current); - old_info = current->group_info; - current->group_info = group_info; - task_unlock(current); + spin_lock(&cred->lock); + old_info = cred->group_info; + cred->group_info = group_info; + spin_unlock(&cred->lock); put_group_info(old_info); - return 0; } +EXPORT_SYMBOL(set_groups); + +/** + * set_current_groups - Change current's group subscription + * @group_info: The group list to impose + * + * Validate a group subscription and, if valid, impose it upon current's task + * security record. + */ +int set_current_groups(struct group_info *group_info) +{ + return set_groups(current->cred, group_info); +} + EXPORT_SYMBOL(set_current_groups); asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) { + struct cred *cred = current->cred; int i = 0; /* @@ -1246,13 +1280,13 @@ asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) return -EINVAL; /* no need to grab task_lock here; it cannot change */ - i = current->group_info->ngroups; + i = cred->group_info->ngroups; if (gidsetsize) { if (i > gidsetsize) { i = -EINVAL; goto out; } - if (groups_to_user(grouplist, current->group_info)) { + if (groups_to_user(grouplist, cred->group_info)) { i = -EFAULT; goto out; } @@ -1296,9 +1330,10 @@ asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist) */ int in_group_p(gid_t grp) { + struct cred *cred = current->cred; int retval = 1; - if (grp != current->fsgid) - retval = groups_search(current->group_info, grp); + if (grp != cred->fsgid) + retval = groups_search(cred->group_info, grp); return retval; } @@ -1306,9 +1341,10 @@ EXPORT_SYMBOL(in_group_p); int in_egroup_p(gid_t grp) { + struct cred *cred = current->cred; int retval = 1; - if (grp != current->egid) - retval = groups_search(current->group_info, grp); + if (grp != cred->egid) + retval = groups_search(cred->group_info, grp); return retval; } @@ -1624,7 +1660,9 @@ asmlinkage long sys_umask(int mask) asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { - long error = 0; + struct task_struct *me = current; + unsigned char comm[sizeof(me->comm)]; + long error; if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error)) return error; @@ -1635,39 +1673,41 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, error = -EINVAL; break; } - current->pdeath_signal = arg2; + me->pdeath_signal = arg2; + error = 0; break; case PR_GET_PDEATHSIG: - error = put_user(current->pdeath_signal, (int __user *)arg2); + error = put_user(me->pdeath_signal, (int __user *)arg2); break; case PR_GET_DUMPABLE: - error = get_dumpable(current->mm); + error = get_dumpable(me->mm); break; case PR_SET_DUMPABLE: if (arg2 < 0 || arg2 > 1) { error = -EINVAL; break; } - set_dumpable(current->mm, arg2); + set_dumpable(me->mm, arg2); + error = 0; break; case PR_SET_UNALIGN: - error = SET_UNALIGN_CTL(current, arg2); + error = SET_UNALIGN_CTL(me, arg2); break; case PR_GET_UNALIGN: - error = GET_UNALIGN_CTL(current, arg2); + error = GET_UNALIGN_CTL(me, arg2); break; case PR_SET_FPEMU: - error = SET_FPEMU_CTL(current, arg2); + error = SET_FPEMU_CTL(me, arg2); break; case PR_GET_FPEMU: - error = GET_FPEMU_CTL(current, arg2); + error = GET_FPEMU_CTL(me, arg2); break; case PR_SET_FPEXC: - error = SET_FPEXC_CTL(current, arg2); + error = SET_FPEXC_CTL(me, arg2); break; case PR_GET_FPEXC: - error = GET_FPEXC_CTL(current, arg2); + error = GET_FPEXC_CTL(me, arg2); break; case PR_GET_TIMING: error = PR_TIMING_STATISTICAL; @@ -1675,33 +1715,28 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, case PR_SET_TIMING: if (arg2 != PR_TIMING_STATISTICAL) error = -EINVAL; + else + error = 0; break; - case PR_SET_NAME: { - struct task_struct *me = current; - unsigned char ncomm[sizeof(me->comm)]; - - ncomm[sizeof(me->comm)-1] = 0; - if (strncpy_from_user(ncomm, (char __user *)arg2, - sizeof(me->comm)-1) < 0) + case PR_SET_NAME: + comm[sizeof(me->comm)-1] = 0; + if (strncpy_from_user(comm, (char __user *)arg2, + sizeof(me->comm) - 1) < 0) return -EFAULT; - set_task_comm(me, ncomm); + set_task_comm(me, comm); return 0; - } - case PR_GET_NAME: { - struct task_struct *me = current; - unsigned char tcomm[sizeof(me->comm)]; - - get_task_comm(tcomm, me); - if (copy_to_user((char __user *)arg2, tcomm, sizeof(tcomm))) + case PR_GET_NAME: + get_task_comm(comm, me); + if (copy_to_user((char __user *)arg2, comm, + sizeof(comm))) return -EFAULT; return 0; - } case PR_GET_ENDIAN: - error = GET_ENDIAN(current, arg2); + error = GET_ENDIAN(me, arg2); break; case PR_SET_ENDIAN: - error = SET_ENDIAN(current, arg2); + error = SET_ENDIAN(me, arg2); break; case PR_GET_SECCOMP: @@ -1725,6 +1760,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, current->default_timer_slack_ns; else current->timer_slack_ns = arg2; + error = 0; break; default: error = -EINVAL; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9f3b478f917..5c97c5b4ea8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -246,7 +246,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) memcpy(data->comm, tsk->comm, TASK_COMM_LEN); data->pid = tsk->pid; - data->uid = tsk->uid; + data->uid = task_uid(tsk); data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; data->policy = tsk->policy; data->rt_priority = tsk->rt_priority; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 8ebcd8532df..6d1ed07bf31 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -53,8 +53,8 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) stats->ac_flag |= AXSIG; stats->ac_nice = task_nice(tsk); stats->ac_sched = tsk->policy; - stats->ac_uid = tsk->uid; - stats->ac_gid = tsk->gid; + stats->ac_uid = tsk->cred->uid; + stats->ac_gid = tsk->cred->gid; stats->ac_pid = tsk->pid; rcu_read_lock(); stats->ac_ppid = pid_alive(tsk) ? diff --git a/kernel/uid16.c b/kernel/uid16.c index 3e41c1673e2..71f07fc39fe 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -86,9 +86,9 @@ asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, { int retval; - if (!(retval = put_user(high2lowuid(current->uid), ruid)) && - !(retval = put_user(high2lowuid(current->euid), euid))) - retval = put_user(high2lowuid(current->suid), suid); + if (!(retval = put_user(high2lowuid(current->cred->uid), ruid)) && + !(retval = put_user(high2lowuid(current->cred->euid), euid))) + retval = put_user(high2lowuid(current->cred->suid), suid); return retval; } @@ -106,9 +106,9 @@ asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, { int retval; - if (!(retval = put_user(high2lowgid(current->gid), rgid)) && - !(retval = put_user(high2lowgid(current->egid), egid))) - retval = put_user(high2lowgid(current->sgid), sgid); + if (!(retval = put_user(high2lowgid(current->cred->gid), rgid)) && + !(retval = put_user(high2lowgid(current->cred->egid), egid))) + retval = put_user(high2lowgid(current->cred->sgid), sgid); return retval; } @@ -166,20 +166,20 @@ asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist) if (gidsetsize < 0) return -EINVAL; - get_group_info(current->group_info); - i = current->group_info->ngroups; + get_group_info(current->cred->group_info); + i = current->cred->group_info->ngroups; if (gidsetsize) { if (i > gidsetsize) { i = -EINVAL; goto out; } - if (groups16_to_user(grouplist, current->group_info)) { + if (groups16_to_user(grouplist, current->cred->group_info)) { i = -EFAULT; goto out; } } out: - put_group_info(current->group_info); + put_group_info(current->cred->group_info); return i; } @@ -210,20 +210,20 @@ asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist) asmlinkage long sys_getuid16(void) { - return high2lowuid(current->uid); + return high2lowuid(current->cred->uid); } asmlinkage long sys_geteuid16(void) { - return high2lowuid(current->euid); + return high2lowuid(current->cred->euid); } asmlinkage long sys_getgid16(void) { - return high2lowgid(current->gid); + return high2lowgid(current->cred->gid); } asmlinkage long sys_getegid16(void) { - return high2lowgid(current->egid); + return high2lowgid(current->cred->egid); } diff --git a/kernel/user.c b/kernel/user.c index 39d6159fae4..104d22ac84d 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -457,11 +457,11 @@ void switch_uid(struct user_struct *new_user) * cheaply with the new uid cache, so if it matters * we should be checking for it. -DaveM */ - old_user = current->user; + old_user = current->cred->user; atomic_inc(&new_user->processes); atomic_dec(&old_user->processes); switch_uid_keyring(new_user); - current->user = new_user; + current->cred->user = new_user; sched_switch_user(current); /* diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 07a96474077..b23492ee3e5 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1110,12 +1110,12 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, const unsigned long __user *old_nodes, const unsigned long __user *new_nodes) { + struct cred *cred, *tcred; struct mm_struct *mm; struct task_struct *task; nodemask_t old; nodemask_t new; nodemask_t task_nodes; - uid_t uid, euid; int err; err = get_nodes(&old, old_nodes, maxnode); @@ -1145,10 +1145,10 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, * capabilities, superuser privileges or the same * userid as the target process. */ - uid = current_uid(); - euid = current_euid(); - if (euid != task->suid && euid != task->uid && - uid != task->suid && uid != task->uid && + cred = current->cred; + tcred = task->cred; + if (cred->euid != tcred->suid && cred->euid != tcred->uid && + cred->uid != tcred->suid && cred->uid != tcred->uid && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out; diff --git a/mm/migrate.c b/mm/migrate.c index 6263c24c4af..794443da1b4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1045,10 +1045,10 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, const int __user *nodes, int __user *status, int flags) { + struct cred *cred, *tcred; struct task_struct *task; struct mm_struct *mm; int err; - uid_t uid, euid; /* Check flags */ if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) @@ -1076,10 +1076,10 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, * capabilities, superuser privileges or the same * userid as the target process. */ - uid = current_uid(); - euid = current_euid(); - if (euid != task->suid && euid != task->uid && - uid != task->suid && uid != task->uid && + cred = current->cred; + tcred = task->cred; + if (cred->euid != tcred->suid && cred->euid != tcred->uid && + cred->uid != tcred->suid && cred->uid != tcred->uid && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 34a458aa799..3af787ba207 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -298,7 +298,7 @@ static void dump_tasks(const struct mem_cgroup *mem) task_lock(p); printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", - p->pid, p->uid, p->tgid, p->mm->total_vm, + p->pid, p->cred->uid, p->tgid, p->mm->total_vm, get_mm_rss(p->mm), (int)task_cpu(p), p->oomkilladj, p->comm); task_unlock(p); diff --git a/net/core/scm.c b/net/core/scm.c index 4681d8f9b45..c28ca32a7d9 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -44,11 +44,13 @@ static __inline__ int scm_check_creds(struct ucred *creds) { + struct cred *cred = current->cred; + if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) && - ((creds->uid == current_uid() || creds->uid == current_euid() || - creds->uid == current_suid()) || capable(CAP_SETUID)) && - ((creds->gid == current_gid() || creds->gid == current_egid() || - creds->gid == current_sgid()) || capable(CAP_SETGID))) { + ((creds->uid == cred->uid || creds->uid == cred->euid || + creds->uid == cred->suid) || capable(CAP_SETUID)) && + ((creds->gid == cred->gid || creds->gid == cred->egid || + creds->gid == cred->sgid) || capable(CAP_SETGID))) { return 0; } return -EPERM; diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 8fc38057880..c7954321260 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -353,7 +353,7 @@ rpcauth_lookupcred(struct rpc_auth *auth, int flags) struct auth_cred acred = { .uid = current_fsuid(), .gid = current_fsgid(), - .group_info = current->group_info, + .group_info = current->cred->group_info, }; struct rpc_cred *ret; diff --git a/security/commoncap.c b/security/commoncap.c index fb4e240720d..fa61679f8c7 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -30,7 +30,7 @@ int cap_netlink_send(struct sock *sk, struct sk_buff *skb) { - NETLINK_CB(skb).eff_cap = current->cap_effective; + NETLINK_CB(skb).eff_cap = current_cap(); return 0; } @@ -52,7 +52,7 @@ EXPORT_SYMBOL(cap_netlink_recv); int cap_capable(struct task_struct *tsk, int cap, int audit) { /* Derived from include/linux/sched.h:capable. */ - if (cap_raised(tsk->cap_effective, cap)) + if (cap_raised(tsk->cred->cap_effective, cap)) return 0; return -EPERM; } @@ -67,7 +67,8 @@ int cap_settime(struct timespec *ts, struct timezone *tz) int cap_ptrace_may_access(struct task_struct *child, unsigned int mode) { /* Derived from arch/i386/kernel/ptrace.c:sys_ptrace. */ - if (cap_issubset(child->cap_permitted, current->cap_permitted)) + if (cap_issubset(child->cred->cap_permitted, + current->cred->cap_permitted)) return 0; if (capable(CAP_SYS_PTRACE)) return 0; @@ -76,8 +77,8 @@ int cap_ptrace_may_access(struct task_struct *child, unsigned int mode) int cap_ptrace_traceme(struct task_struct *parent) { - /* Derived from arch/i386/kernel/ptrace.c:sys_ptrace. */ - if (cap_issubset(current->cap_permitted, parent->cap_permitted)) + if (cap_issubset(current->cred->cap_permitted, + parent->cred->cap_permitted)) return 0; if (has_capability(parent, CAP_SYS_PTRACE)) return 0; @@ -87,10 +88,12 @@ int cap_ptrace_traceme(struct task_struct *parent) int cap_capget (struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { + struct cred *cred = target->cred; + /* Derived from kernel/capability.c:sys_capget. */ - *effective = target->cap_effective; - *inheritable = target->cap_inheritable; - *permitted = target->cap_permitted; + *effective = cred->cap_effective; + *inheritable = cred->cap_inheritable; + *permitted = cred->cap_permitted; return 0; } @@ -122,24 +125,26 @@ int cap_capset_check(const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted) { + const struct cred *cred = current->cred; + if (cap_inh_is_capped() && !cap_issubset(*inheritable, - cap_combine(current->cap_inheritable, - current->cap_permitted))) { + cap_combine(cred->cap_inheritable, + cred->cap_permitted))) { /* incapable of using this inheritable set */ return -EPERM; } if (!cap_issubset(*inheritable, - cap_combine(current->cap_inheritable, - current->cap_bset))) { + cap_combine(cred->cap_inheritable, + cred->cap_bset))) { /* no new pI capabilities outside bounding set */ return -EPERM; } /* verify restrictions on target's new Permitted set */ if (!cap_issubset (*permitted, - cap_combine (current->cap_permitted, - current->cap_permitted))) { + cap_combine (cred->cap_permitted, + cred->cap_permitted))) { return -EPERM; } @@ -155,9 +160,11 @@ void cap_capset_set(const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted) { - current->cap_effective = *effective; - current->cap_inheritable = *inheritable; - current->cap_permitted = *permitted; + struct cred *cred = current->cred; + + cred->cap_effective = *effective; + cred->cap_inheritable = *inheritable; + cred->cap_permitted = *permitted; } static inline void bprm_clear_caps(struct linux_binprm *bprm) @@ -211,8 +218,8 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps, * pP' = (X & fP) | (pI & fI) */ bprm->cap_post_exec_permitted.cap[i] = - (current->cap_bset.cap[i] & permitted) | - (current->cap_inheritable.cap[i] & inheritable); + (current->cred->cap_bset.cap[i] & permitted) | + (current->cred->cap_inheritable.cap[i] & inheritable); if (permitted & ~bprm->cap_post_exec_permitted.cap[i]) { /* @@ -354,8 +361,8 @@ int cap_bprm_set_security (struct linux_binprm *bprm) if (bprm->e_uid == 0 || current_uid() == 0) { /* pP' = (cap_bset & ~0) | (pI & ~0) */ bprm->cap_post_exec_permitted = cap_combine( - current->cap_bset, current->cap_inheritable - ); + current->cred->cap_bset, + current->cred->cap_inheritable); bprm->cap_effective = (bprm->e_uid == 0); ret = 0; } @@ -366,44 +373,39 @@ int cap_bprm_set_security (struct linux_binprm *bprm) void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe) { - kernel_cap_t pP = current->cap_permitted; - kernel_cap_t pE = current->cap_effective; - uid_t uid; - gid_t gid; + struct cred *cred = current->cred; - current_uid_gid(&uid, &gid); - - if (bprm->e_uid != uid || bprm->e_gid != gid || + if (bprm->e_uid != cred->uid || bprm->e_gid != cred->gid || !cap_issubset(bprm->cap_post_exec_permitted, - current->cap_permitted)) { + cred->cap_permitted)) { set_dumpable(current->mm, suid_dumpable); current->pdeath_signal = 0; if (unsafe & ~LSM_UNSAFE_PTRACE_CAP) { if (!capable(CAP_SETUID)) { - bprm->e_uid = uid; - bprm->e_gid = gid; + bprm->e_uid = cred->uid; + bprm->e_gid = cred->gid; } if (cap_limit_ptraced_target()) { bprm->cap_post_exec_permitted = cap_intersect( bprm->cap_post_exec_permitted, - current->cap_permitted); + cred->cap_permitted); } } } - current->suid = current->euid = current->fsuid = bprm->e_uid; - current->sgid = current->egid = current->fsgid = bprm->e_gid; + cred->suid = cred->euid = cred->fsuid = bprm->e_uid; + cred->sgid = cred->egid = cred->fsgid = bprm->e_gid; /* For init, we want to retain the capabilities set * in the init_task struct. Thus we skip the usual * capability rules */ if (!is_global_init(current)) { - current->cap_permitted = bprm->cap_post_exec_permitted; + cred->cap_permitted = bprm->cap_post_exec_permitted; if (bprm->cap_effective) - current->cap_effective = bprm->cap_post_exec_permitted; + cred->cap_effective = bprm->cap_post_exec_permitted; else - cap_clear(current->cap_effective); + cap_clear(cred->cap_effective); } /* @@ -418,27 +420,30 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe) * Number 1 above might fail if you don't have a full bset, but I think * that is interesting information to audit. */ - if (!cap_isclear(current->cap_effective)) { - if (!cap_issubset(CAP_FULL_SET, current->cap_effective) || - (bprm->e_uid != 0) || (current->uid != 0) || + if (!cap_isclear(cred->cap_effective)) { + if (!cap_issubset(CAP_FULL_SET, cred->cap_effective) || + (bprm->e_uid != 0) || (cred->uid != 0) || issecure(SECURE_NOROOT)) - audit_log_bprm_fcaps(bprm, &pP, &pE); + audit_log_bprm_fcaps(bprm, &cred->cap_permitted, + &cred->cap_effective); } - current->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); + cred->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); } int cap_bprm_secureexec (struct linux_binprm *bprm) { - if (current_uid() != 0) { + const struct cred *cred = current->cred; + + if (cred->uid != 0) { if (bprm->cap_effective) return 1; if (!cap_isclear(bprm->cap_post_exec_permitted)) return 1; } - return (current_euid() != current_uid() || - current_egid() != current_gid()); + return (cred->euid != cred->uid || + cred->egid != cred->gid); } int cap_inode_setxattr(struct dentry *dentry, const char *name, @@ -501,25 +506,27 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name) static inline void cap_emulate_setxuid (int old_ruid, int old_euid, int old_suid) { - uid_t euid = current_euid(); + struct cred *cred = current->cred; if ((old_ruid == 0 || old_euid == 0 || old_suid == 0) && - (current_uid() != 0 && euid != 0 && current_suid() != 0) && + (cred->uid != 0 && cred->euid != 0 && cred->suid != 0) && !issecure(SECURE_KEEP_CAPS)) { - cap_clear (current->cap_permitted); - cap_clear (current->cap_effective); + cap_clear (cred->cap_permitted); + cap_clear (cred->cap_effective); } - if (old_euid == 0 && euid != 0) { - cap_clear (current->cap_effective); + if (old_euid == 0 && cred->euid != 0) { + cap_clear (cred->cap_effective); } - if (old_euid != 0 && euid == 0) { - current->cap_effective = current->cap_permitted; + if (old_euid != 0 && cred->euid == 0) { + cred->cap_effective = cred->cap_permitted; } } int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid, int flags) { + struct cred *cred = current->cred; + switch (flags) { case LSM_SETID_RE: case LSM_SETID_ID: @@ -541,16 +548,16 @@ int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid, */ if (!issecure (SECURE_NO_SETUID_FIXUP)) { - if (old_fsuid == 0 && current_fsuid() != 0) { - current->cap_effective = + if (old_fsuid == 0 && cred->fsuid != 0) { + cred->cap_effective = cap_drop_fs_set( - current->cap_effective); + cred->cap_effective); } - if (old_fsuid != 0 && current_fsuid() == 0) { - current->cap_effective = + if (old_fsuid != 0 && cred->fsuid == 0) { + cred->cap_effective = cap_raise_fs_set( - current->cap_effective, - current->cap_permitted); + cred->cap_effective, + cred->cap_permitted); } } break; @@ -575,7 +582,8 @@ int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid, */ static int cap_safe_nice(struct task_struct *p) { - if (!cap_issubset(p->cap_permitted, current->cap_permitted) && + if (!cap_issubset(p->cred->cap_permitted, + current->cred->cap_permitted) && !capable(CAP_SYS_NICE)) return -EPERM; return 0; @@ -610,7 +618,7 @@ static long cap_prctl_drop(unsigned long cap) return -EPERM; if (!cap_valid(cap)) return -EINVAL; - cap_lower(current->cap_bset, cap); + cap_lower(current->cred->cap_bset, cap); return 0; } @@ -633,6 +641,7 @@ int cap_task_setnice (struct task_struct *p, int nice) int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5, long *rc_p) { + struct cred *cred = current->cred; long error = 0; switch (option) { @@ -640,7 +649,7 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, if (!cap_valid(arg2)) error = -EINVAL; else - error = !!cap_raised(current->cap_bset, arg2); + error = !!cap_raised(cred->cap_bset, arg2); break; #ifdef CONFIG_SECURITY_FILE_CAPABILITIES case PR_CAPBSET_DROP: @@ -667,9 +676,9 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, * capability-based-privilege environment. */ case PR_SET_SECUREBITS: - if ((((current->securebits & SECURE_ALL_LOCKS) >> 1) - & (current->securebits ^ arg2)) /*[1]*/ - || ((current->securebits & SECURE_ALL_LOCKS + if ((((cred->securebits & SECURE_ALL_LOCKS) >> 1) + & (cred->securebits ^ arg2)) /*[1]*/ + || ((cred->securebits & SECURE_ALL_LOCKS & ~arg2)) /*[2]*/ || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS)) /*[3]*/ || (cap_capable(current, CAP_SETPCAP, SECURITY_CAP_AUDIT) != 0)) { /*[4]*/ @@ -682,11 +691,11 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, */ error = -EPERM; /* cannot change a locked bit */ } else { - current->securebits = arg2; + cred->securebits = arg2; } break; case PR_GET_SECUREBITS: - error = current->securebits; + error = cred->securebits; break; #endif /* def CONFIG_SECURITY_FILE_CAPABILITIES */ @@ -701,10 +710,9 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, else if (issecure(SECURE_KEEP_CAPS_LOCKED)) error = -EPERM; else if (arg2) - current->securebits |= issecure_mask(SECURE_KEEP_CAPS); + cred->securebits |= issecure_mask(SECURE_KEEP_CAPS); else - current->securebits &= - ~issecure_mask(SECURE_KEEP_CAPS); + cred->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); break; default: @@ -719,11 +727,12 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, void cap_task_reparent_to_init (struct task_struct *p) { - cap_set_init_eff(p->cap_effective); - cap_clear(p->cap_inheritable); - cap_set_full(p->cap_permitted); - p->securebits = SECUREBITS_DEFAULT; - return; + struct cred *cred = p->cred; + + cap_set_init_eff(cred->cap_effective); + cap_clear(cred->cap_inheritable); + cap_set_full(cred->cap_permitted); + p->cred->securebits = SECUREBITS_DEFAULT; } int cap_syslog (int type) diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index fcce331eca7..8833b447ade 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -889,7 +889,7 @@ long keyctl_instantiate_key(key_serial_t id, /* the appropriate instantiation authorisation key must have been * assumed before calling this */ ret = -EPERM; - instkey = current->request_key_auth; + instkey = current->cred->request_key_auth; if (!instkey) goto error; @@ -932,8 +932,8 @@ long keyctl_instantiate_key(key_serial_t id, /* discard the assumed authority if it's just been disabled by * instantiation of the key */ if (ret == 0) { - key_put(current->request_key_auth); - current->request_key_auth = NULL; + key_put(current->cred->request_key_auth); + current->cred->request_key_auth = NULL; } error2: @@ -960,7 +960,7 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid) /* the appropriate instantiation authorisation key must have been * assumed before calling this */ ret = -EPERM; - instkey = current->request_key_auth; + instkey = current->cred->request_key_auth; if (!instkey) goto error; @@ -983,8 +983,8 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid) /* discard the assumed authority if it's just been disabled by * instantiation of the key */ if (ret == 0) { - key_put(current->request_key_auth); - current->request_key_auth = NULL; + key_put(current->cred->request_key_auth); + current->cred->request_key_auth = NULL; } error: @@ -999,6 +999,7 @@ error: */ long keyctl_set_reqkey_keyring(int reqkey_defl) { + struct cred *cred = current->cred; int ret; switch (reqkey_defl) { @@ -1018,10 +1019,10 @@ long keyctl_set_reqkey_keyring(int reqkey_defl) case KEY_REQKEY_DEFL_USER_KEYRING: case KEY_REQKEY_DEFL_USER_SESSION_KEYRING: set: - current->jit_keyring = reqkey_defl; + cred->jit_keyring = reqkey_defl; case KEY_REQKEY_DEFL_NO_CHANGE: - return current->jit_keyring; + return cred->jit_keyring; case KEY_REQKEY_DEFL_GROUP_KEYRING: default: @@ -1086,8 +1087,8 @@ long keyctl_assume_authority(key_serial_t id) /* we divest ourselves of authority if given an ID of 0 */ if (id == 0) { - key_put(current->request_key_auth); - current->request_key_auth = NULL; + key_put(current->cred->request_key_auth); + current->cred->request_key_auth = NULL; ret = 0; goto error; } @@ -1103,8 +1104,8 @@ long keyctl_assume_authority(key_serial_t id) goto error; } - key_put(current->request_key_auth); - current->request_key_auth = authkey; + key_put(current->cred->request_key_auth); + current->cred->request_key_auth = authkey; ret = authkey->serial; error: diff --git a/security/keys/permission.c b/security/keys/permission.c index 3b41f9b5253..baf3d5f31e7 100644 --- a/security/keys/permission.c +++ b/security/keys/permission.c @@ -22,6 +22,7 @@ int key_task_permission(const key_ref_t key_ref, struct task_struct *context, key_perm_t perm) { + struct cred *cred = context->cred; struct key *key; key_perm_t kperm; int ret; @@ -29,7 +30,7 @@ int key_task_permission(const key_ref_t key_ref, key = key_ref_to_ptr(key_ref); /* use the second 8-bits of permissions for keys the caller owns */ - if (key->uid == context->fsuid) { + if (key->uid == cred->fsuid) { kperm = key->perm >> 16; goto use_these_perms; } @@ -37,14 +38,14 @@ int key_task_permission(const key_ref_t key_ref, /* use the third 8-bits of permissions for keys the caller has a group * membership in common with */ if (key->gid != -1 && key->perm & KEY_GRP_ALL) { - if (key->gid == context->fsgid) { + if (key->gid == cred->fsgid) { kperm = key->perm >> 8; goto use_these_perms; } - task_lock(context); - ret = groups_search(context->group_info, key->gid); - task_unlock(context); + spin_lock(&cred->lock); + ret = groups_search(cred->group_info, key->gid); + spin_unlock(&cred->lock); if (ret) { kperm = key->perm >> 8; diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index 1c793b7090a..b0904cdda2e 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -42,7 +42,7 @@ struct key_user root_key_user = { */ int install_user_keyrings(void) { - struct user_struct *user = current->user; + struct user_struct *user = current->cred->user; struct key *uid_keyring, *session_keyring; char buf[20]; int ret; @@ -156,7 +156,7 @@ int install_thread_keyring(void) sprintf(buf, "_tid.%u", tsk->pid); - keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk, + keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, tsk, KEY_ALLOC_QUOTA_OVERRUN, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); @@ -164,8 +164,8 @@ int install_thread_keyring(void) } task_lock(tsk); - old = tsk->thread_keyring; - tsk->thread_keyring = keyring; + old = tsk->cred->thread_keyring; + tsk->cred->thread_keyring = keyring; task_unlock(tsk); ret = 0; @@ -192,7 +192,7 @@ int install_process_keyring(void) if (!tsk->signal->process_keyring) { sprintf(buf, "_pid.%u", tsk->tgid); - keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk, + keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, tsk, KEY_ALLOC_QUOTA_OVERRUN, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); @@ -238,7 +238,7 @@ static int install_session_keyring(struct key *keyring) if (tsk->signal->session_keyring) flags = KEY_ALLOC_IN_QUOTA; - keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk, + keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, tsk, flags, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); @@ -292,14 +292,14 @@ int copy_thread_group_keys(struct task_struct *tsk) */ int copy_keys(unsigned long clone_flags, struct task_struct *tsk) { - key_check(tsk->thread_keyring); - key_check(tsk->request_key_auth); + key_check(tsk->cred->thread_keyring); + key_check(tsk->cred->request_key_auth); /* no thread keyring yet */ - tsk->thread_keyring = NULL; + tsk->cred->thread_keyring = NULL; /* copy the request_key() authorisation for this thread */ - key_get(tsk->request_key_auth); + key_get(tsk->cred->request_key_auth); return 0; @@ -322,8 +322,8 @@ void exit_thread_group_keys(struct signal_struct *tg) */ void exit_keys(struct task_struct *tsk) { - key_put(tsk->thread_keyring); - key_put(tsk->request_key_auth); + key_put(tsk->cred->thread_keyring); + key_put(tsk->cred->request_key_auth); } /* end exit_keys() */ @@ -337,8 +337,8 @@ int exec_keys(struct task_struct *tsk) /* newly exec'd tasks don't get a thread keyring */ task_lock(tsk); - old = tsk->thread_keyring; - tsk->thread_keyring = NULL; + old = tsk->cred->thread_keyring; + tsk->cred->thread_keyring = NULL; task_unlock(tsk); key_put(old); @@ -373,10 +373,11 @@ int suid_keys(struct task_struct *tsk) void key_fsuid_changed(struct task_struct *tsk) { /* update the ownership of the thread keyring */ - if (tsk->thread_keyring) { - down_write(&tsk->thread_keyring->sem); - tsk->thread_keyring->uid = tsk->fsuid; - up_write(&tsk->thread_keyring->sem); + BUG_ON(!tsk->cred); + if (tsk->cred->thread_keyring) { + down_write(&tsk->cred->thread_keyring->sem); + tsk->cred->thread_keyring->uid = tsk->cred->fsuid; + up_write(&tsk->cred->thread_keyring->sem); } } /* end key_fsuid_changed() */ @@ -388,10 +389,11 @@ void key_fsuid_changed(struct task_struct *tsk) void key_fsgid_changed(struct task_struct *tsk) { /* update the ownership of the thread keyring */ - if (tsk->thread_keyring) { - down_write(&tsk->thread_keyring->sem); - tsk->thread_keyring->gid = tsk->fsgid; - up_write(&tsk->thread_keyring->sem); + BUG_ON(!tsk->cred); + if (tsk->cred->thread_keyring) { + down_write(&tsk->cred->thread_keyring->sem); + tsk->cred->thread_keyring->gid = tsk->cred->fsgid; + up_write(&tsk->cred->thread_keyring->sem); } } /* end key_fsgid_changed() */ @@ -426,9 +428,9 @@ key_ref_t search_process_keyrings(struct key_type *type, err = ERR_PTR(-EAGAIN); /* search the thread keyring first */ - if (context->thread_keyring) { + if (context->cred->thread_keyring) { key_ref = keyring_search_aux( - make_key_ref(context->thread_keyring, 1), + make_key_ref(context->cred->thread_keyring, 1), context, type, description, match); if (!IS_ERR(key_ref)) goto found; @@ -493,9 +495,9 @@ key_ref_t search_process_keyrings(struct key_type *type, } } /* or search the user-session keyring */ - else if (context->user->session_keyring) { + else if (context->cred->user->session_keyring) { key_ref = keyring_search_aux( - make_key_ref(context->user->session_keyring, 1), + make_key_ref(context->cred->user->session_keyring, 1), context, type, description, match); if (!IS_ERR(key_ref)) goto found; @@ -517,20 +519,20 @@ key_ref_t search_process_keyrings(struct key_type *type, * search the keyrings of the process mentioned there * - we don't permit access to request_key auth keys via this method */ - if (context->request_key_auth && + if (context->cred->request_key_auth && context == current && type != &key_type_request_key_auth ) { /* defend against the auth key being revoked */ - down_read(&context->request_key_auth->sem); + down_read(&context->cred->request_key_auth->sem); - if (key_validate(context->request_key_auth) == 0) { - rka = context->request_key_auth->payload.data; + if (key_validate(context->cred->request_key_auth) == 0) { + rka = context->cred->request_key_auth->payload.data; key_ref = search_process_keyrings(type, description, match, rka->context); - up_read(&context->request_key_auth->sem); + up_read(&context->cred->request_key_auth->sem); if (!IS_ERR(key_ref)) goto found; @@ -547,7 +549,7 @@ key_ref_t search_process_keyrings(struct key_type *type, break; } } else { - up_read(&context->request_key_auth->sem); + up_read(&context->cred->request_key_auth->sem); } } @@ -580,15 +582,16 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial, { struct request_key_auth *rka; struct task_struct *t = current; - key_ref_t key_ref, skey_ref; + struct cred *cred = t->cred; struct key *key; + key_ref_t key_ref, skey_ref; int ret; key_ref = ERR_PTR(-ENOKEY); switch (id) { case KEY_SPEC_THREAD_KEYRING: - if (!t->thread_keyring) { + if (!cred->thread_keyring) { if (!create) goto error; @@ -599,7 +602,7 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial, } } - key = t->thread_keyring; + key = cred->thread_keyring; atomic_inc(&key->usage); key_ref = make_key_ref(key, 1); break; @@ -628,7 +631,8 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial, ret = install_user_keyrings(); if (ret < 0) goto error; - ret = install_session_keyring(t->user->session_keyring); + ret = install_session_keyring( + cred->user->session_keyring); if (ret < 0) goto error; } @@ -641,25 +645,25 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial, break; case KEY_SPEC_USER_KEYRING: - if (!t->user->uid_keyring) { + if (!cred->user->uid_keyring) { ret = install_user_keyrings(); if (ret < 0) goto error; } - key = t->user->uid_keyring; + key = cred->user->uid_keyring; atomic_inc(&key->usage); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_USER_SESSION_KEYRING: - if (!t->user->session_keyring) { + if (!cred->user->session_keyring) { ret = install_user_keyrings(); if (ret < 0) goto error; } - key = t->user->session_keyring; + key = cred->user->session_keyring; atomic_inc(&key->usage); key_ref = make_key_ref(key, 1); break; @@ -670,7 +674,7 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial, goto error; case KEY_SPEC_REQKEY_AUTH_KEY: - key = t->request_key_auth; + key = cred->request_key_auth; if (!key) goto error; @@ -679,19 +683,19 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial, break; case KEY_SPEC_REQUESTOR_KEYRING: - if (!t->request_key_auth) + if (!cred->request_key_auth) goto error; - down_read(&t->request_key_auth->sem); - if (t->request_key_auth->flags & KEY_FLAG_REVOKED) { + down_read(&cred->request_key_auth->sem); + if (cred->request_key_auth->flags & KEY_FLAG_REVOKED) { key_ref = ERR_PTR(-EKEYREVOKED); key = NULL; } else { - rka = t->request_key_auth->payload.data; + rka = cred->request_key_auth->payload.data; key = rka->dest_keyring; atomic_inc(&key->usage); } - up_read(&t->request_key_auth->sem); + up_read(&cred->request_key_auth->sem); if (!key) goto error; key_ref = make_key_ref(key, 1); @@ -791,7 +795,7 @@ long join_session_keyring(const char *name) keyring = find_keyring_by_name(name, false); if (PTR_ERR(keyring) == -ENOKEY) { /* not found - try and create a new one */ - keyring = keyring_alloc(name, tsk->uid, tsk->gid, tsk, + keyring = keyring_alloc(name, tsk->cred->uid, tsk->cred->gid, tsk, KEY_ALLOC_IN_QUOTA, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); diff --git a/security/keys/request_key.c b/security/keys/request_key.c index 8e9d93b4a40..3e9b9eb1dd2 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -104,7 +104,8 @@ static int call_sbin_request_key(struct key_construction *cons, /* we specify the process's default keyrings */ sprintf(keyring_str[0], "%d", - tsk->thread_keyring ? tsk->thread_keyring->serial : 0); + tsk->cred->thread_keyring ? + tsk->cred->thread_keyring->serial : 0); prkey = 0; if (tsk->signal->process_keyring) @@ -117,7 +118,7 @@ static int call_sbin_request_key(struct key_construction *cons, sskey = rcu_dereference(tsk->signal->session_keyring)->serial; rcu_read_unlock(); } else { - sskey = tsk->user->session_keyring->serial; + sskey = tsk->cred->user->session_keyring->serial; } sprintf(keyring_str[2], "%d", sskey); @@ -232,11 +233,11 @@ static void construct_get_dest_keyring(struct key **_dest_keyring) } else { /* use a default keyring; falling through the cases until we * find one that we actually have */ - switch (tsk->jit_keyring) { + switch (tsk->cred->jit_keyring) { case KEY_REQKEY_DEFL_DEFAULT: case KEY_REQKEY_DEFL_REQUESTOR_KEYRING: - if (tsk->request_key_auth) { - authkey = tsk->request_key_auth; + if (tsk->cred->request_key_auth) { + authkey = tsk->cred->request_key_auth; down_read(&authkey->sem); rka = authkey->payload.data; if (!test_bit(KEY_FLAG_REVOKED, @@ -249,7 +250,7 @@ static void construct_get_dest_keyring(struct key **_dest_keyring) } case KEY_REQKEY_DEFL_THREAD_KEYRING: - dest_keyring = key_get(tsk->thread_keyring); + dest_keyring = key_get(tsk->cred->thread_keyring); if (dest_keyring) break; @@ -268,11 +269,12 @@ static void construct_get_dest_keyring(struct key **_dest_keyring) break; case KEY_REQKEY_DEFL_USER_SESSION_KEYRING: - dest_keyring = key_get(tsk->user->session_keyring); + dest_keyring = + key_get(tsk->cred->user->session_keyring); break; case KEY_REQKEY_DEFL_USER_KEYRING: - dest_keyring = key_get(tsk->user->uid_keyring); + dest_keyring = key_get(tsk->cred->user->uid_keyring); break; case KEY_REQKEY_DEFL_GROUP_KEYRING: diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c index 1762d44711d..2125579d5d7 100644 --- a/security/keys/request_key_auth.c +++ b/security/keys/request_key_auth.c @@ -164,22 +164,22 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, /* see if the calling process is already servicing the key request of * another process */ - if (current->request_key_auth) { + if (current->cred->request_key_auth) { /* it is - use that instantiation context here too */ - down_read(¤t->request_key_auth->sem); + down_read(¤t->cred->request_key_auth->sem); /* if the auth key has been revoked, then the key we're * servicing is already instantiated */ if (test_bit(KEY_FLAG_REVOKED, - ¤t->request_key_auth->flags)) + ¤t->cred->request_key_auth->flags)) goto auth_key_revoked; - irka = current->request_key_auth->payload.data; + irka = current->cred->request_key_auth->payload.data; rka->context = irka->context; rka->pid = irka->pid; get_task_struct(rka->context); - up_read(¤t->request_key_auth->sem); + up_read(¤t->cred->request_key_auth->sem); } else { /* it isn't - use this process as the context */ @@ -214,7 +214,7 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, return authkey; auth_key_revoked: - up_read(¤t->request_key_auth->sem); + up_read(¤t->cred->request_key_auth->sem); kfree(rka->callout_info); kfree(rka); kleave("= -EKEYREVOKED"); diff --git a/security/selinux/exports.c b/security/selinux/exports.c index 64af2d3409e..cf02490cd1e 100644 --- a/security/selinux/exports.c +++ b/security/selinux/exports.c @@ -39,7 +39,7 @@ EXPORT_SYMBOL_GPL(selinux_string_to_sid); int selinux_secmark_relabel_packet_permission(u32 sid) { if (selinux_enabled) { - struct task_security_struct *tsec = current->security; + struct task_security_struct *tsec = current->cred->security; return avc_has_perm(tsec->sid, sid, SECCLASS_PACKET, PACKET__RELABELTO, NULL); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 9f6da154cc8..328308f2882 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -167,21 +167,21 @@ static int task_alloc_security(struct task_struct *task) return -ENOMEM; tsec->osid = tsec->sid = SECINITSID_UNLABELED; - task->security = tsec; + task->cred->security = tsec; return 0; } static void task_free_security(struct task_struct *task) { - struct task_security_struct *tsec = task->security; - task->security = NULL; + struct task_security_struct *tsec = task->cred->security; + task->cred->security = NULL; kfree(tsec); } static int inode_alloc_security(struct inode *inode) { - struct task_security_struct *tsec = current->security; + struct task_security_struct *tsec = current->cred->security; struct inode_security_struct *isec; isec = kmem_cache_zalloc(sel_inode_cache, GFP_NOFS); @@ -215,7 +215,7 @@ static void inode_free_security(struct inode *inode) static int file_alloc_security(struct file *file) { - struct task_security_struct *tsec = current->security; + struct task_security_struct *tsec = current->cred->security; struct file_security_struct *fsec; fsec = kzalloc(sizeof(struct file_security_struct), GFP_KERNEL); @@ -554,7 +554,7 @@ static int selinux_set_mnt_opts(struct super_block *sb, struct security_mnt_opts *opts) { int rc = 0, i; - struct task_security_struct *tsec = current->security; + struct task_security_struct *tsec = current->cred->security; struct superblock_security_struct *sbsec = sb->s_security; const char *name = sb->s_type->name; struct inode *inode = sbsec->sb->s_root->d_inode; @@ -1353,8 +1353,8 @@ static int task_has_perm(struct task_struct *tsk1, { struct task_security_struct *tsec1, *tsec2; - tsec1 = tsk1->security; - tsec2 = tsk2->security; + tsec1 = tsk1->cred->security; + tsec2 = tsk2->cred->security; return avc_has_perm(tsec1->sid, tsec2->sid, SECCLASS_PROCESS, perms, NULL); } @@ -1374,7 +1374,7 @@ static int task_has_capability(struct task_struct *tsk, u32 av = CAP_TO_MASK(cap); int rc; - tsec = tsk->security; + tsec = tsk->cred->security; AVC_AUDIT_DATA_INIT(&ad, CAP); ad.tsk = tsk; @@ -1405,7 +1405,7 @@ static int task_has_system(struct task_struct *tsk, { struct task_security_struct *tsec; - tsec = tsk->security; + tsec = tsk->cred->security; return avc_has_perm(tsec->sid, SECINITSID_KERNEL, SECCLASS_SYSTEM, perms, NULL); @@ -1426,7 +1426,7 @@ static int inode_has_perm(struct task_struct *tsk, if (unlikely(IS_PRIVATE(inode))) return 0; - tsec = tsk->security; + tsec = tsk->cred->security; isec = inode->i_security; if (!adp) { @@ -1466,7 +1466,7 @@ static int file_has_perm(struct task_struct *tsk, struct file *file, u32 av) { - struct task_security_struct *tsec = tsk->security; + struct task_security_struct *tsec = tsk->cred->security; struct file_security_struct *fsec = file->f_security; struct inode *inode = file->f_path.dentry->d_inode; struct avc_audit_data ad; @@ -1503,7 +1503,7 @@ static int may_create(struct inode *dir, struct avc_audit_data ad; int rc; - tsec = current->security; + tsec = current->cred->security; dsec = dir->i_security; sbsec = dir->i_sb->s_security; @@ -1540,7 +1540,7 @@ static int may_create_key(u32 ksid, { struct task_security_struct *tsec; - tsec = ctx->security; + tsec = ctx->cred->security; return avc_has_perm(tsec->sid, ksid, SECCLASS_KEY, KEY__CREATE, NULL); } @@ -1561,7 +1561,7 @@ static int may_link(struct inode *dir, u32 av; int rc; - tsec = current->security; + tsec = current->cred->security; dsec = dir->i_security; isec = dentry->d_inode->i_security; @@ -1606,7 +1606,7 @@ static inline int may_rename(struct inode *old_dir, int old_is_dir, new_is_dir; int rc; - tsec = current->security; + tsec = current->cred->security; old_dsec = old_dir->i_security; old_isec = old_dentry->d_inode->i_security; old_is_dir = S_ISDIR(old_dentry->d_inode->i_mode); @@ -1659,7 +1659,7 @@ static int superblock_has_perm(struct task_struct *tsk, struct task_security_struct *tsec; struct superblock_security_struct *sbsec; - tsec = tsk->security; + tsec = tsk->cred->security; sbsec = sb->s_security; return avc_has_perm(tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, perms, ad); @@ -1758,8 +1758,8 @@ static int selinux_ptrace_may_access(struct task_struct *child, return rc; if (mode == PTRACE_MODE_READ) { - struct task_security_struct *tsec = current->security; - struct task_security_struct *csec = child->security; + struct task_security_struct *tsec = current->cred->security; + struct task_security_struct *csec = child->cred->security; return avc_has_perm(tsec->sid, csec->sid, SECCLASS_FILE, FILE__READ, NULL); } @@ -1874,7 +1874,7 @@ static int selinux_sysctl(ctl_table *table, int op) if (rc) return rc; - tsec = current->security; + tsec = current->cred->security; rc = selinux_sysctl_get_sid(table, (op == 0001) ? SECCLASS_DIR : SECCLASS_FILE, &tsid); @@ -2025,7 +2025,7 @@ static int selinux_bprm_set_security(struct linux_binprm *bprm) if (bsec->set) return 0; - tsec = current->security; + tsec = current->cred->security; isec = inode->i_security; /* Default to the current task SID. */ @@ -2090,7 +2090,7 @@ static int selinux_bprm_check_security(struct linux_binprm *bprm) static int selinux_bprm_secureexec(struct linux_binprm *bprm) { - struct task_security_struct *tsec = current->security; + struct task_security_struct *tsec = current->cred->security; int atsecure = 0; if (tsec->osid != tsec->sid) { @@ -2214,7 +2214,7 @@ static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe) secondary_ops->bprm_apply_creds(bprm, unsafe); - tsec = current->security; + tsec = current->cred->security; bsec = bprm->security; sid = bsec->sid; @@ -2243,7 +2243,7 @@ static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe) rcu_read_lock(); tracer = tracehook_tracer_task(current); if (likely(tracer != NULL)) { - sec = tracer->security; + sec = tracer->cred->security; ptsid = sec->sid; } rcu_read_unlock(); @@ -2274,7 +2274,7 @@ static void selinux_bprm_post_apply_creds(struct linux_binprm *bprm) int rc, i; unsigned long flags; - tsec = current->security; + tsec = current->cred->security; bsec = bprm->security; if (bsec->unsafe) { @@ -2521,7 +2521,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir, int rc; char *namep = NULL, *context; - tsec = current->security; + tsec = current->cred->security; dsec = dir->i_security; sbsec = dir->i_sb->s_security; @@ -2706,7 +2706,7 @@ static int selinux_inode_setotherxattr(struct dentry *dentry, const char *name) static int selinux_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct task_security_struct *tsec = current->security; + struct task_security_struct *tsec = current->cred->security; struct inode *inode = dentry->d_inode; struct inode_security_struct *isec = inode->i_security; struct superblock_security_struct *sbsec; @@ -2918,7 +2918,7 @@ static int selinux_revalidate_file_permission(struct file *file, int mask) static int selinux_file_permission(struct file *file, int mask) { struct inode *inode = file->f_path.dentry->d_inode; - struct task_security_struct *tsec = current->security; + struct task_security_struct *tsec = current->cred->security; struct file_security_struct *fsec = file->f_security; struct inode_security_struct *isec = inode->i_security; @@ -2995,7 +2995,8 @@ static int selinux_file_mmap(struct file *file, unsigned long reqprot, unsigned long addr, unsigned long addr_only) { int rc = 0; - u32 sid = ((struct task_security_struct *)(current->security))->sid; + u32 sid = ((struct task_security_struct *) + (current->cred->security))->sid; if (addr < mmap_min_addr) rc = avc_has_perm(sid, sid, SECCLASS_MEMPROTECT, @@ -3107,7 +3108,7 @@ static int selinux_file_set_fowner(struct file *file) struct task_security_struct *tsec; struct file_security_struct *fsec; - tsec = current->security; + tsec = current->cred->security; fsec = file->f_security; fsec->fown_sid = tsec->sid; @@ -3125,7 +3126,7 @@ static int selinux_file_send_sigiotask(struct task_struct *tsk, /* struct fown_struct is never outside the context of a struct file */ file = container_of(fown, struct file, f_owner); - tsec = tsk->security; + tsec = tsk->cred->security; fsec = file->f_security; if (!signum) @@ -3188,12 +3189,12 @@ static int selinux_task_alloc_security(struct task_struct *tsk) struct task_security_struct *tsec1, *tsec2; int rc; - tsec1 = current->security; + tsec1 = current->cred->security; rc = task_alloc_security(tsk); if (rc) return rc; - tsec2 = tsk->security; + tsec2 = tsk->cred->security; tsec2->osid = tsec1->osid; tsec2->sid = tsec1->sid; @@ -3251,7 +3252,7 @@ static int selinux_task_getsid(struct task_struct *p) static void selinux_task_getsecid(struct task_struct *p, u32 *secid) { - struct task_security_struct *tsec = p->security; + struct task_security_struct *tsec = p->cred->security; *secid = tsec->sid; } @@ -3343,7 +3344,7 @@ static int selinux_task_kill(struct task_struct *p, struct siginfo *info, perm = PROCESS__SIGNULL; /* null signal; existence test */ else perm = signal_to_av(sig); - tsec = p->security; + tsec = p->cred->security; if (secid) rc = avc_has_perm(secid, tsec->sid, SECCLASS_PROCESS, perm, NULL); else @@ -3375,7 +3376,7 @@ static void selinux_task_reparent_to_init(struct task_struct *p) secondary_ops->task_reparent_to_init(p); - tsec = p->security; + tsec = p->cred->security; tsec->osid = tsec->sid; tsec->sid = SECINITSID_KERNEL; return; @@ -3384,7 +3385,7 @@ static void selinux_task_reparent_to_init(struct task_struct *p) static void selinux_task_to_inode(struct task_struct *p, struct inode *inode) { - struct task_security_struct *tsec = p->security; + struct task_security_struct *tsec = p->cred->security; struct inode_security_struct *isec = inode->i_security; isec->sid = tsec->sid; @@ -3632,7 +3633,7 @@ static int socket_has_perm(struct task_struct *task, struct socket *sock, struct avc_audit_data ad; int err = 0; - tsec = task->security; + tsec = task->cred->security; isec = SOCK_INODE(sock)->i_security; if (isec->sid == SECINITSID_KERNEL) @@ -3656,7 +3657,7 @@ static int selinux_socket_create(int family, int type, if (kern) goto out; - tsec = current->security; + tsec = current->cred->security; newsid = tsec->sockcreate_sid ? : tsec->sid; err = avc_has_perm(tsec->sid, newsid, socket_type_to_security_class(family, type, @@ -3677,7 +3678,7 @@ static int selinux_socket_post_create(struct socket *sock, int family, isec = SOCK_INODE(sock)->i_security; - tsec = current->security; + tsec = current->cred->security; newsid = tsec->sockcreate_sid ? : tsec->sid; isec->sclass = socket_type_to_security_class(family, type, protocol); isec->sid = kern ? SECINITSID_KERNEL : newsid; @@ -3723,7 +3724,7 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in struct sock *sk = sock->sk; u32 sid, node_perm; - tsec = current->security; + tsec = current->cred->security; isec = SOCK_INODE(sock)->i_security; if (family == PF_INET) { @@ -4764,7 +4765,7 @@ static int ipc_alloc_security(struct task_struct *task, struct kern_ipc_perm *perm, u16 sclass) { - struct task_security_struct *tsec = task->security; + struct task_security_struct *tsec = task->cred->security; struct ipc_security_struct *isec; isec = kzalloc(sizeof(struct ipc_security_struct), GFP_KERNEL); @@ -4814,7 +4815,7 @@ static int ipc_has_perm(struct kern_ipc_perm *ipc_perms, struct ipc_security_struct *isec; struct avc_audit_data ad; - tsec = current->security; + tsec = current->cred->security; isec = ipc_perms->security; AVC_AUDIT_DATA_INIT(&ad, IPC); @@ -4845,7 +4846,7 @@ static int selinux_msg_queue_alloc_security(struct msg_queue *msq) if (rc) return rc; - tsec = current->security; + tsec = current->cred->security; isec = msq->q_perm.security; AVC_AUDIT_DATA_INIT(&ad, IPC); @@ -4871,7 +4872,7 @@ static int selinux_msg_queue_associate(struct msg_queue *msq, int msqflg) struct ipc_security_struct *isec; struct avc_audit_data ad; - tsec = current->security; + tsec = current->cred->security; isec = msq->q_perm.security; AVC_AUDIT_DATA_INIT(&ad, IPC); @@ -4917,7 +4918,7 @@ static int selinux_msg_queue_msgsnd(struct msg_queue *msq, struct msg_msg *msg, struct avc_audit_data ad; int rc; - tsec = current->security; + tsec = current->cred->security; isec = msq->q_perm.security; msec = msg->security; @@ -4965,7 +4966,7 @@ static int selinux_msg_queue_msgrcv(struct msg_queue *msq, struct msg_msg *msg, struct avc_audit_data ad; int rc; - tsec = target->security; + tsec = target->cred->security; isec = msq->q_perm.security; msec = msg->security; @@ -4992,7 +4993,7 @@ static int selinux_shm_alloc_security(struct shmid_kernel *shp) if (rc) return rc; - tsec = current->security; + tsec = current->cred->security; isec = shp->shm_perm.security; AVC_AUDIT_DATA_INIT(&ad, IPC); @@ -5018,7 +5019,7 @@ static int selinux_shm_associate(struct shmid_kernel *shp, int shmflg) struct ipc_security_struct *isec; struct avc_audit_data ad; - tsec = current->security; + tsec = current->cred->security; isec = shp->shm_perm.security; AVC_AUDIT_DATA_INIT(&ad, IPC); @@ -5091,7 +5092,7 @@ static int selinux_sem_alloc_security(struct sem_array *sma) if (rc) return rc; - tsec = current->security; + tsec = current->cred->security; isec = sma->sem_perm.security; AVC_AUDIT_DATA_INIT(&ad, IPC); @@ -5117,7 +5118,7 @@ static int selinux_sem_associate(struct sem_array *sma, int semflg) struct ipc_security_struct *isec; struct avc_audit_data ad; - tsec = current->security; + tsec = current->cred->security; isec = sma->sem_perm.security; AVC_AUDIT_DATA_INIT(&ad, IPC); @@ -5224,7 +5225,7 @@ static int selinux_getprocattr(struct task_struct *p, return error; } - tsec = p->security; + tsec = p->cred->security; if (!strcmp(name, "current")) sid = tsec->sid; @@ -5308,7 +5309,7 @@ static int selinux_setprocattr(struct task_struct *p, operation. See selinux_bprm_set_security for the execve checks and may_create for the file creation checks. The operation will then fail if the context is not permitted. */ - tsec = p->security; + tsec = p->cred->security; if (!strcmp(name, "exec")) tsec->exec_sid = sid; else if (!strcmp(name, "fscreate")) @@ -5361,7 +5362,8 @@ boundary_ok: rcu_read_lock(); tracer = tracehook_tracer_task(p); if (tracer != NULL) { - struct task_security_struct *ptsec = tracer->security; + struct task_security_struct *ptsec = + tracer->cred->security; u32 ptsid = ptsec->sid; rcu_read_unlock(); error = avc_has_perm_noaudit(ptsid, sid, @@ -5405,7 +5407,7 @@ static void selinux_release_secctx(char *secdata, u32 seclen) static int selinux_key_alloc(struct key *k, struct task_struct *tsk, unsigned long flags) { - struct task_security_struct *tsec = tsk->security; + struct task_security_struct *tsec = tsk->cred->security; struct key_security_struct *ksec; ksec = kzalloc(sizeof(struct key_security_struct), GFP_KERNEL); @@ -5439,7 +5441,7 @@ static int selinux_key_permission(key_ref_t key_ref, key = key_ref_to_ptr(key_ref); - tsec = ctx->security; + tsec = ctx->cred->security; ksec = key->security; /* if no specific permissions are requested, we skip the @@ -5683,7 +5685,7 @@ static __init int selinux_init(void) /* Set the security state for the initial task. */ if (task_alloc_security(current)) panic("SELinux: Failed to initialize initial task.\n"); - tsec = current->security; + tsec = current->cred->security; tsec->osid = tsec->sid = SECINITSID_KERNEL; sel_inode_cache = kmem_cache_create("selinux_inode_security", diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 69c9dccc8cf..10715d1330b 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -97,7 +97,7 @@ static int task_has_security(struct task_struct *tsk, { struct task_security_struct *tsec; - tsec = tsk->security; + tsec = tsk->cred->security; if (!tsec) return -EACCES; diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c index 8f17f542a11..d7db76617b0 100644 --- a/security/selinux/xfrm.c +++ b/security/selinux/xfrm.c @@ -197,7 +197,7 @@ static int selinux_xfrm_sec_ctx_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_user_sec_ctx *uctx, u32 sid) { int rc = 0; - struct task_security_struct *tsec = current->security; + struct task_security_struct *tsec = current->cred->security; struct xfrm_sec_ctx *ctx = NULL; char *ctx_str = NULL; u32 str_len; @@ -333,7 +333,7 @@ void selinux_xfrm_policy_free(struct xfrm_sec_ctx *ctx) */ int selinux_xfrm_policy_delete(struct xfrm_sec_ctx *ctx) { - struct task_security_struct *tsec = current->security; + struct task_security_struct *tsec = current->cred->security; int rc = 0; if (ctx) { @@ -378,7 +378,7 @@ void selinux_xfrm_state_free(struct xfrm_state *x) */ int selinux_xfrm_state_delete(struct xfrm_state *x) { - struct task_security_struct *tsec = current->security; + struct task_security_struct *tsec = current->cred->security; struct xfrm_sec_ctx *ctx = x->security; int rc = 0; diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c index 79ff21ed4c3..b6dd4fc0fb0 100644 --- a/security/smack/smack_access.c +++ b/security/smack/smack_access.c @@ -164,7 +164,7 @@ int smk_curacc(char *obj_label, u32 mode) { int rc; - rc = smk_access(current->security, obj_label, mode); + rc = smk_access(current->cred->security, obj_label, mode); if (rc == 0) return 0; @@ -173,7 +173,7 @@ int smk_curacc(char *obj_label, u32 mode) * only one that gets privilege and current does not * have that label. */ - if (smack_onlycap != NULL && smack_onlycap != current->security) + if (smack_onlycap != NULL && smack_onlycap != current->cred->security) return rc; if (capable(CAP_MAC_OVERRIDE)) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 6e2dc0bab70..791da238d04 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -102,7 +102,8 @@ static int smack_ptrace_may_access(struct task_struct *ctp, unsigned int mode) if (rc != 0) return rc; - rc = smk_access(current->security, ctp->security, MAY_READWRITE); + rc = smk_access(current->cred->security, ctp->cred->security, + MAY_READWRITE); if (rc != 0 && capable(CAP_MAC_OVERRIDE)) return 0; return rc; @@ -124,7 +125,8 @@ static int smack_ptrace_traceme(struct task_struct *ptp) if (rc != 0) return rc; - rc = smk_access(ptp->security, current->security, MAY_READWRITE); + rc = smk_access(ptp->cred->security, current->cred->security, + MAY_READWRITE); if (rc != 0 && has_capability(ptp, CAP_MAC_OVERRIDE)) return 0; return rc; @@ -141,7 +143,7 @@ static int smack_ptrace_traceme(struct task_struct *ptp) static int smack_syslog(int type) { int rc; - char *sp = current->security; + char *sp = current->cred->security; rc = cap_syslog(type); if (rc != 0) @@ -373,7 +375,7 @@ static int smack_sb_umount(struct vfsmount *mnt, int flags) */ static int smack_inode_alloc_security(struct inode *inode) { - inode->i_security = new_inode_smack(current->security); + inode->i_security = new_inode_smack(current->cred->security); if (inode->i_security == NULL) return -ENOMEM; return 0; @@ -818,7 +820,7 @@ static int smack_file_permission(struct file *file, int mask) */ static int smack_file_alloc_security(struct file *file) { - file->f_security = current->security; + file->f_security = current->cred->security; return 0; } @@ -916,7 +918,7 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd, */ static int smack_file_set_fowner(struct file *file) { - file->f_security = current->security; + file->f_security = current->cred->security; return 0; } @@ -941,7 +943,7 @@ static int smack_file_send_sigiotask(struct task_struct *tsk, * struct fown_struct is never outside the context of a struct file */ file = container_of(fown, struct file, f_owner); - rc = smk_access(file->f_security, tsk->security, MAY_WRITE); + rc = smk_access(file->f_security, tsk->cred->security, MAY_WRITE); if (rc != 0 && has_capability(tsk, CAP_MAC_OVERRIDE)) return 0; return rc; @@ -984,7 +986,7 @@ static int smack_file_receive(struct file *file) */ static int smack_task_alloc_security(struct task_struct *tsk) { - tsk->security = current->security; + tsk->cred->security = current->cred->security; return 0; } @@ -999,7 +1001,7 @@ static int smack_task_alloc_security(struct task_struct *tsk) */ static void smack_task_free_security(struct task_struct *task) { - task->security = NULL; + task->cred->security = NULL; } /** @@ -1011,7 +1013,7 @@ static void smack_task_free_security(struct task_struct *task) */ static int smack_task_setpgid(struct task_struct *p, pid_t pgid) { - return smk_curacc(p->security, MAY_WRITE); + return smk_curacc(p->cred->security, MAY_WRITE); } /** @@ -1022,7 +1024,7 @@ static int smack_task_setpgid(struct task_struct *p, pid_t pgid) */ static int smack_task_getpgid(struct task_struct *p) { - return smk_curacc(p->security, MAY_READ); + return smk_curacc(p->cred->security, MAY_READ); } /** @@ -1033,7 +1035,7 @@ static int smack_task_getpgid(struct task_struct *p) */ static int smack_task_getsid(struct task_struct *p) { - return smk_curacc(p->security, MAY_READ); + return smk_curacc(p->cred->security, MAY_READ); } /** @@ -1045,7 +1047,7 @@ static int smack_task_getsid(struct task_struct *p) */ static void smack_task_getsecid(struct task_struct *p, u32 *secid) { - *secid = smack_to_secid(p->security); + *secid = smack_to_secid(p->cred->security); } /** @@ -1061,7 +1063,7 @@ static int smack_task_setnice(struct task_struct *p, int nice) rc = cap_task_setnice(p, nice); if (rc == 0) - rc = smk_curacc(p->security, MAY_WRITE); + rc = smk_curacc(p->cred->security, MAY_WRITE); return rc; } @@ -1078,7 +1080,7 @@ static int smack_task_setioprio(struct task_struct *p, int ioprio) rc = cap_task_setioprio(p, ioprio); if (rc == 0) - rc = smk_curacc(p->security, MAY_WRITE); + rc = smk_curacc(p->cred->security, MAY_WRITE); return rc; } @@ -1090,7 +1092,7 @@ static int smack_task_setioprio(struct task_struct *p, int ioprio) */ static int smack_task_getioprio(struct task_struct *p) { - return smk_curacc(p->security, MAY_READ); + return smk_curacc(p->cred->security, MAY_READ); } /** @@ -1108,7 +1110,7 @@ static int smack_task_setscheduler(struct task_struct *p, int policy, rc = cap_task_setscheduler(p, policy, lp); if (rc == 0) - rc = smk_curacc(p->security, MAY_WRITE); + rc = smk_curacc(p->cred->security, MAY_WRITE); return rc; } @@ -1120,7 +1122,7 @@ static int smack_task_setscheduler(struct task_struct *p, int policy, */ static int smack_task_getscheduler(struct task_struct *p) { - return smk_curacc(p->security, MAY_READ); + return smk_curacc(p->cred->security, MAY_READ); } /** @@ -1131,7 +1133,7 @@ static int smack_task_getscheduler(struct task_struct *p) */ static int smack_task_movememory(struct task_struct *p) { - return smk_curacc(p->security, MAY_WRITE); + return smk_curacc(p->cred->security, MAY_WRITE); } /** @@ -1154,13 +1156,13 @@ static int smack_task_kill(struct task_struct *p, struct siginfo *info, * can write the receiver. */ if (secid == 0) - return smk_curacc(p->security, MAY_WRITE); + return smk_curacc(p->cred->security, MAY_WRITE); /* * If the secid isn't 0 we're dealing with some USB IO * specific behavior. This is not clean. For one thing * we can't take privilege into account. */ - return smk_access(smack_from_secid(secid), p->security, MAY_WRITE); + return smk_access(smack_from_secid(secid), p->cred->security, MAY_WRITE); } /** @@ -1173,7 +1175,7 @@ static int smack_task_wait(struct task_struct *p) { int rc; - rc = smk_access(current->security, p->security, MAY_WRITE); + rc = smk_access(current->cred->security, p->cred->security, MAY_WRITE); if (rc == 0) return 0; @@ -1204,7 +1206,7 @@ static int smack_task_wait(struct task_struct *p) static void smack_task_to_inode(struct task_struct *p, struct inode *inode) { struct inode_smack *isp = inode->i_security; - isp->smk_inode = p->security; + isp->smk_inode = p->cred->security; } /* @@ -1223,7 +1225,7 @@ static void smack_task_to_inode(struct task_struct *p, struct inode *inode) */ static int smack_sk_alloc_security(struct sock *sk, int family, gfp_t gfp_flags) { - char *csp = current->security; + char *csp = current->cred->security; struct socket_smack *ssp; ssp = kzalloc(sizeof(struct socket_smack), gfp_flags); @@ -1448,7 +1450,7 @@ static int smack_flags_to_may(int flags) */ static int smack_msg_msg_alloc_security(struct msg_msg *msg) { - msg->security = current->security; + msg->security = current->cred->security; return 0; } @@ -1484,7 +1486,7 @@ static int smack_shm_alloc_security(struct shmid_kernel *shp) { struct kern_ipc_perm *isp = &shp->shm_perm; - isp->security = current->security; + isp->security = current->cred->security; return 0; } @@ -1593,7 +1595,7 @@ static int smack_sem_alloc_security(struct sem_array *sma) { struct kern_ipc_perm *isp = &sma->sem_perm; - isp->security = current->security; + isp->security = current->cred->security; return 0; } @@ -1697,7 +1699,7 @@ static int smack_msg_queue_alloc_security(struct msg_queue *msq) { struct kern_ipc_perm *kisp = &msq->q_perm; - kisp->security = current->security; + kisp->security = current->cred->security; return 0; } @@ -1852,7 +1854,7 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode) struct super_block *sbp; struct superblock_smack *sbsp; struct inode_smack *isp; - char *csp = current->security; + char *csp = current->cred->security; char *fetched; char *final; struct dentry *dp; @@ -2009,7 +2011,7 @@ static int smack_getprocattr(struct task_struct *p, char *name, char **value) if (strcmp(name, "current") != 0) return -EINVAL; - cp = kstrdup(p->security, GFP_KERNEL); + cp = kstrdup(p->cred->security, GFP_KERNEL); if (cp == NULL) return -ENOMEM; @@ -2055,7 +2057,7 @@ static int smack_setprocattr(struct task_struct *p, char *name, if (newsmack == NULL) return -EINVAL; - p->security = newsmack; + p->cred->security = newsmack; return size; } @@ -2288,8 +2290,8 @@ static void smack_sock_graft(struct sock *sk, struct socket *parent) return; ssp = sk->sk_security; - ssp->smk_in = current->security; - ssp->smk_out = current->security; + ssp->smk_in = current->cred->security; + ssp->smk_out = current->cred->security; ssp->smk_packet[0] = '\0'; rc = smack_netlabel(sk); @@ -2362,7 +2364,7 @@ static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb, static int smack_key_alloc(struct key *key, struct task_struct *tsk, unsigned long flags) { - key->security = tsk->security; + key->security = tsk->cred->security; return 0; } @@ -2403,10 +2405,11 @@ static int smack_key_permission(key_ref_t key_ref, /* * This should not occur */ - if (context->security == NULL) + if (context->cred->security == NULL) return -EACCES; - return smk_access(context->security, keyp->security, MAY_READWRITE); + return smk_access(context->cred->security, keyp->security, + MAY_READWRITE); } #endif /* CONFIG_KEYS */ @@ -2726,7 +2729,7 @@ static __init int smack_init(void) /* * Set the security state for the initial task. */ - current->security = &smack_known_floor.smk_known; + current->cred->security = &smack_known_floor.smk_known; /* * Initialize locks diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index c21d8c8bf0c..c5ca279e050 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -336,7 +336,7 @@ static void smk_cipso_doi(void) audit_info.loginuid = audit_get_loginuid(current); audit_info.sessionid = audit_get_sessionid(current); - audit_info.secid = smack_to_secid(current->security); + audit_info.secid = smack_to_secid(current->cred->security); rc = netlbl_cfg_map_del(NULL, &audit_info); if (rc != 0) @@ -371,7 +371,7 @@ static void smk_unlbl_ambient(char *oldambient) audit_info.loginuid = audit_get_loginuid(current); audit_info.sessionid = audit_get_sessionid(current); - audit_info.secid = smack_to_secid(current->security); + audit_info.secid = smack_to_secid(current->cred->security); if (oldambient != NULL) { rc = netlbl_cfg_map_del(oldambient, &audit_info); @@ -843,7 +843,7 @@ static ssize_t smk_write_onlycap(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char in[SMK_LABELLEN]; - char *sp = current->security; + char *sp = current->cred->security; if (!capable(CAP_MAC_ADMIN)) return -EPERM; -- cgit v1.2.3-70-g09d2 From f1752eec6145c97163dbce62d17cf5d928e28a27 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:17 +1100 Subject: CRED: Detach the credentials from task_struct Detach the credentials from task_struct, duplicating them in copy_process() and releasing them in __put_task_struct(). Signed-off-by: David Howells Acked-by: James Morris Acked-by: Serge Hallyn Signed-off-by: James Morris --- include/linux/cred.h | 29 ++++++++++++++ include/linux/init_task.h | 16 +------- include/linux/sched.h | 1 - include/linux/security.h | 26 ++++++------- kernel/Makefile | 2 +- kernel/cred.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/fork.c | 24 +++--------- security/capability.c | 8 ++-- security/security.c | 8 ++-- security/selinux/hooks.c | 32 ++++++++-------- security/smack/smack_lsm.c | 20 +++++----- 11 files changed, 179 insertions(+), 83 deletions(-) create mode 100644 kernel/cred.c (limited to 'kernel') diff --git a/include/linux/cred.h b/include/linux/cred.h index 3e65587a72e..a7a686074cb 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -158,4 +158,33 @@ do { \ *(_gid) = current->cred->fsgid; \ } while(0) +extern void __put_cred(struct cred *); +extern int copy_creds(struct task_struct *, unsigned long); + +/** + * get_cred - Get a reference on a set of credentials + * @cred: The credentials to reference + * + * Get a reference on the specified set of credentials. The caller must + * release the reference. + */ +static inline struct cred *get_cred(struct cred *cred) +{ + atomic_inc(&cred->usage); + return cred; +} + +/** + * put_cred - Release a reference to a set of credentials + * @cred: The credentials to release + * + * Release a reference to a set of credentials, deleting them when the last ref + * is released. + */ +static inline void put_cred(struct cred *cred) +{ + if (atomic_dec_and_test(&(cred)->usage)) + __put_cred(cred); +} + #endif /* _LINUX_CRED_H */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 9de41ccd67b..5e24c54b6df 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -115,19 +115,6 @@ extern struct group_info init_groups; extern struct cred init_cred; -#define INIT_CRED(p) \ -{ \ - .usage = ATOMIC_INIT(3), \ - .securebits = SECUREBITS_DEFAULT, \ - .cap_inheritable = CAP_INIT_INH_SET, \ - .cap_permitted = CAP_FULL_SET, \ - .cap_effective = CAP_INIT_EFF_SET, \ - .cap_bset = CAP_INIT_BSET, \ - .user = INIT_USER, \ - .group_info = &init_groups, \ - .lock = __SPIN_LOCK_UNLOCKED(p.lock), \ -} - /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -162,8 +149,7 @@ extern struct cred init_cred; .children = LIST_HEAD_INIT(tsk.children), \ .sibling = LIST_HEAD_INIT(tsk.sibling), \ .group_leader = &tsk, \ - .__temp_cred = INIT_CRED(tsk.__temp_cred), \ - .cred = &tsk.__temp_cred, \ + .cred = &init_cred, \ .comm = "swapper", \ .thread = INIT_THREAD, \ .fs = &init_fs, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index c8b92502354..740cf946c8c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1151,7 +1151,6 @@ struct task_struct { struct list_head cpu_timers[3]; /* process credentials */ - struct cred __temp_cred __deprecated; /* temporary credentials to be removed */ struct cred *cred; /* actual/objective task credentials */ char comm[TASK_COMM_LEN]; /* executable name excluding path diff --git a/include/linux/security.h b/include/linux/security.h index 9f305d4a31a..9239cc11eb9 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -593,15 +593,15 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * manual page for definitions of the @clone_flags. * @clone_flags contains the flags indicating what should be shared. * Return 0 if permission is granted. - * @task_alloc_security: - * @p contains the task_struct for child process. - * Allocate and attach a security structure to the p->security field. The - * security field is initialized to NULL when the task structure is + * @cred_alloc_security: + * @cred contains the cred struct for child process. + * Allocate and attach a security structure to the cred->security field. + * The security field is initialized to NULL when the task structure is * allocated. * Return 0 if operation was successful. - * @task_free_security: - * @p contains the task_struct for process. - * Deallocate and clear the p->security field. + * @cred_free: + * @cred points to the credentials. + * Deallocate and clear the cred->security field in a set of credentials. * @task_setuid: * Check permission before setting one or more of the user identity * attributes of the current process. The @flags parameter indicates @@ -1405,8 +1405,8 @@ struct security_operations { int (*dentry_open) (struct file *file); int (*task_create) (unsigned long clone_flags); - int (*task_alloc_security) (struct task_struct *p); - void (*task_free_security) (struct task_struct *p); + int (*cred_alloc_security) (struct cred *cred); + void (*cred_free) (struct cred *cred); int (*task_setuid) (uid_t id0, uid_t id1, uid_t id2, int flags); int (*task_post_setuid) (uid_t old_ruid /* or fsuid */ , uid_t old_euid, uid_t old_suid, int flags); @@ -1660,8 +1660,8 @@ int security_file_send_sigiotask(struct task_struct *tsk, int security_file_receive(struct file *file); int security_dentry_open(struct file *file); int security_task_create(unsigned long clone_flags); -int security_task_alloc(struct task_struct *p); -void security_task_free(struct task_struct *p); +int security_cred_alloc(struct cred *cred); +void security_cred_free(struct cred *cred); int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags); int security_task_post_setuid(uid_t old_ruid, uid_t old_euid, uid_t old_suid, int flags); @@ -2181,12 +2181,12 @@ static inline int security_task_create(unsigned long clone_flags) return 0; } -static inline int security_task_alloc(struct task_struct *p) +static inline int security_cred_alloc(struct cred *cred) { return 0; } -static inline void security_task_free(struct task_struct *p) +static inline void security_cred_free(struct cred *cred) { } static inline int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, diff --git a/kernel/Makefile b/kernel/Makefile index 9a3ec66a9d8..5a6a612c302 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ - notifier.o ksysfs.o pm_qos_params.o sched_clock.o + notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o CFLAGS_REMOVE_sched.o = -mno-spe diff --git a/kernel/cred.c b/kernel/cred.c new file mode 100644 index 00000000000..833244a7cb0 --- /dev/null +++ b/kernel/cred.c @@ -0,0 +1,96 @@ +/* Task credentials management + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include + +/* + * The initial credentials for the initial task + */ +struct cred init_cred = { + .usage = ATOMIC_INIT(3), + .securebits = SECUREBITS_DEFAULT, + .cap_inheritable = CAP_INIT_INH_SET, + .cap_permitted = CAP_FULL_SET, + .cap_effective = CAP_INIT_EFF_SET, + .cap_bset = CAP_INIT_BSET, + .user = INIT_USER, + .group_info = &init_groups, +}; + +/* + * The RCU callback to actually dispose of a set of credentials + */ +static void put_cred_rcu(struct rcu_head *rcu) +{ + struct cred *cred = container_of(rcu, struct cred, rcu); + + BUG_ON(atomic_read(&cred->usage) != 0); + + key_put(cred->thread_keyring); + key_put(cred->request_key_auth); + put_group_info(cred->group_info); + free_uid(cred->user); + security_cred_free(cred); + kfree(cred); +} + +/** + * __put_cred - Destroy a set of credentials + * @sec: The record to release + * + * Destroy a set of credentials on which no references remain. + */ +void __put_cred(struct cred *cred) +{ + call_rcu(&cred->rcu, put_cred_rcu); +} +EXPORT_SYMBOL(__put_cred); + +/* + * Copy credentials for the new process created by fork() + */ +int copy_creds(struct task_struct *p, unsigned long clone_flags) +{ + struct cred *pcred; + int ret; + + pcred = kmemdup(p->cred, sizeof(*p->cred), GFP_KERNEL); + if (!pcred) + return -ENOMEM; + +#ifdef CONFIG_SECURITY + pcred->security = NULL; +#endif + + ret = security_cred_alloc(pcred); + if (ret < 0) { + kfree(pcred); + return ret; + } + + atomic_set(&pcred->usage, 1); + get_group_info(pcred->group_info); + get_uid(pcred->user); + key_get(pcred->thread_keyring); + key_get(pcred->request_key_auth); + + atomic_inc(&pcred->user->processes); + + /* RCU assignment is unneeded here as no-one can have accessed this + * pointer yet, barring us */ + p->cred = pcred; + return 0; +} diff --git a/kernel/fork.c b/kernel/fork.c index 81fdc773390..c932e283ddf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -146,9 +146,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); - security_task_free(tsk); - free_uid(tsk->__temp_cred.user); - put_group_info(tsk->__temp_cred.group_info); + put_cred(tsk->cred); delayacct_tsk_free(tsk); if (!profile_handoff_task(tsk)) @@ -969,7 +967,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif - p->cred = &p->__temp_cred; retval = -EAGAIN; if (atomic_read(&p->cred->user->processes) >= p->signal->rlim[RLIMIT_NPROC].rlim_cur) { @@ -978,9 +975,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_free; } - atomic_inc(&p->cred->user->__count); - atomic_inc(&p->cred->user->processes); - get_group_info(p->cred->group_info); + retval = copy_creds(p, clone_flags); + if (retval < 0) + goto bad_fork_free; /* * If multiple threads are within copy_process(), then this check @@ -1035,9 +1032,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, do_posix_clock_monotonic_gettime(&p->start_time); p->real_start_time = p->start_time; monotonic_to_bootbased(&p->real_start_time); -#ifdef CONFIG_SECURITY - p->cred->security = NULL; -#endif p->io_context = NULL; p->audit_context = NULL; cgroup_fork(p); @@ -1082,10 +1076,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); - if ((retval = security_task_alloc(p))) - goto bad_fork_cleanup_policy; if ((retval = audit_alloc(p))) - goto bad_fork_cleanup_security; + goto bad_fork_cleanup_policy; /* copy all the process information */ if ((retval = copy_semundo(clone_flags, p))) goto bad_fork_cleanup_audit; @@ -1284,8 +1276,6 @@ bad_fork_cleanup_semundo: exit_sem(p); bad_fork_cleanup_audit: audit_free(p); -bad_fork_cleanup_security: - security_task_free(p); bad_fork_cleanup_policy: #ifdef CONFIG_NUMA mpol_put(p->mempolicy); @@ -1298,9 +1288,7 @@ bad_fork_cleanup_cgroup: bad_fork_cleanup_put_domain: module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: - put_group_info(p->cred->group_info); - atomic_dec(&p->cred->user->processes); - free_uid(p->cred->user); + put_cred(p->cred); bad_fork_free: free_task(p); fork_out: diff --git a/security/capability.c b/security/capability.c index 24587481903..6c4b5137ca7 100644 --- a/security/capability.c +++ b/security/capability.c @@ -340,12 +340,12 @@ static int cap_task_create(unsigned long clone_flags) return 0; } -static int cap_task_alloc_security(struct task_struct *p) +static int cap_cred_alloc_security(struct cred *cred) { return 0; } -static void cap_task_free_security(struct task_struct *p) +static void cap_cred_free(struct cred *cred) { } @@ -890,8 +890,8 @@ void security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, file_receive); set_to_cap_if_null(ops, dentry_open); set_to_cap_if_null(ops, task_create); - set_to_cap_if_null(ops, task_alloc_security); - set_to_cap_if_null(ops, task_free_security); + set_to_cap_if_null(ops, cred_alloc_security); + set_to_cap_if_null(ops, cred_free); set_to_cap_if_null(ops, task_setuid); set_to_cap_if_null(ops, task_post_setuid); set_to_cap_if_null(ops, task_setgid); diff --git a/security/security.c b/security/security.c index 81c956a1230..d058f7d5b10 100644 --- a/security/security.c +++ b/security/security.c @@ -616,14 +616,14 @@ int security_task_create(unsigned long clone_flags) return security_ops->task_create(clone_flags); } -int security_task_alloc(struct task_struct *p) +int security_cred_alloc(struct cred *cred) { - return security_ops->task_alloc_security(p); + return security_ops->cred_alloc_security(cred); } -void security_task_free(struct task_struct *p) +void security_cred_free(struct cred *cred) { - security_ops->task_free_security(p); + security_ops->cred_free(cred); } int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 328308f2882..658435dce37 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -158,7 +158,7 @@ static int selinux_secmark_enabled(void) /* Allocate and free functions for each kind of security blob. */ -static int task_alloc_security(struct task_struct *task) +static int cred_alloc_security(struct cred *cred) { struct task_security_struct *tsec; @@ -167,18 +167,11 @@ static int task_alloc_security(struct task_struct *task) return -ENOMEM; tsec->osid = tsec->sid = SECINITSID_UNLABELED; - task->cred->security = tsec; + cred->security = tsec; return 0; } -static void task_free_security(struct task_struct *task) -{ - struct task_security_struct *tsec = task->cred->security; - task->cred->security = NULL; - kfree(tsec); -} - static int inode_alloc_security(struct inode *inode) { struct task_security_struct *tsec = current->cred->security; @@ -3184,17 +3177,17 @@ static int selinux_task_create(unsigned long clone_flags) return task_has_perm(current, current, PROCESS__FORK); } -static int selinux_task_alloc_security(struct task_struct *tsk) +static int selinux_cred_alloc_security(struct cred *cred) { struct task_security_struct *tsec1, *tsec2; int rc; tsec1 = current->cred->security; - rc = task_alloc_security(tsk); + rc = cred_alloc_security(cred); if (rc) return rc; - tsec2 = tsk->cred->security; + tsec2 = cred->security; tsec2->osid = tsec1->osid; tsec2->sid = tsec1->sid; @@ -3208,9 +3201,14 @@ static int selinux_task_alloc_security(struct task_struct *tsk) return 0; } -static void selinux_task_free_security(struct task_struct *tsk) +/* + * detach and free the LSM part of a set of credentials + */ +static void selinux_cred_free(struct cred *cred) { - task_free_security(tsk); + struct task_security_struct *tsec = cred->security; + cred->security = NULL; + kfree(tsec); } static int selinux_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags) @@ -5552,8 +5550,8 @@ static struct security_operations selinux_ops = { .dentry_open = selinux_dentry_open, .task_create = selinux_task_create, - .task_alloc_security = selinux_task_alloc_security, - .task_free_security = selinux_task_free_security, + .cred_alloc_security = selinux_cred_alloc_security, + .cred_free = selinux_cred_free, .task_setuid = selinux_task_setuid, .task_post_setuid = selinux_task_post_setuid, .task_setgid = selinux_task_setgid, @@ -5683,7 +5681,7 @@ static __init int selinux_init(void) printk(KERN_INFO "SELinux: Initializing.\n"); /* Set the security state for the initial task. */ - if (task_alloc_security(current)) + if (cred_alloc_security(current->cred)) panic("SELinux: Failed to initialize initial task.\n"); tsec = current->cred->security; tsec->osid = tsec->sid = SECINITSID_KERNEL; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 791da238d04..cc837314fb0 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -975,8 +975,8 @@ static int smack_file_receive(struct file *file) */ /** - * smack_task_alloc_security - "allocate" a task blob - * @tsk: the task in need of a blob + * smack_cred_alloc_security - "allocate" a task cred blob + * @cred: the task creds in need of a blob * * Smack isn't using copies of blobs. Everyone * points to an immutable list. No alloc required. @@ -984,24 +984,24 @@ static int smack_file_receive(struct file *file) * * Always returns 0 */ -static int smack_task_alloc_security(struct task_struct *tsk) +static int smack_cred_alloc_security(struct cred *cred) { - tsk->cred->security = current->cred->security; + cred->security = current->cred->security; return 0; } /** - * smack_task_free_security - "free" a task blob - * @task: the task with the blob + * smack_cred_free - "free" task-level security credentials + * @cred: the credentials in question * * Smack isn't using copies of blobs. Everyone * points to an immutable list. The blobs never go away. * There is no leak here. */ -static void smack_task_free_security(struct task_struct *task) +static void smack_cred_free(struct cred *cred) { - task->cred->security = NULL; + cred->security = NULL; } /** @@ -2630,8 +2630,8 @@ struct security_operations smack_ops = { .file_send_sigiotask = smack_file_send_sigiotask, .file_receive = smack_file_receive, - .task_alloc_security = smack_task_alloc_security, - .task_free_security = smack_task_free_security, + .cred_alloc_security = smack_cred_alloc_security, + .cred_free = smack_cred_free, .task_post_setuid = cap_task_post_setuid, .task_setpgid = smack_task_setpgid, .task_getpgid = smack_task_getpgid, -- cgit v1.2.3-70-g09d2 From 86a264abe542cfececb4df129bc45a0338d8cdb9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:18 +1100 Subject: CRED: Wrap current->cred and a few other accessors Wrap current->cred and a few other accessors to hide their actual implementation. Signed-off-by: David Howells Acked-by: James Morris Acked-by: Serge Hallyn Signed-off-by: James Morris --- arch/ia64/ia32/sys_ia32.c | 7 +- drivers/net/tun.c | 8 +- drivers/usb/core/devio.c | 10 ++- fs/binfmt_elf.c | 10 +-- fs/binfmt_elf_fdpic.c | 9 +- fs/exec.c | 5 +- fs/fcntl.c | 3 +- fs/file_table.c | 7 +- fs/hugetlbfs/inode.c | 5 +- fs/ioprio.c | 4 +- fs/smbfs/dir.c | 3 +- include/linux/cred.h | 187 ++++++++++++++++++++++++++++++++---------- include/linux/securebits.h | 2 +- ipc/mqueue.c | 2 +- ipc/shm.c | 4 +- kernel/sys.c | 59 +++++++------ kernel/uid16.c | 31 +++---- net/core/scm.c | 2 +- net/sunrpc/auth.c | 14 ++-- security/commoncap.c | 2 +- security/keys/process_keys.c | 2 +- security/keys/request_key.c | 11 +-- security/selinux/exports.c | 8 +- security/selinux/xfrm.c | 6 +- security/smack/smack_access.c | 2 +- security/smack/smack_lsm.c | 26 +++--- security/smack/smackfs.c | 4 +- 27 files changed, 271 insertions(+), 162 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c index 2445a9d3488..16ef61a91d9 100644 --- a/arch/ia64/ia32/sys_ia32.c +++ b/arch/ia64/ia32/sys_ia32.c @@ -1767,25 +1767,24 @@ groups16_from_user(struct group_info *group_info, short __user *grouplist) asmlinkage long sys32_getgroups16 (int gidsetsize, short __user *grouplist) { + const struct cred *cred = current_cred(); int i; if (gidsetsize < 0) return -EINVAL; - get_group_info(current->cred->group_info); - i = current->cred->group_info->ngroups; + i = cred->group_info->ngroups; if (gidsetsize) { if (i > gidsetsize) { i = -EINVAL; goto out; } - if (groups16_to_user(grouplist, current->cred->group_info)) { + if (groups16_to_user(grouplist, cred->group_info)) { i = -EFAULT; goto out; } } out: - put_group_info(current->cred->group_info); return i; } diff --git a/drivers/net/tun.c b/drivers/net/tun.c index b14e2025e22..55dc70c6b4d 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -702,6 +702,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) struct tun_net *tn; struct tun_struct *tun; struct net_device *dev; + const struct cred *cred = current_cred(); int err; tn = net_generic(net, tun_net_id); @@ -712,11 +713,12 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) /* Check permissions */ if (((tun->owner != -1 && - current_euid() != tun->owner) || + cred->euid != tun->owner) || (tun->group != -1 && - current_egid() != tun->group)) && - !capable(CAP_NET_ADMIN)) + cred->egid != tun->group)) && + !capable(CAP_NET_ADMIN)) { return -EPERM; + } } else if (__dev_get_by_name(net, ifr->ifr_name)) return -EINVAL; diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index 1aadb938702..aa79280df15 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -574,6 +574,7 @@ static int usbdev_open(struct inode *inode, struct file *file) { struct usb_device *dev = NULL; struct dev_state *ps; + const struct cred *cred = current_cred(); int ret; lock_kernel(); @@ -617,8 +618,8 @@ static int usbdev_open(struct inode *inode, struct file *file) init_waitqueue_head(&ps->wait); ps->discsignr = 0; ps->disc_pid = get_pid(task_pid(current)); - ps->disc_uid = current_uid(); - ps->disc_euid = current_euid(); + ps->disc_uid = cred->uid; + ps->disc_euid = cred->euid; ps->disccontext = NULL; ps->ifclaimed = 0; security_task_getsecid(current, &ps->secid); @@ -967,6 +968,7 @@ static int proc_do_submiturb(struct dev_state *ps, struct usbdevfs_urb *uurb, struct usb_host_endpoint *ep; struct async *as; struct usb_ctrlrequest *dr = NULL; + const struct cred *cred = current_cred(); unsigned int u, totlen, isofrmlen; int ret, ifnum = -1; int is_in; @@ -1174,8 +1176,8 @@ static int proc_do_submiturb(struct dev_state *ps, struct usbdevfs_urb *uurb, as->signr = uurb->signr; as->ifnum = ifnum; as->pid = get_pid(task_pid(current)); - as->uid = current_uid(); - as->euid = current_euid(); + as->uid = cred->uid; + as->euid = cred->euid; security_task_getsecid(current, &as->secid); if (!is_in) { if (copy_from_user(as->urb->transfer_buffer, uurb->buffer, diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 7a52477ce49..0e665561316 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -157,7 +157,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, int items; elf_addr_t *elf_info; int ei_index = 0; - struct task_struct *tsk = current; + const struct cred *cred = current_cred(); struct vm_area_struct *vma; /* @@ -223,10 +223,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, NEW_AUX_ENT(AT_BASE, interp_load_addr); NEW_AUX_ENT(AT_FLAGS, 0); NEW_AUX_ENT(AT_ENTRY, exec->e_entry); - NEW_AUX_ENT(AT_UID, tsk->cred->uid); - NEW_AUX_ENT(AT_EUID, tsk->cred->euid); - NEW_AUX_ENT(AT_GID, tsk->cred->gid); - NEW_AUX_ENT(AT_EGID, tsk->cred->egid); + NEW_AUX_ENT(AT_UID, cred->uid); + NEW_AUX_ENT(AT_EUID, cred->euid); + NEW_AUX_ENT(AT_GID, cred->gid); + NEW_AUX_ENT(AT_EGID, cred->egid); NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); NEW_AUX_ENT(AT_EXECFN, bprm->exec); if (k_platform) { diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 9f67054c2c4..1f6e8c023b4 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -475,6 +475,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, struct elf_fdpic_params *exec_params, struct elf_fdpic_params *interp_params) { + const struct cred *cred = current_cred(); unsigned long sp, csp, nitems; elf_caddr_t __user *argv, *envp; size_t platform_len = 0, len; @@ -623,10 +624,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, NEW_AUX_ENT(AT_BASE, interp_params->elfhdr_addr); NEW_AUX_ENT(AT_FLAGS, 0); NEW_AUX_ENT(AT_ENTRY, exec_params->entry_addr); - NEW_AUX_ENT(AT_UID, (elf_addr_t) current->cred->uid); - NEW_AUX_ENT(AT_EUID, (elf_addr_t) current->cred->euid); - NEW_AUX_ENT(AT_GID, (elf_addr_t) current->cred->gid); - NEW_AUX_ENT(AT_EGID, (elf_addr_t) current->cred->egid); + NEW_AUX_ENT(AT_UID, (elf_addr_t) cred->uid); + NEW_AUX_ENT(AT_EUID, (elf_addr_t) cred->euid); + NEW_AUX_ENT(AT_GID, (elf_addr_t) cred->gid); + NEW_AUX_ENT(AT_EGID, (elf_addr_t) cred->egid); NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); NEW_AUX_ENT(AT_EXECFN, bprm->exec); diff --git a/fs/exec.c b/fs/exec.c index 31149e430a8..a5330e1a221 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1388,6 +1388,7 @@ EXPORT_SYMBOL(set_binfmt); */ static int format_corename(char *corename, long signr) { + const struct cred *cred = current_cred(); const char *pat_ptr = core_pattern; int ispipe = (*pat_ptr == '|'); char *out_ptr = corename; @@ -1424,7 +1425,7 @@ static int format_corename(char *corename, long signr) /* uid */ case 'u': rc = snprintf(out_ptr, out_end - out_ptr, - "%d", current_uid()); + "%d", cred->uid); if (rc > out_end - out_ptr) goto out; out_ptr += rc; @@ -1432,7 +1433,7 @@ static int format_corename(char *corename, long signr) /* gid */ case 'g': rc = snprintf(out_ptr, out_end - out_ptr, - "%d", current_gid()); + "%d", cred->gid); if (rc > out_end - out_ptr) goto out; out_ptr += rc; diff --git a/fs/fcntl.c b/fs/fcntl.c index 63964d863ad..c594cc0e40f 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -205,13 +205,14 @@ static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, int __f_setown(struct file *filp, struct pid *pid, enum pid_type type, int force) { + const struct cred *cred = current_cred(); int err; err = security_file_set_fowner(filp); if (err) return err; - f_modown(filp, pid, type, current_uid(), current_euid(), force); + f_modown(filp, pid, type, cred->uid, cred->euid, force); return 0; } EXPORT_SYMBOL(__f_setown); diff --git a/fs/file_table.c b/fs/file_table.c index 3152b53cfab..bc4563fe791 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -94,7 +94,7 @@ int proc_nr_files(ctl_table *table, int write, struct file *filp, */ struct file *get_empty_filp(void) { - struct task_struct *tsk; + const struct cred *cred = current_cred(); static int old_max; struct file * f; @@ -118,12 +118,11 @@ struct file *get_empty_filp(void) if (security_file_alloc(f)) goto fail_sec; - tsk = current; INIT_LIST_HEAD(&f->f_u.fu_list); atomic_long_set(&f->f_count, 1); rwlock_init(&f->f_owner.lock); - f->f_uid = tsk->cred->fsuid; - f->f_gid = tsk->cred->fsgid; + f->f_uid = cred->fsuid; + f->f_gid = cred->fsgid; eventpoll_init_file(f); /* f->f_version: 0 */ return f; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 870a721b8bd..7d479ce3ace 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -951,6 +951,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size) struct inode *inode; struct dentry *dentry, *root; struct qstr quick_string; + struct user_struct *user = current_user(); if (!hugetlbfs_vfsmount) return ERR_PTR(-ENOENT); @@ -958,7 +959,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size) if (!can_do_hugetlb_shm()) return ERR_PTR(-EPERM); - if (!user_shm_lock(size, current->cred->user)) + if (!user_shm_lock(size, user)) return ERR_PTR(-ENOMEM); root = hugetlbfs_vfsmount->mnt_root; @@ -998,7 +999,7 @@ out_inode: out_dentry: dput(dentry); out_shm_unlock: - user_shm_unlock(size, current->cred->user); + user_shm_unlock(size, user); return ERR_PTR(error); } diff --git a/fs/ioprio.c b/fs/ioprio.c index bb5210af77c..5112554fd21 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -123,7 +123,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio) break; case IOPRIO_WHO_USER: if (!who) - user = current->cred->user; + user = current_user(); else user = find_user(who); @@ -216,7 +216,7 @@ asmlinkage long sys_ioprio_get(int which, int who) break; case IOPRIO_WHO_USER: if (!who) - user = current->cred->user; + user = current_user(); else user = find_user(who); diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c index 9e9bb0db4f6..e7ddd0328dd 100644 --- a/fs/smbfs/dir.c +++ b/fs/smbfs/dir.c @@ -667,8 +667,7 @@ smb_make_node(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) attr.ia_valid = ATTR_MODE | ATTR_UID | ATTR_GID; attr.ia_mode = mode; - attr.ia_uid = current_euid(); - attr.ia_gid = current_egid(); + current_euid_egid(&attr.ia_uid, &attr.ia_gid); if (!new_valid_dev(dev)) return -EINVAL; diff --git a/include/linux/cred.h b/include/linux/cred.h index a7a686074cb..4221ec6000c 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -37,15 +37,16 @@ struct group_info { * get_group_info - Get a reference to a group info structure * @group_info: The group info to reference * - * This must be called with the owning task locked (via task_lock()) when task - * != current. The reason being that the vast majority of callers are looking - * at current->group_info, which can not be changed except by the current task. - * Changing current->group_info requires the task lock, too. + * This gets a reference to a set of supplementary groups. + * + * If the caller is accessing a task's credentials, they must hold the RCU read + * lock when reading. */ -#define get_group_info(group_info) \ -do { \ - atomic_inc(&(group_info)->usage); \ -} while (0) +static inline struct group_info *get_group_info(struct group_info *gi) +{ + atomic_inc(&gi->usage); + return gi; +} /** * put_group_info - Release a reference to a group info structure @@ -61,7 +62,7 @@ extern struct group_info *groups_alloc(int); extern void groups_free(struct group_info *); extern int set_current_groups(struct group_info *); extern int set_groups(struct cred *, struct group_info *); -extern int groups_search(struct group_info *, gid_t); +extern int groups_search(const struct group_info *, gid_t); /* access the groups "array" with this macro */ #define GROUP_AT(gi, i) \ @@ -123,41 +124,6 @@ struct cred { spinlock_t lock; /* lock for pointer changes */ }; -#define get_current_user() (get_uid(current->cred->user)) - -#define task_uid(task) ((task)->cred->uid) -#define task_gid(task) ((task)->cred->gid) -#define task_euid(task) ((task)->cred->euid) -#define task_egid(task) ((task)->cred->egid) - -#define current_uid() (current->cred->uid) -#define current_gid() (current->cred->gid) -#define current_euid() (current->cred->euid) -#define current_egid() (current->cred->egid) -#define current_suid() (current->cred->suid) -#define current_sgid() (current->cred->sgid) -#define current_fsuid() (current->cred->fsuid) -#define current_fsgid() (current->cred->fsgid) -#define current_cap() (current->cred->cap_effective) - -#define current_uid_gid(_uid, _gid) \ -do { \ - *(_uid) = current->cred->uid; \ - *(_gid) = current->cred->gid; \ -} while(0) - -#define current_euid_egid(_uid, _gid) \ -do { \ - *(_uid) = current->cred->euid; \ - *(_gid) = current->cred->egid; \ -} while(0) - -#define current_fsuid_fsgid(_uid, _gid) \ -do { \ - *(_uid) = current->cred->fsuid; \ - *(_gid) = current->cred->fsgid; \ -} while(0) - extern void __put_cred(struct cred *); extern int copy_creds(struct task_struct *, unsigned long); @@ -187,4 +153,137 @@ static inline void put_cred(struct cred *cred) __put_cred(cred); } +/** + * current_cred - Access the current task's credentials + * + * Access the credentials of the current task. + */ +#define current_cred() \ + (current->cred) + +/** + * __task_cred - Access another task's credentials + * @task: The task to query + * + * Access the credentials of another task. The caller must hold the + * RCU readlock. + * + * The caller must make sure task doesn't go away, either by holding a ref on + * task or by holding tasklist_lock to prevent it from being unlinked. + */ +#define __task_cred(task) \ + ((const struct cred *)(rcu_dereference((task)->cred))) + +/** + * get_task_cred - Get another task's credentials + * @task: The task to query + * + * Get the credentials of a task, pinning them so that they can't go away. + * Accessing a task's credentials directly is not permitted. + * + * The caller must make sure task doesn't go away, either by holding a ref on + * task or by holding tasklist_lock to prevent it from being unlinked. + */ +#define get_task_cred(task) \ +({ \ + struct cred *__cred; \ + rcu_read_lock(); \ + __cred = (struct cred *) __task_cred((task)); \ + get_cred(__cred); \ + rcu_read_unlock(); \ + __cred; \ +}) + +/** + * get_current_cred - Get the current task's credentials + * + * Get the credentials of the current task, pinning them so that they can't go + * away. Accessing the current task's credentials directly is not permitted. + */ +#define get_current_cred() \ + (get_cred(current_cred())) + +/** + * get_current_user - Get the current task's user_struct + * + * Get the user record of the current task, pinning it so that it can't go + * away. + */ +#define get_current_user() \ +({ \ + struct user_struct *__u; \ + struct cred *__cred; \ + __cred = (struct cred *) current_cred(); \ + __u = get_uid(__cred->user); \ + __u; \ +}) + +/** + * get_current_groups - Get the current task's supplementary group list + * + * Get the supplementary group list of the current task, pinning it so that it + * can't go away. + */ +#define get_current_groups() \ +({ \ + struct group_info *__groups; \ + struct cred *__cred; \ + __cred = (struct cred *) current_cred(); \ + __groups = get_group_info(__cred->group_info); \ + __groups; \ +}) + +#define task_cred_xxx(task, xxx) \ +({ \ + __typeof__(task->cred->xxx) ___val; \ + rcu_read_lock(); \ + ___val = __task_cred((task))->xxx; \ + rcu_read_unlock(); \ + ___val; \ +}) + +#define task_uid(task) (task_cred_xxx((task), uid)) +#define task_euid(task) (task_cred_xxx((task), euid)) + +#define current_cred_xxx(xxx) \ +({ \ + current->cred->xxx; \ +}) + +#define current_uid() (current_cred_xxx(uid)) +#define current_gid() (current_cred_xxx(gid)) +#define current_euid() (current_cred_xxx(euid)) +#define current_egid() (current_cred_xxx(egid)) +#define current_suid() (current_cred_xxx(suid)) +#define current_sgid() (current_cred_xxx(sgid)) +#define current_fsuid() (current_cred_xxx(fsuid)) +#define current_fsgid() (current_cred_xxx(fsgid)) +#define current_cap() (current_cred_xxx(cap_effective)) +#define current_user() (current_cred_xxx(user)) +#define current_security() (current_cred_xxx(security)) + +#define current_uid_gid(_uid, _gid) \ +do { \ + const struct cred *__cred; \ + __cred = current_cred(); \ + *(_uid) = __cred->uid; \ + *(_gid) = __cred->gid; \ +} while(0) + +#define current_euid_egid(_euid, _egid) \ +do { \ + const struct cred *__cred; \ + __cred = current_cred(); \ + *(_euid) = __cred->euid; \ + *(_egid) = __cred->egid; \ +} while(0) + +#define current_fsuid_fsgid(_fsuid, _fsgid) \ +do { \ + const struct cred *__cred; \ + __cred = current_cred(); \ + *(_fsuid) = __cred->fsuid; \ + *(_fsgid) = __cred->fsgid; \ +} while(0) + #endif /* _LINUX_CRED_H */ diff --git a/include/linux/securebits.h b/include/linux/securebits.h index 6d389491bfa..d2c5ed845bc 100644 --- a/include/linux/securebits.h +++ b/include/linux/securebits.h @@ -32,7 +32,7 @@ setting is locked or not. A setting which is locked cannot be changed from user-level. */ #define issecure_mask(X) (1 << (X)) -#define issecure(X) (issecure_mask(X) & current->cred->securebits) +#define issecure(X) (issecure_mask(X) & current_cred_xxx(securebits)) #define SECURE_ALL_BITS (issecure_mask(SECURE_NOROOT) | \ issecure_mask(SECURE_NO_SETUID_FIXUP) | \ diff --git a/ipc/mqueue.c b/ipc/mqueue.c index e1885b494ba..1151881ccb9 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -112,6 +112,7 @@ static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode) static struct inode *mqueue_get_inode(struct super_block *sb, int mode, struct mq_attr *attr) { + struct user_struct *u = current_user(); struct inode *inode; inode = new_inode(sb); @@ -126,7 +127,6 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode, if (S_ISREG(mode)) { struct mqueue_inode_info *info; struct task_struct *p = current; - struct user_struct *u = p->cred->user; unsigned long mq_bytes, mq_msg_tblsz; inode->i_fop = &mqueue_file_operations; diff --git a/ipc/shm.c b/ipc/shm.c index 264a9d33c5d..38a055758a9 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -366,7 +366,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) if (shmflg & SHM_HUGETLB) { /* hugetlb_file_setup takes care of mlock user accounting */ file = hugetlb_file_setup(name, size); - shp->mlock_user = current->cred->user; + shp->mlock_user = current_user(); } else { int acctflag = VM_ACCOUNT; /* @@ -767,7 +767,7 @@ asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf) goto out_unlock; if(cmd==SHM_LOCK) { - struct user_struct *user = current->cred->user; + struct user_struct *user = current_user(); if (!is_file_hugepages(shp->shm_file)) { err = shmem_lock(shp->shm_file, 1, user); if (!err && !(shp->shm_perm.mode & SHM_LOCKED)){ diff --git a/kernel/sys.c b/kernel/sys.c index 5d81f07c015..c4d6b59553e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -143,6 +143,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) { struct task_struct *g, *p; struct user_struct *user; + const struct cred *cred = current_cred(); int error = -EINVAL; struct pid *pgrp; @@ -176,18 +177,18 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: - user = current->cred->user; + user = cred->user; if (!who) - who = current_uid(); - else - if (who != current_uid() && !(user = find_user(who))) - goto out_unlock; /* No processes for this user */ + who = cred->uid; + else if ((who != cred->uid) && + !(user = find_user(who))) + goto out_unlock; /* No processes for this user */ do_each_thread(g, p) - if (p->cred->uid == who) + if (__task_cred(p)->uid == who) error = set_one_prio(p, niceval, error); while_each_thread(g, p); - if (who != current_uid()) + if (who != cred->uid) free_uid(user); /* For find_user() */ break; } @@ -207,6 +208,7 @@ asmlinkage long sys_getpriority(int which, int who) { struct task_struct *g, *p; struct user_struct *user; + const struct cred *cred = current_cred(); long niceval, retval = -ESRCH; struct pid *pgrp; @@ -238,21 +240,21 @@ asmlinkage long sys_getpriority(int which, int who) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: - user = current->cred->user; + user = (struct user_struct *) cred->user; if (!who) - who = current_uid(); - else - if (who != current_uid() && !(user = find_user(who))) - goto out_unlock; /* No processes for this user */ + who = cred->uid; + else if ((who != cred->uid) && + !(user = find_user(who))) + goto out_unlock; /* No processes for this user */ do_each_thread(g, p) - if (p->cred->uid == who) { + if (__task_cred(p)->uid == who) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; } while_each_thread(g, p); - if (who != current_uid()) + if (who != cred->uid) free_uid(user); /* for find_user() */ break; } @@ -743,11 +745,11 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid) { - struct cred *cred = current->cred; + const struct cred *cred = current_cred(); int retval; - if (!(retval = put_user(cred->uid, ruid)) && - !(retval = put_user(cred->euid, euid))) + if (!(retval = put_user(cred->uid, ruid)) && + !(retval = put_user(cred->euid, euid))) retval = put_user(cred->suid, suid); return retval; @@ -796,11 +798,11 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid) { - struct cred *cred = current->cred; + const struct cred *cred = current_cred(); int retval; - if (!(retval = put_user(cred->gid, rgid)) && - !(retval = put_user(cred->egid, egid))) + if (!(retval = put_user(cred->gid, rgid)) && + !(retval = put_user(cred->egid, egid))) retval = put_user(cred->sgid, sgid); return retval; @@ -1199,7 +1201,7 @@ static void groups_sort(struct group_info *group_info) } /* a simple bsearch */ -int groups_search(struct group_info *group_info, gid_t grp) +int groups_search(const struct group_info *group_info, gid_t grp) { unsigned int left, right; @@ -1268,13 +1270,8 @@ EXPORT_SYMBOL(set_current_groups); asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) { - struct cred *cred = current->cred; - int i = 0; - - /* - * SMP: Nobody else can change our grouplist. Thus we are - * safe. - */ + const struct cred *cred = current_cred(); + int i; if (gidsetsize < 0) return -EINVAL; @@ -1330,8 +1327,9 @@ asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist) */ int in_group_p(gid_t grp) { - struct cred *cred = current->cred; + const struct cred *cred = current_cred(); int retval = 1; + if (grp != cred->fsgid) retval = groups_search(cred->group_info, grp); return retval; @@ -1341,8 +1339,9 @@ EXPORT_SYMBOL(in_group_p); int in_egroup_p(gid_t grp) { - struct cred *cred = current->cred; + const struct cred *cred = current_cred(); int retval = 1; + if (grp != cred->egid) retval = groups_search(cred->group_info, grp); return retval; diff --git a/kernel/uid16.c b/kernel/uid16.c index 71f07fc39fe..2460c3199b5 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -84,11 +84,12 @@ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid) asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid) { + const struct cred *cred = current_cred(); int retval; - if (!(retval = put_user(high2lowuid(current->cred->uid), ruid)) && - !(retval = put_user(high2lowuid(current->cred->euid), euid))) - retval = put_user(high2lowuid(current->cred->suid), suid); + if (!(retval = put_user(high2lowuid(cred->uid), ruid)) && + !(retval = put_user(high2lowuid(cred->euid), euid))) + retval = put_user(high2lowuid(cred->suid), suid); return retval; } @@ -104,11 +105,12 @@ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid) asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid) { + const struct cred *cred = current_cred(); int retval; - if (!(retval = put_user(high2lowgid(current->cred->gid), rgid)) && - !(retval = put_user(high2lowgid(current->cred->egid), egid))) - retval = put_user(high2lowgid(current->cred->sgid), sgid); + if (!(retval = put_user(high2lowgid(cred->gid), rgid)) && + !(retval = put_user(high2lowgid(cred->egid), egid))) + retval = put_user(high2lowgid(cred->sgid), sgid); return retval; } @@ -161,25 +163,24 @@ static int groups16_from_user(struct group_info *group_info, asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist) { - int i = 0; + const struct cred *cred = current_cred(); + int i; if (gidsetsize < 0) return -EINVAL; - get_group_info(current->cred->group_info); - i = current->cred->group_info->ngroups; + i = cred->group_info->ngroups; if (gidsetsize) { if (i > gidsetsize) { i = -EINVAL; goto out; } - if (groups16_to_user(grouplist, current->cred->group_info)) { + if (groups16_to_user(grouplist, cred->group_info)) { i = -EFAULT; goto out; } } out: - put_group_info(current->cred->group_info); return i; } @@ -210,20 +211,20 @@ asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist) asmlinkage long sys_getuid16(void) { - return high2lowuid(current->cred->uid); + return high2lowuid(current_uid()); } asmlinkage long sys_geteuid16(void) { - return high2lowuid(current->cred->euid); + return high2lowuid(current_euid()); } asmlinkage long sys_getgid16(void) { - return high2lowgid(current->cred->gid); + return high2lowgid(current_gid()); } asmlinkage long sys_getegid16(void) { - return high2lowgid(current->cred->egid); + return high2lowgid(current_egid()); } diff --git a/net/core/scm.c b/net/core/scm.c index c28ca32a7d9..f73c44b17dd 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -44,7 +44,7 @@ static __inline__ int scm_check_creds(struct ucred *creds) { - struct cred *cred = current->cred; + const struct cred *cred = current_cred(); if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) && ((creds->uid == cred->uid || creds->uid == cred->euid || diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index c7954321260..0443f834945 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -350,16 +350,18 @@ EXPORT_SYMBOL_GPL(rpcauth_lookup_credcache); struct rpc_cred * rpcauth_lookupcred(struct rpc_auth *auth, int flags) { - struct auth_cred acred = { - .uid = current_fsuid(), - .gid = current_fsgid(), - .group_info = current->cred->group_info, - }; + struct auth_cred acred; struct rpc_cred *ret; + const struct cred *cred = current_cred(); dprintk("RPC: looking up %s cred\n", auth->au_ops->au_name); - get_group_info(acred.group_info); + + memset(&acred, 0, sizeof(acred)); + acred.uid = cred->fsuid; + acred.gid = cred->fsgid; + acred.group_info = get_group_info(((struct cred *)cred)->group_info); + ret = auth->au_ops->lookup_cred(auth, &acred, flags); put_group_info(acred.group_info); return ret; diff --git a/security/commoncap.c b/security/commoncap.c index fa61679f8c7..61307f59000 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -641,7 +641,7 @@ int cap_task_setnice (struct task_struct *p, int nice) int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5, long *rc_p) { - struct cred *cred = current->cred; + struct cred *cred = current_cred(); long error = 0; switch (option) { diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index b0904cdda2e..ce8ac6073d5 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -582,7 +582,7 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial, { struct request_key_auth *rka; struct task_struct *t = current; - struct cred *cred = t->cred; + struct cred *cred = current_cred(); struct key *key; key_ref_t key_ref, skey_ref; int ret; diff --git a/security/keys/request_key.c b/security/keys/request_key.c index 3e9b9eb1dd2..0488b0af5bd 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -67,6 +67,7 @@ static int call_sbin_request_key(struct key_construction *cons, void *aux) { struct task_struct *tsk = current; + const struct cred *cred = current_cred(); key_serial_t prkey, sskey; struct key *key = cons->key, *authkey = cons->authkey, *keyring; char *argv[9], *envp[3], uid_str[12], gid_str[12]; @@ -96,16 +97,16 @@ static int call_sbin_request_key(struct key_construction *cons, goto error_link; /* record the UID and GID */ - sprintf(uid_str, "%d", current_fsuid()); - sprintf(gid_str, "%d", current_fsgid()); + sprintf(uid_str, "%d", cred->fsuid); + sprintf(gid_str, "%d", cred->fsgid); /* we say which key is under construction */ sprintf(key_str, "%d", key->serial); /* we specify the process's default keyrings */ sprintf(keyring_str[0], "%d", - tsk->cred->thread_keyring ? - tsk->cred->thread_keyring->serial : 0); + cred->thread_keyring ? + cred->thread_keyring->serial : 0); prkey = 0; if (tsk->signal->process_keyring) @@ -118,7 +119,7 @@ static int call_sbin_request_key(struct key_construction *cons, sskey = rcu_dereference(tsk->signal->session_keyring)->serial; rcu_read_unlock(); } else { - sskey = tsk->cred->user->session_keyring->serial; + sskey = cred->user->session_keyring->serial; } sprintf(keyring_str[2], "%d", sskey); diff --git a/security/selinux/exports.c b/security/selinux/exports.c index cf02490cd1e..c73aeaa008e 100644 --- a/security/selinux/exports.c +++ b/security/selinux/exports.c @@ -39,9 +39,13 @@ EXPORT_SYMBOL_GPL(selinux_string_to_sid); int selinux_secmark_relabel_packet_permission(u32 sid) { if (selinux_enabled) { - struct task_security_struct *tsec = current->cred->security; + const struct task_security_struct *__tsec; + u32 tsid; - return avc_has_perm(tsec->sid, sid, SECCLASS_PACKET, + __tsec = current_security(); + tsid = __tsec->sid; + + return avc_has_perm(tsid, sid, SECCLASS_PACKET, PACKET__RELABELTO, NULL); } return 0; diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c index d7db76617b0..c0eb72013d6 100644 --- a/security/selinux/xfrm.c +++ b/security/selinux/xfrm.c @@ -197,7 +197,7 @@ static int selinux_xfrm_sec_ctx_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_user_sec_ctx *uctx, u32 sid) { int rc = 0; - struct task_security_struct *tsec = current->cred->security; + const struct task_security_struct *tsec = current_security(); struct xfrm_sec_ctx *ctx = NULL; char *ctx_str = NULL; u32 str_len; @@ -333,7 +333,7 @@ void selinux_xfrm_policy_free(struct xfrm_sec_ctx *ctx) */ int selinux_xfrm_policy_delete(struct xfrm_sec_ctx *ctx) { - struct task_security_struct *tsec = current->cred->security; + const struct task_security_struct *tsec = current_security(); int rc = 0; if (ctx) { @@ -378,7 +378,7 @@ void selinux_xfrm_state_free(struct xfrm_state *x) */ int selinux_xfrm_state_delete(struct xfrm_state *x) { - struct task_security_struct *tsec = current->cred->security; + const struct task_security_struct *tsec = current_security(); struct xfrm_sec_ctx *ctx = x->security; int rc = 0; diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c index b6dd4fc0fb0..247cec3b5a4 100644 --- a/security/smack/smack_access.c +++ b/security/smack/smack_access.c @@ -164,7 +164,7 @@ int smk_curacc(char *obj_label, u32 mode) { int rc; - rc = smk_access(current->cred->security, obj_label, mode); + rc = smk_access(current_security(), obj_label, mode); if (rc == 0) return 0; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index cc837314fb0..e8a4fcb1ad0 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -143,7 +143,7 @@ static int smack_ptrace_traceme(struct task_struct *ptp) static int smack_syslog(int type) { int rc; - char *sp = current->cred->security; + char *sp = current_security(); rc = cap_syslog(type); if (rc != 0) @@ -375,7 +375,7 @@ static int smack_sb_umount(struct vfsmount *mnt, int flags) */ static int smack_inode_alloc_security(struct inode *inode) { - inode->i_security = new_inode_smack(current->cred->security); + inode->i_security = new_inode_smack(current_security()); if (inode->i_security == NULL) return -ENOMEM; return 0; @@ -820,7 +820,7 @@ static int smack_file_permission(struct file *file, int mask) */ static int smack_file_alloc_security(struct file *file) { - file->f_security = current->cred->security; + file->f_security = current_security(); return 0; } @@ -918,7 +918,7 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd, */ static int smack_file_set_fowner(struct file *file) { - file->f_security = current->cred->security; + file->f_security = current_security(); return 0; } @@ -986,8 +986,7 @@ static int smack_file_receive(struct file *file) */ static int smack_cred_alloc_security(struct cred *cred) { - cred->security = current->cred->security; - + cred->security = current_security(); return 0; } @@ -1225,7 +1224,7 @@ static void smack_task_to_inode(struct task_struct *p, struct inode *inode) */ static int smack_sk_alloc_security(struct sock *sk, int family, gfp_t gfp_flags) { - char *csp = current->cred->security; + char *csp = current_security(); struct socket_smack *ssp; ssp = kzalloc(sizeof(struct socket_smack), gfp_flags); @@ -1450,7 +1449,7 @@ static int smack_flags_to_may(int flags) */ static int smack_msg_msg_alloc_security(struct msg_msg *msg) { - msg->security = current->cred->security; + msg->security = current_security(); return 0; } @@ -1486,7 +1485,7 @@ static int smack_shm_alloc_security(struct shmid_kernel *shp) { struct kern_ipc_perm *isp = &shp->shm_perm; - isp->security = current->cred->security; + isp->security = current_security(); return 0; } @@ -1595,7 +1594,7 @@ static int smack_sem_alloc_security(struct sem_array *sma) { struct kern_ipc_perm *isp = &sma->sem_perm; - isp->security = current->cred->security; + isp->security = current_security(); return 0; } @@ -1699,7 +1698,7 @@ static int smack_msg_queue_alloc_security(struct msg_queue *msq) { struct kern_ipc_perm *kisp = &msq->q_perm; - kisp->security = current->cred->security; + kisp->security = current_security(); return 0; } @@ -1854,7 +1853,7 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode) struct super_block *sbp; struct superblock_smack *sbsp; struct inode_smack *isp; - char *csp = current->cred->security; + char *csp = current_security(); char *fetched; char *final; struct dentry *dp; @@ -2290,8 +2289,7 @@ static void smack_sock_graft(struct sock *sk, struct socket *parent) return; ssp = sk->sk_security; - ssp->smk_in = current->cred->security; - ssp->smk_out = current->cred->security; + ssp->smk_in = ssp->smk_out = current_security(); ssp->smk_packet[0] = '\0'; rc = smack_netlabel(sk); diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index c5ca279e050..ca257dfdc75 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -336,7 +336,7 @@ static void smk_cipso_doi(void) audit_info.loginuid = audit_get_loginuid(current); audit_info.sessionid = audit_get_sessionid(current); - audit_info.secid = smack_to_secid(current->cred->security); + audit_info.secid = smack_to_secid(current_security()); rc = netlbl_cfg_map_del(NULL, &audit_info); if (rc != 0) @@ -371,7 +371,7 @@ static void smk_unlbl_ambient(char *oldambient) audit_info.loginuid = audit_get_loginuid(current); audit_info.sessionid = audit_get_sessionid(current); - audit_info.secid = smack_to_secid(current->cred->security); + audit_info.secid = smack_to_secid(current_security()); if (oldambient != NULL) { rc = netlbl_cfg_map_del(oldambient, &audit_info); -- cgit v1.2.3-70-g09d2 From c69e8d9c01db2adc503464993c358901c9af9de4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:19 +1100 Subject: CRED: Use RCU to access another task's creds and to release a task's own creds Use RCU to access another task's creds and to release a task's own creds. This means that it will be possible for the credentials of a task to be replaced without another task (a) requiring a full lock to read them, and (b) seeing deallocated memory. Signed-off-by: David Howells Acked-by: James Morris Acked-by: Serge Hallyn Signed-off-by: James Morris --- arch/ia64/kernel/perfmon.c | 32 +++++++++++++--------- drivers/connector/cn_proc.c | 16 +++++++---- fs/binfmt_elf.c | 8 ++++-- fs/binfmt_elf_fdpic.c | 8 ++++-- fs/fcntl.c | 15 ++++++++--- fs/fuse/dir.c | 23 ++++++++++------ fs/ioprio.c | 14 +++++++--- fs/proc/array.c | 32 ++++++++++++++-------- fs/proc/base.c | 32 ++++++++++++++++------ include/linux/cred.h | 3 ++- kernel/auditsc.c | 33 +++++++++++++---------- kernel/cgroup.c | 16 +++++------ kernel/exit.c | 14 ++++++---- kernel/futex.c | 22 +++++++++------ kernel/futex_compat.c | 7 ++--- kernel/ptrace.c | 22 ++++++++------- kernel/sched.c | 31 ++++++++++++++------- kernel/signal.c | 49 ++++++++++++++++++++------------- kernel/sys.c | 11 +++++--- kernel/tsacct.c | 6 +++-- mm/mempolicy.c | 8 +++--- mm/migrate.c | 8 +++--- mm/oom_kill.c | 6 ++--- security/commoncap.c | 64 +++++++++++++++++++++++++++----------------- security/keys/permission.c | 10 ++++--- security/keys/process_keys.c | 24 ++++++++++------- security/selinux/selinuxfs.c | 13 ++++++--- security/smack/smack_lsm.c | 32 +++++++++++----------- 28 files changed, 355 insertions(+), 204 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index dd38db46a77..0e499757309 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2399,25 +2399,33 @@ error_kmem: static int pfm_bad_permissions(struct task_struct *task) { + const struct cred *tcred; uid_t uid = current_uid(); gid_t gid = current_gid(); + int ret; + + rcu_read_lock(); + tcred = __task_cred(task); /* inspired by ptrace_attach() */ DPRINT(("cur: uid=%d gid=%d task: euid=%d suid=%d uid=%d egid=%d sgid=%d\n", uid, gid, - task->euid, - task->suid, - task->uid, - task->egid, - task->sgid)); - - return (uid != task->euid) - || (uid != task->suid) - || (uid != task->uid) - || (gid != task->egid) - || (gid != task->sgid) - || (gid != task->gid)) && !capable(CAP_SYS_PTRACE); + tcred->euid, + tcred->suid, + tcred->uid, + tcred->egid, + tcred->sgid)); + + ret = ((uid != tcred->euid) + || (uid != tcred->suid) + || (uid != tcred->uid) + || (gid != tcred->egid) + || (gid != tcred->sgid) + || (gid != tcred->gid)) && !capable(CAP_SYS_PTRACE); + + rcu_read_unlock(); + return ret; } static int diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c index 354c1ff1715..c5afc98e267 100644 --- a/drivers/connector/cn_proc.c +++ b/drivers/connector/cn_proc.c @@ -106,6 +106,7 @@ void proc_id_connector(struct task_struct *task, int which_id) struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE]; struct timespec ts; + const struct cred *cred; if (atomic_read(&proc_event_num_listeners) < 1) return; @@ -115,14 +116,19 @@ void proc_id_connector(struct task_struct *task, int which_id) ev->what = which_id; ev->event_data.id.process_pid = task->pid; ev->event_data.id.process_tgid = task->tgid; + rcu_read_lock(); + cred = __task_cred(task); if (which_id == PROC_EVENT_UID) { - ev->event_data.id.r.ruid = task->cred->uid; - ev->event_data.id.e.euid = task->cred->euid; + ev->event_data.id.r.ruid = cred->uid; + ev->event_data.id.e.euid = cred->euid; } else if (which_id == PROC_EVENT_GID) { - ev->event_data.id.r.rgid = task->cred->gid; - ev->event_data.id.e.egid = task->cred->egid; - } else + ev->event_data.id.r.rgid = cred->gid; + ev->event_data.id.e.egid = cred->egid; + } else { + rcu_read_unlock(); return; + } + rcu_read_unlock(); get_seq(&msg->seq, &ev->cpu); ktime_get_ts(&ts); /* get high res monotonic timestamp */ put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 0e665561316..9142ff5dc8e 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1361,6 +1361,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus, static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, struct mm_struct *mm) { + const struct cred *cred; unsigned int i, len; /* first copy the parameters from user space */ @@ -1388,8 +1389,11 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_zomb = psinfo->pr_sname == 'Z'; psinfo->pr_nice = task_nice(p); psinfo->pr_flag = p->flags; - SET_UID(psinfo->pr_uid, p->cred->uid); - SET_GID(psinfo->pr_gid, p->cred->gid); + rcu_read_lock(); + cred = __task_cred(p); + SET_UID(psinfo->pr_uid, cred->uid); + SET_GID(psinfo->pr_gid, cred->gid); + rcu_read_unlock(); strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); return 0; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 1f6e8c023b4..45dabd59936 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1414,6 +1414,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus, static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, struct mm_struct *mm) { + const struct cred *cred; unsigned int i, len; /* first copy the parameters from user space */ @@ -1441,8 +1442,11 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_zomb = psinfo->pr_sname == 'Z'; psinfo->pr_nice = task_nice(p); psinfo->pr_flag = p->flags; - SET_UID(psinfo->pr_uid, p->cred->uid); - SET_GID(psinfo->pr_gid, p->cred->gid); + rcu_read_lock(); + cred = __task_cred(p); + SET_UID(psinfo->pr_uid, cred->uid); + SET_GID(psinfo->pr_gid, cred->gid); + rcu_read_unlock(); strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); return 0; diff --git a/fs/fcntl.c b/fs/fcntl.c index c594cc0e40f..87c39f1f081 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -401,10 +401,17 @@ static const long band_table[NSIGPOLL] = { static inline int sigio_perm(struct task_struct *p, struct fown_struct *fown, int sig) { - return (((fown->euid == 0) || - (fown->euid == p->cred->suid) || (fown->euid == p->cred->uid) || - (fown->uid == p->cred->suid) || (fown->uid == p->cred->uid)) && - !security_file_send_sigiotask(p, fown, sig)); + const struct cred *cred; + int ret; + + rcu_read_lock(); + cred = __task_cred(p); + ret = ((fown->euid == 0 || + fown->euid == cred->suid || fown->euid == cred->uid || + fown->uid == cred->suid || fown->uid == cred->uid) && + !security_file_send_sigiotask(p, fown, sig)); + rcu_read_unlock(); + return ret; } static void send_sigio_to_task(struct task_struct *p, diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index e97a9898186..95bc22bdd06 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -869,18 +869,25 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat, */ int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task) { + const struct cred *cred; + int ret; + if (fc->flags & FUSE_ALLOW_OTHER) return 1; - if (task->cred->euid == fc->user_id && - task->cred->suid == fc->user_id && - task->cred->uid == fc->user_id && - task->cred->egid == fc->group_id && - task->cred->sgid == fc->group_id && - task->cred->gid == fc->group_id) - return 1; + rcu_read_lock(); + ret = 0; + cred = __task_cred(task); + if (cred->euid == fc->user_id && + cred->suid == fc->user_id && + cred->uid == fc->user_id && + cred->egid == fc->group_id && + cred->sgid == fc->group_id && + cred->gid == fc->group_id) + ret = 1; + rcu_read_unlock(); - return 0; + return ret; } static int fuse_access(struct inode *inode, int mask) diff --git a/fs/ioprio.c b/fs/ioprio.c index 5112554fd21..3569e0ad86a 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -31,10 +31,16 @@ static int set_task_ioprio(struct task_struct *task, int ioprio) { int err; struct io_context *ioc; + const struct cred *cred = current_cred(), *tcred; - if (task->cred->uid != current_euid() && - task->cred->uid != current_uid() && !capable(CAP_SYS_NICE)) + rcu_read_lock(); + tcred = __task_cred(task); + if (tcred->uid != cred->euid && + tcred->uid != cred->uid && !capable(CAP_SYS_NICE)) { + rcu_read_unlock(); return -EPERM; + } + rcu_read_unlock(); err = security_task_setioprio(task, ioprio); if (err) @@ -131,7 +137,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio) break; do_each_thread(g, p) { - if (p->cred->uid != who) + if (__task_cred(p)->uid != who) continue; ret = set_task_ioprio(p, ioprio); if (ret) @@ -224,7 +230,7 @@ asmlinkage long sys_ioprio_get(int which, int who) break; do_each_thread(g, p) { - if (p->cred->uid != user->uid) + if (__task_cred(p)->uid != user->uid) continue; tmpio = get_task_ioprio(p); if (tmpio < 0) diff --git a/fs/proc/array.c b/fs/proc/array.c index 62fe9b2009b..7e4877d9dcb 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -159,6 +159,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct group_info *group_info; int g; struct fdtable *fdt = NULL; + const struct cred *cred; pid_t ppid, tpid; rcu_read_lock(); @@ -170,6 +171,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, if (tracer) tpid = task_pid_nr_ns(tracer, ns); } + cred = get_cred((struct cred *) __task_cred(p)); seq_printf(m, "State:\t%s\n" "Tgid:\t%d\n" @@ -182,8 +184,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, task_tgid_nr_ns(p, ns), pid_nr_ns(pid, ns), ppid, tpid, - p->cred->uid, p->cred->euid, p->cred->suid, p->cred->fsuid, - p->cred->gid, p->cred->egid, p->cred->sgid, p->cred->fsgid); + cred->uid, cred->euid, cred->suid, cred->fsuid, + cred->gid, cred->egid, cred->sgid, cred->fsgid); task_lock(p); if (p->files) @@ -194,13 +196,12 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, fdt ? fdt->max_fds : 0); rcu_read_unlock(); - group_info = p->cred->group_info; - get_group_info(group_info); + group_info = cred->group_info; task_unlock(p); for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++) seq_printf(m, "%d ", GROUP_AT(group_info, g)); - put_group_info(group_info); + put_cred(cred); seq_printf(m, "\n"); } @@ -262,7 +263,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) blocked = p->blocked; collect_sigign_sigcatch(p, &ignored, &caught); num_threads = atomic_read(&p->signal->count); - qsize = atomic_read(&p->cred->user->sigpending); + qsize = atomic_read(&__task_cred(p)->user->sigpending); qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur; unlock_task_sighand(p, &flags); } @@ -293,12 +294,21 @@ static void render_cap_t(struct seq_file *m, const char *header, static inline void task_cap(struct seq_file *m, struct task_struct *p) { - struct cred *cred = p->cred; + const struct cred *cred; + kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset; - render_cap_t(m, "CapInh:\t", &cred->cap_inheritable); - render_cap_t(m, "CapPrm:\t", &cred->cap_permitted); - render_cap_t(m, "CapEff:\t", &cred->cap_effective); - render_cap_t(m, "CapBnd:\t", &cred->cap_bset); + rcu_read_lock(); + cred = __task_cred(p); + cap_inheritable = cred->cap_inheritable; + cap_permitted = cred->cap_permitted; + cap_effective = cred->cap_effective; + cap_bset = cred->cap_bset; + rcu_read_unlock(); + + render_cap_t(m, "CapInh:\t", &cap_inheritable); + render_cap_t(m, "CapPrm:\t", &cap_permitted); + render_cap_t(m, "CapEff:\t", &cap_effective); + render_cap_t(m, "CapBnd:\t", &cap_bset); } static inline void task_context_switch_counts(struct seq_file *m, diff --git a/fs/proc/base.c b/fs/proc/base.c index 6862b360c36..cf42c42cbfb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1406,6 +1406,7 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st { struct inode * inode; struct proc_inode *ei; + const struct cred *cred; /* We need a new inode */ @@ -1428,8 +1429,11 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st inode->i_uid = 0; inode->i_gid = 0; if (task_dumpable(task)) { - inode->i_uid = task->cred->euid; - inode->i_gid = task->cred->egid; + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; + inode->i_gid = cred->egid; + rcu_read_unlock(); } security_task_to_inode(task, inode); @@ -1445,6 +1449,8 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat { struct inode *inode = dentry->d_inode; struct task_struct *task; + const struct cred *cred; + generic_fillattr(inode, stat); rcu_read_lock(); @@ -1454,8 +1460,9 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat if (task) { if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || task_dumpable(task)) { - stat->uid = task->cred->euid; - stat->gid = task->cred->egid; + cred = __task_cred(task); + stat->uid = cred->euid; + stat->gid = cred->egid; } } rcu_read_unlock(); @@ -1483,11 +1490,16 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; struct task_struct *task = get_proc_task(inode); + const struct cred *cred; + if (task) { if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || task_dumpable(task)) { - inode->i_uid = task->cred->euid; - inode->i_gid = task->cred->egid; + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; + inode->i_gid = cred->egid; + rcu_read_unlock(); } else { inode->i_uid = 0; inode->i_gid = 0; @@ -1649,6 +1661,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) struct task_struct *task = get_proc_task(inode); int fd = proc_fd(inode); struct files_struct *files; + const struct cred *cred; if (task) { files = get_files_struct(task); @@ -1658,8 +1671,11 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) rcu_read_unlock(); put_files_struct(files); if (task_dumpable(task)) { - inode->i_uid = task->cred->euid; - inode->i_gid = task->cred->egid; + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; + inode->i_gid = cred->egid; + rcu_read_unlock(); } else { inode->i_uid = 0; inode->i_gid = 0; diff --git a/include/linux/cred.h b/include/linux/cred.h index 4221ec6000c..166ce4ddba6 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -147,8 +147,9 @@ static inline struct cred *get_cred(struct cred *cred) * Release a reference to a set of credentials, deleting them when the last ref * is released. */ -static inline void put_cred(struct cred *cred) +static inline void put_cred(const struct cred *_cred) { + struct cred *cred = (struct cred *) _cred; if (atomic_dec_and_test(&(cred)->usage)) __put_cred(cred); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 2febf5165fa..ae8ef88ade3 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -447,7 +447,7 @@ static int audit_filter_rules(struct task_struct *tsk, struct audit_names *name, enum audit_state *state) { - struct cred *cred = tsk->cred; + const struct cred *cred = get_task_cred(tsk); int i, j, need_sid = 1; u32 sid; @@ -642,8 +642,10 @@ static int audit_filter_rules(struct task_struct *tsk, break; } - if (!result) + if (!result) { + put_cred(cred); return 0; + } } if (rule->filterkey && ctx) ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); @@ -651,6 +653,7 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_NEVER: *state = AUDIT_DISABLED; break; case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; } + put_cred(cred); return 1; } @@ -1229,7 +1232,7 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { - struct cred *cred = tsk->cred; + const struct cred *cred; int i, call_panic = 0; struct audit_buffer *ab; struct audit_aux_data *aux; @@ -1239,13 +1242,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->pid = tsk->pid; if (!context->ppid) context->ppid = sys_getppid(); - context->uid = cred->uid; - context->gid = cred->gid; - context->euid = cred->euid; - context->suid = cred->suid; + cred = current_cred(); + context->uid = cred->uid; + context->gid = cred->gid; + context->euid = cred->euid; + context->suid = cred->suid; context->fsuid = cred->fsuid; - context->egid = cred->egid; - context->sgid = cred->sgid; + context->egid = cred->egid; + context->sgid = cred->sgid; context->fsgid = cred->fsgid; context->personality = tsk->personality; @@ -2088,7 +2092,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) audit_log_format(ab, "login pid=%d uid=%u " "old auid=%u new auid=%u" " old ses=%u new ses=%u", - task->pid, task->cred->uid, + task->pid, task_uid(task), task->loginuid, loginuid, task->sessionid, sessionid); audit_log_end(ab); @@ -2471,7 +2475,7 @@ void __audit_ptrace(struct task_struct *t) context->target_pid = t->pid; context->target_auid = audit_get_loginuid(t); - context->target_uid = t->cred->uid; + context->target_uid = task_uid(t); context->target_sessionid = audit_get_sessionid(t); security_task_getsecid(t, &context->target_sid); memcpy(context->target_comm, t->comm, TASK_COMM_LEN); @@ -2490,6 +2494,7 @@ int __audit_signal_info(int sig, struct task_struct *t) struct audit_aux_data_pids *axp; struct task_struct *tsk = current; struct audit_context *ctx = tsk->audit_context; + uid_t uid = current_uid(), t_uid = task_uid(t); if (audit_pid && t->tgid == audit_pid) { if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { @@ -2497,7 +2502,7 @@ int __audit_signal_info(int sig, struct task_struct *t) if (tsk->loginuid != -1) audit_sig_uid = tsk->loginuid; else - audit_sig_uid = tsk->cred->uid; + audit_sig_uid = uid; security_task_getsecid(tsk, &audit_sig_sid); } if (!audit_signals || audit_dummy_context()) @@ -2509,7 +2514,7 @@ int __audit_signal_info(int sig, struct task_struct *t) if (!ctx->target_pid) { ctx->target_pid = t->tgid; ctx->target_auid = audit_get_loginuid(t); - ctx->target_uid = t->cred->uid; + ctx->target_uid = t_uid; ctx->target_sessionid = audit_get_sessionid(t); security_task_getsecid(t, &ctx->target_sid); memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); @@ -2530,7 +2535,7 @@ int __audit_signal_info(int sig, struct task_struct *t) axp->target_pid[axp->pid_count] = t->tgid; axp->target_auid[axp->pid_count] = audit_get_loginuid(t); - axp->target_uid[axp->pid_count] = t->cred->uid; + axp->target_uid[axp->pid_count] = t_uid; axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); security_task_getsecid(t, &axp->target_sid[axp->pid_count]); memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e210526e640..a512a75a556 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1279,7 +1279,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) { struct task_struct *tsk; - uid_t euid; + const struct cred *cred = current_cred(), *tcred; int ret; if (pid) { @@ -1289,16 +1289,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) rcu_read_unlock(); return -ESRCH; } - get_task_struct(tsk); - rcu_read_unlock(); - euid = current_euid(); - if (euid && - euid != tsk->cred->uid && - euid != tsk->cred->suid) { - put_task_struct(tsk); + tcred = __task_cred(tsk); + if (cred->euid && + cred->euid != tcred->uid && + cred->euid != tcred->suid) { + rcu_read_unlock(); return -EACCES; } + get_task_struct(tsk); + rcu_read_unlock(); } else { tsk = current; get_task_struct(tsk); diff --git a/kernel/exit.c b/kernel/exit.c index e0f6e1892fb..bbc22530f2c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -160,7 +160,10 @@ void release_task(struct task_struct * p) int zap_leader; repeat: tracehook_prepare_release_task(p); - atomic_dec(&p->cred->user->processes); + /* don't need to get the RCU readlock here - the process is dead and + * can't be modifying its own credentials */ + atomic_dec(&__task_cred(p)->user->processes); + proc_flush_task(p); write_lock_irq(&tasklist_lock); tracehook_finish_release_task(p); @@ -1267,12 +1270,12 @@ static int wait_task_zombie(struct task_struct *p, int options, unsigned long state; int retval, status, traced; pid_t pid = task_pid_vnr(p); + uid_t uid = __task_cred(p)->uid; if (!likely(options & WEXITED)) return 0; if (unlikely(options & WNOWAIT)) { - uid_t uid = p->cred->uid; int exit_code = p->exit_code; int why, status; @@ -1393,7 +1396,7 @@ static int wait_task_zombie(struct task_struct *p, int options, if (!retval && infop) retval = put_user(pid, &infop->si_pid); if (!retval && infop) - retval = put_user(p->cred->uid, &infop->si_uid); + retval = put_user(uid, &infop->si_uid); if (!retval) retval = pid; @@ -1458,7 +1461,8 @@ static int wait_task_stopped(int ptrace, struct task_struct *p, if (!unlikely(options & WNOWAIT)) p->exit_code = 0; - uid = p->cred->uid; + /* don't need the RCU readlock here as we're holding a spinlock */ + uid = __task_cred(p)->uid; unlock_sig: spin_unlock_irq(&p->sighand->siglock); if (!exit_code) @@ -1532,10 +1536,10 @@ static int wait_task_continued(struct task_struct *p, int options, } if (!unlikely(options & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; + uid = __task_cred(p)->uid; spin_unlock_irq(&p->sighand->siglock); pid = task_pid_vnr(p); - uid = p->cred->uid; get_task_struct(p); read_unlock(&tasklist_lock); diff --git a/kernel/futex.c b/kernel/futex.c index 28421d8210b..4fe790e89d0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -439,15 +439,20 @@ static void free_pi_state(struct futex_pi_state *pi_state) static struct task_struct * futex_find_get_task(pid_t pid) { struct task_struct *p; - uid_t euid = current_euid(); + const struct cred *cred = current_cred(), *pcred; rcu_read_lock(); p = find_task_by_vpid(pid); - if (!p || (euid != p->cred->euid && - euid != p->cred->uid)) + if (!p) { p = ERR_PTR(-ESRCH); - else - get_task_struct(p); + } else { + pcred = __task_cred(p); + if (cred->euid != pcred->euid && + cred->euid != pcred->uid) + p = ERR_PTR(-ESRCH); + else + get_task_struct(p); + } rcu_read_unlock(); @@ -1831,7 +1836,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, { struct robust_list_head __user *head; unsigned long ret; - uid_t euid = current_euid(); + const struct cred *cred = current_cred(), *pcred; if (!futex_cmpxchg_enabled) return -ENOSYS; @@ -1847,8 +1852,9 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, if (!p) goto err_unlock; ret = -EPERM; - if (euid != p->cred->euid && - euid != p->cred->uid && + pcred = __task_cred(p); + if (cred->euid != pcred->euid && + cred->euid != pcred->uid && !capable(CAP_SYS_PTRACE)) goto err_unlock; head = p->robust_list; diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 2c3fd5ed34f..d607a5b9ee2 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -135,7 +135,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, { struct compat_robust_list_head __user *head; unsigned long ret; - uid_t euid = current_euid(); + const struct cred *cred = current_cred(), *pcred; if (!futex_cmpxchg_enabled) return -ENOSYS; @@ -151,8 +151,9 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, if (!p) goto err_unlock; ret = -EPERM; - if (euid != p->cred->euid && - euid != p->cred->uid && + pcred = __task_cred(p); + if (cred->euid != pcred->euid && + cred->euid != pcred->uid && !capable(CAP_SYS_PTRACE)) goto err_unlock; head = p->compat_robust_list; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 49849d12dd1..b9d5f4e4f6a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -115,7 +115,7 @@ int ptrace_check_attach(struct task_struct *child, int kill) int __ptrace_may_access(struct task_struct *task, unsigned int mode) { - struct cred *cred = current->cred, *tcred = task->cred; + const struct cred *cred = current_cred(), *tcred; /* May we inspect the given task? * This check is used both for attaching with ptrace @@ -125,19 +125,23 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) * because setting up the necessary parent/child relationship * or halting the specified task is impossible. */ - uid_t uid = cred->uid; - gid_t gid = cred->gid; int dumpable = 0; /* Don't let security modules deny introspection */ if (task == current) return 0; - if ((uid != tcred->euid || - uid != tcred->suid || - uid != tcred->uid || - gid != tcred->egid || - gid != tcred->sgid || - gid != tcred->gid) && !capable(CAP_SYS_PTRACE)) + rcu_read_lock(); + tcred = __task_cred(task); + if ((cred->uid != tcred->euid || + cred->uid != tcred->suid || + cred->uid != tcred->uid || + cred->gid != tcred->egid || + cred->gid != tcred->sgid || + cred->gid != tcred->gid) && + !capable(CAP_SYS_PTRACE)) { + rcu_read_unlock(); return -EPERM; + } + rcu_read_unlock(); smp_rmb(); if (task->mm) dumpable = get_dumpable(task->mm); diff --git a/kernel/sched.c b/kernel/sched.c index 733c59e645a..92992e287b1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -345,7 +345,9 @@ static inline struct task_group *task_group(struct task_struct *p) struct task_group *tg; #ifdef CONFIG_USER_SCHED - tg = p->cred->user->tg; + rcu_read_lock(); + tg = __task_cred(p)->user->tg; + rcu_read_unlock(); #elif defined(CONFIG_CGROUP_SCHED) tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), struct task_group, css); @@ -5121,6 +5123,22 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) set_load_weight(p); } +/* + * check the target process has a UID that matches the current process's + */ +static bool check_same_owner(struct task_struct *p) +{ + const struct cred *cred = current_cred(), *pcred; + bool match; + + rcu_read_lock(); + pcred = __task_cred(p); + match = (cred->euid == pcred->euid || + cred->euid == pcred->uid); + rcu_read_unlock(); + return match; +} + static int __sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param, bool user) { @@ -5128,7 +5146,6 @@ static int __sched_setscheduler(struct task_struct *p, int policy, unsigned long flags; const struct sched_class *prev_class = p->sched_class; struct rq *rq; - uid_t euid; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); @@ -5181,9 +5198,7 @@ recheck: return -EPERM; /* can't change other user's priorities */ - euid = current_euid(); - if (euid != p->cred->euid && - euid != p->cred->uid) + if (!check_same_owner(p)) return -EPERM; } @@ -5394,7 +5409,6 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) cpumask_t cpus_allowed; cpumask_t new_mask = *in_mask; struct task_struct *p; - uid_t euid; int retval; get_online_cpus(); @@ -5415,11 +5429,8 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) get_task_struct(p); read_unlock(&tasklist_lock); - euid = current_euid(); retval = -EPERM; - if (euid != p->cred->euid && - euid != p->cred->uid && - !capable(CAP_SYS_NICE)) + if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) goto out_unlock; retval = security_task_setscheduler(p, 0, NULL); diff --git a/kernel/signal.c b/kernel/signal.c index 80e8a6489f9..84989124baf 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -177,6 +177,11 @@ int next_signal(struct sigpending *pending, sigset_t *mask) return sig; } +/* + * allocate a new signal queue record + * - this may be called without locks if and only if t == current, otherwise an + * appopriate lock must be held to protect t's user_struct + */ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, int override_rlimit) { @@ -184,11 +189,12 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, struct user_struct *user; /* - * In order to avoid problems with "switch_user()", we want to make - * sure that the compiler doesn't re-load "t->user" + * We won't get problems with the target's UID changing under us + * because changing it requires RCU be used, and if t != current, the + * caller must be holding the RCU readlock (by way of a spinlock) and + * we use RCU protection here */ - user = t->cred->user; - barrier(); + user = __task_cred(t)->user; atomic_inc(&user->sigpending); if (override_rlimit || atomic_read(&user->sigpending) <= @@ -562,12 +568,13 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s) /* * Bad permissions for sending the signal + * - the caller must hold at least the RCU read lock */ static int check_kill_permission(int sig, struct siginfo *info, struct task_struct *t) { + const struct cred *cred = current_cred(), *tcred; struct pid *sid; - uid_t uid, euid; int error; if (!valid_signal(sig)) @@ -580,10 +587,11 @@ static int check_kill_permission(int sig, struct siginfo *info, if (error) return error; - uid = current_uid(); - euid = current_euid(); - if ((euid ^ t->cred->suid) && (euid ^ t->cred->uid) && - (uid ^ t->cred->suid) && (uid ^ t->cred->uid) && + tcred = __task_cred(t); + if ((cred->euid ^ tcred->suid) && + (cred->euid ^ tcred->uid) && + (cred->uid ^ tcred->suid) && + (cred->uid ^ tcred->uid) && !capable(CAP_KILL)) { switch (sig) { case SIGCONT: @@ -1011,6 +1019,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long return sighand; } +/* + * send signal info to all the members of a group + * - the caller must hold the RCU read lock at least + */ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { unsigned long flags; @@ -1032,8 +1044,8 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) /* * __kill_pgrp_info() sends a signal to a process group: this is what the tty * control characters do (^C, ^Z etc) + * - the caller must hold at least a readlock on tasklist_lock */ - int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) { struct task_struct *p = NULL; @@ -1089,6 +1101,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, { int ret = -EINVAL; struct task_struct *p; + const struct cred *pcred; if (!valid_signal(sig)) return ret; @@ -1099,9 +1112,11 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, ret = -ESRCH; goto out_unlock; } - if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) - && (euid != p->cred->suid) && (euid != p->cred->uid) - && (uid != p->cred->suid) && (uid != p->cred->uid)) { + pcred = __task_cred(p); + if ((info == SEND_SIG_NOINFO || + (!is_si_special(info) && SI_FROMUSER(info))) && + euid != pcred->suid && euid != pcred->uid && + uid != pcred->suid && uid != pcred->uid) { ret = -EPERM; goto out_unlock; } @@ -1372,10 +1387,9 @@ int do_notify_parent(struct task_struct *tsk, int sig) */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); + info.si_uid = __task_cred(tsk)->uid; rcu_read_unlock(); - info.si_uid = tsk->cred->uid; - thread_group_cputime(tsk, &cputime); info.si_utime = cputime_to_jiffies(cputime.utime); info.si_stime = cputime_to_jiffies(cputime.stime); @@ -1443,10 +1457,9 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why) */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); + info.si_uid = __task_cred(tsk)->uid; rcu_read_unlock(); - info.si_uid = tsk->cred->uid; - info.si_utime = cputime_to_clock_t(tsk->utime); info.si_stime = cputime_to_clock_t(tsk->stime); @@ -1713,7 +1726,7 @@ static int ptrace_signal(int signr, siginfo_t *info, info->si_errno = 0; info->si_code = SI_USER; info->si_pid = task_pid_vnr(current->parent); - info->si_uid = current->parent->cred->uid; + info->si_uid = task_uid(current->parent); } /* If the (new) signal is now blocked, requeue it. */ diff --git a/kernel/sys.c b/kernel/sys.c index c4d6b59553e..ccc9eb736d3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -112,14 +112,17 @@ EXPORT_SYMBOL(cad_pid); void (*pm_power_off_prepare)(void); +/* + * set the priority of a task + * - the caller must hold the RCU read lock + */ static int set_one_prio(struct task_struct *p, int niceval, int error) { - uid_t euid = current_euid(); + const struct cred *cred = current_cred(), *pcred = __task_cred(p); int no_nice; - if (p->cred->uid != euid && - p->cred->euid != euid && - !capable(CAP_SYS_NICE)) { + if (pcred->uid != cred->euid && + pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) { error = -EPERM; goto out; } diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 6d1ed07bf31..2dc06ab3571 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -27,6 +27,7 @@ */ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) { + const struct cred *tcred; struct timespec uptime, ts; u64 ac_etime; @@ -53,10 +54,11 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) stats->ac_flag |= AXSIG; stats->ac_nice = task_nice(tsk); stats->ac_sched = tsk->policy; - stats->ac_uid = tsk->cred->uid; - stats->ac_gid = tsk->cred->gid; stats->ac_pid = tsk->pid; rcu_read_lock(); + tcred = __task_cred(tsk); + stats->ac_uid = tcred->uid; + stats->ac_gid = tcred->gid; stats->ac_ppid = pid_alive(tsk) ? rcu_dereference(tsk->real_parent)->tgid : 0; rcu_read_unlock(); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b23492ee3e5..7555219c535 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1110,7 +1110,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, const unsigned long __user *old_nodes, const unsigned long __user *new_nodes) { - struct cred *cred, *tcred; + const struct cred *cred = current_cred(), *tcred; struct mm_struct *mm; struct task_struct *task; nodemask_t old; @@ -1145,14 +1145,16 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, * capabilities, superuser privileges or the same * userid as the target process. */ - cred = current->cred; - tcred = task->cred; + rcu_read_lock(); + tcred = __task_cred(task); if (cred->euid != tcred->suid && cred->euid != tcred->uid && cred->uid != tcred->suid && cred->uid != tcred->uid && !capable(CAP_SYS_NICE)) { + rcu_read_unlock(); err = -EPERM; goto out; } + rcu_read_unlock(); task_nodes = cpuset_mems_allowed(task); /* Is the user allowed to access the target nodes? */ diff --git a/mm/migrate.c b/mm/migrate.c index 794443da1b4..142284229ce 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1045,7 +1045,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, const int __user *nodes, int __user *status, int flags) { - struct cred *cred, *tcred; + const struct cred *cred = current_cred(), *tcred; struct task_struct *task; struct mm_struct *mm; int err; @@ -1076,14 +1076,16 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, * capabilities, superuser privileges or the same * userid as the target process. */ - cred = current->cred; - tcred = task->cred; + rcu_read_lock(); + tcred = __task_cred(task); if (cred->euid != tcred->suid && cred->euid != tcred->uid && cred->uid != tcred->suid && cred->uid != tcred->uid && !capable(CAP_SYS_NICE)) { + rcu_read_unlock(); err = -EPERM; goto out; } + rcu_read_unlock(); err = security_task_movememory(task); if (err) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3af787ba207..0e0b282a207 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -298,9 +298,9 @@ static void dump_tasks(const struct mem_cgroup *mem) task_lock(p); printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", - p->pid, p->cred->uid, p->tgid, p->mm->total_vm, - get_mm_rss(p->mm), (int)task_cpu(p), p->oomkilladj, - p->comm); + p->pid, __task_cred(p)->uid, p->tgid, + p->mm->total_vm, get_mm_rss(p->mm), (int)task_cpu(p), + p->oomkilladj, p->comm); task_unlock(p); } while_each_thread(g, p); } diff --git a/security/commoncap.c b/security/commoncap.c index 61307f59000..0384bf95db6 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -51,10 +51,13 @@ EXPORT_SYMBOL(cap_netlink_recv); */ int cap_capable(struct task_struct *tsk, int cap, int audit) { + __u32 cap_raised; + /* Derived from include/linux/sched.h:capable. */ - if (cap_raised(tsk->cred->cap_effective, cap)) - return 0; - return -EPERM; + rcu_read_lock(); + cap_raised = cap_raised(__task_cred(tsk)->cap_effective, cap); + rcu_read_unlock(); + return cap_raised ? 0 : -EPERM; } int cap_settime(struct timespec *ts, struct timezone *tz) @@ -66,34 +69,42 @@ int cap_settime(struct timespec *ts, struct timezone *tz) int cap_ptrace_may_access(struct task_struct *child, unsigned int mode) { - /* Derived from arch/i386/kernel/ptrace.c:sys_ptrace. */ - if (cap_issubset(child->cred->cap_permitted, - current->cred->cap_permitted)) - return 0; - if (capable(CAP_SYS_PTRACE)) - return 0; - return -EPERM; + int ret = 0; + + rcu_read_lock(); + if (!cap_issubset(child->cred->cap_permitted, + current->cred->cap_permitted) && + !capable(CAP_SYS_PTRACE)) + ret = -EPERM; + rcu_read_unlock(); + return ret; } int cap_ptrace_traceme(struct task_struct *parent) { - if (cap_issubset(current->cred->cap_permitted, - parent->cred->cap_permitted)) - return 0; - if (has_capability(parent, CAP_SYS_PTRACE)) - return 0; - return -EPERM; + int ret = 0; + + rcu_read_lock(); + if (!cap_issubset(current->cred->cap_permitted, + parent->cred->cap_permitted) && + !has_capability(parent, CAP_SYS_PTRACE)) + ret = -EPERM; + rcu_read_unlock(); + return ret; } int cap_capget (struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { - struct cred *cred = target->cred; + const struct cred *cred; /* Derived from kernel/capability.c:sys_capget. */ + rcu_read_lock(); + cred = __task_cred(target); *effective = cred->cap_effective; *inheritable = cred->cap_inheritable; *permitted = cred->cap_permitted; + rcu_read_unlock(); return 0; } @@ -433,7 +444,7 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe) int cap_bprm_secureexec (struct linux_binprm *bprm) { - const struct cred *cred = current->cred; + const struct cred *cred = current_cred(); if (cred->uid != 0) { if (bprm->cap_effective) @@ -511,11 +522,11 @@ static inline void cap_emulate_setxuid (int old_ruid, int old_euid, if ((old_ruid == 0 || old_euid == 0 || old_suid == 0) && (cred->uid != 0 && cred->euid != 0 && cred->suid != 0) && !issecure(SECURE_KEEP_CAPS)) { - cap_clear (cred->cap_permitted); - cap_clear (cred->cap_effective); + cap_clear(cred->cap_permitted); + cap_clear(cred->cap_effective); } if (old_euid == 0 && cred->euid != 0) { - cap_clear (cred->cap_effective); + cap_clear(cred->cap_effective); } if (old_euid != 0 && cred->euid == 0) { cred->cap_effective = cred->cap_permitted; @@ -582,9 +593,14 @@ int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid, */ static int cap_safe_nice(struct task_struct *p) { - if (!cap_issubset(p->cred->cap_permitted, - current->cred->cap_permitted) && - !capable(CAP_SYS_NICE)) + int is_subset; + + rcu_read_lock(); + is_subset = cap_issubset(__task_cred(p)->cap_permitted, + current_cred()->cap_permitted); + rcu_read_unlock(); + + if (!is_subset && !capable(CAP_SYS_NICE)) return -EPERM; return 0; } diff --git a/security/keys/permission.c b/security/keys/permission.c index baf3d5f31e7..13c36164f28 100644 --- a/security/keys/permission.c +++ b/security/keys/permission.c @@ -22,13 +22,16 @@ int key_task_permission(const key_ref_t key_ref, struct task_struct *context, key_perm_t perm) { - struct cred *cred = context->cred; + const struct cred *cred; struct key *key; key_perm_t kperm; int ret; key = key_ref_to_ptr(key_ref); + rcu_read_lock(); + cred = __task_cred(context); + /* use the second 8-bits of permissions for keys the caller owns */ if (key->uid == cred->fsuid) { kperm = key->perm >> 16; @@ -43,10 +46,7 @@ int key_task_permission(const key_ref_t key_ref, goto use_these_perms; } - spin_lock(&cred->lock); ret = groups_search(cred->group_info, key->gid); - spin_unlock(&cred->lock); - if (ret) { kperm = key->perm >> 8; goto use_these_perms; @@ -57,6 +57,8 @@ int key_task_permission(const key_ref_t key_ref, kperm = key->perm; use_these_perms: + rcu_read_lock(); + /* use the top 8-bits of permissions for keys the caller possesses * - possessor permissions are additive with other permissions */ diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index ce8ac6073d5..212601ebaa4 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -412,10 +412,13 @@ key_ref_t search_process_keyrings(struct key_type *type, struct task_struct *context) { struct request_key_auth *rka; + struct cred *cred; key_ref_t key_ref, ret, err; might_sleep(); + cred = get_task_cred(context); + /* we want to return -EAGAIN or -ENOKEY if any of the keyrings were * searchable, but we failed to find a key or we found a negative key; * otherwise we want to return a sample error (probably -EACCES) if @@ -428,9 +431,9 @@ key_ref_t search_process_keyrings(struct key_type *type, err = ERR_PTR(-EAGAIN); /* search the thread keyring first */ - if (context->cred->thread_keyring) { + if (cred->thread_keyring) { key_ref = keyring_search_aux( - make_key_ref(context->cred->thread_keyring, 1), + make_key_ref(cred->thread_keyring, 1), context, type, description, match); if (!IS_ERR(key_ref)) goto found; @@ -495,9 +498,9 @@ key_ref_t search_process_keyrings(struct key_type *type, } } /* or search the user-session keyring */ - else if (context->cred->user->session_keyring) { + else if (cred->user->session_keyring) { key_ref = keyring_search_aux( - make_key_ref(context->cred->user->session_keyring, 1), + make_key_ref(cred->user->session_keyring, 1), context, type, description, match); if (!IS_ERR(key_ref)) goto found; @@ -519,20 +522,20 @@ key_ref_t search_process_keyrings(struct key_type *type, * search the keyrings of the process mentioned there * - we don't permit access to request_key auth keys via this method */ - if (context->cred->request_key_auth && + if (cred->request_key_auth && context == current && type != &key_type_request_key_auth ) { /* defend against the auth key being revoked */ - down_read(&context->cred->request_key_auth->sem); + down_read(&cred->request_key_auth->sem); - if (key_validate(context->cred->request_key_auth) == 0) { - rka = context->cred->request_key_auth->payload.data; + if (key_validate(cred->request_key_auth) == 0) { + rka = cred->request_key_auth->payload.data; key_ref = search_process_keyrings(type, description, match, rka->context); - up_read(&context->cred->request_key_auth->sem); + up_read(&cred->request_key_auth->sem); if (!IS_ERR(key_ref)) goto found; @@ -549,7 +552,7 @@ key_ref_t search_process_keyrings(struct key_type *type, break; } } else { - up_read(&context->cred->request_key_auth->sem); + up_read(&cred->request_key_auth->sem); } } @@ -557,6 +560,7 @@ key_ref_t search_process_keyrings(struct key_type *type, key_ref = ret ? ret : err; found: + put_cred(cred); return key_ref; } /* end search_process_keyrings() */ diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 10715d1330b..c8630363823 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -95,13 +95,18 @@ extern void selnl_notify_setenforce(int val); static int task_has_security(struct task_struct *tsk, u32 perms) { - struct task_security_struct *tsec; - - tsec = tsk->cred->security; + const struct task_security_struct *tsec; + u32 sid = 0; + + rcu_read_lock(); + tsec = __task_cred(tsk)->security; + if (tsec) + sid = tsec->sid; + rcu_read_unlock(); if (!tsec) return -EACCES; - return avc_has_perm(tsec->sid, SECINITSID_SECURITY, + return avc_has_perm(sid, SECINITSID_SECURITY, SECCLASS_SECURITY, perms, NULL); } diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index e8a4fcb1ad0..11167fd567b 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -30,6 +30,8 @@ #include "smack.h" +#define task_security(task) (task_cred_xxx((task), security)) + /* * I hope these are the hokeyist lines of code in the module. Casey. */ @@ -1012,7 +1014,7 @@ static void smack_cred_free(struct cred *cred) */ static int smack_task_setpgid(struct task_struct *p, pid_t pgid) { - return smk_curacc(p->cred->security, MAY_WRITE); + return smk_curacc(task_security(p), MAY_WRITE); } /** @@ -1023,7 +1025,7 @@ static int smack_task_setpgid(struct task_struct *p, pid_t pgid) */ static int smack_task_getpgid(struct task_struct *p) { - return smk_curacc(p->cred->security, MAY_READ); + return smk_curacc(task_security(p), MAY_READ); } /** @@ -1034,7 +1036,7 @@ static int smack_task_getpgid(struct task_struct *p) */ static int smack_task_getsid(struct task_struct *p) { - return smk_curacc(p->cred->security, MAY_READ); + return smk_curacc(task_security(p), MAY_READ); } /** @@ -1046,7 +1048,7 @@ static int smack_task_getsid(struct task_struct *p) */ static void smack_task_getsecid(struct task_struct *p, u32 *secid) { - *secid = smack_to_secid(p->cred->security); + *secid = smack_to_secid(task_security(p)); } /** @@ -1062,7 +1064,7 @@ static int smack_task_setnice(struct task_struct *p, int nice) rc = cap_task_setnice(p, nice); if (rc == 0) - rc = smk_curacc(p->cred->security, MAY_WRITE); + rc = smk_curacc(task_security(p), MAY_WRITE); return rc; } @@ -1079,7 +1081,7 @@ static int smack_task_setioprio(struct task_struct *p, int ioprio) rc = cap_task_setioprio(p, ioprio); if (rc == 0) - rc = smk_curacc(p->cred->security, MAY_WRITE); + rc = smk_curacc(task_security(p), MAY_WRITE); return rc; } @@ -1091,7 +1093,7 @@ static int smack_task_setioprio(struct task_struct *p, int ioprio) */ static int smack_task_getioprio(struct task_struct *p) { - return smk_curacc(p->cred->security, MAY_READ); + return smk_curacc(task_security(p), MAY_READ); } /** @@ -1109,7 +1111,7 @@ static int smack_task_setscheduler(struct task_struct *p, int policy, rc = cap_task_setscheduler(p, policy, lp); if (rc == 0) - rc = smk_curacc(p->cred->security, MAY_WRITE); + rc = smk_curacc(task_security(p), MAY_WRITE); return rc; } @@ -1121,7 +1123,7 @@ static int smack_task_setscheduler(struct task_struct *p, int policy, */ static int smack_task_getscheduler(struct task_struct *p) { - return smk_curacc(p->cred->security, MAY_READ); + return smk_curacc(task_security(p), MAY_READ); } /** @@ -1132,7 +1134,7 @@ static int smack_task_getscheduler(struct task_struct *p) */ static int smack_task_movememory(struct task_struct *p) { - return smk_curacc(p->cred->security, MAY_WRITE); + return smk_curacc(task_security(p), MAY_WRITE); } /** @@ -1155,13 +1157,13 @@ static int smack_task_kill(struct task_struct *p, struct siginfo *info, * can write the receiver. */ if (secid == 0) - return smk_curacc(p->cred->security, MAY_WRITE); + return smk_curacc(task_security(p), MAY_WRITE); /* * If the secid isn't 0 we're dealing with some USB IO * specific behavior. This is not clean. For one thing * we can't take privilege into account. */ - return smk_access(smack_from_secid(secid), p->cred->security, MAY_WRITE); + return smk_access(smack_from_secid(secid), task_security(p), MAY_WRITE); } /** @@ -1174,7 +1176,7 @@ static int smack_task_wait(struct task_struct *p) { int rc; - rc = smk_access(current->cred->security, p->cred->security, MAY_WRITE); + rc = smk_access(current_security(), task_security(p), MAY_WRITE); if (rc == 0) return 0; @@ -1205,7 +1207,7 @@ static int smack_task_wait(struct task_struct *p) static void smack_task_to_inode(struct task_struct *p, struct inode *inode) { struct inode_smack *isp = inode->i_security; - isp->smk_inode = p->cred->security; + isp->smk_inode = task_security(p); } /* @@ -2010,7 +2012,7 @@ static int smack_getprocattr(struct task_struct *p, char *name, char **value) if (strcmp(name, "current") != 0) return -EINVAL; - cp = kstrdup(p->cred->security, GFP_KERNEL); + cp = kstrdup(task_security(p), GFP_KERNEL); if (cp == NULL) return -ENOMEM; -- cgit v1.2.3-70-g09d2 From bb952bb98a7e479262c7eb25d5592545a3af147d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:20 +1100 Subject: CRED: Separate per-task-group keyrings from signal_struct Separate per-task-group keyrings from signal_struct and dangle their anchor from the cred struct rather than the signal_struct. Signed-off-by: David Howells Reviewed-by: James Morris Signed-off-by: James Morris --- include/linux/cred.h | 16 +++++++ include/linux/key.h | 8 +--- include/linux/sched.h | 6 --- kernel/cred.c | 63 +++++++++++++++++++++++++++ kernel/fork.c | 7 --- security/keys/process_keys.c | 100 +++++++++++++++++-------------------------- security/keys/request_key.c | 34 ++++++--------- 7 files changed, 135 insertions(+), 99 deletions(-) (limited to 'kernel') diff --git a/include/linux/cred.h b/include/linux/cred.h index 166ce4ddba6..62b9e532422 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -71,6 +71,21 @@ extern int groups_search(const struct group_info *, gid_t); extern int in_group_p(gid_t); extern int in_egroup_p(gid_t); +/* + * The common credentials for a thread group + * - shared by CLONE_THREAD + */ +#ifdef CONFIG_KEYS +struct thread_group_cred { + atomic_t usage; + pid_t tgid; /* thread group process ID */ + spinlock_t lock; + struct key *session_keyring; /* keyring inherited over fork */ + struct key *process_keyring; /* keyring private to this process */ + struct rcu_head rcu; /* RCU deletion hook */ +}; +#endif + /* * The security context of a task * @@ -114,6 +129,7 @@ struct cred { * keys to */ struct key *thread_keyring; /* keyring private to this thread */ struct key *request_key_auth; /* assumed request_key authority */ + struct thread_group_cred *tgcred; /* thread-group shared credentials */ #endif #ifdef CONFIG_SECURITY void *security; /* subjective LSM security */ diff --git a/include/linux/key.h b/include/linux/key.h index df709e1af3c..0836cc838b0 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -278,9 +278,7 @@ extern ctl_table key_sysctls[]; */ extern void switch_uid_keyring(struct user_struct *new_user); extern int copy_keys(unsigned long clone_flags, struct task_struct *tsk); -extern int copy_thread_group_keys(struct task_struct *tsk); extern void exit_keys(struct task_struct *tsk); -extern void exit_thread_group_keys(struct signal_struct *tg); extern int suid_keys(struct task_struct *tsk); extern int exec_keys(struct task_struct *tsk); extern void key_fsuid_changed(struct task_struct *tsk); @@ -289,8 +287,8 @@ extern void key_init(void); #define __install_session_keyring(keyring) \ ({ \ - struct key *old_session = current->signal->session_keyring; \ - current->signal->session_keyring = keyring; \ + struct key *old_session = current->cred->tgcred->session_keyring; \ + current->cred->tgcred->session_keyring = keyring; \ old_session; \ }) @@ -308,9 +306,7 @@ extern void key_init(void); #define switch_uid_keyring(u) do { } while(0) #define __install_session_keyring(k) ({ NULL; }) #define copy_keys(f,t) 0 -#define copy_thread_group_keys(t) 0 #define exit_keys(t) do { } while(0) -#define exit_thread_group_keys(tg) do { } while(0) #define suid_keys(t) do { } while(0) #define exec_keys(t) do { } while(0) #define key_fsuid_changed(t) do { } while(0) diff --git a/include/linux/sched.h b/include/linux/sched.h index 740cf946c8c..2913252989b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -571,12 +571,6 @@ struct signal_struct { */ struct rlimit rlim[RLIM_NLIMITS]; - /* keep the process-shared keyrings here so that they do the right - * thing in threads created with CLONE_THREAD */ -#ifdef CONFIG_KEYS - struct key *session_keyring; /* keyring inherited over fork */ - struct key *process_keyring; /* keyring private to this process */ -#endif #ifdef CONFIG_BSD_PROCESS_ACCT struct pacct_struct pacct; /* per-process accounting information */ #endif diff --git a/kernel/cred.c b/kernel/cred.c index 833244a7cb0..ac73e361768 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -16,6 +16,17 @@ #include #include +/* + * The common credentials for the initial task's thread group + */ +#ifdef CONFIG_KEYS +static struct thread_group_cred init_tgcred = { + .usage = ATOMIC_INIT(2), + .tgid = 0, + .lock = SPIN_LOCK_UNLOCKED, +}; +#endif + /* * The initial credentials for the initial task */ @@ -28,8 +39,41 @@ struct cred init_cred = { .cap_bset = CAP_INIT_BSET, .user = INIT_USER, .group_info = &init_groups, +#ifdef CONFIG_KEYS + .tgcred = &init_tgcred, +#endif }; +/* + * Dispose of the shared task group credentials + */ +#ifdef CONFIG_KEYS +static void release_tgcred_rcu(struct rcu_head *rcu) +{ + struct thread_group_cred *tgcred = + container_of(rcu, struct thread_group_cred, rcu); + + BUG_ON(atomic_read(&tgcred->usage) != 0); + + key_put(tgcred->session_keyring); + key_put(tgcred->process_keyring); + kfree(tgcred); +} +#endif + +/* + * Release a set of thread group credentials. + */ +static void release_tgcred(struct cred *cred) +{ +#ifdef CONFIG_KEYS + struct thread_group_cred *tgcred = cred->tgcred; + + if (atomic_dec_and_test(&tgcred->usage)) + call_rcu(&tgcred->rcu, release_tgcred_rcu); +#endif +} + /* * The RCU callback to actually dispose of a set of credentials */ @@ -41,6 +85,7 @@ static void put_cred_rcu(struct rcu_head *rcu) key_put(cred->thread_keyring); key_put(cred->request_key_auth); + release_tgcred(cred); put_group_info(cred->group_info); free_uid(cred->user); security_cred_free(cred); @@ -71,12 +116,30 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) if (!pcred) return -ENOMEM; +#ifdef CONFIG_KEYS + if (clone_flags & CLONE_THREAD) { + atomic_inc(&pcred->tgcred->usage); + } else { + pcred->tgcred = kmalloc(sizeof(struct cred), GFP_KERNEL); + if (!pcred->tgcred) { + kfree(pcred); + return -ENOMEM; + } + atomic_set(&pcred->tgcred->usage, 1); + spin_lock_init(&pcred->tgcred->lock); + pcred->tgcred->process_keyring = NULL; + pcred->tgcred->session_keyring = + key_get(p->cred->tgcred->session_keyring); + } +#endif + #ifdef CONFIG_SECURITY pcred->security = NULL; #endif ret = security_cred_alloc(pcred); if (ret < 0) { + release_tgcred(pcred); kfree(pcred); return ret; } diff --git a/kernel/fork.c b/kernel/fork.c index c932e283ddf..ded1972672a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -802,12 +802,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) if (!sig) return -ENOMEM; - ret = copy_thread_group_keys(tsk); - if (ret < 0) { - kmem_cache_free(signal_cachep, sig); - return ret; - } - atomic_set(&sig->count, 1); atomic_set(&sig->live, 1); init_waitqueue_head(&sig->wait_chldexit); @@ -852,7 +846,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_signal(struct signal_struct *sig) { thread_group_cputime_free(sig); - exit_thread_group_keys(sig); tty_kref_put(sig->tty); kmem_cache_free(signal_cachep, sig); } diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index 212601ebaa4..70ee93406f3 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -189,7 +189,7 @@ int install_process_keyring(void) might_sleep(); - if (!tsk->signal->process_keyring) { + if (!tsk->cred->tgcred->process_keyring) { sprintf(buf, "_pid.%u", tsk->tgid); keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, tsk, @@ -200,12 +200,12 @@ int install_process_keyring(void) } /* attach keyring */ - spin_lock_irq(&tsk->sighand->siglock); - if (!tsk->signal->process_keyring) { - tsk->signal->process_keyring = keyring; + spin_lock_irq(&tsk->cred->tgcred->lock); + if (!tsk->cred->tgcred->process_keyring) { + tsk->cred->tgcred->process_keyring = keyring; keyring = NULL; } - spin_unlock_irq(&tsk->sighand->siglock); + spin_unlock_irq(&tsk->cred->tgcred->lock); key_put(keyring); } @@ -235,11 +235,11 @@ static int install_session_keyring(struct key *keyring) sprintf(buf, "_ses.%u", tsk->tgid); flags = KEY_ALLOC_QUOTA_OVERRUN; - if (tsk->signal->session_keyring) + if (tsk->cred->tgcred->session_keyring) flags = KEY_ALLOC_IN_QUOTA; - keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, tsk, - flags, NULL); + keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, + tsk, flags, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); } @@ -248,10 +248,10 @@ static int install_session_keyring(struct key *keyring) } /* install the keyring */ - spin_lock_irq(&tsk->sighand->siglock); - old = tsk->signal->session_keyring; - rcu_assign_pointer(tsk->signal->session_keyring, keyring); - spin_unlock_irq(&tsk->sighand->siglock); + spin_lock_irq(&tsk->cred->tgcred->lock); + old = tsk->cred->tgcred->session_keyring; + rcu_assign_pointer(tsk->cred->tgcred->session_keyring, keyring); + spin_unlock_irq(&tsk->cred->tgcred->lock); /* we're using RCU on the pointer, but there's no point synchronising * on it if it didn't previously point to anything */ @@ -264,28 +264,6 @@ static int install_session_keyring(struct key *keyring) } /* end install_session_keyring() */ -/*****************************************************************************/ -/* - * copy the keys in a thread group for fork without CLONE_THREAD - */ -int copy_thread_group_keys(struct task_struct *tsk) -{ - key_check(current->thread_group->session_keyring); - key_check(current->thread_group->process_keyring); - - /* no process keyring yet */ - tsk->signal->process_keyring = NULL; - - /* same session keyring */ - rcu_read_lock(); - tsk->signal->session_keyring = - key_get(rcu_dereference(current->signal->session_keyring)); - rcu_read_unlock(); - - return 0; - -} /* end copy_thread_group_keys() */ - /*****************************************************************************/ /* * copy the keys for fork @@ -305,17 +283,6 @@ int copy_keys(unsigned long clone_flags, struct task_struct *tsk) } /* end copy_keys() */ -/*****************************************************************************/ -/* - * dispose of thread group keys upon thread group destruction - */ -void exit_thread_group_keys(struct signal_struct *tg) -{ - key_put(tg->session_keyring); - key_put(tg->process_keyring); - -} /* end exit_thread_group_keys() */ - /*****************************************************************************/ /* * dispose of per-thread keys upon thread exit @@ -344,10 +311,10 @@ int exec_keys(struct task_struct *tsk) key_put(old); /* discard the process keyring from a newly exec'd task */ - spin_lock_irq(&tsk->sighand->siglock); - old = tsk->signal->process_keyring; - tsk->signal->process_keyring = NULL; - spin_unlock_irq(&tsk->sighand->siglock); + spin_lock_irq(&tsk->cred->tgcred->lock); + old = tsk->cred->tgcred->process_keyring; + tsk->cred->tgcred->process_keyring = NULL; + spin_unlock_irq(&tsk->cred->tgcred->lock); key_put(old); @@ -452,9 +419,9 @@ key_ref_t search_process_keyrings(struct key_type *type, } /* search the process keyring second */ - if (context->signal->process_keyring) { + if (cred->tgcred->process_keyring) { key_ref = keyring_search_aux( - make_key_ref(context->signal->process_keyring, 1), + make_key_ref(cred->tgcred->process_keyring, 1), context, type, description, match); if (!IS_ERR(key_ref)) goto found; @@ -473,11 +440,11 @@ key_ref_t search_process_keyrings(struct key_type *type, } /* search the session keyring */ - if (context->signal->session_keyring) { + if (cred->tgcred->session_keyring) { rcu_read_lock(); key_ref = keyring_search_aux( make_key_ref(rcu_dereference( - context->signal->session_keyring), + cred->tgcred->session_keyring), 1), context, type, description, match); rcu_read_unlock(); @@ -586,11 +553,13 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial, { struct request_key_auth *rka; struct task_struct *t = current; - struct cred *cred = current_cred(); + struct cred *cred; struct key *key; key_ref_t key_ref, skey_ref; int ret; +try_again: + cred = get_current_cred(); key_ref = ERR_PTR(-ENOKEY); switch (id) { @@ -604,6 +573,7 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial, key = ERR_PTR(ret); goto error; } + goto reget_creds; } key = cred->thread_keyring; @@ -612,7 +582,7 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial, break; case KEY_SPEC_PROCESS_KEYRING: - if (!t->signal->process_keyring) { + if (!cred->tgcred->process_keyring) { if (!create) goto error; @@ -621,15 +591,16 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial, key = ERR_PTR(ret); goto error; } + goto reget_creds; } - key = t->signal->process_keyring; + key = cred->tgcred->process_keyring; atomic_inc(&key->usage); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_SESSION_KEYRING: - if (!t->signal->session_keyring) { + if (!cred->tgcred->session_keyring) { /* always install a session keyring upon access if one * doesn't exist yet */ ret = install_user_keyrings(); @@ -639,10 +610,11 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial, cred->user->session_keyring); if (ret < 0) goto error; + goto reget_creds; } rcu_read_lock(); - key = rcu_dereference(t->signal->session_keyring); + key = rcu_dereference(cred->tgcred->session_keyring); atomic_inc(&key->usage); rcu_read_unlock(); key_ref = make_key_ref(key, 1); @@ -758,6 +730,7 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial, goto invalid_key; error: + put_cred(cred); return key_ref; invalid_key: @@ -765,6 +738,12 @@ invalid_key: key_ref = ERR_PTR(ret); goto error; + /* if we attempted to install a keyring, then it may have caused new + * creds to be installed */ +reget_creds: + put_cred(cred); + goto try_again; + } /* end lookup_user_key() */ /*****************************************************************************/ @@ -777,6 +756,7 @@ invalid_key: long join_session_keyring(const char *name) { struct task_struct *tsk = current; + struct cred *cred = current->cred; struct key *keyring; long ret; @@ -787,7 +767,7 @@ long join_session_keyring(const char *name) goto error; rcu_read_lock(); - ret = rcu_dereference(tsk->signal->session_keyring)->serial; + ret = rcu_dereference(cred->tgcred->session_keyring)->serial; rcu_read_unlock(); goto error; } @@ -799,7 +779,7 @@ long join_session_keyring(const char *name) keyring = find_keyring_by_name(name, false); if (PTR_ERR(keyring) == -ENOKEY) { /* not found - try and create a new one */ - keyring = keyring_alloc(name, tsk->cred->uid, tsk->cred->gid, tsk, + keyring = keyring_alloc(name, cred->uid, cred->gid, tsk, KEY_ALLOC_IN_QUOTA, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); diff --git a/security/keys/request_key.c b/security/keys/request_key.c index 0488b0af5bd..3d12558362d 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -66,7 +66,6 @@ static int call_sbin_request_key(struct key_construction *cons, const char *op, void *aux) { - struct task_struct *tsk = current; const struct cred *cred = current_cred(); key_serial_t prkey, sskey; struct key *key = cons->key, *authkey = cons->authkey, *keyring; @@ -109,18 +108,13 @@ static int call_sbin_request_key(struct key_construction *cons, cred->thread_keyring->serial : 0); prkey = 0; - if (tsk->signal->process_keyring) - prkey = tsk->signal->process_keyring->serial; + if (cred->tgcred->process_keyring) + prkey = cred->tgcred->process_keyring->serial; - sprintf(keyring_str[1], "%d", prkey); - - if (tsk->signal->session_keyring) { - rcu_read_lock(); - sskey = rcu_dereference(tsk->signal->session_keyring)->serial; - rcu_read_unlock(); - } else { + if (cred->tgcred->session_keyring) + sskey = rcu_dereference(cred->tgcred->session_keyring)->serial; + else sskey = cred->user->session_keyring->serial; - } sprintf(keyring_str[2], "%d", sskey); @@ -222,7 +216,7 @@ static int construct_key(struct key *key, const void *callout_info, static void construct_get_dest_keyring(struct key **_dest_keyring) { struct request_key_auth *rka; - struct task_struct *tsk = current; + const struct cred *cred = current_cred(); struct key *dest_keyring = *_dest_keyring, *authkey; kenter("%p", dest_keyring); @@ -234,11 +228,11 @@ static void construct_get_dest_keyring(struct key **_dest_keyring) } else { /* use a default keyring; falling through the cases until we * find one that we actually have */ - switch (tsk->cred->jit_keyring) { + switch (cred->jit_keyring) { case KEY_REQKEY_DEFL_DEFAULT: case KEY_REQKEY_DEFL_REQUESTOR_KEYRING: - if (tsk->cred->request_key_auth) { - authkey = tsk->cred->request_key_auth; + if (cred->request_key_auth) { + authkey = cred->request_key_auth; down_read(&authkey->sem); rka = authkey->payload.data; if (!test_bit(KEY_FLAG_REVOKED, @@ -251,19 +245,19 @@ static void construct_get_dest_keyring(struct key **_dest_keyring) } case KEY_REQKEY_DEFL_THREAD_KEYRING: - dest_keyring = key_get(tsk->cred->thread_keyring); + dest_keyring = key_get(cred->thread_keyring); if (dest_keyring) break; case KEY_REQKEY_DEFL_PROCESS_KEYRING: - dest_keyring = key_get(tsk->signal->process_keyring); + dest_keyring = key_get(cred->tgcred->process_keyring); if (dest_keyring) break; case KEY_REQKEY_DEFL_SESSION_KEYRING: rcu_read_lock(); dest_keyring = key_get( - rcu_dereference(tsk->signal->session_keyring)); + rcu_dereference(cred->tgcred->session_keyring)); rcu_read_unlock(); if (dest_keyring) @@ -271,11 +265,11 @@ static void construct_get_dest_keyring(struct key **_dest_keyring) case KEY_REQKEY_DEFL_USER_SESSION_KEYRING: dest_keyring = - key_get(tsk->cred->user->session_keyring); + key_get(cred->user->session_keyring); break; case KEY_REQKEY_DEFL_USER_KEYRING: - dest_keyring = key_get(tsk->cred->user->uid_keyring); + dest_keyring = key_get(cred->user->uid_keyring); break; case KEY_REQKEY_DEFL_GROUP_KEYRING: -- cgit v1.2.3-70-g09d2 From 6cc88bc45ce8043171089c9592da223dfab91823 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:21 +1100 Subject: CRED: Rename is_single_threaded() to is_wq_single_threaded() Rename is_single_threaded() to is_wq_single_threaded() so that a new is_single_threaded() can be created that refers to tasks rather than waitqueues. Signed-off-by: David Howells Reviewed-by: James Morris Signed-off-by: James Morris --- kernel/workqueue.c | 8 ++++---- lib/is_single_threaded.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 4 deletions(-) create mode 100644 lib/is_single_threaded.c (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f928f2a87b9..f12ab5c4dec 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -84,21 +84,21 @@ static cpumask_t cpu_singlethread_map __read_mostly; static cpumask_t cpu_populated_map __read_mostly; /* If it's single threaded, it isn't in the list of workqueues. */ -static inline int is_single_threaded(struct workqueue_struct *wq) +static inline int is_wq_single_threaded(struct workqueue_struct *wq) { return wq->singlethread; } static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq) { - return is_single_threaded(wq) + return is_wq_single_threaded(wq) ? &cpu_singlethread_map : &cpu_populated_map; } static struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) { - if (unlikely(is_single_threaded(wq))) + if (unlikely(is_wq_single_threaded(wq))) cpu = singlethread_cpu; return per_cpu_ptr(wq->cpu_wq, cpu); } @@ -769,7 +769,7 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) { struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; struct workqueue_struct *wq = cwq->wq; - const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d"; + const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d"; struct task_struct *p; p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); diff --git a/lib/is_single_threaded.c b/lib/is_single_threaded.c new file mode 100644 index 00000000000..f1ed2fe76c6 --- /dev/null +++ b/lib/is_single_threaded.c @@ -0,0 +1,45 @@ +/* Function to determine if a thread group is single threaded or not + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * - Derived from security/selinux/hooks.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include + +/** + * is_single_threaded - Determine if a thread group is single-threaded or not + * @p: A task in the thread group in question + * + * This returns true if the thread group to which a task belongs is single + * threaded, false if it is not. + */ +bool is_single_threaded(struct task_struct *p) +{ + struct task_struct *g, *t; + struct mm_struct *mm = p->mm; + + if (atomic_read(&p->signal->count) != 1) + goto no; + + if (atomic_read(&p->mm->mm_users) != 1) { + read_lock(&tasklist_lock); + do_each_thread(g, t) { + if (t->mm == mm && t != p) + goto no_unlock; + } while_each_thread(g, t); + read_unlock(&tasklist_lock); + } + + return true; + +no_unlock: + read_unlock(&tasklist_lock); +no: + return false; +} -- cgit v1.2.3-70-g09d2 From d84f4f992cbd76e8f39c488cf0c5d123843923b1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:23 +1100 Subject: CRED: Inaugurate COW credentials Inaugurate copy-on-write credentials management. This uses RCU to manage the credentials pointer in the task_struct with respect to accesses by other tasks. A process may only modify its own credentials, and so does not need locking to access or modify its own credentials. A mutex (cred_replace_mutex) is added to the task_struct to control the effect of PTRACE_ATTACHED on credential calculations, particularly with respect to execve(). With this patch, the contents of an active credentials struct may not be changed directly; rather a new set of credentials must be prepared, modified and committed using something like the following sequence of events: struct cred *new = prepare_creds(); int ret = blah(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); There are some exceptions to this rule: the keyrings pointed to by the active credentials may be instantiated - keyrings violate the COW rule as managing COW keyrings is tricky, given that it is possible for a task to directly alter the keys in a keyring in use by another task. To help enforce this, various pointers to sets of credentials, such as those in the task_struct, are declared const. The purpose of this is compile-time discouragement of altering credentials through those pointers. Once a set of credentials has been made public through one of these pointers, it may not be modified, except under special circumstances: (1) Its reference count may incremented and decremented. (2) The keyrings to which it points may be modified, but not replaced. The only safe way to modify anything else is to create a replacement and commit using the functions described in Documentation/credentials.txt (which will be added by a later patch). This patch and the preceding patches have been tested with the LTP SELinux testsuite. This patch makes several logical sets of alteration: (1) execve(). This now prepares and commits credentials in various places in the security code rather than altering the current creds directly. (2) Temporary credential overrides. do_coredump() and sys_faccessat() now prepare their own credentials and temporarily override the ones currently on the acting thread, whilst preventing interference from other threads by holding cred_replace_mutex on the thread being dumped. This will be replaced in a future patch by something that hands down the credentials directly to the functions being called, rather than altering the task's objective credentials. (3) LSM interface. A number of functions have been changed, added or removed: (*) security_capset_check(), ->capset_check() (*) security_capset_set(), ->capset_set() Removed in favour of security_capset(). (*) security_capset(), ->capset() New. This is passed a pointer to the new creds, a pointer to the old creds and the proposed capability sets. It should fill in the new creds or return an error. All pointers, barring the pointer to the new creds, are now const. (*) security_bprm_apply_creds(), ->bprm_apply_creds() Changed; now returns a value, which will cause the process to be killed if it's an error. (*) security_task_alloc(), ->task_alloc_security() Removed in favour of security_prepare_creds(). (*) security_cred_free(), ->cred_free() New. Free security data attached to cred->security. (*) security_prepare_creds(), ->cred_prepare() New. Duplicate any security data attached to cred->security. (*) security_commit_creds(), ->cred_commit() New. Apply any security effects for the upcoming installation of new security by commit_creds(). (*) security_task_post_setuid(), ->task_post_setuid() Removed in favour of security_task_fix_setuid(). (*) security_task_fix_setuid(), ->task_fix_setuid() Fix up the proposed new credentials for setuid(). This is used by cap_set_fix_setuid() to implicitly adjust capabilities in line with setuid() changes. Changes are made to the new credentials, rather than the task itself as in security_task_post_setuid(). (*) security_task_reparent_to_init(), ->task_reparent_to_init() Removed. Instead the task being reparented to init is referred directly to init's credentials. NOTE! This results in the loss of some state: SELinux's osid no longer records the sid of the thread that forked it. (*) security_key_alloc(), ->key_alloc() (*) security_key_permission(), ->key_permission() Changed. These now take cred pointers rather than task pointers to refer to the security context. (4) sys_capset(). This has been simplified and uses less locking. The LSM functions it calls have been merged. (5) reparent_to_kthreadd(). This gives the current thread the same credentials as init by simply using commit_thread() to point that way. (6) __sigqueue_alloc() and switch_uid() __sigqueue_alloc() can't stop the target task from changing its creds beneath it, so this function gets a reference to the currently applicable user_struct which it then passes into the sigqueue struct it returns if successful. switch_uid() is now called from commit_creds(), and possibly should be folded into that. commit_creds() should take care of protecting __sigqueue_alloc(). (7) [sg]et[ug]id() and co and [sg]et_current_groups. The set functions now all use prepare_creds(), commit_creds() and abort_creds() to build and check a new set of credentials before applying it. security_task_set[ug]id() is called inside the prepared section. This guarantees that nothing else will affect the creds until we've finished. The calling of set_dumpable() has been moved into commit_creds(). Much of the functionality of set_user() has been moved into commit_creds(). The get functions all simply access the data directly. (8) security_task_prctl() and cap_task_prctl(). security_task_prctl() has been modified to return -ENOSYS if it doesn't want to handle a function, or otherwise return the return value directly rather than through an argument. Additionally, cap_task_prctl() now prepares a new set of credentials, even if it doesn't end up using it. (9) Keyrings. A number of changes have been made to the keyrings code: (a) switch_uid_keyring(), copy_keys(), exit_keys() and suid_keys() have all been dropped and built in to the credentials functions directly. They may want separating out again later. (b) key_alloc() and search_process_keyrings() now take a cred pointer rather than a task pointer to specify the security context. (c) copy_creds() gives a new thread within the same thread group a new thread keyring if its parent had one, otherwise it discards the thread keyring. (d) The authorisation key now points directly to the credentials to extend the search into rather pointing to the task that carries them. (e) Installing thread, process or session keyrings causes a new set of credentials to be created, even though it's not strictly necessary for process or session keyrings (they're shared). (10) Usermode helper. The usermode helper code now carries a cred struct pointer in its subprocess_info struct instead of a new session keyring pointer. This set of credentials is derived from init_cred and installed on the new process after it has been cloned. call_usermodehelper_setup() allocates the new credentials and call_usermodehelper_freeinfo() discards them if they haven't been used. A special cred function (prepare_usermodeinfo_creds()) is provided specifically for call_usermodehelper_setup() to call. call_usermodehelper_setkeys() adjusts the credentials to sport the supplied keyring as the new session keyring. (11) SELinux. SELinux has a number of changes, in addition to those to support the LSM interface changes mentioned above: (a) selinux_setprocattr() no longer does its check for whether the current ptracer can access processes with the new SID inside the lock that covers getting the ptracer's SID. Whilst this lock ensures that the check is done with the ptracer pinned, the result is only valid until the lock is released, so there's no point doing it inside the lock. (12) is_single_threaded(). This function has been extracted from selinux_setprocattr() and put into a file of its own in the lib/ directory as join_session_keyring() now wants to use it too. The code in SELinux just checked to see whether a task shared mm_structs with other tasks (CLONE_VM), but that isn't good enough. We really want to know if they're part of the same thread group (CLONE_THREAD). (13) nfsd. The NFS server daemon now has to use the COW credentials to set the credentials it is going to use. It really needs to pass the credentials down to the functions it calls, but it can't do that until other patches in this series have been applied. Signed-off-by: David Howells Acked-by: James Morris Signed-off-by: James Morris --- fs/exec.c | 31 ++- fs/nfsd/auth.c | 92 ++++---- fs/nfsd/nfs4recover.c | 68 +++--- fs/nfsd/nfsfh.c | 11 +- fs/open.c | 31 ++- include/linux/audit.h | 22 +- include/linux/capability.h | 2 - include/linux/cred.h | 44 +++- include/linux/init_task.h | 2 + include/linux/key.h | 22 +- include/linux/sched.h | 6 +- include/linux/security.h | 178 +++++++--------- init/main.c | 1 + kernel/auditsc.c | 42 ++-- kernel/capability.c | 78 +++---- kernel/cred-internals.h | 21 ++ kernel/cred.c | 321 ++++++++++++++++++++++++---- kernel/exit.c | 9 +- kernel/fork.c | 7 +- kernel/kmod.c | 30 ++- kernel/ptrace.c | 9 + kernel/signal.c | 10 +- kernel/sys.c | 450 +++++++++++++++++++++------------------ kernel/user.c | 37 +--- kernel/user_namespace.c | 12 +- lib/Makefile | 2 +- net/rxrpc/ar-key.c | 6 +- security/capability.c | 21 +- security/commoncap.c | 265 +++++++++++------------ security/keys/internal.h | 17 +- security/keys/key.c | 25 +-- security/keys/keyctl.c | 95 ++++++--- security/keys/keyring.c | 14 +- security/keys/permission.c | 24 ++- security/keys/proc.c | 8 +- security/keys/process_keys.c | 333 ++++++++++++++--------------- security/keys/request_key.c | 29 ++- security/keys/request_key_auth.c | 41 ++-- security/security.c | 58 +++-- security/selinux/hooks.c | 286 ++++++++++++------------- security/smack/smack_lsm.c | 82 ++++--- 41 files changed, 1603 insertions(+), 1239 deletions(-) create mode 100644 kernel/cred-internals.h (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index a5330e1a221..9bd3559ddec 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1007,13 +1007,12 @@ int flush_old_exec(struct linux_binprm * bprm) */ current->mm->task_size = TASK_SIZE; - if (bprm->e_uid != current_euid() || bprm->e_gid != current_egid()) { - suid_keys(current); + if (bprm->e_uid != current_euid() || + bprm->e_gid != current_egid()) { set_dumpable(current->mm, suid_dumpable); current->pdeath_signal = 0; } else if (file_permission(bprm->file, MAY_READ) || (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) { - suid_keys(current); set_dumpable(current->mm, suid_dumpable); } @@ -1096,10 +1095,8 @@ void compute_creds(struct linux_binprm *bprm) { int unsafe; - if (bprm->e_uid != current_uid()) { - suid_keys(current); + if (bprm->e_uid != current_uid()) current->pdeath_signal = 0; - } exec_keys(current); task_lock(current); @@ -1709,8 +1706,9 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) struct linux_binfmt * binfmt; struct inode * inode; struct file * file; + const struct cred *old_cred; + struct cred *cred; int retval = 0; - int fsuid = current_fsuid(); int flag = 0; int ispipe = 0; unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur; @@ -1723,12 +1721,20 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) binfmt = current->binfmt; if (!binfmt || !binfmt->core_dump) goto fail; + + cred = prepare_creds(); + if (!cred) { + retval = -ENOMEM; + goto fail; + } + down_write(&mm->mmap_sem); /* * If another thread got here first, or we are not dumpable, bail out. */ if (mm->core_state || !get_dumpable(mm)) { up_write(&mm->mmap_sem); + put_cred(cred); goto fail; } @@ -1739,12 +1745,16 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) */ if (get_dumpable(mm) == 2) { /* Setuid core dump mode */ flag = O_EXCL; /* Stop rewrite attacks */ - current->cred->fsuid = 0; /* Dump root private */ + cred->fsuid = 0; /* Dump root private */ } retval = coredump_wait(exit_code, &core_state); - if (retval < 0) + if (retval < 0) { + put_cred(cred); goto fail; + } + + old_cred = override_creds(cred); /* * Clear any false indication of pending signals that might @@ -1835,7 +1845,8 @@ fail_unlock: if (helper_argv) argv_free(helper_argv); - current->cred->fsuid = fsuid; + revert_creds(old_cred); + put_cred(cred); coredump_finish(mm); fail: return retval; diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 808fc03a6fb..836ffa1047d 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -27,55 +27,67 @@ int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp) int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) { - struct cred *act_as = current->cred ; - struct svc_cred cred = rqstp->rq_cred; + struct group_info *rqgi; + struct group_info *gi; + struct cred *new; int i; int flags = nfsexp_flags(rqstp, exp); int ret; + new = prepare_creds(); + if (!new) + return -ENOMEM; + + new->fsuid = rqstp->rq_cred.cr_uid; + new->fsgid = rqstp->rq_cred.cr_gid; + + rqgi = rqstp->rq_cred.cr_group_info; + if (flags & NFSEXP_ALLSQUASH) { - cred.cr_uid = exp->ex_anon_uid; - cred.cr_gid = exp->ex_anon_gid; - cred.cr_group_info = groups_alloc(0); + new->fsuid = exp->ex_anon_uid; + new->fsgid = exp->ex_anon_gid; + gi = groups_alloc(0); } else if (flags & NFSEXP_ROOTSQUASH) { - struct group_info *gi; - if (!cred.cr_uid) - cred.cr_uid = exp->ex_anon_uid; - if (!cred.cr_gid) - cred.cr_gid = exp->ex_anon_gid; - gi = groups_alloc(cred.cr_group_info->ngroups); - if (gi) - for (i = 0; i < cred.cr_group_info->ngroups; i++) { - if (!GROUP_AT(cred.cr_group_info, i)) - GROUP_AT(gi, i) = exp->ex_anon_gid; - else - GROUP_AT(gi, i) = GROUP_AT(cred.cr_group_info, i); - } - cred.cr_group_info = gi; - } else - get_group_info(cred.cr_group_info); - - if (cred.cr_uid != (uid_t) -1) - act_as->fsuid = cred.cr_uid; - else - act_as->fsuid = exp->ex_anon_uid; - if (cred.cr_gid != (gid_t) -1) - act_as->fsgid = cred.cr_gid; - else - act_as->fsgid = exp->ex_anon_gid; + if (!new->fsuid) + new->fsuid = exp->ex_anon_uid; + if (!new->fsgid) + new->fsgid = exp->ex_anon_gid; - if (!cred.cr_group_info) - return -ENOMEM; - ret = set_groups(act_as, cred.cr_group_info); - put_group_info(cred.cr_group_info); - if ((cred.cr_uid)) { - act_as->cap_effective = - cap_drop_nfsd_set(act_as->cap_effective); + gi = groups_alloc(rqgi->ngroups); + if (!gi) + goto oom; + + for (i = 0; i < rqgi->ngroups; i++) { + if (!GROUP_AT(rqgi, i)) + GROUP_AT(gi, i) = exp->ex_anon_gid; + else + GROUP_AT(gi, i) = GROUP_AT(rqgi, i); + } } else { - act_as->cap_effective = - cap_raise_nfsd_set(act_as->cap_effective, - act_as->cap_permitted); + gi = get_group_info(rqgi); } + + if (new->fsuid == (uid_t) -1) + new->fsuid = exp->ex_anon_uid; + if (new->fsgid == (gid_t) -1) + new->fsgid = exp->ex_anon_gid; + + ret = set_groups(new, gi); + put_group_info(gi); + if (!ret) + goto error; + + if (new->uid) + new->cap_effective = cap_drop_nfsd_set(new->cap_effective); + else + new->cap_effective = cap_raise_nfsd_set(new->cap_effective, + new->cap_permitted); + return commit_creds(new); + +oom: + ret = -ENOMEM; +error: + abort_creds(new); return ret; } diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 632a50b4b37..9371ea12d7f 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -54,20 +54,26 @@ static struct path rec_dir; static int rec_dir_init = 0; -static void -nfs4_save_user(uid_t *saveuid, gid_t *savegid) +static int +nfs4_save_creds(const struct cred **original_creds) { - *saveuid = current->cred->fsuid; - *savegid = current->cred->fsgid; - current->cred->fsuid = 0; - current->cred->fsgid = 0; + struct cred *new; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + new->fsuid = 0; + new->fsgid = 0; + *original_creds = override_creds(new); + put_cred(new); + return 0; } static void -nfs4_reset_user(uid_t saveuid, gid_t savegid) +nfs4_reset_creds(const struct cred *original) { - current->cred->fsuid = saveuid; - current->cred->fsgid = savegid; + revert_creds(original); } static void @@ -129,10 +135,9 @@ nfsd4_sync_rec_dir(void) int nfsd4_create_clid_dir(struct nfs4_client *clp) { + const struct cred *original_cred; char *dname = clp->cl_recdir; struct dentry *dentry; - uid_t uid; - gid_t gid; int status; dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname); @@ -140,7 +145,9 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) if (!rec_dir_init || clp->cl_firststate) return 0; - nfs4_save_user(&uid, &gid); + status = nfs4_save_creds(&original_cred); + if (status < 0) + return status; /* lock the parent */ mutex_lock(&rec_dir.dentry->d_inode->i_mutex); @@ -168,7 +175,7 @@ out_unlock: clp->cl_firststate = 1; nfsd4_sync_rec_dir(); } - nfs4_reset_user(uid, gid); + nfs4_reset_creds(original_cred); dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status); return status; } @@ -211,20 +218,21 @@ nfsd4_build_dentrylist(void *arg, const char *name, int namlen, static int nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f) { + const struct cred *original_cred; struct file *filp; struct dentry_list_arg dla = { .parent = dir, }; struct list_head *dentries = &dla.dentries; struct dentry_list *child; - uid_t uid; - gid_t gid; int status; if (!rec_dir_init) return 0; - nfs4_save_user(&uid, &gid); + status = nfs4_save_creds(&original_cred); + if (status < 0) + return status; filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY, current_cred()); @@ -250,7 +258,7 @@ out: dput(child->dentry); kfree(child); } - nfs4_reset_user(uid, gid); + nfs4_reset_creds(original_cred); return status; } @@ -312,8 +320,7 @@ out: void nfsd4_remove_clid_dir(struct nfs4_client *clp) { - uid_t uid; - gid_t gid; + const struct cred *original_cred; int status; if (!rec_dir_init || !clp->cl_firststate) @@ -323,9 +330,13 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) if (status) goto out; clp->cl_firststate = 0; - nfs4_save_user(&uid, &gid); + + status = nfs4_save_creds(&original_cred); + if (status < 0) + goto out; + status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1); - nfs4_reset_user(uid, gid); + nfs4_reset_creds(original_cred); if (status == 0) nfsd4_sync_rec_dir(); mnt_drop_write(rec_dir.mnt); @@ -402,16 +413,21 @@ nfsd4_recdir_load(void) { void nfsd4_init_recdir(char *rec_dirname) { - uid_t uid = 0; - gid_t gid = 0; - int status; + const struct cred *original_cred; + int status; printk("NFSD: Using %s as the NFSv4 state recovery directory\n", rec_dirname); BUG_ON(rec_dir_init); - nfs4_save_user(&uid, &gid); + status = nfs4_save_creds(&original_cred); + if (status < 0) { + printk("NFSD: Unable to change credentials to find recovery" + " directory: error %d\n", + status); + return; + } status = kern_path(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &rec_dir); @@ -421,7 +437,7 @@ nfsd4_init_recdir(char *rec_dirname) if (!status) rec_dir_init = 1; - nfs4_reset_user(uid, gid); + nfs4_reset_creds(original_cred); } void diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index e67cfaea086..f0da7d9c3a9 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -186,9 +186,14 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) * access control settings being in effect, we cannot * fix that case easily. */ - current->cred->cap_effective = - cap_raise_nfsd_set(current->cred->cap_effective, - current->cred->cap_permitted); + struct cred *new = prepare_creds(); + if (!new) + return nfserrno(-ENOMEM); + new->cap_effective = + cap_raise_nfsd_set(new->cap_effective, + new->cap_permitted); + put_cred(override_creds(new)); + put_cred(new); } else { error = nfsd_setuser_and_check_port(rqstp, exp); if (error) diff --git a/fs/open.c b/fs/open.c index f96eaab280a..c0a426d5766 100644 --- a/fs/open.c +++ b/fs/open.c @@ -425,30 +425,33 @@ out: */ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode) { - struct cred *cred = current->cred; + const struct cred *old_cred; + struct cred *override_cred; struct path path; struct inode *inode; - int old_fsuid, old_fsgid; - kernel_cap_t uninitialized_var(old_cap); /* !SECURE_NO_SETUID_FIXUP */ int res; if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ return -EINVAL; - old_fsuid = cred->fsuid; - old_fsgid = cred->fsgid; + override_cred = prepare_creds(); + if (!override_cred) + return -ENOMEM; - cred->fsuid = cred->uid; - cred->fsgid = cred->gid; + override_cred->fsuid = override_cred->uid; + override_cred->fsgid = override_cred->gid; if (!issecure(SECURE_NO_SETUID_FIXUP)) { /* Clear the capabilities if we switch to a non-root user */ - if (current->cred->uid) - old_cap = cap_set_effective(__cap_empty_set); + if (override_cred->uid) + cap_clear(override_cred->cap_effective); else - old_cap = cap_set_effective(cred->cap_permitted); + override_cred->cap_effective = + override_cred->cap_permitted; } + old_cred = override_creds(override_cred); + res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); if (res) goto out; @@ -485,12 +488,8 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode) out_path_release: path_put(&path); out: - cred->fsuid = old_fsuid; - cred->fsgid = old_fsgid; - - if (!issecure(SECURE_NO_SETUID_FIXUP)) - cap_set_effective(old_cap); - + revert_creds(old_cred); + put_cred(override_cred); return res; } diff --git a/include/linux/audit.h b/include/linux/audit.h index 6fbebac7b1b..0b2fcb698a6 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -454,8 +454,10 @@ extern int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_pr extern int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned int __user *u_msg_prio, const struct timespec __user *u_abs_timeout); extern int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification); extern int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat); -extern void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE); -extern int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm); +extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm, + const struct cred *new, + const struct cred *old); +extern int __audit_log_capset(pid_t pid, const struct cred *new, const struct cred *old); static inline int audit_ipc_obj(struct kern_ipc_perm *ipcp) { @@ -522,16 +524,20 @@ static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) * * -Eric */ -static inline void audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE) +static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm, + const struct cred *new, + const struct cred *old) { if (unlikely(!audit_dummy_context())) - __audit_log_bprm_fcaps(bprm, pP, pE); + return __audit_log_bprm_fcaps(bprm, new, old); + return 0; } -static inline int audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm) +static inline int audit_log_capset(pid_t pid, const struct cred *new, + const struct cred *old) { if (unlikely(!audit_dummy_context())) - return __audit_log_capset(pid, eff, inh, perm); + return __audit_log_capset(pid, new, old); return 0; } @@ -566,8 +572,8 @@ extern int audit_signals; #define audit_mq_timedreceive(d,l,p,t) ({ 0; }) #define audit_mq_notify(d,n) ({ 0; }) #define audit_mq_getsetattr(d,s) ({ 0; }) -#define audit_log_bprm_fcaps(b, p, e) do { ; } while (0) -#define audit_log_capset(pid, e, i, p) ({ 0; }) +#define audit_log_bprm_fcaps(b, ncr, ocr) ({ 0; }) +#define audit_log_capset(pid, ncr, ocr) ({ 0; }) #define audit_ptrace(t) ((void)0) #define audit_n_rules 0 #define audit_signals 0 diff --git a/include/linux/capability.h b/include/linux/capability.h index 7f26580a5a4..e22f48c2a46 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -519,8 +519,6 @@ extern const kernel_cap_t __cap_empty_set; extern const kernel_cap_t __cap_full_set; extern const kernel_cap_t __cap_init_eff_set; -kernel_cap_t cap_set_effective(const kernel_cap_t pE_new); - /** * has_capability - Determine if a task has a superior capability available * @t: The task in question diff --git a/include/linux/cred.h b/include/linux/cred.h index 62b9e532422..eaf6fa695a0 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -84,6 +84,8 @@ struct thread_group_cred { struct key *process_keyring; /* keyring private to this process */ struct rcu_head rcu; /* RCU deletion hook */ }; + +extern void release_tgcred(struct cred *cred); #endif /* @@ -137,11 +139,30 @@ struct cred { struct user_struct *user; /* real user ID subscription */ struct group_info *group_info; /* supplementary groups for euid/fsgid */ struct rcu_head rcu; /* RCU deletion hook */ - spinlock_t lock; /* lock for pointer changes */ }; extern void __put_cred(struct cred *); extern int copy_creds(struct task_struct *, unsigned long); +extern struct cred *prepare_creds(void); +extern struct cred *prepare_usermodehelper_creds(void); +extern int commit_creds(struct cred *); +extern void abort_creds(struct cred *); +extern const struct cred *override_creds(const struct cred *) __deprecated; +extern void revert_creds(const struct cred *) __deprecated; +extern void __init cred_init(void); + +/** + * get_new_cred - Get a reference on a new set of credentials + * @cred: The new credentials to reference + * + * Get a reference on the specified set of new credentials. The caller must + * release the reference. + */ +static inline struct cred *get_new_cred(struct cred *cred) +{ + atomic_inc(&cred->usage); + return cred; +} /** * get_cred - Get a reference on a set of credentials @@ -150,10 +171,9 @@ extern int copy_creds(struct task_struct *, unsigned long); * Get a reference on the specified set of credentials. The caller must * release the reference. */ -static inline struct cred *get_cred(struct cred *cred) +static inline const struct cred *get_cred(const struct cred *cred) { - atomic_inc(&cred->usage); - return cred; + return get_new_cred((struct cred *) cred); } /** @@ -166,6 +186,8 @@ static inline struct cred *get_cred(struct cred *cred) static inline void put_cred(const struct cred *_cred) { struct cred *cred = (struct cred *) _cred; + + BUG_ON(atomic_read(&(cred)->usage) <= 0); if (atomic_dec_and_test(&(cred)->usage)) __put_cred(cred); } @@ -250,13 +272,13 @@ static inline void put_cred(const struct cred *_cred) __groups; \ }) -#define task_cred_xxx(task, xxx) \ -({ \ - __typeof__(task->cred->xxx) ___val; \ - rcu_read_lock(); \ - ___val = __task_cred((task))->xxx; \ - rcu_read_unlock(); \ - ___val; \ +#define task_cred_xxx(task, xxx) \ +({ \ + __typeof__(((struct cred *)NULL)->xxx) ___val; \ + rcu_read_lock(); \ + ___val = __task_cred((task))->xxx; \ + rcu_read_unlock(); \ + ___val; \ }) #define task_uid(task) (task_cred_xxx((task), uid)) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 5e24c54b6df..08c3b24ad9a 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -150,6 +150,8 @@ extern struct cred init_cred; .sibling = LIST_HEAD_INIT(tsk.sibling), \ .group_leader = &tsk, \ .cred = &init_cred, \ + .cred_exec_mutex = \ + __MUTEX_INITIALIZER(tsk.cred_exec_mutex), \ .comm = "swapper", \ .thread = INIT_THREAD, \ .fs = &init_fs, \ diff --git a/include/linux/key.h b/include/linux/key.h index 0836cc838b0..69ecf0934b0 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -73,6 +73,7 @@ struct key; struct seq_file; struct user_struct; struct signal_struct; +struct cred; struct key_type; struct key_owner; @@ -181,7 +182,7 @@ struct key { extern struct key *key_alloc(struct key_type *type, const char *desc, uid_t uid, gid_t gid, - struct task_struct *ctx, + const struct cred *cred, key_perm_t perm, unsigned long flags); @@ -249,7 +250,7 @@ extern int key_unlink(struct key *keyring, struct key *key); extern struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid, - struct task_struct *ctx, + const struct cred *cred, unsigned long flags, struct key *dest); @@ -276,22 +277,12 @@ extern ctl_table key_sysctls[]; /* * the userspace interface */ -extern void switch_uid_keyring(struct user_struct *new_user); -extern int copy_keys(unsigned long clone_flags, struct task_struct *tsk); -extern void exit_keys(struct task_struct *tsk); -extern int suid_keys(struct task_struct *tsk); +extern int install_thread_keyring_to_cred(struct cred *cred); extern int exec_keys(struct task_struct *tsk); extern void key_fsuid_changed(struct task_struct *tsk); extern void key_fsgid_changed(struct task_struct *tsk); extern void key_init(void); -#define __install_session_keyring(keyring) \ -({ \ - struct key *old_session = current->cred->tgcred->session_keyring; \ - current->cred->tgcred->session_keyring = keyring; \ - old_session; \ -}) - #else /* CONFIG_KEYS */ #define key_validate(k) 0 @@ -303,11 +294,6 @@ extern void key_init(void); #define make_key_ref(k, p) NULL #define key_ref_to_ptr(k) NULL #define is_key_possessed(k) 0 -#define switch_uid_keyring(u) do { } while(0) -#define __install_session_keyring(k) ({ NULL; }) -#define copy_keys(f,t) 0 -#define exit_keys(t) do { } while(0) -#define suid_keys(t) do { } while(0) #define exec_keys(t) do { } while(0) #define key_fsuid_changed(t) do { } while(0) #define key_fsgid_changed(t) do { } while(0) diff --git a/include/linux/sched.h b/include/linux/sched.h index 2913252989b..121d655e460 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1145,7 +1145,8 @@ struct task_struct { struct list_head cpu_timers[3]; /* process credentials */ - struct cred *cred; /* actual/objective task credentials */ + const struct cred *cred; /* actual/objective task credentials (COW) */ + struct mutex cred_exec_mutex; /* execve vs ptrace cred calculation mutex */ char comm[TASK_COMM_LEN]; /* executable name excluding path - access with [gs]et_task_comm (which lock @@ -1720,7 +1721,6 @@ static inline struct user_struct *get_uid(struct user_struct *u) return u; } extern void free_uid(struct user_struct *); -extern void switch_uid(struct user_struct *); extern void release_uids(struct user_namespace *ns); #include @@ -1870,6 +1870,8 @@ static inline unsigned long wait_task_inactive(struct task_struct *p, #define for_each_process(p) \ for (p = &init_task ; (p = next_task(p)) != &init_task ; ) +extern bool is_single_threaded(struct task_struct *); + /* * Careful: do_each_thread/while_each_thread is a double loop so * 'break' will not work as expected - use goto instead. diff --git a/include/linux/security.h b/include/linux/security.h index 7e9fe046a0d..68be1125144 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -53,24 +53,21 @@ extern int cap_settime(struct timespec *ts, struct timezone *tz); extern int cap_ptrace_may_access(struct task_struct *child, unsigned int mode); extern int cap_ptrace_traceme(struct task_struct *parent); extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); -extern int cap_capset_check(const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted); -extern void cap_capset_set(const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted); +extern int cap_capset(struct cred *new, const struct cred *old, + const kernel_cap_t *effective, + const kernel_cap_t *inheritable, + const kernel_cap_t *permitted); extern int cap_bprm_set_security(struct linux_binprm *bprm); -extern void cap_bprm_apply_creds(struct linux_binprm *bprm, int unsafe); +extern int cap_bprm_apply_creds(struct linux_binprm *bprm, int unsafe); extern int cap_bprm_secureexec(struct linux_binprm *bprm); extern int cap_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); extern int cap_inode_removexattr(struct dentry *dentry, const char *name); extern int cap_inode_need_killpriv(struct dentry *dentry); extern int cap_inode_killpriv(struct dentry *dentry); -extern int cap_task_post_setuid(uid_t old_ruid, uid_t old_euid, uid_t old_suid, int flags); -extern void cap_task_reparent_to_init(struct task_struct *p); +extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags); extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, - unsigned long arg4, unsigned long arg5, long *rc_p); + unsigned long arg4, unsigned long arg5); extern int cap_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp); extern int cap_task_setioprio(struct task_struct *p, int ioprio); extern int cap_task_setnice(struct task_struct *p, int nice); @@ -170,8 +167,8 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * Compute and set the security attributes of a process being transformed * by an execve operation based on the old attributes (current->security) * and the information saved in @bprm->security by the set_security hook. - * Since this hook function (and its caller) are void, this hook can not - * return an error. However, it can leave the security attributes of the + * Since this function may return an error, in which case the process will + * be killed. However, it can leave the security attributes of the * process unchanged if an access failure occurs at this point. * bprm_apply_creds is called under task_lock. @unsafe indicates various * reasons why it may be unsafe to change security state. @@ -593,15 +590,18 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * manual page for definitions of the @clone_flags. * @clone_flags contains the flags indicating what should be shared. * Return 0 if permission is granted. - * @cred_alloc_security: - * @cred contains the cred struct for child process. - * Allocate and attach a security structure to the cred->security field. - * The security field is initialized to NULL when the task structure is - * allocated. - * Return 0 if operation was successful. * @cred_free: * @cred points to the credentials. * Deallocate and clear the cred->security field in a set of credentials. + * @cred_prepare: + * @new points to the new credentials. + * @old points to the original credentials. + * @gfp indicates the atomicity of any memory allocations. + * Prepare a new set of credentials by copying the data from the old set. + * @cred_commit: + * @new points to the new credentials. + * @old points to the original credentials. + * Install a new set of credentials. * @task_setuid: * Check permission before setting one or more of the user identity * attributes of the current process. The @flags parameter indicates @@ -614,15 +614,13 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @id2 contains a uid. * @flags contains one of the LSM_SETID_* values. * Return 0 if permission is granted. - * @task_post_setuid: + * @task_fix_setuid: * Update the module's state after setting one or more of the user * identity attributes of the current process. The @flags parameter * indicates which of the set*uid system calls invoked this hook. If - * @flags is LSM_SETID_FS, then @old_ruid is the old fs uid and the other - * parameters are not used. - * @old_ruid contains the old real uid (or fs uid if LSM_SETID_FS). - * @old_euid contains the old effective uid (or -1 if LSM_SETID_FS). - * @old_suid contains the old saved uid (or -1 if LSM_SETID_FS). + * @new is the set of credentials that will be installed. Modifications + * should be made to this rather than to @current->cred. + * @old is the set of credentials that are being replaces * @flags contains one of the LSM_SETID_* values. * Return 0 on success. * @task_setgid: @@ -725,13 +723,8 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @arg3 contains a argument. * @arg4 contains a argument. * @arg5 contains a argument. - * @rc_p contains a pointer to communicate back the forced return code - * Return 0 if permission is granted, and non-zero if the security module - * has taken responsibility (setting *rc_p) for the prctl call. - * @task_reparent_to_init: - * Set the security attributes in @p->security for a kernel thread that - * is being reparented to the init task. - * @p contains the task_struct for the kernel thread. + * Return -ENOSYS if no-one wanted to handle this op, any other value to + * cause prctl() to return immediately with that value. * @task_to_inode: * Set the security attributes for an inode based on an associated task's * security attributes, e.g. for /proc/pid inodes. @@ -1008,7 +1001,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * See whether a specific operational right is granted to a process on a * key. * @key_ref refers to the key (key pointer + possession attribute bit). - * @context points to the process to provide the context against which to + * @cred points to the credentials to provide the context against which to * evaluate the security data on the key. * @perm describes the combination of permissions required of this key. * Return 1 if permission granted, 0 if permission denied and -ve it the @@ -1170,6 +1163,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @child process. * Security modules may also want to perform a process tracing check * during an execve in the set_security or apply_creds hooks of + * tracing check during an execve in the bprm_set_creds hook of * binprm_security_ops if the process is being traced and its security * attributes would be changed by the execve. * @child contains the task_struct structure for the target process. @@ -1193,19 +1187,15 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @inheritable contains the inheritable capability set. * @permitted contains the permitted capability set. * Return 0 if the capability sets were successfully obtained. - * @capset_check: - * Check permission before setting the @effective, @inheritable, and - * @permitted capability sets for the current process. - * @effective contains the effective capability set. - * @inheritable contains the inheritable capability set. - * @permitted contains the permitted capability set. - * Return 0 if permission is granted. - * @capset_set: + * @capset: * Set the @effective, @inheritable, and @permitted capability sets for * the current process. + * @new contains the new credentials structure for target process. + * @old contains the current credentials structure for target process. * @effective contains the effective capability set. * @inheritable contains the inheritable capability set. * @permitted contains the permitted capability set. + * Return 0 and update @new if permission is granted. * @capable: * Check whether the @tsk process has the @cap capability. * @tsk contains the task_struct for the process. @@ -1297,12 +1287,11 @@ struct security_operations { int (*capget) (struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); - int (*capset_check) (const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted); - void (*capset_set) (const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted); + int (*capset) (struct cred *new, + const struct cred *old, + const kernel_cap_t *effective, + const kernel_cap_t *inheritable, + const kernel_cap_t *permitted); int (*capable) (struct task_struct *tsk, int cap, int audit); int (*acct) (struct file *file); int (*sysctl) (struct ctl_table *table, int op); @@ -1314,7 +1303,7 @@ struct security_operations { int (*bprm_alloc_security) (struct linux_binprm *bprm); void (*bprm_free_security) (struct linux_binprm *bprm); - void (*bprm_apply_creds) (struct linux_binprm *bprm, int unsafe); + int (*bprm_apply_creds) (struct linux_binprm *bprm, int unsafe); void (*bprm_post_apply_creds) (struct linux_binprm *bprm); int (*bprm_set_security) (struct linux_binprm *bprm); int (*bprm_check_security) (struct linux_binprm *bprm); @@ -1405,11 +1394,13 @@ struct security_operations { int (*dentry_open) (struct file *file, const struct cred *cred); int (*task_create) (unsigned long clone_flags); - int (*cred_alloc_security) (struct cred *cred); void (*cred_free) (struct cred *cred); + int (*cred_prepare)(struct cred *new, const struct cred *old, + gfp_t gfp); + void (*cred_commit)(struct cred *new, const struct cred *old); int (*task_setuid) (uid_t id0, uid_t id1, uid_t id2, int flags); - int (*task_post_setuid) (uid_t old_ruid /* or fsuid */ , - uid_t old_euid, uid_t old_suid, int flags); + int (*task_fix_setuid) (struct cred *new, const struct cred *old, + int flags); int (*task_setgid) (gid_t id0, gid_t id1, gid_t id2, int flags); int (*task_setpgid) (struct task_struct *p, pid_t pgid); int (*task_getpgid) (struct task_struct *p); @@ -1429,8 +1420,7 @@ struct security_operations { int (*task_wait) (struct task_struct *p); int (*task_prctl) (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, - unsigned long arg5, long *rc_p); - void (*task_reparent_to_init) (struct task_struct *p); + unsigned long arg5); void (*task_to_inode) (struct task_struct *p, struct inode *inode); int (*ipc_permission) (struct kern_ipc_perm *ipcp, short flag); @@ -1535,10 +1525,10 @@ struct security_operations { /* key management security hooks */ #ifdef CONFIG_KEYS - int (*key_alloc) (struct key *key, struct task_struct *tsk, unsigned long flags); + int (*key_alloc) (struct key *key, const struct cred *cred, unsigned long flags); void (*key_free) (struct key *key); int (*key_permission) (key_ref_t key_ref, - struct task_struct *context, + const struct cred *cred, key_perm_t perm); int (*key_getsecurity)(struct key *key, char **_buffer); #endif /* CONFIG_KEYS */ @@ -1564,12 +1554,10 @@ int security_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); -int security_capset_check(const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted); -void security_capset_set(const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted); +int security_capset(struct cred *new, const struct cred *old, + const kernel_cap_t *effective, + const kernel_cap_t *inheritable, + const kernel_cap_t *permitted); int security_capable(struct task_struct *tsk, int cap); int security_capable_noaudit(struct task_struct *tsk, int cap); int security_acct(struct file *file); @@ -1583,7 +1571,7 @@ int security_vm_enough_memory_mm(struct mm_struct *mm, long pages); int security_vm_enough_memory_kern(long pages); int security_bprm_alloc(struct linux_binprm *bprm); void security_bprm_free(struct linux_binprm *bprm); -void security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe); +int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe); void security_bprm_post_apply_creds(struct linux_binprm *bprm); int security_bprm_set(struct linux_binprm *bprm); int security_bprm_check(struct linux_binprm *bprm); @@ -1660,11 +1648,12 @@ int security_file_send_sigiotask(struct task_struct *tsk, int security_file_receive(struct file *file); int security_dentry_open(struct file *file, const struct cred *cred); int security_task_create(unsigned long clone_flags); -int security_cred_alloc(struct cred *cred); void security_cred_free(struct cred *cred); +int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp); +void security_commit_creds(struct cred *new, const struct cred *old); int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags); -int security_task_post_setuid(uid_t old_ruid, uid_t old_euid, - uid_t old_suid, int flags); +int security_task_fix_setuid(struct cred *new, const struct cred *old, + int flags); int security_task_setgid(gid_t id0, gid_t id1, gid_t id2, int flags); int security_task_setpgid(struct task_struct *p, pid_t pgid); int security_task_getpgid(struct task_struct *p); @@ -1683,8 +1672,7 @@ int security_task_kill(struct task_struct *p, struct siginfo *info, int sig, u32 secid); int security_task_wait(struct task_struct *p); int security_task_prctl(int option, unsigned long arg2, unsigned long arg3, - unsigned long arg4, unsigned long arg5, long *rc_p); -void security_task_reparent_to_init(struct task_struct *p); + unsigned long arg4, unsigned long arg5); void security_task_to_inode(struct task_struct *p, struct inode *inode); int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag); void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid); @@ -1759,18 +1747,13 @@ static inline int security_capget(struct task_struct *target, return cap_capget(target, effective, inheritable, permitted); } -static inline int security_capset_check(const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted) +static inline int security_capset(struct cred *new, + const struct cred *old, + const kernel_cap_t *effective, + const kernel_cap_t *inheritable, + const kernel_cap_t *permitted) { - return cap_capset_check(effective, inheritable, permitted); -} - -static inline void security_capset_set(const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted) -{ - cap_capset_set(effective, inheritable, permitted); + return cap_capset(new, old, effective, inheritable, permitted); } static inline int security_capable(struct task_struct *tsk, int cap) @@ -1837,9 +1820,9 @@ static inline int security_bprm_alloc(struct linux_binprm *bprm) static inline void security_bprm_free(struct linux_binprm *bprm) { } -static inline void security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe) +static inline int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe) { - cap_bprm_apply_creds(bprm, unsafe); + return cap_bprm_apply_creds(bprm, unsafe); } static inline void security_bprm_post_apply_creds(struct linux_binprm *bprm) @@ -2182,13 +2165,20 @@ static inline int security_task_create(unsigned long clone_flags) return 0; } -static inline int security_cred_alloc(struct cred *cred) +static inline void security_cred_free(struct cred *cred) +{ } + +static inline int security_prepare_creds(struct cred *new, + const struct cred *old, + gfp_t gfp) { return 0; } -static inline void security_cred_free(struct cred *cred) -{ } +static inline void security_commit_creds(struct cred *new, + const struct cred *old) +{ +} static inline int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags) @@ -2196,10 +2186,11 @@ static inline int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, return 0; } -static inline int security_task_post_setuid(uid_t old_ruid, uid_t old_euid, - uid_t old_suid, int flags) +static inline int security_task_fix_setuid(struct cred *new, + const struct cred *old, + int flags) { - return cap_task_post_setuid(old_ruid, old_euid, old_suid, flags); + return cap_task_fix_setuid(new, old, flags); } static inline int security_task_setgid(gid_t id0, gid_t id1, gid_t id2, @@ -2286,14 +2277,9 @@ static inline int security_task_wait(struct task_struct *p) static inline int security_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, - unsigned long arg5, long *rc_p) -{ - return cap_task_prctl(option, arg2, arg3, arg3, arg5, rc_p); -} - -static inline void security_task_reparent_to_init(struct task_struct *p) + unsigned long arg5) { - cap_task_reparent_to_init(p); + return cap_task_prctl(option, arg2, arg3, arg3, arg5); } static inline void security_task_to_inode(struct task_struct *p, struct inode *inode) @@ -2719,16 +2705,16 @@ static inline void security_skb_classify_flow(struct sk_buff *skb, struct flowi #ifdef CONFIG_KEYS #ifdef CONFIG_SECURITY -int security_key_alloc(struct key *key, struct task_struct *tsk, unsigned long flags); +int security_key_alloc(struct key *key, const struct cred *cred, unsigned long flags); void security_key_free(struct key *key); int security_key_permission(key_ref_t key_ref, - struct task_struct *context, key_perm_t perm); + const struct cred *cred, key_perm_t perm); int security_key_getsecurity(struct key *key, char **_buffer); #else static inline int security_key_alloc(struct key *key, - struct task_struct *tsk, + const struct cred *cred, unsigned long flags) { return 0; @@ -2739,7 +2725,7 @@ static inline void security_key_free(struct key *key) } static inline int security_key_permission(key_ref_t key_ref, - struct task_struct *context, + const struct cred *cred, key_perm_t perm) { return 0; diff --git a/init/main.c b/init/main.c index 7e117a231af..db843bff573 100644 --- a/init/main.c +++ b/init/main.c @@ -669,6 +669,7 @@ asmlinkage void __init start_kernel(void) efi_enter_virtual_mode(); #endif thread_info_cache_init(); + cred_init(); fork_init(num_physpages); proc_caches_init(); buffer_init(); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ae8ef88ade3..bc1e2d854bf 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2546,18 +2546,17 @@ int __audit_signal_info(int sig, struct task_struct *t) /** * __audit_log_bprm_fcaps - store information about a loading bprm and relevant fcaps - * @bprm pointer to the bprm being processed - * @caps the caps read from the disk + * @bprm: pointer to the bprm being processed + * @new: the proposed new credentials + * @old: the old credentials * * Simply check if the proc already has the caps given by the file and if not * store the priv escalation info for later auditing at the end of the syscall * - * this can fail and we don't care. See the note in audit.h for - * audit_log_bprm_fcaps() for my explaination.... - * * -Eric */ -void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE) +int __audit_log_bprm_fcaps(struct linux_binprm *bprm, + const struct cred *new, const struct cred *old) { struct audit_aux_data_bprm_fcaps *ax; struct audit_context *context = current->audit_context; @@ -2566,7 +2565,7 @@ void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_ ax = kmalloc(sizeof(*ax), GFP_KERNEL); if (!ax) - return; + return -ENOMEM; ax->d.type = AUDIT_BPRM_FCAPS; ax->d.next = context->aux; @@ -2581,26 +2580,27 @@ void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_ ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; - ax->old_pcap.permitted = *pP; - ax->old_pcap.inheritable = current->cred->cap_inheritable; - ax->old_pcap.effective = *pE; + ax->old_pcap.permitted = old->cap_permitted; + ax->old_pcap.inheritable = old->cap_inheritable; + ax->old_pcap.effective = old->cap_effective; - ax->new_pcap.permitted = current->cred->cap_permitted; - ax->new_pcap.inheritable = current->cred->cap_inheritable; - ax->new_pcap.effective = current->cred->cap_effective; + ax->new_pcap.permitted = new->cap_permitted; + ax->new_pcap.inheritable = new->cap_inheritable; + ax->new_pcap.effective = new->cap_effective; + return 0; } /** * __audit_log_capset - store information about the arguments to the capset syscall - * @pid target pid of the capset call - * @eff effective cap set - * @inh inheritible cap set - * @perm permited cap set + * @pid: target pid of the capset call + * @new: the new credentials + * @old: the old (current) credentials * * Record the aguments userspace sent to sys_capset for later printing by the * audit system if applicable */ -int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm) +int __audit_log_capset(pid_t pid, + const struct cred *new, const struct cred *old) { struct audit_aux_data_capset *ax; struct audit_context *context = current->audit_context; @@ -2617,9 +2617,9 @@ int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_c context->aux = (void *)ax; ax->pid = pid; - ax->cap.effective = *eff; - ax->cap.inheritable = *eff; - ax->cap.permitted = *perm; + ax->cap.effective = new->cap_effective; + ax->cap.inheritable = new->cap_effective; + ax->cap.permitted = new->cap_permitted; return 0; } diff --git a/kernel/capability.c b/kernel/capability.c index a404b980b1b..36b4b4daebe 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -15,12 +15,7 @@ #include #include #include - -/* - * This lock protects task->cap_* for all tasks including current. - * Locking rule: acquire this prior to tasklist_lock. - */ -static DEFINE_SPINLOCK(task_capability_lock); +#include "cred-internals.h" /* * Leveraged for setting/resetting capabilities @@ -128,12 +123,11 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) } /* - * If we have configured with filesystem capability support, then the - * only thing that can change the capabilities of the current process - * is the current process. As such, we can't be in this code at the - * same time as we are in the process of setting capabilities in this - * process. The net result is that we can limit our use of locks to - * when we are reading the caps of another process. + * The only thing that can change the capabilities of the current + * process is the current process. As such, we can't be in this code + * at the same time as we are in the process of setting capabilities + * in this process. The net result is that we can limit our use of + * locks to when we are reading the caps of another process. */ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, kernel_cap_t *pIp, kernel_cap_t *pPp) @@ -143,7 +137,6 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, if (pid && (pid != task_pid_vnr(current))) { struct task_struct *target; - spin_lock(&task_capability_lock); read_lock(&tasklist_lock); target = find_task_by_vpid(pid); @@ -153,34 +146,12 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, ret = security_capget(target, pEp, pIp, pPp); read_unlock(&tasklist_lock); - spin_unlock(&task_capability_lock); } else ret = security_capget(current, pEp, pIp, pPp); return ret; } -/* - * Atomically modify the effective capabilities returning the original - * value. No permission check is performed here - it is assumed that the - * caller is permitted to set the desired effective capabilities. - */ -kernel_cap_t cap_set_effective(const kernel_cap_t pE_new) -{ - kernel_cap_t pE_old; - - spin_lock(&task_capability_lock); - - pE_old = current->cred->cap_effective; - current->cred->cap_effective = pE_new; - - spin_unlock(&task_capability_lock); - - return pE_old; -} - -EXPORT_SYMBOL(cap_set_effective); - /** * sys_capget - get the capabilities of a given process. * @header: pointer to struct that contains capability version and @@ -208,7 +179,6 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) return -EINVAL; ret = cap_get_target_pid(pid, &pE, &pI, &pP); - if (!ret) { struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; unsigned i; @@ -270,6 +240,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; unsigned i, tocopy; kernel_cap_t inheritable, permitted, effective; + struct cred *new; int ret; pid_t pid; @@ -284,8 +255,8 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) if (pid != 0 && pid != task_pid_vnr(current)) return -EPERM; - if (copy_from_user(&kdata, data, tocopy - * sizeof(struct __user_cap_data_struct))) + if (copy_from_user(&kdata, data, + tocopy * sizeof(struct __user_cap_data_struct))) return -EFAULT; for (i = 0; i < tocopy; i++) { @@ -300,24 +271,23 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) i++; } - ret = audit_log_capset(pid, &effective, &inheritable, &permitted); - if (ret) + new = prepare_creds(); + if (!new) + return -ENOMEM; + + ret = security_capset(new, current_cred(), + &effective, &inheritable, &permitted); + if (ret < 0) + goto error; + + ret = audit_log_capset(pid, new, current_cred()); + if (ret < 0) return ret; - /* This lock is required even when filesystem capability support is - * configured - it protects the sys_capget() call from returning - * incorrect data in the case that the targeted process is not the - * current one. - */ - spin_lock(&task_capability_lock); - - ret = security_capset_check(&effective, &inheritable, &permitted); - /* Having verified that the proposed changes are legal, we now put them - * into effect. - */ - if (!ret) - security_capset_set(&effective, &inheritable, &permitted); - spin_unlock(&task_capability_lock); + return commit_creds(new); + +error: + abort_creds(new); return ret; } diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h new file mode 100644 index 00000000000..2dc4fc2d0bf --- /dev/null +++ b/kernel/cred-internals.h @@ -0,0 +1,21 @@ +/* Internal credentials stuff + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +/* + * user.c + */ +static inline void sched_switch_user(struct task_struct *p) +{ +#ifdef CONFIG_USER_SCHED + sched_move_task(p); +#endif /* CONFIG_USER_SCHED */ +} + diff --git a/kernel/cred.c b/kernel/cred.c index ac73e361768..cb6b5eda978 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -15,6 +15,10 @@ #include #include #include +#include +#include "cred-internals.h" + +static struct kmem_cache *cred_jar; /* * The common credentials for the initial task's thread group @@ -64,7 +68,7 @@ static void release_tgcred_rcu(struct rcu_head *rcu) /* * Release a set of thread group credentials. */ -static void release_tgcred(struct cred *cred) +void release_tgcred(struct cred *cred) { #ifdef CONFIG_KEYS struct thread_group_cred *tgcred = cred->tgcred; @@ -81,79 +85,322 @@ static void put_cred_rcu(struct rcu_head *rcu) { struct cred *cred = container_of(rcu, struct cred, rcu); - BUG_ON(atomic_read(&cred->usage) != 0); + if (atomic_read(&cred->usage) != 0) + panic("CRED: put_cred_rcu() sees %p with usage %d\n", + cred, atomic_read(&cred->usage)); + security_cred_free(cred); key_put(cred->thread_keyring); key_put(cred->request_key_auth); release_tgcred(cred); put_group_info(cred->group_info); free_uid(cred->user); - security_cred_free(cred); - kfree(cred); + kmem_cache_free(cred_jar, cred); } /** * __put_cred - Destroy a set of credentials - * @sec: The record to release + * @cred: The record to release * * Destroy a set of credentials on which no references remain. */ void __put_cred(struct cred *cred) { + BUG_ON(atomic_read(&cred->usage) != 0); + call_rcu(&cred->rcu, put_cred_rcu); } EXPORT_SYMBOL(__put_cred); +/** + * prepare_creds - Prepare a new set of credentials for modification + * + * Prepare a new set of task credentials for modification. A task's creds + * shouldn't generally be modified directly, therefore this function is used to + * prepare a new copy, which the caller then modifies and then commits by + * calling commit_creds(). + * + * Returns a pointer to the new creds-to-be if successful, NULL otherwise. + * + * Call commit_creds() or abort_creds() to clean up. + */ +struct cred *prepare_creds(void) +{ + struct task_struct *task = current; + const struct cred *old; + struct cred *new; + + BUG_ON(atomic_read(&task->cred->usage) < 1); + + new = kmem_cache_alloc(cred_jar, GFP_KERNEL); + if (!new) + return NULL; + + old = task->cred; + memcpy(new, old, sizeof(struct cred)); + + atomic_set(&new->usage, 1); + get_group_info(new->group_info); + get_uid(new->user); + +#ifdef CONFIG_KEYS + key_get(new->thread_keyring); + key_get(new->request_key_auth); + atomic_inc(&new->tgcred->usage); +#endif + +#ifdef CONFIG_SECURITY + new->security = NULL; +#endif + + if (security_prepare_creds(new, old, GFP_KERNEL) < 0) + goto error; + return new; + +error: + abort_creds(new); + return NULL; +} +EXPORT_SYMBOL(prepare_creds); + +/* + * prepare new credentials for the usermode helper dispatcher + */ +struct cred *prepare_usermodehelper_creds(void) +{ +#ifdef CONFIG_KEYS + struct thread_group_cred *tgcred = NULL; +#endif + struct cred *new; + +#ifdef CONFIG_KEYS + tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC); + if (!tgcred) + return NULL; +#endif + + new = kmem_cache_alloc(cred_jar, GFP_ATOMIC); + if (!new) + return NULL; + + memcpy(new, &init_cred, sizeof(struct cred)); + + atomic_set(&new->usage, 1); + get_group_info(new->group_info); + get_uid(new->user); + +#ifdef CONFIG_KEYS + new->thread_keyring = NULL; + new->request_key_auth = NULL; + new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT; + + atomic_set(&tgcred->usage, 1); + spin_lock_init(&tgcred->lock); + new->tgcred = tgcred; +#endif + +#ifdef CONFIG_SECURITY + new->security = NULL; +#endif + if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0) + goto error; + + BUG_ON(atomic_read(&new->usage) != 1); + return new; + +error: + put_cred(new); + return NULL; +} + /* * Copy credentials for the new process created by fork() + * + * We share if we can, but under some circumstances we have to generate a new + * set. */ int copy_creds(struct task_struct *p, unsigned long clone_flags) { - struct cred *pcred; - int ret; +#ifdef CONFIG_KEYS + struct thread_group_cred *tgcred; +#endif + struct cred *new; + + mutex_init(&p->cred_exec_mutex); - pcred = kmemdup(p->cred, sizeof(*p->cred), GFP_KERNEL); - if (!pcred) + if ( +#ifdef CONFIG_KEYS + !p->cred->thread_keyring && +#endif + clone_flags & CLONE_THREAD + ) { + get_cred(p->cred); + atomic_inc(&p->cred->user->processes); + return 0; + } + + new = prepare_creds(); + if (!new) return -ENOMEM; #ifdef CONFIG_KEYS - if (clone_flags & CLONE_THREAD) { - atomic_inc(&pcred->tgcred->usage); - } else { - pcred->tgcred = kmalloc(sizeof(struct cred), GFP_KERNEL); - if (!pcred->tgcred) { - kfree(pcred); + /* new threads get their own thread keyrings if their parent already + * had one */ + if (new->thread_keyring) { + key_put(new->thread_keyring); + new->thread_keyring = NULL; + if (clone_flags & CLONE_THREAD) + install_thread_keyring_to_cred(new); + } + + /* we share the process and session keyrings between all the threads in + * a process - this is slightly icky as we violate COW credentials a + * bit */ + if (!(clone_flags & CLONE_THREAD)) { + tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); + if (!tgcred) { + put_cred(new); return -ENOMEM; } - atomic_set(&pcred->tgcred->usage, 1); - spin_lock_init(&pcred->tgcred->lock); - pcred->tgcred->process_keyring = NULL; - pcred->tgcred->session_keyring = - key_get(p->cred->tgcred->session_keyring); + atomic_set(&tgcred->usage, 1); + spin_lock_init(&tgcred->lock); + tgcred->process_keyring = NULL; + tgcred->session_keyring = key_get(new->tgcred->session_keyring); + + release_tgcred(new); + new->tgcred = tgcred; } #endif -#ifdef CONFIG_SECURITY - pcred->security = NULL; -#endif + atomic_inc(&new->user->processes); + p->cred = new; + return 0; +} - ret = security_cred_alloc(pcred); - if (ret < 0) { - release_tgcred(pcred); - kfree(pcred); - return ret; +/** + * commit_creds - Install new credentials upon the current task + * @new: The credentials to be assigned + * + * Install a new set of credentials to the current task, using RCU to replace + * the old set. + * + * This function eats the caller's reference to the new credentials. + * + * Always returns 0 thus allowing this function to be tail-called at the end + * of, say, sys_setgid(). + */ +int commit_creds(struct cred *new) +{ + struct task_struct *task = current; + const struct cred *old; + + BUG_ON(atomic_read(&new->usage) < 1); + BUG_ON(atomic_read(&task->cred->usage) < 1); + + old = task->cred; + security_commit_creds(new, old); + + /* dumpability changes */ + if (old->euid != new->euid || + old->egid != new->egid || + old->fsuid != new->fsuid || + old->fsgid != new->fsgid || + !cap_issubset(new->cap_permitted, old->cap_permitted)) { + set_dumpable(task->mm, suid_dumpable); + task->pdeath_signal = 0; + smp_wmb(); } - atomic_set(&pcred->usage, 1); - get_group_info(pcred->group_info); - get_uid(pcred->user); - key_get(pcred->thread_keyring); - key_get(pcred->request_key_auth); + /* alter the thread keyring */ + if (new->fsuid != old->fsuid) + key_fsuid_changed(task); + if (new->fsgid != old->fsgid) + key_fsgid_changed(task); + + /* do it + * - What if a process setreuid()'s and this brings the + * new uid over his NPROC rlimit? We can check this now + * cheaply with the new uid cache, so if it matters + * we should be checking for it. -DaveM + */ + if (new->user != old->user) + atomic_inc(&new->user->processes); + rcu_assign_pointer(task->cred, new); + if (new->user != old->user) + atomic_dec(&old->user->processes); + + sched_switch_user(task); + + /* send notifications */ + if (new->uid != old->uid || + new->euid != old->euid || + new->suid != old->suid || + new->fsuid != old->fsuid) + proc_id_connector(task, PROC_EVENT_UID); - atomic_inc(&pcred->user->processes); + if (new->gid != old->gid || + new->egid != old->egid || + new->sgid != old->sgid || + new->fsgid != old->fsgid) + proc_id_connector(task, PROC_EVENT_GID); - /* RCU assignment is unneeded here as no-one can have accessed this - * pointer yet, barring us */ - p->cred = pcred; + put_cred(old); return 0; } +EXPORT_SYMBOL(commit_creds); + +/** + * abort_creds - Discard a set of credentials and unlock the current task + * @new: The credentials that were going to be applied + * + * Discard a set of credentials that were under construction and unlock the + * current task. + */ +void abort_creds(struct cred *new) +{ + BUG_ON(atomic_read(&new->usage) < 1); + put_cred(new); +} +EXPORT_SYMBOL(abort_creds); + +/** + * override_creds - Temporarily override the current process's credentials + * @new: The credentials to be assigned + * + * Install a set of temporary override credentials on the current process, + * returning the old set for later reversion. + */ +const struct cred *override_creds(const struct cred *new) +{ + const struct cred *old = current->cred; + + rcu_assign_pointer(current->cred, get_cred(new)); + return old; +} +EXPORT_SYMBOL(override_creds); + +/** + * revert_creds - Revert a temporary credentials override + * @old: The credentials to be restored + * + * Revert a temporary set of override credentials to an old set, discarding the + * override set. + */ +void revert_creds(const struct cred *old) +{ + const struct cred *override = current->cred; + + rcu_assign_pointer(current->cred, old); + put_cred(override); +} +EXPORT_SYMBOL(revert_creds); + +/* + * initialise the credentials stuff + */ +void __init cred_init(void) +{ + /* allocate a slab in which we can store credentials */ + cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); +} diff --git a/kernel/exit.c b/kernel/exit.c index bbc22530f2c..c0711da1548 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -47,12 +47,14 @@ #include #include #include +#include #include #include #include #include #include +#include "cred-internals.h" static void exit_mm(struct task_struct * tsk); @@ -338,12 +340,12 @@ static void reparent_to_kthreadd(void) /* cpus_allowed? */ /* rt_priority? */ /* signals? */ - security_task_reparent_to_init(current); memcpy(current->signal->rlim, init_task.signal->rlim, sizeof(current->signal->rlim)); - atomic_inc(&(INIT_USER->__count)); + + atomic_inc(&init_cred.usage); + commit_creds(&init_cred); write_unlock_irq(&tasklist_lock); - switch_uid(INIT_USER); } void __set_special_pids(struct pid *pid) @@ -1085,7 +1087,6 @@ NORET_TYPE void do_exit(long code) check_stack_usage(); exit_thread(); cgroup_exit(tsk, 1); - exit_keys(tsk); if (group_dead && tsk->signal->leader) disassociate_ctty(1); diff --git a/kernel/fork.c b/kernel/fork.c index ded1972672a..82a7948a664 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1084,10 +1084,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_sighand; if ((retval = copy_mm(clone_flags, p))) goto bad_fork_cleanup_signal; - if ((retval = copy_keys(clone_flags, p))) - goto bad_fork_cleanup_mm; if ((retval = copy_namespaces(clone_flags, p))) - goto bad_fork_cleanup_keys; + goto bad_fork_cleanup_mm; if ((retval = copy_io(clone_flags, p))) goto bad_fork_cleanup_namespaces; retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); @@ -1252,8 +1250,6 @@ bad_fork_cleanup_io: put_io_context(p->io_context); bad_fork_cleanup_namespaces: exit_task_namespaces(p); -bad_fork_cleanup_keys: - exit_keys(p); bad_fork_cleanup_mm: if (p->mm) mmput(p->mm); @@ -1281,6 +1277,7 @@ bad_fork_cleanup_cgroup: bad_fork_cleanup_put_domain: module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: + atomic_dec(&p->cred->user->processes); put_cred(p->cred); bad_fork_free: free_task(p); diff --git a/kernel/kmod.c b/kernel/kmod.c index f044f8f5770..b46dbb90866 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -118,10 +118,10 @@ EXPORT_SYMBOL(request_module); struct subprocess_info { struct work_struct work; struct completion *complete; + struct cred *cred; char *path; char **argv; char **envp; - struct key *ring; enum umh_wait wait; int retval; struct file *stdin; @@ -134,19 +134,20 @@ struct subprocess_info { static int ____call_usermodehelper(void *data) { struct subprocess_info *sub_info = data; - struct key *new_session, *old_session; int retval; - /* Unblock all signals and set the session keyring. */ - new_session = key_get(sub_info->ring); + BUG_ON(atomic_read(&sub_info->cred->usage) != 1); + + /* Unblock all signals */ spin_lock_irq(¤t->sighand->siglock); - old_session = __install_session_keyring(new_session); flush_signal_handlers(current, 1); sigemptyset(¤t->blocked); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - key_put(old_session); + /* Install the credentials */ + commit_creds(sub_info->cred); + sub_info->cred = NULL; /* Install input pipe when needed */ if (sub_info->stdin) { @@ -185,6 +186,8 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info) { if (info->cleanup) (*info->cleanup)(info->argv, info->envp); + if (info->cred) + put_cred(info->cred); kfree(info); } EXPORT_SYMBOL(call_usermodehelper_freeinfo); @@ -240,6 +243,8 @@ static void __call_usermodehelper(struct work_struct *work) pid_t pid; enum umh_wait wait = sub_info->wait; + BUG_ON(atomic_read(&sub_info->cred->usage) != 1); + /* CLONE_VFORK: wait until the usermode helper has execve'd * successfully We need the data structures to stay around * until that is done. */ @@ -362,6 +367,9 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, sub_info->path = path; sub_info->argv = argv; sub_info->envp = envp; + sub_info->cred = prepare_usermodehelper_creds(); + if (!sub_info->cred) + return NULL; out: return sub_info; @@ -376,7 +384,13 @@ EXPORT_SYMBOL(call_usermodehelper_setup); void call_usermodehelper_setkeys(struct subprocess_info *info, struct key *session_keyring) { - info->ring = session_keyring; +#ifdef CONFIG_KEYS + struct thread_group_cred *tgcred = info->cred->tgcred; + key_put(tgcred->session_keyring); + tgcred->session_keyring = key_get(session_keyring); +#else + BUG(); +#endif } EXPORT_SYMBOL(call_usermodehelper_setkeys); @@ -444,6 +458,8 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, DECLARE_COMPLETION_ONSTACK(done); int retval = 0; + BUG_ON(atomic_read(&sub_info->cred->usage) != 1); + helper_lock(); if (sub_info->path[0] == '\0') goto out; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index b9d5f4e4f6a..f764b880695 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -171,6 +171,14 @@ int ptrace_attach(struct task_struct *task) if (same_thread_group(task, current)) goto out; + /* Protect exec's credential calculations against our interference; + * SUID, SGID and LSM creds get determined differently under ptrace. + */ + retval = mutex_lock_interruptible(¤t->cred_exec_mutex); + if (retval < 0) + goto out; + + retval = -EPERM; repeat: /* * Nasty, nasty. @@ -210,6 +218,7 @@ repeat: bad: write_unlock_irqrestore(&tasklist_lock, flags); task_unlock(task); + mutex_unlock(¤t->cred_exec_mutex); out: return retval; } diff --git a/kernel/signal.c b/kernel/signal.c index 84989124baf..2a64304ed54 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -180,7 +180,7 @@ int next_signal(struct sigpending *pending, sigset_t *mask) /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an - * appopriate lock must be held to protect t's user_struct + * appopriate lock must be held to stop the target task from exiting */ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, int override_rlimit) @@ -194,7 +194,7 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, * caller must be holding the RCU readlock (by way of a spinlock) and * we use RCU protection here */ - user = __task_cred(t)->user; + user = get_uid(__task_cred(t)->user); atomic_inc(&user->sigpending); if (override_rlimit || atomic_read(&user->sigpending) <= @@ -202,12 +202,14 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, q = kmem_cache_alloc(sigqueue_cachep, flags); if (unlikely(q == NULL)) { atomic_dec(&user->sigpending); + free_uid(user); } else { INIT_LIST_HEAD(&q->list); q->flags = 0; - q->user = get_uid(user); + q->user = user; } - return(q); + + return q; } static void __sigqueue_free(struct sigqueue *q) diff --git a/kernel/sys.c b/kernel/sys.c index ccc9eb736d3..ab735040468 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -180,7 +180,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: - user = cred->user; + user = (struct user_struct *) cred->user; if (!who) who = cred->uid; else if ((who != cred->uid) && @@ -479,47 +479,48 @@ void ctrl_alt_del(void) */ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) { - struct cred *cred = current->cred; - int old_rgid = cred->gid; - int old_egid = cred->egid; - int new_rgid = old_rgid; - int new_egid = old_egid; + const struct cred *old; + struct cred *new; int retval; + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE); if (retval) - return retval; + goto error; + retval = -EPERM; if (rgid != (gid_t) -1) { - if ((old_rgid == rgid) || - (cred->egid == rgid) || + if (old->gid == rgid || + old->egid == rgid || capable(CAP_SETGID)) - new_rgid = rgid; + new->gid = rgid; else - return -EPERM; + goto error; } if (egid != (gid_t) -1) { - if ((old_rgid == egid) || - (cred->egid == egid) || - (cred->sgid == egid) || + if (old->gid == egid || + old->egid == egid || + old->sgid == egid || capable(CAP_SETGID)) - new_egid = egid; + new->egid = egid; else - return -EPERM; - } - if (new_egid != old_egid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); + goto error; } + if (rgid != (gid_t) -1 || - (egid != (gid_t) -1 && egid != old_rgid)) - cred->sgid = new_egid; - cred->fsgid = new_egid; - cred->egid = new_egid; - cred->gid = new_rgid; - key_fsgid_changed(current); - proc_id_connector(current, PROC_EVENT_GID); - return 0; + (egid != (gid_t) -1 && egid != old->gid)) + new->sgid = new->egid; + new->fsgid = new->egid; + + return commit_creds(new); + +error: + abort_creds(new); + return retval; } /* @@ -529,40 +530,42 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) */ asmlinkage long sys_setgid(gid_t gid) { - struct cred *cred = current->cred; - int old_egid = cred->egid; + const struct cred *old; + struct cred *new; int retval; + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID); if (retval) - return retval; + goto error; - if (capable(CAP_SETGID)) { - if (old_egid != gid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - cred->gid = cred->egid = cred->sgid = cred->fsgid = gid; - } else if ((gid == cred->gid) || (gid == cred->sgid)) { - if (old_egid != gid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - cred->egid = cred->fsgid = gid; - } + retval = -EPERM; + if (capable(CAP_SETGID)) + new->gid = new->egid = new->sgid = new->fsgid = gid; + else if (gid == old->gid || gid == old->sgid) + new->egid = new->fsgid = gid; else - return -EPERM; + goto error; - key_fsgid_changed(current); - proc_id_connector(current, PROC_EVENT_GID); - return 0; + return commit_creds(new); + +error: + abort_creds(new); + return retval; } -static int set_user(uid_t new_ruid, int dumpclear) +/* + * change the user struct in a credentials set to match the new UID + */ +static int set_user(struct cred *new) { struct user_struct *new_user; - new_user = alloc_uid(current->nsproxy->user_ns, new_ruid); + new_user = alloc_uid(current->nsproxy->user_ns, new->uid); if (!new_user) return -EAGAIN; @@ -573,13 +576,8 @@ static int set_user(uid_t new_ruid, int dumpclear) return -EAGAIN; } - switch_uid(new_user); - - if (dumpclear) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - current->cred->uid = new_ruid; + free_uid(new->user); + new->user = new_user; return 0; } @@ -600,55 +598,56 @@ static int set_user(uid_t new_ruid, int dumpclear) */ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) { - struct cred *cred = current->cred; - int old_ruid, old_euid, old_suid, new_ruid, new_euid; + const struct cred *old; + struct cred *new; int retval; + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE); if (retval) - return retval; - - new_ruid = old_ruid = cred->uid; - new_euid = old_euid = cred->euid; - old_suid = cred->suid; + goto error; + retval = -EPERM; if (ruid != (uid_t) -1) { - new_ruid = ruid; - if ((old_ruid != ruid) && - (cred->euid != ruid) && + new->uid = ruid; + if (old->uid != ruid && + old->euid != ruid && !capable(CAP_SETUID)) - return -EPERM; + goto error; } if (euid != (uid_t) -1) { - new_euid = euid; - if ((old_ruid != euid) && - (cred->euid != euid) && - (cred->suid != euid) && + new->euid = euid; + if (old->uid != euid && + old->euid != euid && + old->suid != euid && !capable(CAP_SETUID)) - return -EPERM; + goto error; } - if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0) - return -EAGAIN; + retval = -EAGAIN; + if (new->uid != old->uid && set_user(new) < 0) + goto error; - if (new_euid != old_euid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - cred->fsuid = cred->euid = new_euid; if (ruid != (uid_t) -1 || - (euid != (uid_t) -1 && euid != old_ruid)) - cred->suid = cred->euid; - cred->fsuid = cred->euid; - - key_fsuid_changed(current); - proc_id_connector(current, PROC_EVENT_UID); + (euid != (uid_t) -1 && euid != old->uid)) + new->suid = new->euid; + new->fsuid = new->euid; - return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE); -} + retval = security_task_fix_setuid(new, old, LSM_SETID_RE); + if (retval < 0) + goto error; + return commit_creds(new); +error: + abort_creds(new); + return retval; +} /* * setuid() is implemented like SysV with SAVED_IDS @@ -663,37 +662,41 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) */ asmlinkage long sys_setuid(uid_t uid) { - struct cred *cred = current->cred; - int old_euid = cred->euid; - int old_ruid, old_suid, new_suid; + const struct cred *old; + struct cred *new; int retval; + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); if (retval) - return retval; + goto error; - old_ruid = cred->uid; - old_suid = cred->suid; - new_suid = old_suid; - + retval = -EPERM; if (capable(CAP_SETUID)) { - if (uid != old_ruid && set_user(uid, old_euid != uid) < 0) - return -EAGAIN; - new_suid = uid; - } else if ((uid != cred->uid) && (uid != new_suid)) - return -EPERM; - - if (old_euid != uid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); + new->suid = new->uid = uid; + if (uid != old->uid && set_user(new) < 0) { + retval = -EAGAIN; + goto error; + } + } else if (uid != old->uid && uid != new->suid) { + goto error; } - cred->fsuid = cred->euid = uid; - cred->suid = new_suid; - key_fsuid_changed(current); - proc_id_connector(current, PROC_EVENT_UID); + new->fsuid = new->euid = uid; + + retval = security_task_fix_setuid(new, old, LSM_SETID_ID); + if (retval < 0) + goto error; + + return commit_creds(new); - return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID); +error: + abort_creds(new); + return retval; } @@ -703,47 +706,53 @@ asmlinkage long sys_setuid(uid_t uid) */ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) { - struct cred *cred = current->cred; - int old_ruid = cred->uid; - int old_euid = cred->euid; - int old_suid = cred->suid; + const struct cred *old; + struct cred *new; int retval; + new = prepare_creds(); + if (!new) + return -ENOMEM; + retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES); if (retval) - return retval; + goto error; + old = current_cred(); + retval = -EPERM; if (!capable(CAP_SETUID)) { - if ((ruid != (uid_t) -1) && (ruid != cred->uid) && - (ruid != cred->euid) && (ruid != cred->suid)) - return -EPERM; - if ((euid != (uid_t) -1) && (euid != cred->uid) && - (euid != cred->euid) && (euid != cred->suid)) - return -EPERM; - if ((suid != (uid_t) -1) && (suid != cred->uid) && - (suid != cred->euid) && (suid != cred->suid)) - return -EPERM; + if (ruid != (uid_t) -1 && ruid != old->uid && + ruid != old->euid && ruid != old->suid) + goto error; + if (euid != (uid_t) -1 && euid != old->uid && + euid != old->euid && euid != old->suid) + goto error; + if (suid != (uid_t) -1 && suid != old->uid && + suid != old->euid && suid != old->suid) + goto error; } + + retval = -EAGAIN; if (ruid != (uid_t) -1) { - if (ruid != cred->uid && - set_user(ruid, euid != cred->euid) < 0) - return -EAGAIN; + new->uid = ruid; + if (ruid != old->uid && set_user(new) < 0) + goto error; } - if (euid != (uid_t) -1) { - if (euid != cred->euid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - cred->euid = euid; - } - cred->fsuid = cred->euid; + if (euid != (uid_t) -1) + new->euid = euid; if (suid != (uid_t) -1) - cred->suid = suid; + new->suid = suid; + new->fsuid = new->euid; - key_fsuid_changed(current); - proc_id_connector(current, PROC_EVENT_UID); + retval = security_task_fix_setuid(new, old, LSM_SETID_RES); + if (retval < 0) + goto error; - return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES); + return commit_creds(new); + +error: + abort_creds(new); + return retval; } asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid) @@ -763,40 +772,45 @@ asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __us */ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) { - struct cred *cred = current->cred; + const struct cred *old; + struct cred *new; int retval; + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES); if (retval) - return retval; + goto error; + retval = -EPERM; if (!capable(CAP_SETGID)) { - if ((rgid != (gid_t) -1) && (rgid != cred->gid) && - (rgid != cred->egid) && (rgid != cred->sgid)) - return -EPERM; - if ((egid != (gid_t) -1) && (egid != cred->gid) && - (egid != cred->egid) && (egid != cred->sgid)) - return -EPERM; - if ((sgid != (gid_t) -1) && (sgid != cred->gid) && - (sgid != cred->egid) && (sgid != cred->sgid)) - return -EPERM; + if (rgid != (gid_t) -1 && rgid != old->gid && + rgid != old->egid && rgid != old->sgid) + goto error; + if (egid != (gid_t) -1 && egid != old->gid && + egid != old->egid && egid != old->sgid) + goto error; + if (sgid != (gid_t) -1 && sgid != old->gid && + sgid != old->egid && sgid != old->sgid) + goto error; } - if (egid != (gid_t) -1) { - if (egid != cred->egid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - cred->egid = egid; - } - cred->fsgid = cred->egid; + if (rgid != (gid_t) -1) - cred->gid = rgid; + new->gid = rgid; + if (egid != (gid_t) -1) + new->egid = egid; if (sgid != (gid_t) -1) - cred->sgid = sgid; + new->sgid = sgid; + new->fsgid = new->egid; - key_fsgid_changed(current); - proc_id_connector(current, PROC_EVENT_GID); - return 0; + return commit_creds(new); + +error: + abort_creds(new); + return retval; } asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid) @@ -820,28 +834,35 @@ asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __us */ asmlinkage long sys_setfsuid(uid_t uid) { - struct cred *cred = current->cred; - int old_fsuid; + const struct cred *old; + struct cred *new; + uid_t old_fsuid; + + new = prepare_creds(); + if (!new) + return current_fsuid(); + old = current_cred(); + old_fsuid = old->fsuid; - old_fsuid = cred->fsuid; - if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS)) - return old_fsuid; + if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0) + goto error; - if (uid == cred->uid || uid == cred->euid || - uid == cred->suid || uid == cred->fsuid || + if (uid == old->uid || uid == old->euid || + uid == old->suid || uid == old->fsuid || capable(CAP_SETUID)) { if (uid != old_fsuid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); + new->fsuid = uid; + if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) + goto change_okay; } - cred->fsuid = uid; } - key_fsuid_changed(current); - proc_id_connector(current, PROC_EVENT_UID); - - security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS); +error: + abort_creds(new); + return old_fsuid; +change_okay: + commit_creds(new); return old_fsuid; } @@ -850,24 +871,34 @@ asmlinkage long sys_setfsuid(uid_t uid) */ asmlinkage long sys_setfsgid(gid_t gid) { - struct cred *cred = current->cred; - int old_fsgid; + const struct cred *old; + struct cred *new; + gid_t old_fsgid; + + new = prepare_creds(); + if (!new) + return current_fsgid(); + old = current_cred(); + old_fsgid = old->fsgid; - old_fsgid = cred->fsgid; if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS)) - return old_fsgid; + goto error; - if (gid == cred->gid || gid == cred->egid || - gid == cred->sgid || gid == cred->fsgid || + if (gid == old->gid || gid == old->egid || + gid == old->sgid || gid == old->fsgid || capable(CAP_SETGID)) { if (gid != old_fsgid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); + new->fsgid = gid; + goto change_okay; } - cred->fsgid = gid; - key_fsgid_changed(current); - proc_id_connector(current, PROC_EVENT_GID); } + +error: + abort_creds(new); + return old_fsgid; + +change_okay: + commit_creds(new); return old_fsgid; } @@ -1136,7 +1167,7 @@ EXPORT_SYMBOL(groups_free); /* export the group_info to a user-space array */ static int groups_to_user(gid_t __user *grouplist, - struct group_info *group_info) + const struct group_info *group_info) { int i; unsigned int count = group_info->ngroups; @@ -1227,31 +1258,25 @@ int groups_search(const struct group_info *group_info, gid_t grp) } /** - * set_groups - Change a group subscription in a security record - * @sec: The security record to alter - * @group_info: The group list to impose + * set_groups - Change a group subscription in a set of credentials + * @new: The newly prepared set of credentials to alter + * @group_info: The group list to install * - * Validate a group subscription and, if valid, impose it upon a task security - * record. + * Validate a group subscription and, if valid, insert it into a set + * of credentials. */ -int set_groups(struct cred *cred, struct group_info *group_info) +int set_groups(struct cred *new, struct group_info *group_info) { int retval; - struct group_info *old_info; retval = security_task_setgroups(group_info); if (retval) return retval; + put_group_info(new->group_info); groups_sort(group_info); get_group_info(group_info); - - spin_lock(&cred->lock); - old_info = cred->group_info; - cred->group_info = group_info; - spin_unlock(&cred->lock); - - put_group_info(old_info); + new->group_info = group_info; return 0; } @@ -1266,7 +1291,20 @@ EXPORT_SYMBOL(set_groups); */ int set_current_groups(struct group_info *group_info) { - return set_groups(current->cred, group_info); + struct cred *new; + int ret; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + ret = set_groups(new, group_info); + if (ret < 0) { + abort_creds(new); + return ret; + } + + return commit_creds(new); } EXPORT_SYMBOL(set_current_groups); @@ -1666,9 +1704,11 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned char comm[sizeof(me->comm)]; long error; - if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error)) + error = security_task_prctl(option, arg2, arg3, arg4, arg5); + if (error != -ENOSYS) return error; + error = 0; switch (option) { case PR_SET_PDEATHSIG: if (!valid_signal(arg2)) { diff --git a/kernel/user.c b/kernel/user.c index 104d22ac84d..d476307dd4b 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -16,6 +16,7 @@ #include #include #include +#include "cred-internals.h" struct user_namespace init_user_ns = { .kref = { @@ -104,16 +105,10 @@ static int sched_create_user(struct user_struct *up) return rc; } -static void sched_switch_user(struct task_struct *p) -{ - sched_move_task(p); -} - #else /* CONFIG_USER_SCHED */ static void sched_destroy_user(struct user_struct *up) { } static int sched_create_user(struct user_struct *up) { return 0; } -static void sched_switch_user(struct task_struct *p) { } #endif /* CONFIG_USER_SCHED */ @@ -448,36 +443,6 @@ out_unlock: return NULL; } -void switch_uid(struct user_struct *new_user) -{ - struct user_struct *old_user; - - /* What if a process setreuid()'s and this brings the - * new uid over his NPROC rlimit? We can check this now - * cheaply with the new uid cache, so if it matters - * we should be checking for it. -DaveM - */ - old_user = current->cred->user; - atomic_inc(&new_user->processes); - atomic_dec(&old_user->processes); - switch_uid_keyring(new_user); - current->cred->user = new_user; - sched_switch_user(current); - - /* - * We need to synchronize with __sigqueue_alloc() - * doing a get_uid(p->user).. If that saw the old - * user value, we need to wait until it has exited - * its critical region before we can free the old - * structure. - */ - smp_mb(); - spin_unlock_wait(¤t->sighand->siglock); - - free_uid(old_user); - suid_keys(current); -} - #ifdef CONFIG_USER_NS void release_uids(struct user_namespace *ns) { diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index f82730adea0..0d9c51d6733 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -19,6 +19,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) { struct user_namespace *ns; struct user_struct *new_user; + struct cred *new; int n; ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); @@ -45,7 +46,16 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) return ERR_PTR(-ENOMEM); } - switch_uid(new_user); + /* Install the new user */ + new = prepare_creds(); + if (!new) { + free_uid(new_user); + free_uid(ns->root_user); + kfree(ns); + } + free_uid(new->user); + new->user = new_user; + commit_creds(new); return ns; } diff --git a/lib/Makefile b/lib/Makefile index 7cb65d85aeb..80fe8a3ec12 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -11,7 +11,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o \ idr.o int_sqrt.o extable.o prio_tree.o \ sha1.o irq_regs.o reciprocal_div.o argv_split.o \ - proportions.o prio_heap.o ratelimit.o show_mem.o + proportions.o prio_heap.o ratelimit.o show_mem.o is_single_threaded.o lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c index 9a8ff684da7..ad8c7a782da 100644 --- a/net/rxrpc/ar-key.c +++ b/net/rxrpc/ar-key.c @@ -287,6 +287,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *conn, time_t expiry, u32 kvno) { + const struct cred *cred = current_cred(); struct key *key; int ret; @@ -297,7 +298,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *conn, _enter(""); - key = key_alloc(&key_type_rxrpc, "x", 0, 0, current, 0, + key = key_alloc(&key_type_rxrpc, "x", 0, 0, cred, 0, KEY_ALLOC_NOT_IN_QUOTA); if (IS_ERR(key)) { _leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key)); @@ -340,10 +341,11 @@ EXPORT_SYMBOL(rxrpc_get_server_data_key); */ struct key *rxrpc_get_null_key(const char *keyname) { + const struct cred *cred = current_cred(); struct key *key; int ret; - key = key_alloc(&key_type_rxrpc, keyname, 0, 0, current, + key = key_alloc(&key_type_rxrpc, keyname, 0, 0, cred, KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA); if (IS_ERR(key)) return key; diff --git a/security/capability.c b/security/capability.c index fac2f61b69a..efeb6d9e0e6 100644 --- a/security/capability.c +++ b/security/capability.c @@ -340,12 +340,16 @@ static int cap_task_create(unsigned long clone_flags) return 0; } -static int cap_cred_alloc_security(struct cred *cred) +static void cap_cred_free(struct cred *cred) +{ +} + +static int cap_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp) { return 0; } -static void cap_cred_free(struct cred *cred) +static void cap_cred_commit(struct cred *new, const struct cred *old) { } @@ -750,7 +754,7 @@ static void cap_release_secctx(char *secdata, u32 seclen) } #ifdef CONFIG_KEYS -static int cap_key_alloc(struct key *key, struct task_struct *ctx, +static int cap_key_alloc(struct key *key, const struct cred *cred, unsigned long flags) { return 0; @@ -760,7 +764,7 @@ static void cap_key_free(struct key *key) { } -static int cap_key_permission(key_ref_t key_ref, struct task_struct *context, +static int cap_key_permission(key_ref_t key_ref, const struct cred *cred, key_perm_t perm) { return 0; @@ -814,8 +818,7 @@ void security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, ptrace_may_access); set_to_cap_if_null(ops, ptrace_traceme); set_to_cap_if_null(ops, capget); - set_to_cap_if_null(ops, capset_check); - set_to_cap_if_null(ops, capset_set); + set_to_cap_if_null(ops, capset); set_to_cap_if_null(ops, acct); set_to_cap_if_null(ops, capable); set_to_cap_if_null(ops, quotactl); @@ -890,10 +893,11 @@ void security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, file_receive); set_to_cap_if_null(ops, dentry_open); set_to_cap_if_null(ops, task_create); - set_to_cap_if_null(ops, cred_alloc_security); set_to_cap_if_null(ops, cred_free); + set_to_cap_if_null(ops, cred_prepare); + set_to_cap_if_null(ops, cred_commit); set_to_cap_if_null(ops, task_setuid); - set_to_cap_if_null(ops, task_post_setuid); + set_to_cap_if_null(ops, task_fix_setuid); set_to_cap_if_null(ops, task_setgid); set_to_cap_if_null(ops, task_setpgid); set_to_cap_if_null(ops, task_getpgid); @@ -910,7 +914,6 @@ void security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, task_wait); set_to_cap_if_null(ops, task_kill); set_to_cap_if_null(ops, task_prctl); - set_to_cap_if_null(ops, task_reparent_to_init); set_to_cap_if_null(ops, task_to_inode); set_to_cap_if_null(ops, ipc_permission); set_to_cap_if_null(ops, ipc_getsecid); diff --git a/security/commoncap.c b/security/commoncap.c index 0384bf95db6..b5419273f92 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -72,8 +72,8 @@ int cap_ptrace_may_access(struct task_struct *child, unsigned int mode) int ret = 0; rcu_read_lock(); - if (!cap_issubset(child->cred->cap_permitted, - current->cred->cap_permitted) && + if (!cap_issubset(__task_cred(child)->cap_permitted, + current_cred()->cap_permitted) && !capable(CAP_SYS_PTRACE)) ret = -EPERM; rcu_read_unlock(); @@ -85,8 +85,8 @@ int cap_ptrace_traceme(struct task_struct *parent) int ret = 0; rcu_read_lock(); - if (!cap_issubset(current->cred->cap_permitted, - parent->cred->cap_permitted) && + if (!cap_issubset(current_cred()->cap_permitted, + __task_cred(parent)->cap_permitted) && !has_capability(parent, CAP_SYS_PTRACE)) ret = -EPERM; rcu_read_unlock(); @@ -117,7 +117,7 @@ static inline int cap_inh_is_capped(void) * to the old permitted set. That is, if the current task * does *not* possess the CAP_SETPCAP capability. */ - return (cap_capable(current, CAP_SETPCAP, SECURITY_CAP_AUDIT) != 0); + return cap_capable(current, CAP_SETPCAP, SECURITY_CAP_AUDIT) != 0; } static inline int cap_limit_ptraced_target(void) { return 1; } @@ -132,52 +132,39 @@ static inline int cap_limit_ptraced_target(void) #endif /* def CONFIG_SECURITY_FILE_CAPABILITIES */ -int cap_capset_check(const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted) +int cap_capset(struct cred *new, + const struct cred *old, + const kernel_cap_t *effective, + const kernel_cap_t *inheritable, + const kernel_cap_t *permitted) { - const struct cred *cred = current->cred; - - if (cap_inh_is_capped() - && !cap_issubset(*inheritable, - cap_combine(cred->cap_inheritable, - cred->cap_permitted))) { + if (cap_inh_is_capped() && + !cap_issubset(*inheritable, + cap_combine(old->cap_inheritable, + old->cap_permitted))) /* incapable of using this inheritable set */ return -EPERM; - } + if (!cap_issubset(*inheritable, - cap_combine(cred->cap_inheritable, - cred->cap_bset))) { + cap_combine(old->cap_inheritable, + old->cap_bset))) /* no new pI capabilities outside bounding set */ return -EPERM; - } /* verify restrictions on target's new Permitted set */ - if (!cap_issubset (*permitted, - cap_combine (cred->cap_permitted, - cred->cap_permitted))) { + if (!cap_issubset(*permitted, old->cap_permitted)) return -EPERM; - } /* verify the _new_Effective_ is a subset of the _new_Permitted_ */ - if (!cap_issubset (*effective, *permitted)) { + if (!cap_issubset(*effective, *permitted)) return -EPERM; - } + new->cap_effective = *effective; + new->cap_inheritable = *inheritable; + new->cap_permitted = *permitted; return 0; } -void cap_capset_set(const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted) -{ - struct cred *cred = current->cred; - - cred->cap_effective = *effective; - cred->cap_inheritable = *inheritable; - cred->cap_permitted = *permitted; -} - static inline void bprm_clear_caps(struct linux_binprm *bprm) { cap_clear(bprm->cap_post_exec_permitted); @@ -382,41 +369,46 @@ int cap_bprm_set_security (struct linux_binprm *bprm) return ret; } -void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe) +int cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe) { - struct cred *cred = current->cred; + const struct cred *old = current_cred(); + struct cred *new; + + new = prepare_creds(); + if (!new) + return -ENOMEM; - if (bprm->e_uid != cred->uid || bprm->e_gid != cred->gid || + if (bprm->e_uid != old->uid || bprm->e_gid != old->gid || !cap_issubset(bprm->cap_post_exec_permitted, - cred->cap_permitted)) { + old->cap_permitted)) { set_dumpable(current->mm, suid_dumpable); current->pdeath_signal = 0; if (unsafe & ~LSM_UNSAFE_PTRACE_CAP) { if (!capable(CAP_SETUID)) { - bprm->e_uid = cred->uid; - bprm->e_gid = cred->gid; + bprm->e_uid = old->uid; + bprm->e_gid = old->gid; } if (cap_limit_ptraced_target()) { bprm->cap_post_exec_permitted = cap_intersect( bprm->cap_post_exec_permitted, - cred->cap_permitted); + new->cap_permitted); } } } - cred->suid = cred->euid = cred->fsuid = bprm->e_uid; - cred->sgid = cred->egid = cred->fsgid = bprm->e_gid; + new->suid = new->euid = new->fsuid = bprm->e_uid; + new->sgid = new->egid = new->fsgid = bprm->e_gid; /* For init, we want to retain the capabilities set * in the init_task struct. Thus we skip the usual * capability rules */ if (!is_global_init(current)) { - cred->cap_permitted = bprm->cap_post_exec_permitted; + new->cap_permitted = bprm->cap_post_exec_permitted; if (bprm->cap_effective) - cred->cap_effective = bprm->cap_post_exec_permitted; + new->cap_effective = bprm->cap_post_exec_permitted; else - cap_clear(cred->cap_effective); + cap_clear(new->cap_effective); } /* @@ -431,15 +423,15 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe) * Number 1 above might fail if you don't have a full bset, but I think * that is interesting information to audit. */ - if (!cap_isclear(cred->cap_effective)) { - if (!cap_issubset(CAP_FULL_SET, cred->cap_effective) || - (bprm->e_uid != 0) || (cred->uid != 0) || + if (!cap_isclear(new->cap_effective)) { + if (!cap_issubset(CAP_FULL_SET, new->cap_effective) || + bprm->e_uid != 0 || new->uid != 0 || issecure(SECURE_NOROOT)) - audit_log_bprm_fcaps(bprm, &cred->cap_permitted, - &cred->cap_effective); + audit_log_bprm_fcaps(bprm, new, old); } - cred->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); + new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); + return commit_creds(new); } int cap_bprm_secureexec (struct linux_binprm *bprm) @@ -514,65 +506,49 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name) * files.. * Thanks to Olaf Kirch and Peter Benie for spotting this. */ -static inline void cap_emulate_setxuid (int old_ruid, int old_euid, - int old_suid) +static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old) { - struct cred *cred = current->cred; - - if ((old_ruid == 0 || old_euid == 0 || old_suid == 0) && - (cred->uid != 0 && cred->euid != 0 && cred->suid != 0) && + if ((old->uid == 0 || old->euid == 0 || old->suid == 0) && + (new->uid != 0 && new->euid != 0 && new->suid != 0) && !issecure(SECURE_KEEP_CAPS)) { - cap_clear(cred->cap_permitted); - cap_clear(cred->cap_effective); - } - if (old_euid == 0 && cred->euid != 0) { - cap_clear(cred->cap_effective); - } - if (old_euid != 0 && cred->euid == 0) { - cred->cap_effective = cred->cap_permitted; + cap_clear(new->cap_permitted); + cap_clear(new->cap_effective); } + if (old->euid == 0 && new->euid != 0) + cap_clear(new->cap_effective); + if (old->euid != 0 && new->euid == 0) + new->cap_effective = new->cap_permitted; } -int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid, - int flags) +int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags) { - struct cred *cred = current->cred; - switch (flags) { case LSM_SETID_RE: case LSM_SETID_ID: case LSM_SETID_RES: /* Copied from kernel/sys.c:setreuid/setuid/setresuid. */ - if (!issecure (SECURE_NO_SETUID_FIXUP)) { - cap_emulate_setxuid (old_ruid, old_euid, old_suid); - } + if (!issecure(SECURE_NO_SETUID_FIXUP)) + cap_emulate_setxuid(new, old); break; case LSM_SETID_FS: - { - uid_t old_fsuid = old_ruid; - - /* Copied from kernel/sys.c:setfsuid. */ + /* Copied from kernel/sys.c:setfsuid. */ - /* - * FIXME - is fsuser used for all CAP_FS_MASK capabilities? - * if not, we might be a bit too harsh here. - */ - - if (!issecure (SECURE_NO_SETUID_FIXUP)) { - if (old_fsuid == 0 && cred->fsuid != 0) { - cred->cap_effective = - cap_drop_fs_set( - cred->cap_effective); - } - if (old_fsuid != 0 && cred->fsuid == 0) { - cred->cap_effective = - cap_raise_fs_set( - cred->cap_effective, - cred->cap_permitted); - } + /* + * FIXME - is fsuser used for all CAP_FS_MASK capabilities? + * if not, we might be a bit too harsh here. + */ + if (!issecure(SECURE_NO_SETUID_FIXUP)) { + if (old->fsuid == 0 && new->fsuid != 0) { + new->cap_effective = + cap_drop_fs_set(new->cap_effective); + } + if (old->fsuid != 0 && new->fsuid == 0) { + new->cap_effective = + cap_raise_fs_set(new->cap_effective, + new->cap_permitted); } - break; } + break; default: return -EINVAL; } @@ -628,13 +604,14 @@ int cap_task_setnice (struct task_struct *p, int nice) * this task could get inconsistent info. There can be no * racing writer bc a task can only change its own caps. */ -static long cap_prctl_drop(unsigned long cap) +static long cap_prctl_drop(struct cred *new, unsigned long cap) { if (!capable(CAP_SETPCAP)) return -EPERM; if (!cap_valid(cap)) return -EINVAL; - cap_lower(current->cred->cap_bset, cap); + + cap_lower(new->cap_bset, cap); return 0; } @@ -655,22 +632,29 @@ int cap_task_setnice (struct task_struct *p, int nice) #endif int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, - unsigned long arg4, unsigned long arg5, long *rc_p) + unsigned long arg4, unsigned long arg5) { - struct cred *cred = current_cred(); + struct cred *new; long error = 0; + new = prepare_creds(); + if (!new) + return -ENOMEM; + switch (option) { case PR_CAPBSET_READ: + error = -EINVAL; if (!cap_valid(arg2)) - error = -EINVAL; - else - error = !!cap_raised(cred->cap_bset, arg2); - break; + goto error; + error = !!cap_raised(new->cap_bset, arg2); + goto no_change; + #ifdef CONFIG_SECURITY_FILE_CAPABILITIES case PR_CAPBSET_DROP: - error = cap_prctl_drop(arg2); - break; + error = cap_prctl_drop(new, arg2); + if (error < 0) + goto error; + goto changed; /* * The next four prctl's remain to assist with transitioning a @@ -692,12 +676,12 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, * capability-based-privilege environment. */ case PR_SET_SECUREBITS: - if ((((cred->securebits & SECURE_ALL_LOCKS) >> 1) - & (cred->securebits ^ arg2)) /*[1]*/ - || ((cred->securebits & SECURE_ALL_LOCKS - & ~arg2)) /*[2]*/ - || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS)) /*[3]*/ - || (cap_capable(current, CAP_SETPCAP, SECURITY_CAP_AUDIT) != 0)) { /*[4]*/ + error = -EPERM; + if ((((new->securebits & SECURE_ALL_LOCKS) >> 1) + & (new->securebits ^ arg2)) /*[1]*/ + || ((new->securebits & SECURE_ALL_LOCKS & ~arg2)) /*[2]*/ + || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS)) /*[3]*/ + || (cap_capable(current, CAP_SETPCAP, SECURITY_CAP_AUDIT) != 0) /*[4]*/ /* * [1] no changing of bits that are locked * [2] no unlocking of locks @@ -705,50 +689,51 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, * [4] doing anything requires privilege (go read about * the "sendmail capabilities bug") */ - error = -EPERM; /* cannot change a locked bit */ - } else { - cred->securebits = arg2; - } - break; + ) + /* cannot change a locked bit */ + goto error; + new->securebits = arg2; + goto changed; + case PR_GET_SECUREBITS: - error = cred->securebits; - break; + error = new->securebits; + goto no_change; #endif /* def CONFIG_SECURITY_FILE_CAPABILITIES */ case PR_GET_KEEPCAPS: if (issecure(SECURE_KEEP_CAPS)) error = 1; - break; + goto no_change; + case PR_SET_KEEPCAPS: + error = -EINVAL; if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */ - error = -EINVAL; - else if (issecure(SECURE_KEEP_CAPS_LOCKED)) - error = -EPERM; - else if (arg2) - cred->securebits |= issecure_mask(SECURE_KEEP_CAPS); + goto error; + error = -EPERM; + if (issecure(SECURE_KEEP_CAPS_LOCKED)) + goto error; + if (arg2) + new->securebits |= issecure_mask(SECURE_KEEP_CAPS); else - cred->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); - break; + new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); + goto changed; default: /* No functionality available - continue with default */ - return 0; + error = -ENOSYS; + goto error; } /* Functionality provided */ - *rc_p = error; - return 1; -} - -void cap_task_reparent_to_init (struct task_struct *p) -{ - struct cred *cred = p->cred; - - cap_set_init_eff(cred->cap_effective); - cap_clear(cred->cap_inheritable); - cap_set_full(cred->cap_permitted); - p->cred->securebits = SECUREBITS_DEFAULT; +changed: + return commit_creds(new); + +no_change: + error = 0; +error: + abort_creds(new); + return error; } int cap_syslog (int type) diff --git a/security/keys/internal.h b/security/keys/internal.h index d1586c62978..81932abefe7 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -12,6 +12,7 @@ #ifndef _INTERNAL_H #define _INTERNAL_H +#include #include static inline __attribute__((format(printf, 1, 2))) @@ -25,7 +26,7 @@ void no_printk(const char *fmt, ...) #define kleave(FMT, ...) \ printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) #define kdebug(FMT, ...) \ - printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__) + printk(KERN_DEBUG " "FMT"\n", ##__VA_ARGS__) #else #define kenter(FMT, ...) \ no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) @@ -97,7 +98,7 @@ extern struct key *keyring_search_instkey(struct key *keyring, typedef int (*key_match_func_t)(const struct key *, const void *); extern key_ref_t keyring_search_aux(key_ref_t keyring_ref, - struct task_struct *tsk, + const struct cred *cred, struct key_type *type, const void *description, key_match_func_t match); @@ -105,13 +106,13 @@ extern key_ref_t keyring_search_aux(key_ref_t keyring_ref, extern key_ref_t search_process_keyrings(struct key_type *type, const void *description, key_match_func_t match, - struct task_struct *tsk); + const struct cred *cred); extern struct key *find_keyring_by_name(const char *name, bool skip_perm_check); extern int install_user_keyrings(void); -extern int install_thread_keyring(void); -extern int install_process_keyring(void); +extern int install_thread_keyring_to_cred(struct cred *); +extern int install_process_keyring_to_cred(struct cred *); extern struct key *request_key_and_link(struct key_type *type, const char *description, @@ -130,12 +131,12 @@ extern long join_session_keyring(const char *name); * check to see whether permission is granted to use a key in the desired way */ extern int key_task_permission(const key_ref_t key_ref, - struct task_struct *context, + const struct cred *cred, key_perm_t perm); static inline int key_permission(const key_ref_t key_ref, key_perm_t perm) { - return key_task_permission(key_ref, current, perm); + return key_task_permission(key_ref, current_cred(), perm); } /* required permissions */ @@ -153,7 +154,7 @@ static inline int key_permission(const key_ref_t key_ref, key_perm_t perm) struct request_key_auth { struct key *target_key; struct key *dest_keyring; - struct task_struct *context; + const struct cred *cred; void *callout_info; size_t callout_len; pid_t pid; diff --git a/security/keys/key.c b/security/keys/key.c index a6ca39ed3b0..f76c8a546fd 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -218,7 +218,7 @@ serial_exists: * instantiate the key or discard it before returning */ struct key *key_alloc(struct key_type *type, const char *desc, - uid_t uid, gid_t gid, struct task_struct *ctx, + uid_t uid, gid_t gid, const struct cred *cred, key_perm_t perm, unsigned long flags) { struct key_user *user = NULL; @@ -294,7 +294,7 @@ struct key *key_alloc(struct key_type *type, const char *desc, #endif /* let the security module know about the key */ - ret = security_key_alloc(key, ctx, flags); + ret = security_key_alloc(key, cred, flags); if (ret < 0) goto security_error; @@ -391,7 +391,7 @@ static int __key_instantiate_and_link(struct key *key, const void *data, size_t datalen, struct key *keyring, - struct key *instkey) + struct key *authkey) { int ret, awaken; @@ -421,8 +421,8 @@ static int __key_instantiate_and_link(struct key *key, ret = __key_link(keyring, key); /* disable the authorisation key */ - if (instkey) - key_revoke(instkey); + if (authkey) + key_revoke(authkey); } } @@ -444,14 +444,14 @@ int key_instantiate_and_link(struct key *key, const void *data, size_t datalen, struct key *keyring, - struct key *instkey) + struct key *authkey) { int ret; if (keyring) down_write(&keyring->sem); - ret = __key_instantiate_and_link(key, data, datalen, keyring, instkey); + ret = __key_instantiate_and_link(key, data, datalen, keyring, authkey); if (keyring) up_write(&keyring->sem); @@ -469,7 +469,7 @@ EXPORT_SYMBOL(key_instantiate_and_link); int key_negate_and_link(struct key *key, unsigned timeout, struct key *keyring, - struct key *instkey) + struct key *authkey) { struct timespec now; int ret, awaken; @@ -504,8 +504,8 @@ int key_negate_and_link(struct key *key, ret = __key_link(keyring, key); /* disable the authorisation key */ - if (instkey) - key_revoke(instkey); + if (authkey) + key_revoke(authkey); } mutex_unlock(&key_construction_mutex); @@ -743,6 +743,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, key_perm_t perm, unsigned long flags) { + const struct cred *cred = current_cred(); struct key_type *ktype; struct key *keyring, *key = NULL; key_ref_t key_ref; @@ -802,8 +803,8 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, } /* allocate a new key */ - key = key_alloc(ktype, description, current_fsuid(), current_fsgid(), - current, perm, flags); + key = key_alloc(ktype, description, cred->fsuid, cred->fsgid, cred, + perm, flags); if (IS_ERR(key)) { key_ref = ERR_CAST(key); goto error_3; diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 8833b447ade..7c72baa02f2 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -866,6 +866,23 @@ static long get_instantiation_keyring(key_serial_t ringid, return -ENOKEY; } +/* + * change the request_key authorisation key on the current process + */ +static int keyctl_change_reqkey_auth(struct key *key) +{ + struct cred *new; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + key_put(new->request_key_auth); + new->request_key_auth = key_get(key); + + return commit_creds(new); +} + /*****************************************************************************/ /* * instantiate the key with the specified payload, and, if one is given, link @@ -876,12 +893,15 @@ long keyctl_instantiate_key(key_serial_t id, size_t plen, key_serial_t ringid) { + const struct cred *cred = current_cred(); struct request_key_auth *rka; struct key *instkey, *dest_keyring; void *payload; long ret; bool vm = false; + kenter("%d,,%zu,%d", id, plen, ringid); + ret = -EINVAL; if (plen > 1024 * 1024 - 1) goto error; @@ -889,7 +909,7 @@ long keyctl_instantiate_key(key_serial_t id, /* the appropriate instantiation authorisation key must have been * assumed before calling this */ ret = -EPERM; - instkey = current->cred->request_key_auth; + instkey = cred->request_key_auth; if (!instkey) goto error; @@ -931,10 +951,8 @@ long keyctl_instantiate_key(key_serial_t id, /* discard the assumed authority if it's just been disabled by * instantiation of the key */ - if (ret == 0) { - key_put(current->cred->request_key_auth); - current->cred->request_key_auth = NULL; - } + if (ret == 0) + keyctl_change_reqkey_auth(NULL); error2: if (!vm) @@ -953,14 +971,17 @@ error: */ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid) { + const struct cred *cred = current_cred(); struct request_key_auth *rka; struct key *instkey, *dest_keyring; long ret; + kenter("%d,%u,%d", id, timeout, ringid); + /* the appropriate instantiation authorisation key must have been * assumed before calling this */ ret = -EPERM; - instkey = current->cred->request_key_auth; + instkey = cred->request_key_auth; if (!instkey) goto error; @@ -982,10 +1003,8 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid) /* discard the assumed authority if it's just been disabled by * instantiation of the key */ - if (ret == 0) { - key_put(current->cred->request_key_auth); - current->cred->request_key_auth = NULL; - } + if (ret == 0) + keyctl_change_reqkey_auth(NULL); error: return ret; @@ -999,36 +1018,56 @@ error: */ long keyctl_set_reqkey_keyring(int reqkey_defl) { - struct cred *cred = current->cred; - int ret; + struct cred *new; + int ret, old_setting; + + old_setting = current_cred_xxx(jit_keyring); + + if (reqkey_defl == KEY_REQKEY_DEFL_NO_CHANGE) + return old_setting; + + new = prepare_creds(); + if (!new) + return -ENOMEM; switch (reqkey_defl) { case KEY_REQKEY_DEFL_THREAD_KEYRING: - ret = install_thread_keyring(); + ret = install_thread_keyring_to_cred(new); if (ret < 0) - return ret; + goto error; goto set; case KEY_REQKEY_DEFL_PROCESS_KEYRING: - ret = install_process_keyring(); - if (ret < 0) - return ret; + ret = install_process_keyring_to_cred(new); + if (ret < 0) { + if (ret != -EEXIST) + goto error; + ret = 0; + } + goto set; case KEY_REQKEY_DEFL_DEFAULT: case KEY_REQKEY_DEFL_SESSION_KEYRING: case KEY_REQKEY_DEFL_USER_KEYRING: case KEY_REQKEY_DEFL_USER_SESSION_KEYRING: - set: - cred->jit_keyring = reqkey_defl; + case KEY_REQKEY_DEFL_REQUESTOR_KEYRING: + goto set; case KEY_REQKEY_DEFL_NO_CHANGE: - return cred->jit_keyring; - case KEY_REQKEY_DEFL_GROUP_KEYRING: default: - return -EINVAL; + ret = -EINVAL; + goto error; } +set: + new->jit_keyring = reqkey_defl; + commit_creds(new); + return old_setting; +error: + abort_creds(new); + return -EINVAL; + } /* end keyctl_set_reqkey_keyring() */ /*****************************************************************************/ @@ -1087,9 +1126,7 @@ long keyctl_assume_authority(key_serial_t id) /* we divest ourselves of authority if given an ID of 0 */ if (id == 0) { - key_put(current->cred->request_key_auth); - current->cred->request_key_auth = NULL; - ret = 0; + ret = keyctl_change_reqkey_auth(NULL); goto error; } @@ -1104,10 +1141,12 @@ long keyctl_assume_authority(key_serial_t id) goto error; } - key_put(current->cred->request_key_auth); - current->cred->request_key_auth = authkey; - ret = authkey->serial; + ret = keyctl_change_reqkey_auth(authkey); + if (ret < 0) + goto error; + key_put(authkey); + ret = authkey->serial; error: return ret; diff --git a/security/keys/keyring.c b/security/keys/keyring.c index fdf75f90199..ed851574d07 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -245,14 +245,14 @@ static long keyring_read(const struct key *keyring, * allocate a keyring and link into the destination keyring */ struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid, - struct task_struct *ctx, unsigned long flags, + const struct cred *cred, unsigned long flags, struct key *dest) { struct key *keyring; int ret; keyring = key_alloc(&key_type_keyring, description, - uid, gid, ctx, + uid, gid, cred, (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL, flags); @@ -281,7 +281,7 @@ struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid, * - we propagate the possession attribute from the keyring ref to the key ref */ key_ref_t keyring_search_aux(key_ref_t keyring_ref, - struct task_struct *context, + const struct cred *cred, struct key_type *type, const void *description, key_match_func_t match) @@ -304,7 +304,7 @@ key_ref_t keyring_search_aux(key_ref_t keyring_ref, key_check(keyring); /* top keyring must have search permission to begin the search */ - err = key_task_permission(keyring_ref, context, KEY_SEARCH); + err = key_task_permission(keyring_ref, cred, KEY_SEARCH); if (err < 0) { key_ref = ERR_PTR(err); goto error; @@ -377,7 +377,7 @@ descend: /* key must have search permissions */ if (key_task_permission(make_key_ref(key, possessed), - context, KEY_SEARCH) < 0) + cred, KEY_SEARCH) < 0) continue; /* we set a different error code if we pass a negative key */ @@ -404,7 +404,7 @@ ascend: continue; if (key_task_permission(make_key_ref(key, possessed), - context, KEY_SEARCH) < 0) + cred, KEY_SEARCH) < 0) continue; /* stack the current position */ @@ -459,7 +459,7 @@ key_ref_t keyring_search(key_ref_t keyring, if (!type->match) return ERR_PTR(-ENOKEY); - return keyring_search_aux(keyring, current, + return keyring_search_aux(keyring, current->cred, type, description, type->match); } /* end keyring_search() */ diff --git a/security/keys/permission.c b/security/keys/permission.c index 13c36164f28..5d9fc7b93f2 100644 --- a/security/keys/permission.c +++ b/security/keys/permission.c @@ -14,24 +14,27 @@ #include "internal.h" /*****************************************************************************/ -/* - * check to see whether permission is granted to use a key in the desired way, - * but permit the security modules to override +/** + * key_task_permission - Check a key can be used + * @key_ref: The key to check + * @cred: The credentials to use + * @perm: The permissions to check for + * + * Check to see whether permission is granted to use a key in the desired way, + * but permit the security modules to override. + * + * The caller must hold either a ref on cred or must hold the RCU readlock or a + * spinlock. */ -int key_task_permission(const key_ref_t key_ref, - struct task_struct *context, +int key_task_permission(const key_ref_t key_ref, const struct cred *cred, key_perm_t perm) { - const struct cred *cred; struct key *key; key_perm_t kperm; int ret; key = key_ref_to_ptr(key_ref); - rcu_read_lock(); - cred = __task_cred(context); - /* use the second 8-bits of permissions for keys the caller owns */ if (key->uid == cred->fsuid) { kperm = key->perm >> 16; @@ -57,7 +60,6 @@ int key_task_permission(const key_ref_t key_ref, kperm = key->perm; use_these_perms: - rcu_read_lock(); /* use the top 8-bits of permissions for keys the caller possesses * - possessor permissions are additive with other permissions @@ -71,7 +73,7 @@ use_these_perms: return -EACCES; /* let LSM be the final arbiter */ - return security_key_permission(key_ref, context, perm); + return security_key_permission(key_ref, cred, perm); } /* end key_task_permission() */ diff --git a/security/keys/proc.c b/security/keys/proc.c index f619170da76..7f508def50e 100644 --- a/security/keys/proc.c +++ b/security/keys/proc.c @@ -136,8 +136,12 @@ static int proc_keys_show(struct seq_file *m, void *v) int rc; /* check whether the current task is allowed to view the key (assuming - * non-possession) */ - rc = key_task_permission(make_key_ref(key, 0), current, KEY_VIEW); + * non-possession) + * - the caller holds a spinlock, and thus the RCU read lock, making our + * access to __current_cred() safe + */ + rc = key_task_permission(make_key_ref(key, 0), current_cred(), + KEY_VIEW); if (rc < 0) return 0; diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index 70ee93406f3..df329f684a6 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -42,11 +42,15 @@ struct key_user root_key_user = { */ int install_user_keyrings(void) { - struct user_struct *user = current->cred->user; + struct user_struct *user; + const struct cred *cred; struct key *uid_keyring, *session_keyring; char buf[20]; int ret; + cred = current_cred(); + user = cred->user; + kenter("%p{%u}", user, user->uid); if (user->uid_keyring) { @@ -67,7 +71,7 @@ int install_user_keyrings(void) uid_keyring = find_keyring_by_name(buf, true); if (IS_ERR(uid_keyring)) { uid_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, - current, KEY_ALLOC_IN_QUOTA, + cred, KEY_ALLOC_IN_QUOTA, NULL); if (IS_ERR(uid_keyring)) { ret = PTR_ERR(uid_keyring); @@ -83,8 +87,7 @@ int install_user_keyrings(void) if (IS_ERR(session_keyring)) { session_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, - current, KEY_ALLOC_IN_QUOTA, - NULL); + cred, KEY_ALLOC_IN_QUOTA, NULL); if (IS_ERR(session_keyring)) { ret = PTR_ERR(session_keyring); goto error_release; @@ -116,142 +119,128 @@ error: return ret; } -/*****************************************************************************/ /* - * deal with the UID changing + * install a fresh thread keyring directly to new credentials */ -void switch_uid_keyring(struct user_struct *new_user) +int install_thread_keyring_to_cred(struct cred *new) { -#if 0 /* do nothing for now */ - struct key *old; - - /* switch to the new user's session keyring if we were running under - * root's default session keyring */ - if (new_user->uid != 0 && - current->session_keyring == &root_session_keyring - ) { - atomic_inc(&new_user->session_keyring->usage); - - task_lock(current); - old = current->session_keyring; - current->session_keyring = new_user->session_keyring; - task_unlock(current); + struct key *keyring; - key_put(old); - } -#endif + keyring = keyring_alloc("_tid", new->uid, new->gid, new, + KEY_ALLOC_QUOTA_OVERRUN, NULL); + if (IS_ERR(keyring)) + return PTR_ERR(keyring); -} /* end switch_uid_keyring() */ + new->thread_keyring = keyring; + return 0; +} -/*****************************************************************************/ /* * install a fresh thread keyring, discarding the old one */ -int install_thread_keyring(void) +static int install_thread_keyring(void) { - struct task_struct *tsk = current; - struct key *keyring, *old; - char buf[20]; + struct cred *new; int ret; - sprintf(buf, "_tid.%u", tsk->pid); + new = prepare_creds(); + if (!new) + return -ENOMEM; - keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, tsk, - KEY_ALLOC_QUOTA_OVERRUN, NULL); - if (IS_ERR(keyring)) { - ret = PTR_ERR(keyring); - goto error; + BUG_ON(new->thread_keyring); + + ret = install_thread_keyring_to_cred(new); + if (ret < 0) { + abort_creds(new); + return ret; } - task_lock(tsk); - old = tsk->cred->thread_keyring; - tsk->cred->thread_keyring = keyring; - task_unlock(tsk); + return commit_creds(new); +} - ret = 0; +/* + * install a process keyring directly to a credentials struct + * - returns -EEXIST if there was already a process keyring, 0 if one installed, + * and other -ve on any other error + */ +int install_process_keyring_to_cred(struct cred *new) +{ + struct key *keyring; + int ret; - key_put(old); -error: + if (new->tgcred->process_keyring) + return -EEXIST; + + keyring = keyring_alloc("_pid", new->uid, new->gid, + new, KEY_ALLOC_QUOTA_OVERRUN, NULL); + if (IS_ERR(keyring)) + return PTR_ERR(keyring); + + spin_lock_irq(&new->tgcred->lock); + if (!new->tgcred->process_keyring) { + new->tgcred->process_keyring = keyring; + keyring = NULL; + ret = 0; + } else { + ret = -EEXIST; + } + spin_unlock_irq(&new->tgcred->lock); + key_put(keyring); return ret; +} -} /* end install_thread_keyring() */ - -/*****************************************************************************/ /* * make sure a process keyring is installed + * - we */ -int install_process_keyring(void) +static int install_process_keyring(void) { - struct task_struct *tsk = current; - struct key *keyring; - char buf[20]; + struct cred *new; int ret; - might_sleep(); - - if (!tsk->cred->tgcred->process_keyring) { - sprintf(buf, "_pid.%u", tsk->tgid); - - keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, tsk, - KEY_ALLOC_QUOTA_OVERRUN, NULL); - if (IS_ERR(keyring)) { - ret = PTR_ERR(keyring); - goto error; - } - - /* attach keyring */ - spin_lock_irq(&tsk->cred->tgcred->lock); - if (!tsk->cred->tgcred->process_keyring) { - tsk->cred->tgcred->process_keyring = keyring; - keyring = NULL; - } - spin_unlock_irq(&tsk->cred->tgcred->lock); + new = prepare_creds(); + if (!new) + return -ENOMEM; - key_put(keyring); + ret = install_process_keyring_to_cred(new); + if (ret < 0) { + abort_creds(new); + return ret != -EEXIST ?: 0; } - ret = 0; -error: - return ret; - -} /* end install_process_keyring() */ + return commit_creds(new); +} -/*****************************************************************************/ /* - * install a session keyring, discarding the old one - * - if a keyring is not supplied, an empty one is invented + * install a session keyring directly to a credentials struct */ -static int install_session_keyring(struct key *keyring) +static int install_session_keyring_to_cred(struct cred *cred, + struct key *keyring) { - struct task_struct *tsk = current; unsigned long flags; struct key *old; - char buf[20]; might_sleep(); /* create an empty session keyring */ if (!keyring) { - sprintf(buf, "_ses.%u", tsk->tgid); - flags = KEY_ALLOC_QUOTA_OVERRUN; - if (tsk->cred->tgcred->session_keyring) + if (cred->tgcred->session_keyring) flags = KEY_ALLOC_IN_QUOTA; - keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, - tsk, flags, NULL); + keyring = keyring_alloc("_ses", cred->uid, cred->gid, + cred, flags, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); - } - else { + } else { atomic_inc(&keyring->usage); } /* install the keyring */ - spin_lock_irq(&tsk->cred->tgcred->lock); - old = tsk->cred->tgcred->session_keyring; - rcu_assign_pointer(tsk->cred->tgcred->session_keyring, keyring); - spin_unlock_irq(&tsk->cred->tgcred->lock); + spin_lock_irq(&cred->tgcred->lock); + old = cred->tgcred->session_keyring; + rcu_assign_pointer(cred->tgcred->session_keyring, keyring); + spin_unlock_irq(&cred->tgcred->lock); /* we're using RCU on the pointer, but there's no point synchronising * on it if it didn't previously point to anything */ @@ -261,38 +250,29 @@ static int install_session_keyring(struct key *keyring) } return 0; +} -} /* end install_session_keyring() */ - -/*****************************************************************************/ /* - * copy the keys for fork + * install a session keyring, discarding the old one + * - if a keyring is not supplied, an empty one is invented */ -int copy_keys(unsigned long clone_flags, struct task_struct *tsk) +static int install_session_keyring(struct key *keyring) { - key_check(tsk->cred->thread_keyring); - key_check(tsk->cred->request_key_auth); - - /* no thread keyring yet */ - tsk->cred->thread_keyring = NULL; - - /* copy the request_key() authorisation for this thread */ - key_get(tsk->cred->request_key_auth); - - return 0; + struct cred *new; + int ret; -} /* end copy_keys() */ + new = prepare_creds(); + if (!new) + return -ENOMEM; -/*****************************************************************************/ -/* - * dispose of per-thread keys upon thread exit - */ -void exit_keys(struct task_struct *tsk) -{ - key_put(tsk->cred->thread_keyring); - key_put(tsk->cred->request_key_auth); + ret = install_session_keyring_to_cred(new, NULL); + if (ret < 0) { + abort_creds(new); + return ret; + } -} /* end exit_keys() */ + return commit_creds(new); +} /*****************************************************************************/ /* @@ -300,38 +280,41 @@ void exit_keys(struct task_struct *tsk) */ int exec_keys(struct task_struct *tsk) { - struct key *old; + struct thread_group_cred *tgcred = NULL; + struct cred *new; - /* newly exec'd tasks don't get a thread keyring */ - task_lock(tsk); - old = tsk->cred->thread_keyring; - tsk->cred->thread_keyring = NULL; - task_unlock(tsk); +#ifdef CONFIG_KEYS + tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); + if (!tgcred) + return -ENOMEM; +#endif - key_put(old); + new = prepare_creds(); + if (new < 0) + return -ENOMEM; - /* discard the process keyring from a newly exec'd task */ - spin_lock_irq(&tsk->cred->tgcred->lock); - old = tsk->cred->tgcred->process_keyring; - tsk->cred->tgcred->process_keyring = NULL; - spin_unlock_irq(&tsk->cred->tgcred->lock); + /* newly exec'd tasks don't get a thread keyring */ + key_put(new->thread_keyring); + new->thread_keyring = NULL; - key_put(old); + /* create a new per-thread-group creds for all this set of threads to + * share */ + memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred)); - return 0; + atomic_set(&tgcred->usage, 1); + spin_lock_init(&tgcred->lock); -} /* end exec_keys() */ + /* inherit the session keyring; new process keyring */ + key_get(tgcred->session_keyring); + tgcred->process_keyring = NULL; -/*****************************************************************************/ -/* - * deal with SUID programs - * - we might want to make this invent a new session keyring - */ -int suid_keys(struct task_struct *tsk) -{ + release_tgcred(new); + new->tgcred = tgcred; + + commit_creds(new); return 0; -} /* end suid_keys() */ +} /* end exec_keys() */ /*****************************************************************************/ /* @@ -376,16 +359,13 @@ void key_fsgid_changed(struct task_struct *tsk) key_ref_t search_process_keyrings(struct key_type *type, const void *description, key_match_func_t match, - struct task_struct *context) + const struct cred *cred) { struct request_key_auth *rka; - struct cred *cred; key_ref_t key_ref, ret, err; might_sleep(); - cred = get_task_cred(context); - /* we want to return -EAGAIN or -ENOKEY if any of the keyrings were * searchable, but we failed to find a key or we found a negative key; * otherwise we want to return a sample error (probably -EACCES) if @@ -401,7 +381,7 @@ key_ref_t search_process_keyrings(struct key_type *type, if (cred->thread_keyring) { key_ref = keyring_search_aux( make_key_ref(cred->thread_keyring, 1), - context, type, description, match); + cred, type, description, match); if (!IS_ERR(key_ref)) goto found; @@ -422,7 +402,7 @@ key_ref_t search_process_keyrings(struct key_type *type, if (cred->tgcred->process_keyring) { key_ref = keyring_search_aux( make_key_ref(cred->tgcred->process_keyring, 1), - context, type, description, match); + cred, type, description, match); if (!IS_ERR(key_ref)) goto found; @@ -446,7 +426,7 @@ key_ref_t search_process_keyrings(struct key_type *type, make_key_ref(rcu_dereference( cred->tgcred->session_keyring), 1), - context, type, description, match); + cred, type, description, match); rcu_read_unlock(); if (!IS_ERR(key_ref)) @@ -468,7 +448,7 @@ key_ref_t search_process_keyrings(struct key_type *type, else if (cred->user->session_keyring) { key_ref = keyring_search_aux( make_key_ref(cred->user->session_keyring, 1), - context, type, description, match); + cred, type, description, match); if (!IS_ERR(key_ref)) goto found; @@ -490,7 +470,7 @@ key_ref_t search_process_keyrings(struct key_type *type, * - we don't permit access to request_key auth keys via this method */ if (cred->request_key_auth && - context == current && + cred == current_cred() && type != &key_type_request_key_auth ) { /* defend against the auth key being revoked */ @@ -500,7 +480,7 @@ key_ref_t search_process_keyrings(struct key_type *type, rka = cred->request_key_auth->payload.data; key_ref = search_process_keyrings(type, description, - match, rka->context); + match, rka->cred); up_read(&cred->request_key_auth->sem); @@ -527,7 +507,6 @@ key_ref_t search_process_keyrings(struct key_type *type, key_ref = ret ? ret : err; found: - put_cred(cred); return key_ref; } /* end search_process_keyrings() */ @@ -552,8 +531,7 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial, key_perm_t perm) { struct request_key_auth *rka; - struct task_struct *t = current; - struct cred *cred; + const struct cred *cred; struct key *key; key_ref_t key_ref, skey_ref; int ret; @@ -608,6 +586,7 @@ try_again: goto error; ret = install_session_keyring( cred->user->session_keyring); + if (ret < 0) goto error; goto reget_creds; @@ -693,7 +672,7 @@ try_again: /* check to see if we possess the key */ skey_ref = search_process_keyrings(key->type, key, lookup_user_key_possessed, - current); + cred); if (!IS_ERR(skey_ref)) { key_put(key); @@ -725,7 +704,7 @@ try_again: goto invalid_key; /* check the permissions */ - ret = key_task_permission(key_ref, t, perm); + ret = key_task_permission(key_ref, cred, perm); if (ret < 0) goto invalid_key; @@ -755,21 +734,33 @@ reget_creds: */ long join_session_keyring(const char *name) { - struct task_struct *tsk = current; - struct cred *cred = current->cred; + const struct cred *old; + struct cred *new; struct key *keyring; - long ret; + long ret, serial; + + /* only permit this if there's a single thread in the thread group - + * this avoids us having to adjust the creds on all threads and risking + * ENOMEM */ + if (!is_single_threaded(current)) + return -EMLINK; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); /* if no name is provided, install an anonymous keyring */ if (!name) { - ret = install_session_keyring(NULL); + ret = install_session_keyring_to_cred(new, NULL); if (ret < 0) goto error; - rcu_read_lock(); - ret = rcu_dereference(cred->tgcred->session_keyring)->serial; - rcu_read_unlock(); - goto error; + serial = new->tgcred->session_keyring->serial; + ret = commit_creds(new); + if (ret == 0) + ret = serial; + goto okay; } /* allow the user to join or create a named keyring */ @@ -779,29 +770,33 @@ long join_session_keyring(const char *name) keyring = find_keyring_by_name(name, false); if (PTR_ERR(keyring) == -ENOKEY) { /* not found - try and create a new one */ - keyring = keyring_alloc(name, cred->uid, cred->gid, tsk, + keyring = keyring_alloc(name, old->uid, old->gid, old, KEY_ALLOC_IN_QUOTA, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; } - } - else if (IS_ERR(keyring)) { + } else if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; } /* we've got a keyring - now to install it */ - ret = install_session_keyring(keyring); + ret = install_session_keyring_to_cred(new, keyring); if (ret < 0) goto error2; + commit_creds(new); + mutex_unlock(&key_session_mutex); + ret = keyring->serial; key_put(keyring); +okay: + return ret; error2: mutex_unlock(&key_session_mutex); error: + abort_creds(new); return ret; - -} /* end join_session_keyring() */ +} diff --git a/security/keys/request_key.c b/security/keys/request_key.c index 3d12558362d..0e04f72ef2d 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -83,8 +83,10 @@ static int call_sbin_request_key(struct key_construction *cons, /* allocate a new session keyring */ sprintf(desc, "_req.%u", key->serial); - keyring = keyring_alloc(desc, current_fsuid(), current_fsgid(), current, + cred = get_current_cred(); + keyring = keyring_alloc(desc, cred->fsuid, cred->fsgid, cred, KEY_ALLOC_QUOTA_OVERRUN, NULL); + put_cred(cred); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error_alloc; @@ -104,8 +106,7 @@ static int call_sbin_request_key(struct key_construction *cons, /* we specify the process's default keyrings */ sprintf(keyring_str[0], "%d", - cred->thread_keyring ? - cred->thread_keyring->serial : 0); + cred->thread_keyring ? cred->thread_keyring->serial : 0); prkey = 0; if (cred->tgcred->process_keyring) @@ -155,8 +156,8 @@ error_link: key_put(keyring); error_alloc: - kleave(" = %d", ret); complete_request_key(cons, ret); + kleave(" = %d", ret); return ret; } @@ -295,6 +296,7 @@ static int construct_alloc_key(struct key_type *type, struct key_user *user, struct key **_key) { + const struct cred *cred = current_cred(); struct key *key; key_ref_t key_ref; @@ -302,9 +304,8 @@ static int construct_alloc_key(struct key_type *type, mutex_lock(&user->cons_lock); - key = key_alloc(type, description, - current_fsuid(), current_fsgid(), current, KEY_POS_ALL, - flags); + key = key_alloc(type, description, cred->fsuid, cred->fsgid, cred, + KEY_POS_ALL, flags); if (IS_ERR(key)) goto alloc_failed; @@ -317,8 +318,7 @@ static int construct_alloc_key(struct key_type *type, * waited for locks */ mutex_lock(&key_construction_mutex); - key_ref = search_process_keyrings(type, description, type->match, - current); + key_ref = search_process_keyrings(type, description, type->match, cred); if (!IS_ERR(key_ref)) goto key_already_present; @@ -363,6 +363,8 @@ static struct key *construct_key_and_link(struct key_type *type, struct key *key; int ret; + kenter(""); + user = key_user_lookup(current_fsuid()); if (!user) return ERR_PTR(-ENOMEM); @@ -376,17 +378,21 @@ static struct key *construct_key_and_link(struct key_type *type, if (ret == 0) { ret = construct_key(key, callout_info, callout_len, aux, dest_keyring); - if (ret < 0) + if (ret < 0) { + kdebug("cons failed"); goto construction_failed; + } } key_put(dest_keyring); + kleave(" = key %d", key_serial(key)); return key; construction_failed: key_negate_and_link(key, key_negative_timeout, NULL, NULL); key_put(key); key_put(dest_keyring); + kleave(" = %d", ret); return ERR_PTR(ret); } @@ -405,6 +411,7 @@ struct key *request_key_and_link(struct key_type *type, struct key *dest_keyring, unsigned long flags) { + const struct cred *cred = current_cred(); struct key *key; key_ref_t key_ref; @@ -414,7 +421,7 @@ struct key *request_key_and_link(struct key_type *type, /* search all the process keyrings for a key */ key_ref = search_process_keyrings(type, description, type->match, - current); + cred); if (!IS_ERR(key_ref)) { key = key_ref_to_ptr(key_ref); diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c index 2125579d5d7..86747151ee5 100644 --- a/security/keys/request_key_auth.c +++ b/security/keys/request_key_auth.c @@ -105,9 +105,9 @@ static void request_key_auth_revoke(struct key *key) kenter("{%d}", key->serial); - if (rka->context) { - put_task_struct(rka->context); - rka->context = NULL; + if (rka->cred) { + put_cred(rka->cred); + rka->cred = NULL; } } /* end request_key_auth_revoke() */ @@ -122,9 +122,9 @@ static void request_key_auth_destroy(struct key *key) kenter("{%d}", key->serial); - if (rka->context) { - put_task_struct(rka->context); - rka->context = NULL; + if (rka->cred) { + put_cred(rka->cred); + rka->cred = NULL; } key_put(rka->target_key); @@ -143,6 +143,7 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, size_t callout_len, struct key *dest_keyring) { struct request_key_auth *rka, *irka; + const struct cred *cred = current->cred; struct key *authkey = NULL; char desc[20]; int ret; @@ -164,28 +165,25 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, /* see if the calling process is already servicing the key request of * another process */ - if (current->cred->request_key_auth) { + if (cred->request_key_auth) { /* it is - use that instantiation context here too */ - down_read(¤t->cred->request_key_auth->sem); + down_read(&cred->request_key_auth->sem); /* if the auth key has been revoked, then the key we're * servicing is already instantiated */ - if (test_bit(KEY_FLAG_REVOKED, - ¤t->cred->request_key_auth->flags)) + if (test_bit(KEY_FLAG_REVOKED, &cred->request_key_auth->flags)) goto auth_key_revoked; - irka = current->cred->request_key_auth->payload.data; - rka->context = irka->context; + irka = cred->request_key_auth->payload.data; + rka->cred = get_cred(irka->cred); rka->pid = irka->pid; - get_task_struct(rka->context); - up_read(¤t->cred->request_key_auth->sem); + up_read(&cred->request_key_auth->sem); } else { /* it isn't - use this process as the context */ - rka->context = current; + rka->cred = get_cred(cred); rka->pid = current->pid; - get_task_struct(rka->context); } rka->target_key = key_get(target); @@ -197,7 +195,7 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, sprintf(desc, "%x", target->serial); authkey = key_alloc(&key_type_request_key_auth, desc, - current_fsuid(), current_fsgid(), current, + cred->fsuid, cred->fsgid, cred, KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | KEY_USR_VIEW, KEY_ALLOC_NOT_IN_QUOTA); if (IS_ERR(authkey)) { @@ -205,16 +203,16 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, goto error_alloc; } - /* construct and attach to the keyring */ + /* construct the auth key */ ret = key_instantiate_and_link(authkey, rka, 0, NULL, NULL); if (ret < 0) goto error_inst; - kleave(" = {%d}", authkey->serial); + kleave(" = {%d,%d}", authkey->serial, atomic_read(&authkey->usage)); return authkey; auth_key_revoked: - up_read(¤t->cred->request_key_auth->sem); + up_read(&cred->request_key_auth->sem); kfree(rka->callout_info); kfree(rka); kleave("= -EKEYREVOKED"); @@ -257,6 +255,7 @@ static int key_get_instantiation_authkey_match(const struct key *key, */ struct key *key_get_instantiation_authkey(key_serial_t target_id) { + const struct cred *cred = current_cred(); struct key *authkey; key_ref_t authkey_ref; @@ -264,7 +263,7 @@ struct key *key_get_instantiation_authkey(key_serial_t target_id) &key_type_request_key_auth, (void *) (unsigned long) target_id, key_get_instantiation_authkey_match, - current); + cred); if (IS_ERR(authkey_ref)) { authkey = ERR_CAST(authkey_ref); diff --git a/security/security.c b/security/security.c index f40a0a04c3c..a55d739c686 100644 --- a/security/security.c +++ b/security/security.c @@ -145,18 +145,13 @@ int security_capget(struct task_struct *target, return security_ops->capget(target, effective, inheritable, permitted); } -int security_capset_check(const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted) +int security_capset(struct cred *new, const struct cred *old, + const kernel_cap_t *effective, + const kernel_cap_t *inheritable, + const kernel_cap_t *permitted) { - return security_ops->capset_check(effective, inheritable, permitted); -} - -void security_capset_set(const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted) -{ - security_ops->capset_set(effective, inheritable, permitted); + return security_ops->capset(new, old, + effective, inheritable, permitted); } int security_capable(struct task_struct *tsk, int cap) @@ -228,9 +223,9 @@ void security_bprm_free(struct linux_binprm *bprm) security_ops->bprm_free_security(bprm); } -void security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe) +int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe) { - security_ops->bprm_apply_creds(bprm, unsafe); + return security_ops->bprm_apply_creds(bprm, unsafe); } void security_bprm_post_apply_creds(struct linux_binprm *bprm) @@ -616,14 +611,19 @@ int security_task_create(unsigned long clone_flags) return security_ops->task_create(clone_flags); } -int security_cred_alloc(struct cred *cred) +void security_cred_free(struct cred *cred) { - return security_ops->cred_alloc_security(cred); + security_ops->cred_free(cred); } -void security_cred_free(struct cred *cred) +int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp) { - security_ops->cred_free(cred); + return security_ops->cred_prepare(new, old, gfp); +} + +void security_commit_creds(struct cred *new, const struct cred *old) +{ + return security_ops->cred_commit(new, old); } int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags) @@ -631,10 +631,10 @@ int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags) return security_ops->task_setuid(id0, id1, id2, flags); } -int security_task_post_setuid(uid_t old_ruid, uid_t old_euid, - uid_t old_suid, int flags) +int security_task_fix_setuid(struct cred *new, const struct cred *old, + int flags) { - return security_ops->task_post_setuid(old_ruid, old_euid, old_suid, flags); + return security_ops->task_fix_setuid(new, old, flags); } int security_task_setgid(gid_t id0, gid_t id1, gid_t id2, int flags) @@ -716,14 +716,9 @@ int security_task_wait(struct task_struct *p) } int security_task_prctl(int option, unsigned long arg2, unsigned long arg3, - unsigned long arg4, unsigned long arg5, long *rc_p) -{ - return security_ops->task_prctl(option, arg2, arg3, arg4, arg5, rc_p); -} - -void security_task_reparent_to_init(struct task_struct *p) + unsigned long arg4, unsigned long arg5) { - security_ops->task_reparent_to_init(p); + return security_ops->task_prctl(option, arg2, arg3, arg4, arg5); } void security_task_to_inode(struct task_struct *p, struct inode *inode) @@ -1123,9 +1118,10 @@ EXPORT_SYMBOL(security_skb_classify_flow); #ifdef CONFIG_KEYS -int security_key_alloc(struct key *key, struct task_struct *tsk, unsigned long flags) +int security_key_alloc(struct key *key, const struct cred *cred, + unsigned long flags) { - return security_ops->key_alloc(key, tsk, flags); + return security_ops->key_alloc(key, cred, flags); } void security_key_free(struct key *key) @@ -1134,9 +1130,9 @@ void security_key_free(struct key *key) } int security_key_permission(key_ref_t key_ref, - struct task_struct *context, key_perm_t perm) + const struct cred *cred, key_perm_t perm) { - return security_ops->key_permission(key_ref, context, perm); + return security_ops->key_permission(key_ref, cred, perm); } int security_key_getsecurity(struct key *key, char **_buffer) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index f20cbd681ba..c71bba78872 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -156,20 +156,20 @@ static int selinux_secmark_enabled(void) return (atomic_read(&selinux_secmark_refcount) > 0); } -/* Allocate and free functions for each kind of security blob. */ - -static int cred_alloc_security(struct cred *cred) +/* + * initialise the security for the init task + */ +static void cred_init_security(void) { + struct cred *cred = (struct cred *) current->cred; struct task_security_struct *tsec; tsec = kzalloc(sizeof(struct task_security_struct), GFP_KERNEL); if (!tsec) - return -ENOMEM; + panic("SELinux: Failed to initialize initial task.\n"); - tsec->osid = tsec->sid = SECINITSID_UNLABELED; + tsec->osid = tsec->sid = SECINITSID_KERNEL; cred->security = tsec; - - return 0; } /* @@ -1378,6 +1378,19 @@ static inline u32 signal_to_av(int sig) return perm; } +/* + * Check permission between a pair of credentials + * fork check, ptrace check, etc. + */ +static int cred_has_perm(const struct cred *actor, + const struct cred *target, + u32 perms) +{ + u32 asid = cred_sid(actor), tsid = cred_sid(target); + + return avc_has_perm(asid, tsid, SECCLASS_PROCESS, perms, NULL); +} + /* * Check permission between a pair of tasks, e.g. signal checks, * fork check, ptrace check, etc. @@ -1820,24 +1833,19 @@ static int selinux_capget(struct task_struct *target, kernel_cap_t *effective, return secondary_ops->capget(target, effective, inheritable, permitted); } -static int selinux_capset_check(const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted) +static int selinux_capset(struct cred *new, const struct cred *old, + const kernel_cap_t *effective, + const kernel_cap_t *inheritable, + const kernel_cap_t *permitted) { int error; - error = secondary_ops->capset_check(effective, inheritable, permitted); + error = secondary_ops->capset(new, old, + effective, inheritable, permitted); if (error) return error; - return task_has_perm(current, current, PROCESS__SETCAP); -} - -static void selinux_capset_set(const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted) -{ - secondary_ops->capset_set(effective, inheritable, permitted); + return cred_has_perm(old, new, PROCESS__SETCAP); } static int selinux_capable(struct task_struct *tsk, int cap, int audit) @@ -2244,16 +2252,23 @@ static inline void flush_unauthorized_files(const struct cred *cred, spin_unlock(&files->file_lock); } -static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe) +static int selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe) { struct task_security_struct *tsec; struct bprm_security_struct *bsec; + struct cred *new; u32 sid; int rc; - secondary_ops->bprm_apply_creds(bprm, unsafe); + rc = secondary_ops->bprm_apply_creds(bprm, unsafe); + if (rc < 0) + return rc; - tsec = current_security(); + new = prepare_creds(); + if (!new) + return -ENOMEM; + + tsec = new->security; bsec = bprm->security; sid = bsec->sid; @@ -2268,7 +2283,7 @@ static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe) PROCESS__SHARE, NULL); if (rc) { bsec->unsafe = 1; - return; + goto out; } } @@ -2292,12 +2307,16 @@ static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe) PROCESS__PTRACE, NULL); if (rc) { bsec->unsafe = 1; - return; + goto out; } } } tsec->sid = sid; } + +out: + commit_creds(new); + return 0; } /* @@ -3021,6 +3040,7 @@ static int selinux_file_ioctl(struct file *file, unsigned int cmd, static int file_map_prot_check(struct file *file, unsigned long prot, int shared) { const struct cred *cred = current_cred(); + int rc = 0; #ifndef CONFIG_PPC32 if ((prot & PROT_EXEC) && (!file || (!shared && (prot & PROT_WRITE)))) { @@ -3029,9 +3049,9 @@ static int file_map_prot_check(struct file *file, unsigned long prot, int shared * private file mapping that will also be writable. * This has an additional check. */ - int rc = task_has_perm(current, current, PROCESS__EXECMEM); + rc = cred_has_perm(cred, cred, PROCESS__EXECMEM); if (rc) - return rc; + goto error; } #endif @@ -3048,7 +3068,9 @@ static int file_map_prot_check(struct file *file, unsigned long prot, int shared return file_has_perm(cred, file, av); } - return 0; + +error: + return rc; } static int selinux_file_mmap(struct file *file, unsigned long reqprot, @@ -3090,8 +3112,7 @@ static int selinux_file_mprotect(struct vm_area_struct *vma, rc = 0; if (vma->vm_start >= vma->vm_mm->start_brk && vma->vm_end <= vma->vm_mm->brk) { - rc = task_has_perm(current, current, - PROCESS__EXECHEAP); + rc = cred_has_perm(cred, cred, PROCESS__EXECHEAP); } else if (!vma->vm_file && vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack) { @@ -3104,8 +3125,7 @@ static int selinux_file_mprotect(struct vm_area_struct *vma, * modified content. This typically should only * occur for text relocations. */ - rc = file_has_perm(cred, vma->vm_file, - FILE__EXECMOD); + rc = file_has_perm(cred, vma->vm_file, FILE__EXECMOD); } if (rc) return rc; @@ -3211,6 +3231,7 @@ static int selinux_dentry_open(struct file *file, const struct cred *cred) struct file_security_struct *fsec; struct inode *inode; struct inode_security_struct *isec; + inode = file->f_path.dentry->d_inode; fsec = file->f_security; isec = inode->i_security; @@ -3247,38 +3268,41 @@ static int selinux_task_create(unsigned long clone_flags) return task_has_perm(current, current, PROCESS__FORK); } -static int selinux_cred_alloc_security(struct cred *cred) +/* + * detach and free the LSM part of a set of credentials + */ +static void selinux_cred_free(struct cred *cred) { - struct task_security_struct *tsec1, *tsec2; - int rc; - - tsec1 = current_security(); + struct task_security_struct *tsec = cred->security; + cred->security = NULL; + kfree(tsec); +} - rc = cred_alloc_security(cred); - if (rc) - return rc; - tsec2 = cred->security; +/* + * prepare a new set of credentials for modification + */ +static int selinux_cred_prepare(struct cred *new, const struct cred *old, + gfp_t gfp) +{ + const struct task_security_struct *old_tsec; + struct task_security_struct *tsec; - tsec2->osid = tsec1->osid; - tsec2->sid = tsec1->sid; + old_tsec = old->security; - /* Retain the exec, fs, key, and sock SIDs across fork */ - tsec2->exec_sid = tsec1->exec_sid; - tsec2->create_sid = tsec1->create_sid; - tsec2->keycreate_sid = tsec1->keycreate_sid; - tsec2->sockcreate_sid = tsec1->sockcreate_sid; + tsec = kmemdup(old_tsec, sizeof(struct task_security_struct), gfp); + if (!tsec) + return -ENOMEM; + new->security = tsec; return 0; } /* - * detach and free the LSM part of a set of credentials + * commit new credentials */ -static void selinux_cred_free(struct cred *cred) +static void selinux_cred_commit(struct cred *new, const struct cred *old) { - struct task_security_struct *tsec = cred->security; - cred->security = NULL; - kfree(tsec); + secondary_ops->cred_commit(new, old); } static int selinux_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags) @@ -3292,9 +3316,10 @@ static int selinux_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags) return 0; } -static int selinux_task_post_setuid(uid_t id0, uid_t id1, uid_t id2, int flags) +static int selinux_task_fix_setuid(struct cred *new, const struct cred *old, + int flags) { - return secondary_ops->task_post_setuid(id0, id1, id2, flags); + return secondary_ops->task_fix_setuid(new, old, flags); } static int selinux_task_setgid(gid_t id0, gid_t id1, gid_t id2, int flags) @@ -3368,7 +3393,7 @@ static int selinux_task_setrlimit(unsigned int resource, struct rlimit *new_rlim /* Control the ability to change the hard limit (whether lowering or raising it), so that the hard limit can later be used as a safe reset point for the soft limit - upon context transitions. See selinux_bprm_apply_creds. */ + upon context transitions. See selinux_bprm_committing_creds. */ if (old_rlim->rlim_max != new_rlim->rlim_max) return task_has_perm(current, current, PROCESS__SETRLIMIT); @@ -3422,13 +3447,12 @@ static int selinux_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, - unsigned long arg5, - long *rc_p) + unsigned long arg5) { /* The current prctl operations do not appear to require any SELinux controls since they merely observe or modify the state of the current process. */ - return secondary_ops->task_prctl(option, arg2, arg3, arg4, arg5, rc_p); + return secondary_ops->task_prctl(option, arg2, arg3, arg4, arg5); } static int selinux_task_wait(struct task_struct *p) @@ -3436,18 +3460,6 @@ static int selinux_task_wait(struct task_struct *p) return task_has_perm(p, current, PROCESS__SIGCHLD); } -static void selinux_task_reparent_to_init(struct task_struct *p) -{ - struct task_security_struct *tsec; - - secondary_ops->task_reparent_to_init(p); - - tsec = p->cred->security; - tsec->osid = tsec->sid; - tsec->sid = SECINITSID_KERNEL; - return; -} - static void selinux_task_to_inode(struct task_struct *p, struct inode *inode) { @@ -5325,7 +5337,8 @@ static int selinux_setprocattr(struct task_struct *p, { struct task_security_struct *tsec; struct task_struct *tracer; - u32 sid = 0; + struct cred *new; + u32 sid = 0, ptsid; int error; char *str = value; @@ -5372,86 +5385,75 @@ static int selinux_setprocattr(struct task_struct *p, return error; } + new = prepare_creds(); + if (!new) + return -ENOMEM; + /* Permission checking based on the specified context is performed during the actual operation (execve, open/mkdir/...), when we know the full context of the - operation. See selinux_bprm_set_security for the execve + operation. See selinux_bprm_set_creds for the execve checks and may_create for the file creation checks. The operation will then fail if the context is not permitted. */ - tsec = p->cred->security; - if (!strcmp(name, "exec")) + tsec = new->security; + if (!strcmp(name, "exec")) { tsec->exec_sid = sid; - else if (!strcmp(name, "fscreate")) + } else if (!strcmp(name, "fscreate")) { tsec->create_sid = sid; - else if (!strcmp(name, "keycreate")) { + } else if (!strcmp(name, "keycreate")) { error = may_create_key(sid, p); if (error) - return error; + goto abort_change; tsec->keycreate_sid = sid; - } else if (!strcmp(name, "sockcreate")) + } else if (!strcmp(name, "sockcreate")) { tsec->sockcreate_sid = sid; - else if (!strcmp(name, "current")) { - struct av_decision avd; - + } else if (!strcmp(name, "current")) { + error = -EINVAL; if (sid == 0) - return -EINVAL; - /* - * SELinux allows to change context in the following case only. - * - Single threaded processes. - * - Multi threaded processes intend to change its context into - * more restricted domain (defined by TYPEBOUNDS statement). - */ - if (atomic_read(&p->mm->mm_users) != 1) { - struct task_struct *g, *t; - struct mm_struct *mm = p->mm; - read_lock(&tasklist_lock); - do_each_thread(g, t) { - if (t->mm == mm && t != p) { - read_unlock(&tasklist_lock); - error = security_bounded_transition(tsec->sid, sid); - if (!error) - goto boundary_ok; - - return error; - } - } while_each_thread(g, t); - read_unlock(&tasklist_lock); + goto abort_change; + + /* Only allow single threaded processes to change context */ + error = -EPERM; + if (!is_single_threaded(p)) { + error = security_bounded_transition(tsec->sid, sid); + if (error) + goto abort_change; } -boundary_ok: /* Check permissions for the transition. */ error = avc_has_perm(tsec->sid, sid, SECCLASS_PROCESS, PROCESS__DYNTRANSITION, NULL); if (error) - return error; + goto abort_change; /* Check for ptracing, and update the task SID if ok. Otherwise, leave SID unchanged and fail. */ + ptsid = 0; task_lock(p); - rcu_read_lock(); tracer = tracehook_tracer_task(p); - if (tracer != NULL) { - u32 ptsid = task_sid(tracer); - rcu_read_unlock(); - error = avc_has_perm_noaudit(ptsid, sid, - SECCLASS_PROCESS, - PROCESS__PTRACE, 0, &avd); - if (!error) - tsec->sid = sid; - task_unlock(p); - avc_audit(ptsid, sid, SECCLASS_PROCESS, - PROCESS__PTRACE, &avd, error, NULL); + if (tracer) + ptsid = task_sid(tracer); + task_unlock(p); + + if (tracer) { + error = avc_has_perm(ptsid, sid, SECCLASS_PROCESS, + PROCESS__PTRACE, NULL); if (error) - return error; - } else { - rcu_read_unlock(); - tsec->sid = sid; - task_unlock(p); + goto abort_change; } - } else - return -EINVAL; + tsec->sid = sid; + } else { + error = -EINVAL; + goto abort_change; + } + + commit_creds(new); return size; + +abort_change: + abort_creds(new); + return error; } static int selinux_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) @@ -5471,23 +5473,21 @@ static void selinux_release_secctx(char *secdata, u32 seclen) #ifdef CONFIG_KEYS -static int selinux_key_alloc(struct key *k, struct task_struct *tsk, +static int selinux_key_alloc(struct key *k, const struct cred *cred, unsigned long flags) { - const struct task_security_struct *__tsec; + const struct task_security_struct *tsec; struct key_security_struct *ksec; ksec = kzalloc(sizeof(struct key_security_struct), GFP_KERNEL); if (!ksec) return -ENOMEM; - rcu_read_lock(); - __tsec = __task_cred(tsk)->security; - if (__tsec->keycreate_sid) - ksec->sid = __tsec->keycreate_sid; + tsec = cred->security; + if (tsec->keycreate_sid) + ksec->sid = tsec->keycreate_sid; else - ksec->sid = __tsec->sid; - rcu_read_unlock(); + ksec->sid = tsec->sid; k->security = ksec; return 0; @@ -5502,8 +5502,8 @@ static void selinux_key_free(struct key *k) } static int selinux_key_permission(key_ref_t key_ref, - struct task_struct *ctx, - key_perm_t perm) + const struct cred *cred, + key_perm_t perm) { struct key *key; struct key_security_struct *ksec; @@ -5515,7 +5515,7 @@ static int selinux_key_permission(key_ref_t key_ref, if (perm == 0) return 0; - sid = task_sid(ctx); + sid = cred_sid(cred); key = key_ref_to_ptr(key_ref); ksec = key->security; @@ -5545,8 +5545,7 @@ static struct security_operations selinux_ops = { .ptrace_may_access = selinux_ptrace_may_access, .ptrace_traceme = selinux_ptrace_traceme, .capget = selinux_capget, - .capset_check = selinux_capset_check, - .capset_set = selinux_capset_set, + .capset = selinux_capset, .sysctl = selinux_sysctl, .capable = selinux_capable, .quotactl = selinux_quotactl, @@ -5621,10 +5620,11 @@ static struct security_operations selinux_ops = { .dentry_open = selinux_dentry_open, .task_create = selinux_task_create, - .cred_alloc_security = selinux_cred_alloc_security, .cred_free = selinux_cred_free, + .cred_prepare = selinux_cred_prepare, + .cred_commit = selinux_cred_commit, .task_setuid = selinux_task_setuid, - .task_post_setuid = selinux_task_post_setuid, + .task_fix_setuid = selinux_task_fix_setuid, .task_setgid = selinux_task_setgid, .task_setpgid = selinux_task_setpgid, .task_getpgid = selinux_task_getpgid, @@ -5641,7 +5641,6 @@ static struct security_operations selinux_ops = { .task_kill = selinux_task_kill, .task_wait = selinux_task_wait, .task_prctl = selinux_task_prctl, - .task_reparent_to_init = selinux_task_reparent_to_init, .task_to_inode = selinux_task_to_inode, .ipc_permission = selinux_ipc_permission, @@ -5737,8 +5736,6 @@ static struct security_operations selinux_ops = { static __init int selinux_init(void) { - struct task_security_struct *tsec; - if (!security_module_enable(&selinux_ops)) { selinux_enabled = 0; return 0; @@ -5752,10 +5749,7 @@ static __init int selinux_init(void) printk(KERN_INFO "SELinux: Initializing.\n"); /* Set the security state for the initial task. */ - if (cred_alloc_security(current->cred)) - panic("SELinux: Failed to initialize initial task.\n"); - tsec = current->cred->security; - tsec->osid = tsec->sid = SECINITSID_KERNEL; + cred_init_security(); sel_inode_cache = kmem_cache_create("selinux_inode_security", sizeof(struct inode_security_struct), diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 11167fd567b..e952b397153 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -104,8 +104,7 @@ static int smack_ptrace_may_access(struct task_struct *ctp, unsigned int mode) if (rc != 0) return rc; - rc = smk_access(current->cred->security, ctp->cred->security, - MAY_READWRITE); + rc = smk_access(current_security(), task_security(ctp), MAY_READWRITE); if (rc != 0 && capable(CAP_MAC_OVERRIDE)) return 0; return rc; @@ -127,8 +126,7 @@ static int smack_ptrace_traceme(struct task_struct *ptp) if (rc != 0) return rc; - rc = smk_access(ptp->cred->security, current->cred->security, - MAY_READWRITE); + rc = smk_access(task_security(ptp), current_security(), MAY_READWRITE); if (rc != 0 && has_capability(ptp, CAP_MAC_OVERRIDE)) return 0; return rc; @@ -976,22 +974,6 @@ static int smack_file_receive(struct file *file) * Task hooks */ -/** - * smack_cred_alloc_security - "allocate" a task cred blob - * @cred: the task creds in need of a blob - * - * Smack isn't using copies of blobs. Everyone - * points to an immutable list. No alloc required. - * No data copy required. - * - * Always returns 0 - */ -static int smack_cred_alloc_security(struct cred *cred) -{ - cred->security = current_security(); - return 0; -} - /** * smack_cred_free - "free" task-level security credentials * @cred: the credentials in question @@ -1005,6 +987,30 @@ static void smack_cred_free(struct cred *cred) cred->security = NULL; } +/** + * smack_cred_prepare - prepare new set of credentials for modification + * @new: the new credentials + * @old: the original credentials + * @gfp: the atomicity of any memory allocations + * + * Prepare a new set of credentials for modification. + */ +static int smack_cred_prepare(struct cred *new, const struct cred *old, + gfp_t gfp) +{ + new->security = old->security; + return 0; +} + +/* + * commit new credentials + * @new: the new credentials + * @old: the original credentials + */ +static void smack_cred_commit(struct cred *new, const struct cred *old) +{ +} + /** * smack_task_setpgid - Smack check on setting pgid * @p: the task object @@ -2036,6 +2042,7 @@ static int smack_getprocattr(struct task_struct *p, char *name, char **value) static int smack_setprocattr(struct task_struct *p, char *name, void *value, size_t size) { + struct cred *new; char *newsmack; /* @@ -2058,7 +2065,11 @@ static int smack_setprocattr(struct task_struct *p, char *name, if (newsmack == NULL) return -EINVAL; - p->cred->security = newsmack; + new = prepare_creds(); + if (!new) + return -ENOMEM; + new->security = newsmack; + commit_creds(new); return size; } @@ -2354,17 +2365,17 @@ static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb, /** * smack_key_alloc - Set the key security blob * @key: object - * @tsk: the task associated with the key + * @cred: the credentials to use * @flags: unused * * No allocation required * * Returns 0 */ -static int smack_key_alloc(struct key *key, struct task_struct *tsk, +static int smack_key_alloc(struct key *key, const struct cred *cred, unsigned long flags) { - key->security = tsk->cred->security; + key->security = cred->security; return 0; } @@ -2382,14 +2393,14 @@ static void smack_key_free(struct key *key) /* * smack_key_permission - Smack access on a key * @key_ref: gets to the object - * @context: task involved + * @cred: the credentials to use * @perm: unused * * Return 0 if the task has read and write to the object, * an error code otherwise */ static int smack_key_permission(key_ref_t key_ref, - struct task_struct *context, key_perm_t perm) + const struct cred *cred, key_perm_t perm) { struct key *keyp; @@ -2405,11 +2416,10 @@ static int smack_key_permission(key_ref_t key_ref, /* * This should not occur */ - if (context->cred->security == NULL) + if (cred->security == NULL) return -EACCES; - return smk_access(context->cred->security, keyp->security, - MAY_READWRITE); + return smk_access(cred->security, keyp->security, MAY_READWRITE); } #endif /* CONFIG_KEYS */ @@ -2580,8 +2590,7 @@ struct security_operations smack_ops = { .ptrace_may_access = smack_ptrace_may_access, .ptrace_traceme = smack_ptrace_traceme, .capget = cap_capget, - .capset_check = cap_capset_check, - .capset_set = cap_capset_set, + .capset = cap_capset, .capable = cap_capable, .syslog = smack_syslog, .settime = cap_settime, @@ -2630,9 +2639,10 @@ struct security_operations smack_ops = { .file_send_sigiotask = smack_file_send_sigiotask, .file_receive = smack_file_receive, - .cred_alloc_security = smack_cred_alloc_security, .cred_free = smack_cred_free, - .task_post_setuid = cap_task_post_setuid, + .cred_prepare = smack_cred_prepare, + .cred_commit = smack_cred_commit, + .task_fix_setuid = cap_task_fix_setuid, .task_setpgid = smack_task_setpgid, .task_getpgid = smack_task_getpgid, .task_getsid = smack_task_getsid, @@ -2645,7 +2655,6 @@ struct security_operations smack_ops = { .task_movememory = smack_task_movememory, .task_kill = smack_task_kill, .task_wait = smack_task_wait, - .task_reparent_to_init = cap_task_reparent_to_init, .task_to_inode = smack_task_to_inode, .task_prctl = cap_task_prctl, @@ -2721,6 +2730,8 @@ struct security_operations smack_ops = { */ static __init int smack_init(void) { + struct cred *cred; + if (!security_module_enable(&smack_ops)) return 0; @@ -2729,7 +2740,8 @@ static __init int smack_init(void) /* * Set the security state for the initial task. */ - current->cred->security = &smack_known_floor.smk_known; + cred = (struct cred *) current->cred; + cred->security = &smack_known_floor.smk_known; /* * Initialize locks -- cgit v1.2.3-70-g09d2 From a6f76f23d297f70e2a6b3ec607f7aeeea9e37e8d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:24 +1100 Subject: CRED: Make execve() take advantage of copy-on-write credentials Make execve() take advantage of copy-on-write credentials, allowing it to set up the credentials in advance, and then commit the whole lot after the point of no return. This patch and the preceding patches have been tested with the LTP SELinux testsuite. This patch makes several logical sets of alteration: (1) execve(). The credential bits from struct linux_binprm are, for the most part, replaced with a single credentials pointer (bprm->cred). This means that all the creds can be calculated in advance and then applied at the point of no return with no possibility of failure. I would like to replace bprm->cap_effective with: cap_isclear(bprm->cap_effective) but this seems impossible due to special behaviour for processes of pid 1 (they always retain their parent's capability masks where normally they'd be changed - see cap_bprm_set_creds()). The following sequence of events now happens: (a) At the start of do_execve, the current task's cred_exec_mutex is locked to prevent PTRACE_ATTACH from obsoleting the calculation of creds that we make. (a) prepare_exec_creds() is then called to make a copy of the current task's credentials and prepare it. This copy is then assigned to bprm->cred. This renders security_bprm_alloc() and security_bprm_free() unnecessary, and so they've been removed. (b) The determination of unsafe execution is now performed immediately after (a) rather than later on in the code. The result is stored in bprm->unsafe for future reference. (c) prepare_binprm() is called, possibly multiple times. (i) This applies the result of set[ug]id binaries to the new creds attached to bprm->cred. Personality bit clearance is recorded, but now deferred on the basis that the exec procedure may yet fail. (ii) This then calls the new security_bprm_set_creds(). This should calculate the new LSM and capability credentials into *bprm->cred. This folds together security_bprm_set() and parts of security_bprm_apply_creds() (these two have been removed). Anything that might fail must be done at this point. (iii) bprm->cred_prepared is set to 1. bprm->cred_prepared is 0 on the first pass of the security calculations, and 1 on all subsequent passes. This allows SELinux in (ii) to base its calculations only on the initial script and not on the interpreter. (d) flush_old_exec() is called to commit the task to execution. This performs the following steps with regard to credentials: (i) Clear pdeath_signal and set dumpable on certain circumstances that may not be covered by commit_creds(). (ii) Clear any bits in current->personality that were deferred from (c.i). (e) install_exec_creds() [compute_creds() as was] is called to install the new credentials. This performs the following steps with regard to credentials: (i) Calls security_bprm_committing_creds() to apply any security requirements, such as flushing unauthorised files in SELinux, that must be done before the credentials are changed. This is made up of bits of security_bprm_apply_creds() and security_bprm_post_apply_creds(), both of which have been removed. This function is not allowed to fail; anything that might fail must have been done in (c.ii). (ii) Calls commit_creds() to apply the new credentials in a single assignment (more or less). Possibly pdeath_signal and dumpable should be part of struct creds. (iii) Unlocks the task's cred_replace_mutex, thus allowing PTRACE_ATTACH to take place. (iv) Clears The bprm->cred pointer as the credentials it was holding are now immutable. (v) Calls security_bprm_committed_creds() to apply any security alterations that must be done after the creds have been changed. SELinux uses this to flush signals and signal handlers. (f) If an error occurs before (d.i), bprm_free() will call abort_creds() to destroy the proposed new credentials and will then unlock cred_replace_mutex. No changes to the credentials will have been made. (2) LSM interface. A number of functions have been changed, added or removed: (*) security_bprm_alloc(), ->bprm_alloc_security() (*) security_bprm_free(), ->bprm_free_security() Removed in favour of preparing new credentials and modifying those. (*) security_bprm_apply_creds(), ->bprm_apply_creds() (*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds() Removed; split between security_bprm_set_creds(), security_bprm_committing_creds() and security_bprm_committed_creds(). (*) security_bprm_set(), ->bprm_set_security() Removed; folded into security_bprm_set_creds(). (*) security_bprm_set_creds(), ->bprm_set_creds() New. The new credentials in bprm->creds should be checked and set up as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the second and subsequent calls. (*) security_bprm_committing_creds(), ->bprm_committing_creds() (*) security_bprm_committed_creds(), ->bprm_committed_creds() New. Apply the security effects of the new credentials. This includes closing unauthorised files in SELinux. This function may not fail. When the former is called, the creds haven't yet been applied to the process; when the latter is called, they have. The former may access bprm->cred, the latter may not. (3) SELinux. SELinux has a number of changes, in addition to those to support the LSM interface changes mentioned above: (a) The bprm_security_struct struct has been removed in favour of using the credentials-under-construction approach. (c) flush_unauthorized_files() now takes a cred pointer and passes it on to inode_has_perm(), file_has_perm() and dentry_open(). Signed-off-by: David Howells Acked-by: James Morris Acked-by: Serge Hallyn Signed-off-by: James Morris --- arch/x86/ia32/ia32_aout.c | 2 +- fs/binfmt_aout.c | 2 +- fs/binfmt_elf.c | 2 +- fs/binfmt_elf_fdpic.c | 2 +- fs/binfmt_flat.c | 2 +- fs/binfmt_som.c | 2 +- fs/compat.c | 42 +++--- fs/exec.c | 149 +++++++++++--------- fs/internal.h | 6 + include/linux/audit.h | 16 --- include/linux/binfmts.h | 16 ++- include/linux/cred.h | 3 +- include/linux/key.h | 2 - include/linux/security.h | 103 +++++--------- kernel/cred.c | 46 ++++++- security/capability.c | 19 +-- security/commoncap.c | 152 ++++++++++---------- security/keys/process_keys.c | 42 ------ security/root_plug.c | 13 +- security/security.c | 26 ++-- security/selinux/hooks.c | 283 ++++++++++++++++---------------------- security/selinux/include/objsec.h | 11 -- security/smack/smack_lsm.c | 3 +- 23 files changed, 429 insertions(+), 515 deletions(-) (limited to 'kernel') diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 127ec3f0721..2a4d073d2cf 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -327,7 +327,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) current->mm->cached_hole_size = 0; current->mm->mmap = NULL; - compute_creds(bprm); + install_exec_creds(bprm); current->flags &= ~PF_FORKNOEXEC; if (N_MAGIC(ex) == OMAGIC) { diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 204cfd1d767..f1f3f4192a6 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -320,7 +320,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) current->mm->free_area_cache = current->mm->mmap_base; current->mm->cached_hole_size = 0; - compute_creds(bprm); + install_exec_creds(bprm); current->flags &= ~PF_FORKNOEXEC; #ifdef __sparc__ if (N_MAGIC(ex) == NMAGIC) { diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 9142ff5dc8e..f458c1217c5 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -956,7 +956,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) } #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ - compute_creds(bprm); + install_exec_creds(bprm); current->flags &= ~PF_FORKNOEXEC; retval = create_elf_tables(bprm, &loc->elf_ex, load_addr, interp_load_addr); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 45dabd59936..aa5b43205e3 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -404,7 +404,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, current->mm->start_stack = current->mm->start_brk + stack_size; #endif - compute_creds(bprm); + install_exec_creds(bprm); current->flags &= ~PF_FORKNOEXEC; if (create_elf_fdpic_tables(bprm, current->mm, &exec_params, &interp_params) < 0) diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index ccb781a6a80..7bbd5c6b372 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -880,7 +880,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) (libinfo.lib_list[j].loaded)? libinfo.lib_list[j].start_data:UNLOADED_LIB; - compute_creds(bprm); + install_exec_creds(bprm); current->flags &= ~PF_FORKNOEXEC; set_binfmt(&flat_format); diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index 74e587a5279..08644a61616 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -255,7 +255,7 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) kfree(hpuxhdr); set_binfmt(&som_format); - compute_creds(bprm); + install_exec_creds(bprm); setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); create_som_tables(bprm); diff --git a/fs/compat.c b/fs/compat.c index e5f49f53850..d1ece79b641 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1393,10 +1393,20 @@ int compat_do_execve(char * filename, if (!bprm) goto out_ret; + retval = mutex_lock_interruptible(¤t->cred_exec_mutex); + if (retval < 0) + goto out_free; + + retval = -ENOMEM; + bprm->cred = prepare_exec_creds(); + if (!bprm->cred) + goto out_unlock; + check_unsafe_exec(bprm); + file = open_exec(filename); retval = PTR_ERR(file); if (IS_ERR(file)) - goto out_kfree; + goto out_unlock; sched_exec(); @@ -1410,14 +1420,10 @@ int compat_do_execve(char * filename, bprm->argc = compat_count(argv, MAX_ARG_STRINGS); if ((retval = bprm->argc) < 0) - goto out_mm; + goto out; bprm->envc = compat_count(envp, MAX_ARG_STRINGS); if ((retval = bprm->envc) < 0) - goto out_mm; - - retval = security_bprm_alloc(bprm); - if (retval) goto out; retval = prepare_binprm(bprm); @@ -1438,19 +1444,16 @@ int compat_do_execve(char * filename, goto out; retval = search_binary_handler(bprm, regs); - if (retval >= 0) { - /* execve success */ - security_bprm_free(bprm); - acct_update_integrals(current); - free_bprm(bprm); - return retval; - } + if (retval < 0) + goto out; -out: - if (bprm->security) - security_bprm_free(bprm); + /* execve succeeded */ + mutex_unlock(¤t->cred_exec_mutex); + acct_update_integrals(current); + free_bprm(bprm); + return retval; -out_mm: +out: if (bprm->mm) mmput(bprm->mm); @@ -1460,7 +1463,10 @@ out_file: fput(bprm->file); } -out_kfree: +out_unlock: + mutex_unlock(¤t->cred_exec_mutex); + +out_free: free_bprm(bprm); out_ret: diff --git a/fs/exec.c b/fs/exec.c index 9bd3559ddec..32f13e29941 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -55,6 +55,7 @@ #include #include #include +#include "internal.h" #ifdef __alpha__ /* for /sbin/loader handling in search_binary_handler() */ @@ -1007,15 +1008,17 @@ int flush_old_exec(struct linux_binprm * bprm) */ current->mm->task_size = TASK_SIZE; - if (bprm->e_uid != current_euid() || - bprm->e_gid != current_egid()) { - set_dumpable(current->mm, suid_dumpable); + /* install the new credentials */ + if (bprm->cred->uid != current_euid() || + bprm->cred->gid != current_egid()) { current->pdeath_signal = 0; } else if (file_permission(bprm->file, MAY_READ) || - (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) { + bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) { set_dumpable(current->mm, suid_dumpable); } + current->personality &= ~bprm->per_clear; + /* An exec changes our domain. We are no longer part of the thread group */ @@ -1032,13 +1035,50 @@ out: EXPORT_SYMBOL(flush_old_exec); +/* + * install the new credentials for this executable + */ +void install_exec_creds(struct linux_binprm *bprm) +{ + security_bprm_committing_creds(bprm); + + commit_creds(bprm->cred); + bprm->cred = NULL; + + /* cred_exec_mutex must be held at least to this point to prevent + * ptrace_attach() from altering our determination of the task's + * credentials; any time after this it may be unlocked */ + + security_bprm_committed_creds(bprm); +} +EXPORT_SYMBOL(install_exec_creds); + +/* + * determine how safe it is to execute the proposed program + * - the caller must hold current->cred_exec_mutex to protect against + * PTRACE_ATTACH + */ +void check_unsafe_exec(struct linux_binprm *bprm) +{ + struct task_struct *p = current; + + bprm->unsafe = tracehook_unsafe_exec(p); + + if (atomic_read(&p->fs->count) > 1 || + atomic_read(&p->files->count) > 1 || + atomic_read(&p->sighand->count) > 1) + bprm->unsafe |= LSM_UNSAFE_SHARE; +} + /* * Fill the binprm structure from the inode. * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes + * + * This may be called multiple times for binary chains (scripts for example). */ int prepare_binprm(struct linux_binprm *bprm) { - int mode; + umode_t mode; struct inode * inode = bprm->file->f_path.dentry->d_inode; int retval; @@ -1046,14 +1086,15 @@ int prepare_binprm(struct linux_binprm *bprm) if (bprm->file->f_op == NULL) return -EACCES; - bprm->e_uid = current_euid(); - bprm->e_gid = current_egid(); + /* clear any previous set[ug]id data from a previous binary */ + bprm->cred->euid = current_euid(); + bprm->cred->egid = current_egid(); - if(!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) { + if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) { /* Set-uid? */ if (mode & S_ISUID) { - current->personality &= ~PER_CLEAR_ON_SETID; - bprm->e_uid = inode->i_uid; + bprm->per_clear |= PER_CLEAR_ON_SETID; + bprm->cred->euid = inode->i_uid; } /* Set-gid? */ @@ -1063,50 +1104,23 @@ int prepare_binprm(struct linux_binprm *bprm) * executable. */ if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { - current->personality &= ~PER_CLEAR_ON_SETID; - bprm->e_gid = inode->i_gid; + bprm->per_clear |= PER_CLEAR_ON_SETID; + bprm->cred->egid = inode->i_gid; } } /* fill in binprm security blob */ - retval = security_bprm_set(bprm); + retval = security_bprm_set_creds(bprm); if (retval) return retval; + bprm->cred_prepared = 1; - memset(bprm->buf,0,BINPRM_BUF_SIZE); - return kernel_read(bprm->file,0,bprm->buf,BINPRM_BUF_SIZE); + memset(bprm->buf, 0, BINPRM_BUF_SIZE); + return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE); } EXPORT_SYMBOL(prepare_binprm); -static int unsafe_exec(struct task_struct *p) -{ - int unsafe = tracehook_unsafe_exec(p); - - if (atomic_read(&p->fs->count) > 1 || - atomic_read(&p->files->count) > 1 || - atomic_read(&p->sighand->count) > 1) - unsafe |= LSM_UNSAFE_SHARE; - - return unsafe; -} - -void compute_creds(struct linux_binprm *bprm) -{ - int unsafe; - - if (bprm->e_uid != current_uid()) - current->pdeath_signal = 0; - exec_keys(current); - - task_lock(current); - unsafe = unsafe_exec(current); - security_bprm_apply_creds(bprm, unsafe); - task_unlock(current); - security_bprm_post_apply_creds(bprm); -} -EXPORT_SYMBOL(compute_creds); - /* * Arguments are '\0' separated strings found at the location bprm->p * points to; chop off the first by relocating brpm->p to right after @@ -1259,6 +1273,8 @@ EXPORT_SYMBOL(search_binary_handler); void free_bprm(struct linux_binprm *bprm) { free_arg_pages(bprm); + if (bprm->cred) + abort_creds(bprm->cred); kfree(bprm); } @@ -1284,10 +1300,20 @@ int do_execve(char * filename, if (!bprm) goto out_files; + retval = mutex_lock_interruptible(¤t->cred_exec_mutex); + if (retval < 0) + goto out_free; + + retval = -ENOMEM; + bprm->cred = prepare_exec_creds(); + if (!bprm->cred) + goto out_unlock; + check_unsafe_exec(bprm); + file = open_exec(filename); retval = PTR_ERR(file); if (IS_ERR(file)) - goto out_kfree; + goto out_unlock; sched_exec(); @@ -1301,14 +1327,10 @@ int do_execve(char * filename, bprm->argc = count(argv, MAX_ARG_STRINGS); if ((retval = bprm->argc) < 0) - goto out_mm; + goto out; bprm->envc = count(envp, MAX_ARG_STRINGS); if ((retval = bprm->envc) < 0) - goto out_mm; - - retval = security_bprm_alloc(bprm); - if (retval) goto out; retval = prepare_binprm(bprm); @@ -1330,21 +1352,18 @@ int do_execve(char * filename, current->flags &= ~PF_KTHREAD; retval = search_binary_handler(bprm,regs); - if (retval >= 0) { - /* execve success */ - security_bprm_free(bprm); - acct_update_integrals(current); - free_bprm(bprm); - if (displaced) - put_files_struct(displaced); - return retval; - } + if (retval < 0) + goto out; -out: - if (bprm->security) - security_bprm_free(bprm); + /* execve succeeded */ + mutex_unlock(¤t->cred_exec_mutex); + acct_update_integrals(current); + free_bprm(bprm); + if (displaced) + put_files_struct(displaced); + return retval; -out_mm: +out: if (bprm->mm) mmput (bprm->mm); @@ -1353,7 +1372,11 @@ out_file: allow_write_access(bprm->file); fput(bprm->file); } -out_kfree: + +out_unlock: + mutex_unlock(¤t->cred_exec_mutex); + +out_free: free_bprm(bprm); out_files: diff --git a/fs/internal.h b/fs/internal.h index 80aa9a02337..53af885f173 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -10,6 +10,7 @@ */ struct super_block; +struct linux_binprm; /* * block_dev.c @@ -39,6 +40,11 @@ static inline int sb_is_blkdev_sb(struct super_block *sb) */ extern void __init chrdev_init(void); +/* + * exec.c + */ +extern void check_unsafe_exec(struct linux_binprm *); + /* * namespace.c */ diff --git a/include/linux/audit.h b/include/linux/audit.h index 0b2fcb698a6..e8ce2c4c7ac 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -508,22 +508,6 @@ static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) return 0; } -/* - * ieieeeeee, an audit function without a return code! - * - * This function might fail! I decided that it didn't matter. We are too late - * to fail the syscall and the information isn't REQUIRED for any purpose. It's - * just nice to have. We should be able to look at past audit logs to figure - * out this process's current cap set along with the fcaps from the PATH record - * and use that to come up with the final set. Yeah, its ugly, but all the info - * is still in the audit log. So I'm not going to bother mentioning we failed - * if we couldn't allocate memory. - * - * If someone changes their mind they could create the aux record earlier and - * then search here and use that earlier allocation. But I don't wanna. - * - * -Eric - */ static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old) diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 7394b5b349f..6cbfbe29718 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -35,16 +35,20 @@ struct linux_binprm{ struct mm_struct *mm; unsigned long p; /* current top of mem */ unsigned int sh_bang:1, - misc_bang:1; + misc_bang:1, + cred_prepared:1,/* true if creds already prepared (multiple + * preps happen for interpreters) */ + cap_effective:1;/* true if has elevated effective capabilities, + * false if not; except for init which inherits + * its parent's caps anyway */ #ifdef __alpha__ unsigned int taso:1; #endif unsigned int recursion_depth; struct file * file; - int e_uid, e_gid; - kernel_cap_t cap_post_exec_permitted; - bool cap_effective; - void *security; + struct cred *cred; /* new credentials */ + int unsafe; /* how unsafe this exec is (mask of LSM_UNSAFE_*) */ + unsigned int per_clear; /* bits to clear in current->personality */ int argc, envc; char * filename; /* Name of binary as seen by procps */ char * interp; /* Name of the binary really executed. Most @@ -101,7 +105,7 @@ extern int setup_arg_pages(struct linux_binprm * bprm, int executable_stack); extern int bprm_mm_init(struct linux_binprm *bprm); extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm); -extern void compute_creds(struct linux_binprm *binprm); +extern void install_exec_creds(struct linux_binprm *bprm); extern int do_coredump(long signr, int exit_code, struct pt_regs * regs); extern int set_binfmt(struct linux_binfmt *new); extern void free_bprm(struct linux_binprm *); diff --git a/include/linux/cred.h b/include/linux/cred.h index eaf6fa695a0..8edb4d1d542 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -84,8 +84,6 @@ struct thread_group_cred { struct key *process_keyring; /* keyring private to this process */ struct rcu_head rcu; /* RCU deletion hook */ }; - -extern void release_tgcred(struct cred *cred); #endif /* @@ -144,6 +142,7 @@ struct cred { extern void __put_cred(struct cred *); extern int copy_creds(struct task_struct *, unsigned long); extern struct cred *prepare_creds(void); +extern struct cred *prepare_exec_creds(void); extern struct cred *prepare_usermodehelper_creds(void); extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); diff --git a/include/linux/key.h b/include/linux/key.h index 69ecf0934b0..21d32a142c0 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -278,7 +278,6 @@ extern ctl_table key_sysctls[]; * the userspace interface */ extern int install_thread_keyring_to_cred(struct cred *cred); -extern int exec_keys(struct task_struct *tsk); extern void key_fsuid_changed(struct task_struct *tsk); extern void key_fsgid_changed(struct task_struct *tsk); extern void key_init(void); @@ -294,7 +293,6 @@ extern void key_init(void); #define make_key_ref(k, p) NULL #define key_ref_to_ptr(k) NULL #define is_key_possessed(k) 0 -#define exec_keys(t) do { } while(0) #define key_fsuid_changed(t) do { } while(0) #define key_fsgid_changed(t) do { } while(0) #define key_init() do { } while(0) diff --git a/include/linux/security.h b/include/linux/security.h index 68be1125144..56a0eed6567 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -57,8 +57,7 @@ extern int cap_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted); -extern int cap_bprm_set_security(struct linux_binprm *bprm); -extern int cap_bprm_apply_creds(struct linux_binprm *bprm, int unsafe); +extern int cap_bprm_set_creds(struct linux_binprm *bprm); extern int cap_bprm_secureexec(struct linux_binprm *bprm); extern int cap_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); @@ -110,7 +109,7 @@ extern unsigned long mmap_min_addr; struct sched_param; struct request_sock; -/* bprm_apply_creds unsafe reasons */ +/* bprm->unsafe reasons */ #define LSM_UNSAFE_SHARE 1 #define LSM_UNSAFE_PTRACE 2 #define LSM_UNSAFE_PTRACE_CAP 4 @@ -154,36 +153,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * * Security hooks for program execution operations. * - * @bprm_alloc_security: - * Allocate and attach a security structure to the @bprm->security field. - * The security field is initialized to NULL when the bprm structure is - * allocated. - * @bprm contains the linux_binprm structure to be modified. - * Return 0 if operation was successful. - * @bprm_free_security: - * @bprm contains the linux_binprm structure to be modified. - * Deallocate and clear the @bprm->security field. - * @bprm_apply_creds: - * Compute and set the security attributes of a process being transformed - * by an execve operation based on the old attributes (current->security) - * and the information saved in @bprm->security by the set_security hook. - * Since this function may return an error, in which case the process will - * be killed. However, it can leave the security attributes of the - * process unchanged if an access failure occurs at this point. - * bprm_apply_creds is called under task_lock. @unsafe indicates various - * reasons why it may be unsafe to change security state. - * @bprm contains the linux_binprm structure. - * @bprm_post_apply_creds: - * Runs after bprm_apply_creds with the task_lock dropped, so that - * functions which cannot be called safely under the task_lock can - * be used. This hook is a good place to perform state changes on - * the process such as closing open file descriptors to which access - * is no longer granted if the attributes were changed. - * Note that a security module might need to save state between - * bprm_apply_creds and bprm_post_apply_creds to store the decision - * on whether the process may proceed. - * @bprm contains the linux_binprm structure. - * @bprm_set_security: + * @bprm_set_creds: * Save security information in the bprm->security field, typically based * on information about the bprm->file, for later use by the apply_creds * hook. This hook may also optionally check permissions (e.g. for @@ -196,15 +166,30 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @bprm contains the linux_binprm structure. * Return 0 if the hook is successful and permission is granted. * @bprm_check_security: - * This hook mediates the point when a search for a binary handler will - * begin. It allows a check the @bprm->security value which is set in - * the preceding set_security call. The primary difference from - * set_security is that the argv list and envp list are reliably - * available in @bprm. This hook may be called multiple times - * during a single execve; and in each pass set_security is called - * first. + * This hook mediates the point when a search for a binary handler will + * begin. It allows a check the @bprm->security value which is set in the + * preceding set_creds call. The primary difference from set_creds is + * that the argv list and envp list are reliably available in @bprm. This + * hook may be called multiple times during a single execve; and in each + * pass set_creds is called first. * @bprm contains the linux_binprm structure. * Return 0 if the hook is successful and permission is granted. + * @bprm_committing_creds: + * Prepare to install the new security attributes of a process being + * transformed by an execve operation, based on the old credentials + * pointed to by @current->cred and the information set in @bprm->cred by + * the bprm_set_creds hook. @bprm points to the linux_binprm structure. + * This hook is a good place to perform state changes on the process such + * as closing open file descriptors to which access will no longer be + * granted when the attributes are changed. This is called immediately + * before commit_creds(). + * @bprm_committed_creds: + * Tidy up after the installation of the new security attributes of a + * process being transformed by an execve operation. The new credentials + * have, by this point, been set to @current->cred. @bprm points to the + * linux_binprm structure. This hook is a good place to perform state + * changes on the process such as clearing out non-inheritable signal + * state. This is called immediately after commit_creds(). * @bprm_secureexec: * Return a boolean value (0 or 1) indicating whether a "secure exec" * is required. The flag is passed in the auxiliary table @@ -1301,13 +1286,11 @@ struct security_operations { int (*settime) (struct timespec *ts, struct timezone *tz); int (*vm_enough_memory) (struct mm_struct *mm, long pages); - int (*bprm_alloc_security) (struct linux_binprm *bprm); - void (*bprm_free_security) (struct linux_binprm *bprm); - int (*bprm_apply_creds) (struct linux_binprm *bprm, int unsafe); - void (*bprm_post_apply_creds) (struct linux_binprm *bprm); - int (*bprm_set_security) (struct linux_binprm *bprm); + int (*bprm_set_creds) (struct linux_binprm *bprm); int (*bprm_check_security) (struct linux_binprm *bprm); int (*bprm_secureexec) (struct linux_binprm *bprm); + void (*bprm_committing_creds) (struct linux_binprm *bprm); + void (*bprm_committed_creds) (struct linux_binprm *bprm); int (*sb_alloc_security) (struct super_block *sb); void (*sb_free_security) (struct super_block *sb); @@ -1569,12 +1552,10 @@ int security_settime(struct timespec *ts, struct timezone *tz); int security_vm_enough_memory(long pages); int security_vm_enough_memory_mm(struct mm_struct *mm, long pages); int security_vm_enough_memory_kern(long pages); -int security_bprm_alloc(struct linux_binprm *bprm); -void security_bprm_free(struct linux_binprm *bprm); -int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe); -void security_bprm_post_apply_creds(struct linux_binprm *bprm); -int security_bprm_set(struct linux_binprm *bprm); +int security_bprm_set_creds(struct linux_binprm *bprm); int security_bprm_check(struct linux_binprm *bprm); +void security_bprm_committing_creds(struct linux_binprm *bprm); +void security_bprm_committed_creds(struct linux_binprm *bprm); int security_bprm_secureexec(struct linux_binprm *bprm); int security_sb_alloc(struct super_block *sb); void security_sb_free(struct super_block *sb); @@ -1812,32 +1793,22 @@ static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) return cap_vm_enough_memory(mm, pages); } -static inline int security_bprm_alloc(struct linux_binprm *bprm) -{ - return 0; -} - -static inline void security_bprm_free(struct linux_binprm *bprm) -{ } - -static inline int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe) +static inline int security_bprm_set_creds(struct linux_binprm *bprm) { - return cap_bprm_apply_creds(bprm, unsafe); + return cap_bprm_set_creds(bprm); } -static inline void security_bprm_post_apply_creds(struct linux_binprm *bprm) +static inline int security_bprm_check(struct linux_binprm *bprm) { - return; + return 0; } -static inline int security_bprm_set(struct linux_binprm *bprm) +static inline void security_bprm_committing_creds(struct linux_binprm *bprm) { - return cap_bprm_set_security(bprm); } -static inline int security_bprm_check(struct linux_binprm *bprm) +static inline void security_bprm_committed_creds(struct linux_binprm *bprm) { - return 0; } static inline int security_bprm_secureexec(struct linux_binprm *bprm) diff --git a/kernel/cred.c b/kernel/cred.c index cb6b5eda978..e6fcdd67b2e 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -68,7 +68,7 @@ static void release_tgcred_rcu(struct rcu_head *rcu) /* * Release a set of thread group credentials. */ -void release_tgcred(struct cred *cred) +static void release_tgcred(struct cred *cred) { #ifdef CONFIG_KEYS struct thread_group_cred *tgcred = cred->tgcred; @@ -163,6 +163,50 @@ error: } EXPORT_SYMBOL(prepare_creds); +/* + * Prepare credentials for current to perform an execve() + * - The caller must hold current->cred_exec_mutex + */ +struct cred *prepare_exec_creds(void) +{ + struct thread_group_cred *tgcred = NULL; + struct cred *new; + +#ifdef CONFIG_KEYS + tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); + if (!tgcred) + return NULL; +#endif + + new = prepare_creds(); + if (!new) { + kfree(tgcred); + return new; + } + +#ifdef CONFIG_KEYS + /* newly exec'd tasks don't get a thread keyring */ + key_put(new->thread_keyring); + new->thread_keyring = NULL; + + /* create a new per-thread-group creds for all this set of threads to + * share */ + memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred)); + + atomic_set(&tgcred->usage, 1); + spin_lock_init(&tgcred->lock); + + /* inherit the session keyring; new process keyring */ + key_get(tgcred->session_keyring); + tgcred->process_keyring = NULL; + + release_tgcred(new); + new->tgcred = tgcred; +#endif + + return new; +} + /* * prepare new credentials for the usermode helper dispatcher */ diff --git a/security/capability.c b/security/capability.c index efeb6d9e0e6..185804f99ad 100644 --- a/security/capability.c +++ b/security/capability.c @@ -32,24 +32,19 @@ static int cap_quota_on(struct dentry *dentry) return 0; } -static int cap_bprm_alloc_security(struct linux_binprm *bprm) +static int cap_bprm_check_security (struct linux_binprm *bprm) { return 0; } -static void cap_bprm_free_security(struct linux_binprm *bprm) +static void cap_bprm_committing_creds(struct linux_binprm *bprm) { } -static void cap_bprm_post_apply_creds(struct linux_binprm *bprm) +static void cap_bprm_committed_creds(struct linux_binprm *bprm) { } -static int cap_bprm_check_security(struct linux_binprm *bprm) -{ - return 0; -} - static int cap_sb_alloc_security(struct super_block *sb) { return 0; @@ -827,11 +822,9 @@ void security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, syslog); set_to_cap_if_null(ops, settime); set_to_cap_if_null(ops, vm_enough_memory); - set_to_cap_if_null(ops, bprm_alloc_security); - set_to_cap_if_null(ops, bprm_free_security); - set_to_cap_if_null(ops, bprm_apply_creds); - set_to_cap_if_null(ops, bprm_post_apply_creds); - set_to_cap_if_null(ops, bprm_set_security); + set_to_cap_if_null(ops, bprm_set_creds); + set_to_cap_if_null(ops, bprm_committing_creds); + set_to_cap_if_null(ops, bprm_committed_creds); set_to_cap_if_null(ops, bprm_check_security); set_to_cap_if_null(ops, bprm_secureexec); set_to_cap_if_null(ops, sb_alloc_security); diff --git a/security/commoncap.c b/security/commoncap.c index b5419273f92..51dfa11e8e5 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -167,7 +167,7 @@ int cap_capset(struct cred *new, static inline void bprm_clear_caps(struct linux_binprm *bprm) { - cap_clear(bprm->cap_post_exec_permitted); + cap_clear(bprm->cred->cap_permitted); bprm->cap_effective = false; } @@ -198,15 +198,15 @@ int cap_inode_killpriv(struct dentry *dentry) } static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps, - struct linux_binprm *bprm) + struct linux_binprm *bprm, + bool *effective) { + struct cred *new = bprm->cred; unsigned i; int ret = 0; if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE) - bprm->cap_effective = true; - else - bprm->cap_effective = false; + *effective = true; CAP_FOR_EACH_U32(i) { __u32 permitted = caps->permitted.cap[i]; @@ -215,16 +215,13 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps, /* * pP' = (X & fP) | (pI & fI) */ - bprm->cap_post_exec_permitted.cap[i] = - (current->cred->cap_bset.cap[i] & permitted) | - (current->cred->cap_inheritable.cap[i] & inheritable); + new->cap_permitted.cap[i] = + (new->cap_bset.cap[i] & permitted) | + (new->cap_inheritable.cap[i] & inheritable); - if (permitted & ~bprm->cap_post_exec_permitted.cap[i]) { - /* - * insufficient to execute correctly - */ + if (permitted & ~new->cap_permitted.cap[i]) + /* insufficient to execute correctly */ ret = -EPERM; - } } /* @@ -232,7 +229,7 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps, * do not have enough capabilities, we return an error if they are * missing some "forced" (aka file-permitted) capabilities. */ - return bprm->cap_effective ? ret : 0; + return *effective ? ret : 0; } int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps) @@ -250,10 +247,9 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_CAPS, &caps, XATTR_CAPS_SZ); - if (size == -ENODATA || size == -EOPNOTSUPP) { + if (size == -ENODATA || size == -EOPNOTSUPP) /* no data, that's ok */ return -ENODATA; - } if (size < 0) return size; @@ -262,7 +258,7 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps.magic_etc); - switch ((magic_etc & VFS_CAP_REVISION_MASK)) { + switch (magic_etc & VFS_CAP_REVISION_MASK) { case VFS_CAP_REVISION_1: if (size != XATTR_CAPS_SZ_1) return -EINVAL; @@ -283,11 +279,12 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data cpu_caps->permitted.cap[i] = le32_to_cpu(caps.data[i].permitted); cpu_caps->inheritable.cap[i] = le32_to_cpu(caps.data[i].inheritable); } + return 0; } /* Locate any VFS capabilities: */ -static int get_file_caps(struct linux_binprm *bprm) +static int get_file_caps(struct linux_binprm *bprm, bool *effective) { struct dentry *dentry; int rc = 0; @@ -313,7 +310,10 @@ static int get_file_caps(struct linux_binprm *bprm) goto out; } - rc = bprm_caps_from_vfs_caps(&vcaps, bprm); + rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective); + if (rc == -EINVAL) + printk(KERN_NOTICE "%s: cap_from_disk returned %d for %s\n", + __func__, rc, bprm->filename); out: dput(dentry); @@ -334,18 +334,27 @@ int cap_inode_killpriv(struct dentry *dentry) return 0; } -static inline int get_file_caps(struct linux_binprm *bprm) +static inline int get_file_caps(struct linux_binprm *bprm, bool *effective) { bprm_clear_caps(bprm); return 0; } #endif -int cap_bprm_set_security (struct linux_binprm *bprm) +/* + * set up the new credentials for an exec'd task + */ +int cap_bprm_set_creds(struct linux_binprm *bprm) { + const struct cred *old = current_cred(); + struct cred *new = bprm->cred; + bool effective; int ret; - ret = get_file_caps(bprm); + effective = false; + ret = get_file_caps(bprm, &effective); + if (ret < 0) + return ret; if (!issecure(SECURE_NOROOT)) { /* @@ -353,63 +362,47 @@ int cap_bprm_set_security (struct linux_binprm *bprm) * executables under compatibility mode, we override the * capability sets for the file. * - * If only the real uid is 0, we do not set the effective - * bit. + * If only the real uid is 0, we do not set the effective bit. */ - if (bprm->e_uid == 0 || current_uid() == 0) { + if (new->euid == 0 || new->uid == 0) { /* pP' = (cap_bset & ~0) | (pI & ~0) */ - bprm->cap_post_exec_permitted = cap_combine( - current->cred->cap_bset, - current->cred->cap_inheritable); - bprm->cap_effective = (bprm->e_uid == 0); - ret = 0; + new->cap_permitted = cap_combine(old->cap_bset, + old->cap_inheritable); } + if (new->euid == 0) + effective = true; } - return ret; -} - -int cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe) -{ - const struct cred *old = current_cred(); - struct cred *new; - - new = prepare_creds(); - if (!new) - return -ENOMEM; - - if (bprm->e_uid != old->uid || bprm->e_gid != old->gid || - !cap_issubset(bprm->cap_post_exec_permitted, - old->cap_permitted)) { - set_dumpable(current->mm, suid_dumpable); - current->pdeath_signal = 0; - - if (unsafe & ~LSM_UNSAFE_PTRACE_CAP) { - if (!capable(CAP_SETUID)) { - bprm->e_uid = old->uid; - bprm->e_gid = old->gid; - } - if (cap_limit_ptraced_target()) { - bprm->cap_post_exec_permitted = cap_intersect( - bprm->cap_post_exec_permitted, - new->cap_permitted); - } + /* Don't let someone trace a set[ug]id/setpcap binary with the revised + * credentials unless they have the appropriate permit + */ + if ((new->euid != old->uid || + new->egid != old->gid || + !cap_issubset(new->cap_permitted, old->cap_permitted)) && + bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) { + /* downgrade; they get no more than they had, and maybe less */ + if (!capable(CAP_SETUID)) { + new->euid = new->uid; + new->egid = new->gid; } + if (cap_limit_ptraced_target()) + new->cap_permitted = cap_intersect(new->cap_permitted, + old->cap_permitted); } - new->suid = new->euid = new->fsuid = bprm->e_uid; - new->sgid = new->egid = new->fsgid = bprm->e_gid; + new->suid = new->fsuid = new->euid; + new->sgid = new->fsgid = new->egid; - /* For init, we want to retain the capabilities set - * in the init_task struct. Thus we skip the usual - * capability rules */ + /* For init, we want to retain the capabilities set in the initial + * task. Thus we skip the usual capability rules + */ if (!is_global_init(current)) { - new->cap_permitted = bprm->cap_post_exec_permitted; - if (bprm->cap_effective) - new->cap_effective = bprm->cap_post_exec_permitted; + if (effective) + new->cap_effective = new->cap_permitted; else cap_clear(new->cap_effective); } + bprm->cap_effective = effective; /* * Audit candidate if current->cap_effective is set @@ -425,23 +418,31 @@ int cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe) */ if (!cap_isclear(new->cap_effective)) { if (!cap_issubset(CAP_FULL_SET, new->cap_effective) || - bprm->e_uid != 0 || new->uid != 0 || - issecure(SECURE_NOROOT)) - audit_log_bprm_fcaps(bprm, new, old); + new->euid != 0 || new->uid != 0 || + issecure(SECURE_NOROOT)) { + ret = audit_log_bprm_fcaps(bprm, new, old); + if (ret < 0) + return ret; + } } new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); - return commit_creds(new); + return 0; } -int cap_bprm_secureexec (struct linux_binprm *bprm) +/* + * determine whether a secure execution is required + * - the creds have been committed at this point, and are no longer available + * through bprm + */ +int cap_bprm_secureexec(struct linux_binprm *bprm) { const struct cred *cred = current_cred(); if (cred->uid != 0) { if (bprm->cap_effective) return 1; - if (!cap_isclear(bprm->cap_post_exec_permitted)) + if (!cap_isclear(cred->cap_permitted)) return 1; } @@ -477,7 +478,7 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name) } /* moved from kernel/sys.c. */ -/* +/* * cap_emulate_setxuid() fixes the effective / permitted capabilities of * a process after a call to setuid, setreuid, or setresuid. * @@ -491,10 +492,10 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name) * 3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective * capabilities are set to the permitted capabilities. * - * fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should + * fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should * never happen. * - * -astor + * -astor * * cevans - New behaviour, Oct '99 * A process may, via prctl(), elect to keep its capabilities when it @@ -751,4 +752,3 @@ int cap_vm_enough_memory(struct mm_struct *mm, long pages) cap_sys_admin = 1; return __vm_enough_memory(mm, pages, cap_sys_admin); } - diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index df329f684a6..2f5d89e92b8 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -274,48 +274,6 @@ static int install_session_keyring(struct key *keyring) return commit_creds(new); } -/*****************************************************************************/ -/* - * deal with execve() - */ -int exec_keys(struct task_struct *tsk) -{ - struct thread_group_cred *tgcred = NULL; - struct cred *new; - -#ifdef CONFIG_KEYS - tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); - if (!tgcred) - return -ENOMEM; -#endif - - new = prepare_creds(); - if (new < 0) - return -ENOMEM; - - /* newly exec'd tasks don't get a thread keyring */ - key_put(new->thread_keyring); - new->thread_keyring = NULL; - - /* create a new per-thread-group creds for all this set of threads to - * share */ - memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred)); - - atomic_set(&tgcred->usage, 1); - spin_lock_init(&tgcred->lock); - - /* inherit the session keyring; new process keyring */ - key_get(tgcred->session_keyring); - tgcred->process_keyring = NULL; - - release_tgcred(new); - new->tgcred = tgcred; - - commit_creds(new); - return 0; - -} /* end exec_keys() */ - /*****************************************************************************/ /* * the filesystem user ID changed diff --git a/security/root_plug.c b/security/root_plug.c index c3f68b5b372..40fb4f15e27 100644 --- a/security/root_plug.c +++ b/security/root_plug.c @@ -55,9 +55,9 @@ static int rootplug_bprm_check_security (struct linux_binprm *bprm) struct usb_device *dev; root_dbg("file %s, e_uid = %d, e_gid = %d\n", - bprm->filename, bprm->e_uid, bprm->e_gid); + bprm->filename, bprm->cred->euid, bprm->cred->egid); - if (bprm->e_gid == 0) { + if (bprm->cred->egid == 0) { dev = usb_find_device(vendor_id, product_id); if (!dev) { root_dbg("e_gid = 0, and device not found, " @@ -75,15 +75,12 @@ static struct security_operations rootplug_security_ops = { .ptrace_may_access = cap_ptrace_may_access, .ptrace_traceme = cap_ptrace_traceme, .capget = cap_capget, - .capset_check = cap_capset_check, - .capset_set = cap_capset_set, + .capset = cap_capset, .capable = cap_capable, - .bprm_apply_creds = cap_bprm_apply_creds, - .bprm_set_security = cap_bprm_set_security, + .bprm_set_creds = cap_bprm_set_creds, - .task_post_setuid = cap_task_post_setuid, - .task_reparent_to_init = cap_task_reparent_to_init, + .task_fix_setuid = cap_task_fix_setuid, .task_prctl = cap_task_prctl, .bprm_check_security = rootplug_bprm_check_security, diff --git a/security/security.c b/security/security.c index a55d739c686..dc5babb2d6d 100644 --- a/security/security.c +++ b/security/security.c @@ -213,34 +213,24 @@ int security_vm_enough_memory_kern(long pages) return security_ops->vm_enough_memory(current->mm, pages); } -int security_bprm_alloc(struct linux_binprm *bprm) +int security_bprm_set_creds(struct linux_binprm *bprm) { - return security_ops->bprm_alloc_security(bprm); + return security_ops->bprm_set_creds(bprm); } -void security_bprm_free(struct linux_binprm *bprm) -{ - security_ops->bprm_free_security(bprm); -} - -int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe) -{ - return security_ops->bprm_apply_creds(bprm, unsafe); -} - -void security_bprm_post_apply_creds(struct linux_binprm *bprm) +int security_bprm_check(struct linux_binprm *bprm) { - security_ops->bprm_post_apply_creds(bprm); + return security_ops->bprm_check_security(bprm); } -int security_bprm_set(struct linux_binprm *bprm) +void security_bprm_committing_creds(struct linux_binprm *bprm) { - return security_ops->bprm_set_security(bprm); + return security_ops->bprm_committing_creds(bprm); } -int security_bprm_check(struct linux_binprm *bprm) +void security_bprm_committed_creds(struct linux_binprm *bprm) { - return security_ops->bprm_check_security(bprm); + return security_ops->bprm_committed_creds(bprm); } int security_bprm_secureexec(struct linux_binprm *bprm) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c71bba78872..21a59218463 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2029,59 +2029,45 @@ static int selinux_vm_enough_memory(struct mm_struct *mm, long pages) /* binprm security operations */ -static int selinux_bprm_alloc_security(struct linux_binprm *bprm) +static int selinux_bprm_set_creds(struct linux_binprm *bprm) { - struct bprm_security_struct *bsec; - - bsec = kzalloc(sizeof(struct bprm_security_struct), GFP_KERNEL); - if (!bsec) - return -ENOMEM; - - bsec->sid = SECINITSID_UNLABELED; - bsec->set = 0; - - bprm->security = bsec; - return 0; -} - -static int selinux_bprm_set_security(struct linux_binprm *bprm) -{ - struct task_security_struct *tsec; - struct inode *inode = bprm->file->f_path.dentry->d_inode; + const struct task_security_struct *old_tsec; + struct task_security_struct *new_tsec; struct inode_security_struct *isec; - struct bprm_security_struct *bsec; - u32 newsid; struct avc_audit_data ad; + struct inode *inode = bprm->file->f_path.dentry->d_inode; int rc; - rc = secondary_ops->bprm_set_security(bprm); + rc = secondary_ops->bprm_set_creds(bprm); if (rc) return rc; - bsec = bprm->security; - - if (bsec->set) + /* SELinux context only depends on initial program or script and not + * the script interpreter */ + if (bprm->cred_prepared) return 0; - tsec = current_security(); + old_tsec = current_security(); + new_tsec = bprm->cred->security; isec = inode->i_security; /* Default to the current task SID. */ - bsec->sid = tsec->sid; + new_tsec->sid = old_tsec->sid; + new_tsec->osid = old_tsec->sid; /* Reset fs, key, and sock SIDs on execve. */ - tsec->create_sid = 0; - tsec->keycreate_sid = 0; - tsec->sockcreate_sid = 0; + new_tsec->create_sid = 0; + new_tsec->keycreate_sid = 0; + new_tsec->sockcreate_sid = 0; - if (tsec->exec_sid) { - newsid = tsec->exec_sid; + if (old_tsec->exec_sid) { + new_tsec->sid = old_tsec->exec_sid; /* Reset exec SID on execve. */ - tsec->exec_sid = 0; + new_tsec->exec_sid = 0; } else { /* Check for a default transition on this program. */ - rc = security_transition_sid(tsec->sid, isec->sid, - SECCLASS_PROCESS, &newsid); + rc = security_transition_sid(old_tsec->sid, isec->sid, + SECCLASS_PROCESS, &new_tsec->sid); if (rc) return rc; } @@ -2090,33 +2076,63 @@ static int selinux_bprm_set_security(struct linux_binprm *bprm) ad.u.fs.path = bprm->file->f_path; if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) - newsid = tsec->sid; + new_tsec->sid = old_tsec->sid; - if (tsec->sid == newsid) { - rc = avc_has_perm(tsec->sid, isec->sid, + if (new_tsec->sid == old_tsec->sid) { + rc = avc_has_perm(old_tsec->sid, isec->sid, SECCLASS_FILE, FILE__EXECUTE_NO_TRANS, &ad); if (rc) return rc; } else { /* Check permissions for the transition. */ - rc = avc_has_perm(tsec->sid, newsid, + rc = avc_has_perm(old_tsec->sid, new_tsec->sid, SECCLASS_PROCESS, PROCESS__TRANSITION, &ad); if (rc) return rc; - rc = avc_has_perm(newsid, isec->sid, + rc = avc_has_perm(new_tsec->sid, isec->sid, SECCLASS_FILE, FILE__ENTRYPOINT, &ad); if (rc) return rc; - /* Clear any possibly unsafe personality bits on exec: */ - current->personality &= ~PER_CLEAR_ON_SETID; + /* Check for shared state */ + if (bprm->unsafe & LSM_UNSAFE_SHARE) { + rc = avc_has_perm(old_tsec->sid, new_tsec->sid, + SECCLASS_PROCESS, PROCESS__SHARE, + NULL); + if (rc) + return -EPERM; + } + + /* Make sure that anyone attempting to ptrace over a task that + * changes its SID has the appropriate permit */ + if (bprm->unsafe & + (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) { + struct task_struct *tracer; + struct task_security_struct *sec; + u32 ptsid = 0; + + rcu_read_lock(); + tracer = tracehook_tracer_task(current); + if (likely(tracer != NULL)) { + sec = __task_cred(tracer)->security; + ptsid = sec->sid; + } + rcu_read_unlock(); + + if (ptsid != 0) { + rc = avc_has_perm(ptsid, new_tsec->sid, + SECCLASS_PROCESS, + PROCESS__PTRACE, NULL); + if (rc) + return -EPERM; + } + } - /* Set the security field to the new SID. */ - bsec->sid = newsid; + /* Clear any possibly unsafe personality bits on exec: */ + bprm->per_clear |= PER_CLEAR_ON_SETID; } - bsec->set = 1; return 0; } @@ -2125,7 +2141,6 @@ static int selinux_bprm_check_security(struct linux_binprm *bprm) return secondary_ops->bprm_check_security(bprm); } - static int selinux_bprm_secureexec(struct linux_binprm *bprm) { const struct cred *cred = current_cred(); @@ -2141,19 +2156,13 @@ static int selinux_bprm_secureexec(struct linux_binprm *bprm) the noatsecure permission is granted between the two SIDs, i.e. ahp returns 0. */ atsecure = avc_has_perm(osid, sid, - SECCLASS_PROCESS, - PROCESS__NOATSECURE, NULL); + SECCLASS_PROCESS, + PROCESS__NOATSECURE, NULL); } return (atsecure || secondary_ops->bprm_secureexec(bprm)); } -static void selinux_bprm_free_security(struct linux_binprm *bprm) -{ - kfree(bprm->security); - bprm->security = NULL; -} - extern struct vfsmount *selinuxfs_mount; extern struct dentry *selinux_null; @@ -2252,108 +2261,78 @@ static inline void flush_unauthorized_files(const struct cred *cred, spin_unlock(&files->file_lock); } -static int selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe) +/* + * Prepare a process for imminent new credential changes due to exec + */ +static void selinux_bprm_committing_creds(struct linux_binprm *bprm) { - struct task_security_struct *tsec; - struct bprm_security_struct *bsec; - struct cred *new; - u32 sid; - int rc; - - rc = secondary_ops->bprm_apply_creds(bprm, unsafe); - if (rc < 0) - return rc; - - new = prepare_creds(); - if (!new) - return -ENOMEM; + struct task_security_struct *new_tsec; + struct rlimit *rlim, *initrlim; + int rc, i; - tsec = new->security; + secondary_ops->bprm_committing_creds(bprm); - bsec = bprm->security; - sid = bsec->sid; - - tsec->osid = tsec->sid; - bsec->unsafe = 0; - if (tsec->sid != sid) { - /* Check for shared state. If not ok, leave SID - unchanged and kill. */ - if (unsafe & LSM_UNSAFE_SHARE) { - rc = avc_has_perm(tsec->sid, sid, SECCLASS_PROCESS, - PROCESS__SHARE, NULL); - if (rc) { - bsec->unsafe = 1; - goto out; - } - } + new_tsec = bprm->cred->security; + if (new_tsec->sid == new_tsec->osid) + return; - /* Check for ptracing, and update the task SID if ok. - Otherwise, leave SID unchanged and kill. */ - if (unsafe & (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) { - struct task_struct *tracer; - struct task_security_struct *sec; - u32 ptsid = 0; + /* Close files for which the new task SID is not authorized. */ + flush_unauthorized_files(bprm->cred, current->files); - rcu_read_lock(); - tracer = tracehook_tracer_task(current); - if (likely(tracer != NULL)) { - sec = __task_cred(tracer)->security; - ptsid = sec->sid; - } - rcu_read_unlock(); + /* Always clear parent death signal on SID transitions. */ + current->pdeath_signal = 0; - if (ptsid != 0) { - rc = avc_has_perm(ptsid, sid, SECCLASS_PROCESS, - PROCESS__PTRACE, NULL); - if (rc) { - bsec->unsafe = 1; - goto out; - } - } + /* Check whether the new SID can inherit resource limits from the old + * SID. If not, reset all soft limits to the lower of the current + * task's hard limit and the init task's soft limit. + * + * Note that the setting of hard limits (even to lower them) can be + * controlled by the setrlimit check. The inclusion of the init task's + * soft limit into the computation is to avoid resetting soft limits + * higher than the default soft limit for cases where the default is + * lower than the hard limit, e.g. RLIMIT_CORE or RLIMIT_STACK. + */ + rc = avc_has_perm(new_tsec->osid, new_tsec->sid, SECCLASS_PROCESS, + PROCESS__RLIMITINH, NULL); + if (rc) { + for (i = 0; i < RLIM_NLIMITS; i++) { + rlim = current->signal->rlim + i; + initrlim = init_task.signal->rlim + i; + rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur); } - tsec->sid = sid; + update_rlimit_cpu(rlim->rlim_cur); } - -out: - commit_creds(new); - return 0; } /* - * called after apply_creds without the task lock held + * Clean up the process immediately after the installation of new credentials + * due to exec */ -static void selinux_bprm_post_apply_creds(struct linux_binprm *bprm) +static void selinux_bprm_committed_creds(struct linux_binprm *bprm) { - const struct cred *cred = current_cred(); - struct task_security_struct *tsec; - struct rlimit *rlim, *initrlim; + const struct task_security_struct *tsec = current_security(); struct itimerval itimer; - struct bprm_security_struct *bsec; struct sighand_struct *psig; + u32 osid, sid; int rc, i; unsigned long flags; - tsec = current_security(); - bsec = bprm->security; + secondary_ops->bprm_committed_creds(bprm); - if (bsec->unsafe) { - force_sig_specific(SIGKILL, current); - return; - } - if (tsec->osid == tsec->sid) + osid = tsec->osid; + sid = tsec->sid; + + if (sid == osid) return; - /* Close files for which the new task SID is not authorized. */ - flush_unauthorized_files(cred, current->files); - - /* Check whether the new SID can inherit signal state - from the old SID. If not, clear itimers to avoid - subsequent signal generation and flush and unblock - signals. This must occur _after_ the task SID has - been updated so that any kill done after the flush - will be checked against the new SID. */ - rc = avc_has_perm(tsec->osid, tsec->sid, SECCLASS_PROCESS, - PROCESS__SIGINH, NULL); + /* Check whether the new SID can inherit signal state from the old SID. + * If not, clear itimers to avoid subsequent signal generation and + * flush and unblock signals. + * + * This must occur _after_ the task SID has been updated so that any + * kill done after the flush will be checked against the new SID. + */ + rc = avc_has_perm(osid, sid, SECCLASS_PROCESS, PROCESS__SIGINH, NULL); if (rc) { memset(&itimer, 0, sizeof itimer); for (i = 0; i < 3; i++) @@ -2366,32 +2345,8 @@ static void selinux_bprm_post_apply_creds(struct linux_binprm *bprm) spin_unlock_irq(¤t->sighand->siglock); } - /* Always clear parent death signal on SID transitions. */ - current->pdeath_signal = 0; - - /* Check whether the new SID can inherit resource limits - from the old SID. If not, reset all soft limits to - the lower of the current task's hard limit and the init - task's soft limit. Note that the setting of hard limits - (even to lower them) can be controlled by the setrlimit - check. The inclusion of the init task's soft limit into - the computation is to avoid resetting soft limits higher - than the default soft limit for cases where the default - is lower than the hard limit, e.g. RLIMIT_CORE or - RLIMIT_STACK.*/ - rc = avc_has_perm(tsec->osid, tsec->sid, SECCLASS_PROCESS, - PROCESS__RLIMITINH, NULL); - if (rc) { - for (i = 0; i < RLIM_NLIMITS; i++) { - rlim = current->signal->rlim + i; - initrlim = init_task.signal->rlim+i; - rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur); - } - update_rlimit_cpu(rlim->rlim_cur); - } - - /* Wake up the parent if it is waiting so that it can - recheck wait permission to the new task SID. */ + /* Wake up the parent if it is waiting so that it can recheck + * wait permission to the new task SID. */ read_lock_irq(&tasklist_lock); psig = current->parent->sighand; spin_lock_irqsave(&psig->siglock, flags); @@ -5556,12 +5511,10 @@ static struct security_operations selinux_ops = { .netlink_send = selinux_netlink_send, .netlink_recv = selinux_netlink_recv, - .bprm_alloc_security = selinux_bprm_alloc_security, - .bprm_free_security = selinux_bprm_free_security, - .bprm_apply_creds = selinux_bprm_apply_creds, - .bprm_post_apply_creds = selinux_bprm_post_apply_creds, - .bprm_set_security = selinux_bprm_set_security, + .bprm_set_creds = selinux_bprm_set_creds, .bprm_check_security = selinux_bprm_check_security, + .bprm_committing_creds = selinux_bprm_committing_creds, + .bprm_committed_creds = selinux_bprm_committed_creds, .bprm_secureexec = selinux_bprm_secureexec, .sb_alloc_security = selinux_sb_alloc_security, diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index f8be8d7fa26..3cc45168f67 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -77,17 +77,6 @@ struct ipc_security_struct { u32 sid; /* SID of IPC resource */ }; -struct bprm_security_struct { - u32 sid; /* SID for transformed process */ - unsigned char set; - - /* - * unsafe is used to share failure information from bprm_apply_creds() - * to bprm_post_apply_creds(). - */ - char unsafe; -}; - struct netif_security_struct { int ifindex; /* device index */ u32 sid; /* SID for this interface */ diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index e952b397153..de396742abf 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -2596,8 +2596,7 @@ struct security_operations smack_ops = { .settime = cap_settime, .vm_enough_memory = cap_vm_enough_memory, - .bprm_apply_creds = cap_bprm_apply_creds, - .bprm_set_security = cap_bprm_set_security, + .bprm_set_creds = cap_bprm_set_creds, .bprm_secureexec = cap_bprm_secureexec, .sb_alloc_security = smack_sb_alloc_security, -- cgit v1.2.3-70-g09d2 From 98870ab0a5a3f1822aee681d2997017e1c87d026 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:26 +1100 Subject: CRED: Documentation Document credentials and the new credentials API. Signed-off-by: David Howells Signed-off-by: James Morris --- Documentation/credentials.txt | 582 ++++++++++++++++++++++++++++++++++++++++++ include/linux/cred.h | 12 +- kernel/cred.c | 2 +- 3 files changed, 594 insertions(+), 2 deletions(-) create mode 100644 Documentation/credentials.txt (limited to 'kernel') diff --git a/Documentation/credentials.txt b/Documentation/credentials.txt new file mode 100644 index 00000000000..df03169782e --- /dev/null +++ b/Documentation/credentials.txt @@ -0,0 +1,582 @@ + ==================== + CREDENTIALS IN LINUX + ==================== + +By: David Howells + +Contents: + + (*) Overview. + + (*) Types of credentials. + + (*) File markings. + + (*) Task credentials. + + - Immutable credentials. + - Accessing task credentials. + - Accessing another task's credentials. + - Altering credentials. + - Managing credentials. + + (*) Open file credentials. + + (*) Overriding the VFS's use of credentials. + + +======== +OVERVIEW +======== + +There are several parts to the security check performed by Linux when one +object acts upon another: + + (1) Objects. + + Objects are things in the system that may be acted upon directly by + userspace programs. Linux has a variety of actionable objects, including: + + - Tasks + - Files/inodes + - Sockets + - Message queues + - Shared memory segments + - Semaphores + - Keys + + As a part of the description of all these objects there is a set of + credentials. What's in the set depends on the type of object. + + (2) Object ownership. + + Amongst the credentials of most objects, there will be a subset that + indicates the ownership of that object. This is used for resource + accounting and limitation (disk quotas and task rlimits for example). + + In a standard UNIX filesystem, for instance, this will be defined by the + UID marked on the inode. + + (3) The objective context. + + Also amongst the credentials of those objects, there will be a subset that + indicates the 'objective context' of that object. This may or may not be + the same set as in (2) - in standard UNIX files, for instance, this is the + defined by the UID and the GID marked on the inode. + + The objective context is used as part of the security calculation that is + carried out when an object is acted upon. + + (4) Subjects. + + A subject is an object that is acting upon another object. + + Most of the objects in the system are inactive: they don't act on other + objects within the system. Processes/tasks are the obvious exception: + they do stuff; they access and manipulate things. + + Objects other than tasks may under some circumstances also be subjects. + For instance an open file may send SIGIO to a task using the UID and EUID + given to it by a task that called fcntl(F_SETOWN) upon it. In this case, + the file struct will have a subjective context too. + + (5) The subjective context. + + A subject has an additional interpretation of its credentials. A subset + of its credentials forms the 'subjective context'. The subjective context + is used as part of the security calculation that is carried out when a + subject acts. + + A Linux task, for example, has the FSUID, FSGID and the supplementary + group list for when it is acting upon a file - which are quite separate + from the real UID and GID that normally form the objective context of the + task. + + (6) Actions. + + Linux has a number of actions available that a subject may perform upon an + object. The set of actions available depends on the nature of the subject + and the object. + + Actions include reading, writing, creating and deleting files; forking or + signalling and tracing tasks. + + (7) Rules, access control lists and security calculations. + + When a subject acts upon an object, a security calculation is made. This + involves taking the subjective context, the objective context and the + action, and searching one or more sets of rules to see whether the subject + is granted or denied permission to act in the desired manner on the + object, given those contexts. + + There are two main sources of rules: + + (a) Discretionary access control (DAC): + + Sometimes the object will include sets of rules as part of its + description. This is an 'Access Control List' or 'ACL'. A Linux + file may supply more than one ACL. + + A traditional UNIX file, for example, includes a permissions mask that + is an abbreviated ACL with three fixed classes of subject ('user', + 'group' and 'other'), each of which may be granted certain privileges + ('read', 'write' and 'execute' - whatever those map to for the object + in question). UNIX file permissions do not allow the arbitrary + specification of subjects, however, and so are of limited use. + + A Linux file might also sport a POSIX ACL. This is a list of rules + that grants various permissions to arbitrary subjects. + + (b) Mandatory access control (MAC): + + The system as a whole may have one or more sets of rules that get + applied to all subjects and objects, regardless of their source. + SELinux and Smack are examples of this. + + In the case of SELinux and Smack, each object is given a label as part + of its credentials. When an action is requested, they take the + subject label, the object label and the action and look for a rule + that says that this action is either granted or denied. + + +==================== +TYPES OF CREDENTIALS +==================== + +The Linux kernel supports the following types of credentials: + + (1) Traditional UNIX credentials. + + Real User ID + Real Group ID + + The UID and GID are carried by most, if not all, Linux objects, even if in + some cases it has to be invented (FAT or CIFS files for example, which are + derived from Windows). These (mostly) define the objective context of + that object, with tasks being slightly different in some cases. + + Effective, Saved and FS User ID + Effective, Saved and FS Group ID + Supplementary groups + + These are additional credentials used by tasks only. Usually, an + EUID/EGID/GROUPS will be used as the subjective context, and real UID/GID + will be used as the objective. For tasks, it should be noted that this is + not always true. + + (2) Capabilities. + + Set of permitted capabilities + Set of inheritable capabilities + Set of effective capabilities + Capability bounding set + + These are only carried by tasks. They indicate superior capabilities + granted piecemeal to a task that an ordinary task wouldn't otherwise have. + These are manipulated implicitly by changes to the traditional UNIX + credentials, but can also be manipulated directly by the capset() system + call. + + The permitted capabilities are those caps that the process might grant + itself to its effective or permitted sets through capset(). This + inheritable set might also be so constrained. + + The effective capabilities are the ones that a task is actually allowed to + make use of itself. + + The inheritable capabilities are the ones that may get passed across + execve(). + + The bounding set limits the capabilities that may be inherited across + execve(), especially when a binary is executed that will execute as UID 0. + + (3) Secure management flags (securebits). + + These are only carried by tasks. These govern the way the above + credentials are manipulated and inherited over certain operations such as + execve(). They aren't used directly as objective or subjective + credentials. + + (4) Keys and keyrings. + + These are only carried by tasks. They carry and cache security tokens + that don't fit into the other standard UNIX credentials. They are for + making such things as network filesystem keys available to the file + accesses performed by processes, without the necessity of ordinary + programs having to know about security details involved. + + Keyrings are a special type of key. They carry sets of other keys and can + be searched for the desired key. Each process may subscribe to a number + of keyrings: + + Per-thread keying + Per-process keyring + Per-session keyring + + When a process accesses a key, if not already present, it will normally be + cached on one of these keyrings for future accesses to find. + + For more information on using keys, see Documentation/keys.txt. + + (5) LSM + + The Linux Security Module allows extra controls to be placed over the + operations that a task may do. Currently Linux supports two main + alternate LSM options: SELinux and Smack. + + Both work by labelling the objects in a system and then applying sets of + rules (policies) that say what operations a task with one label may do to + an object with another label. + + (6) AF_KEY + + This is a socket-based approach to credential management for networking + stacks [RFC 2367]. It isn't discussed by this document as it doesn't + interact directly with task and file credentials; rather it keeps system + level credentials. + + +When a file is opened, part of the opening task's subjective context is +recorded in the file struct created. This allows operations using that file +struct to use those credentials instead of the subjective context of the task +that issued the operation. An example of this would be a file opened on a +network filesystem where the credentials of the opened file should be presented +to the server, regardless of who is actually doing a read or a write upon it. + + +============= +FILE MARKINGS +============= + +Files on disk or obtained over the network may have annotations that form the +objective security context of that file. Depending on the type of filesystem, +this may include one or more of the following: + + (*) UNIX UID, GID, mode; + + (*) Windows user ID; + + (*) Access control list; + + (*) LSM security label; + + (*) UNIX exec privilege escalation bits (SUID/SGID); + + (*) File capabilities exec privilege escalation bits. + +These are compared to the task's subjective security context, and certain +operations allowed or disallowed as a result. In the case of execve(), the +privilege escalation bits come into play, and may allow the resulting process +extra privileges, based on the annotations on the executable file. + + +================ +TASK CREDENTIALS +================ + +In Linux, all of a task's credentials are held in (uid, gid) or through +(groups, keys, LSM security) a refcounted structure of type 'struct cred'. +Each task points to its credentials by a pointer called 'cred' in its +task_struct. + +Once a set of credentials has been prepared and committed, it may not be +changed, barring the following exceptions: + + (1) its reference count may be changed; + + (2) the reference count on the group_info struct it points to may be changed; + + (3) the reference count on the security data it points to may be changed; + + (4) the reference count on any keyrings it points to may be changed; + + (5) any keyrings it points to may be revoked, expired or have their security + attributes changed; and + + (6) the contents of any keyrings to which it points may be changed (the whole + point of keyrings being a shared set of credentials, modifiable by anyone + with appropriate access). + +To alter anything in the cred struct, the copy-and-replace principle must be +adhered to. First take a copy, then alter the copy and then use RCU to change +the task pointer to make it point to the new copy. There are wrappers to aid +with this (see below). + +A task may only alter its _own_ credentials; it is no longer permitted for a +task to alter another's credentials. This means the capset() system call is no +longer permitted to take any PID other than the one of the current process. +Also keyctl_instantiate() and keyctl_negate() functions no longer permit +attachment to process-specific keyrings in the requesting process as the +instantiating process may need to create them. + + +IMMUTABLE CREDENTIALS +--------------------- + +Once a set of credentials has been made public (by calling commit_creds() for +example), it must be considered immutable, barring two exceptions: + + (1) The reference count may be altered. + + (2) Whilst the keyring subscriptions of a set of credentials may not be + changed, the keyrings subscribed to may have their contents altered. + +To catch accidental credential alteration at compile time, struct task_struct +has _const_ pointers to its credential sets, as does struct file. Furthermore, +certain functions such as get_cred() and put_cred() operate on const pointers, +thus rendering casts unnecessary, but require to temporarily ditch the const +qualification to be able to alter the reference count. + + +ACCESSING TASK CREDENTIALS +-------------------------- + +A task being able to alter only its own credentials permits the current process +to read or replace its own credentials without the need for any form of locking +- which simplifies things greatly. It can just call: + + const struct cred *current_cred() + +to get a pointer to its credentials structure, and it doesn't have to release +it afterwards. + +There are convenience wrappers for retrieving specific aspects of a task's +credentials (the value is simply returned in each case): + + uid_t current_uid(void) Current's real UID + gid_t current_gid(void) Current's real GID + uid_t current_euid(void) Current's effective UID + gid_t current_egid(void) Current's effective GID + uid_t current_fsuid(void) Current's file access UID + gid_t current_fsgid(void) Current's file access GID + kernel_cap_t current_cap(void) Current's effective capabilities + void *current_security(void) Current's LSM security pointer + struct user_struct *current_user(void) Current's user account + +There are also convenience wrappers for retrieving specific associated pairs of +a task's credentials: + + void current_uid_gid(uid_t *, gid_t *); + void current_euid_egid(uid_t *, gid_t *); + void current_fsuid_fsgid(uid_t *, gid_t *); + +which return these pairs of values through their arguments after retrieving +them from the current task's credentials. + + +In addition, there is a function for obtaining a reference on the current +process's current set of credentials: + + const struct cred *get_current_cred(void); + +and functions for getting references to one of the credentials that don't +actually live in struct cred: + + struct user_struct *get_current_user(void); + struct group_info *get_current_groups(void); + +which get references to the current process's user accounting structure and +supplementary groups list respectively. + +Once a reference has been obtained, it must be released with put_cred(), +free_uid() or put_group_info() as appropriate. + + +ACCESSING ANOTHER TASK'S CREDENTIALS +------------------------------------ + +Whilst a task may access its own credentials without the need for locking, the +same is not true of a task wanting to access another task's credentials. It +must use the RCU read lock and rcu_dereference(). + +The rcu_dereference() is wrapped by: + + const struct cred *__task_cred(struct task_struct *task); + +This should be used inside the RCU read lock, as in the following example: + + void foo(struct task_struct *t, struct foo_data *f) + { + const struct cred *tcred; + ... + rcu_read_lock(); + tcred = __task_cred(t); + f->uid = tcred->uid; + f->gid = tcred->gid; + f->groups = get_group_info(tcred->groups); + rcu_read_unlock(); + ... + } + +A function need not get RCU read lock to use __task_cred() if it is holding a +spinlock at the time as this implicitly holds the RCU read lock. + +Should it be necessary to hold another task's credentials for a long period of +time, and possibly to sleep whilst doing so, then the caller should get a +reference on them using: + + const struct cred *get_task_cred(struct task_struct *task); + +This does all the RCU magic inside of it. The caller must call put_cred() on +the credentials so obtained when they're finished with. + +There are a couple of convenience functions to access bits of another task's +credentials, hiding the RCU magic from the caller: + + uid_t task_uid(task) Task's real UID + uid_t task_euid(task) Task's effective UID + +If the caller is holding a spinlock or the RCU read lock at the time anyway, +then: + + __task_cred(task)->uid + __task_cred(task)->euid + +should be used instead. Similarly, if multiple aspects of a task's credentials +need to be accessed, RCU read lock or a spinlock should be used, __task_cred() +called, the result stored in a temporary pointer and then the credential +aspects called from that before dropping the lock. This prevents the +potentially expensive RCU magic from being invoked multiple times. + +Should some other single aspect of another task's credentials need to be +accessed, then this can be used: + + task_cred_xxx(task, member) + +where 'member' is a non-pointer member of the cred struct. For instance: + + uid_t task_cred_xxx(task, suid); + +will retrieve 'struct cred::suid' from the task, doing the appropriate RCU +magic. This may not be used for pointer members as what they point to may +disappear the moment the RCU read lock is dropped. + + +ALTERING CREDENTIALS +-------------------- + +As previously mentioned, a task may only alter its own credentials, and may not +alter those of another task. This means that it doesn't need to use any +locking to alter its own credentials. + +To alter the current process's credentials, a function should first prepare a +new set of credentials by calling: + + struct cred *prepare_creds(void); + +this locks current->cred_replace_mutex and then allocates and constructs a +duplicate of the current process's credentials, returning with the mutex still +held if successful. It returns NULL if not successful (out of memory). + +The mutex prevents ptrace() from altering the ptrace state of a process whilst +security checks on credentials construction and changing is taking place as +the ptrace state may alter the outcome, particularly in the case of execve(). + +The new credentials set should be altered appropriately, and any security +checks and hooks done. Both the current and the proposed sets of credentials +are available for this purpose as current_cred() will return the current set +still at this point. + + +When the credential set is ready, it should be committed to the current process +by calling: + + int commit_creds(struct cred *new); + +This will alter various aspects of the credentials and the process, giving the +LSM a chance to do likewise, then it will use rcu_assign_pointer() to actually +commit the new credentials to current->cred, it will release +current->cred_replace_mutex to allow ptrace() to take place, and it will notify +the scheduler and others of the changes. + +This function is guaranteed to return 0, so that it can be tail-called at the +end of such functions as sys_setresuid(). + +Note that this function consumes the caller's reference to the new credentials. +The caller should _not_ call put_cred() on the new credentials afterwards. + +Furthermore, once this function has been called on a new set of credentials, +those credentials may _not_ be changed further. + + +Should the security checks fail or some other error occur after prepare_creds() +has been called, then the following function should be invoked: + + void abort_creds(struct cred *new); + +This releases the lock on current->cred_replace_mutex that prepare_creds() got +and then releases the new credentials. + + +A typical credentials alteration function would look something like this: + + int alter_suid(uid_t suid) + { + struct cred *new; + int ret; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + new->suid = suid; + ret = security_alter_suid(new); + if (ret < 0) { + abort_creds(new); + return ret; + } + + return commit_creds(new); + } + + +MANAGING CREDENTIALS +-------------------- + +There are some functions to help manage credentials: + + (*) void put_cred(const struct cred *cred); + + This releases a reference to the given set of credentials. If the + reference count reaches zero, the credentials will be scheduled for + destruction by the RCU system. + + (*) const struct cred *get_cred(const struct cred *cred); + + This gets a reference on a live set of credentials, returning a pointer to + that set of credentials. + + (*) struct cred *get_new_cred(struct cred *cred); + + This gets a reference on a set of credentials that is under construction + and is thus still mutable, returning a pointer to that set of credentials. + + +===================== +OPEN FILE CREDENTIALS +===================== + +When a new file is opened, a reference is obtained on the opening task's +credentials and this is attached to the file struct as 'f_cred' in place of +'f_uid' and 'f_gid'. Code that used to access file->f_uid and file->f_gid +should now access file->f_cred->fsuid and file->f_cred->fsgid. + +It is safe to access f_cred without the use of RCU or locking because the +pointer will not change over the lifetime of the file struct, and nor will the +contents of the cred struct pointed to, barring the exceptions listed above +(see the Task Credentials section). + + +======================================= +OVERRIDING THE VFS'S USE OF CREDENTIALS +======================================= + +Under some circumstances it is desirable to override the credentials used by +the VFS, and that can be done by calling into such as vfs_mkdir() with a +different set of credentials. This is done in the following places: + + (*) sys_faccessat(). + + (*) do_coredump(). + + (*) nfs4recover.c. diff --git a/include/linux/cred.h b/include/linux/cred.h index 8edb4d1d542..794aab5c66e 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -1,4 +1,4 @@ -/* Credentials management +/* Credentials management - see Documentation/credentials.txt * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) @@ -169,6 +169,12 @@ static inline struct cred *get_new_cred(struct cred *cred) * * Get a reference on the specified set of credentials. The caller must * release the reference. + * + * This is used to deal with a committed set of credentials. Although the + * pointer is const, this will temporarily discard the const and increment the + * usage count. The purpose of this is to attempt to catch at compile time the + * accidental alteration of a set of credentials that should be considered + * immutable. */ static inline const struct cred *get_cred(const struct cred *cred) { @@ -181,6 +187,10 @@ static inline const struct cred *get_cred(const struct cred *cred) * * Release a reference to a set of credentials, deleting them when the last ref * is released. + * + * This takes a const pointer to a set of credentials because the credentials + * on task_struct are attached by const pointers to prevent accidental + * alteration of otherwise immutable credential sets. */ static inline void put_cred(const struct cred *_cred) { diff --git a/kernel/cred.c b/kernel/cred.c index e6fcdd67b2e..b8bd2f99d8c 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -1,4 +1,4 @@ -/* Task credentials management +/* Task credentials management - see Documentation/credentials.txt * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) -- cgit v1.2.3-70-g09d2 From 3b11a1decef07c19443d24ae926982bc8ec9f4c0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:26 +1100 Subject: CRED: Differentiate objective and effective subjective credentials on a task Differentiate the objective and real subjective credentials from the effective subjective credentials on a task by introducing a second credentials pointer into the task_struct. task_struct::real_cred then refers to the objective and apparent real subjective credentials of a task, as perceived by the other tasks in the system. task_struct::cred then refers to the effective subjective credentials of a task, as used by that task when it's actually running. These are not visible to the other tasks in the system. __task_cred(task) then refers to the objective/real credentials of the task in question. current_cred() refers to the effective subjective credentials of the current task. prepare_creds() uses the objective creds as a base and commit_creds() changes both pointers in the task_struct (indeed commit_creds() requires them to be the same). override_creds() and revert_creds() change the subjective creds pointer only, and the former returns the old subjective creds. These are used by NFSD, faccessat() and do_coredump(), and will by used by CacheFiles. In SELinux, current_has_perm() is provided as an alternative to task_has_perm(). This uses the effective subjective context of current, whereas task_has_perm() uses the objective/real context of the subject. Signed-off-by: David Howells Signed-off-by: James Morris --- fs/nfsd/auth.c | 5 +++- include/linux/cred.h | 29 +++++++++++---------- include/linux/init_task.h | 1 + include/linux/sched.h | 5 +++- kernel/cred.c | 38 ++++++++++++++++++--------- kernel/fork.c | 6 +++-- security/selinux/hooks.c | 65 ++++++++++++++++++++++++++++++----------------- 7 files changed, 95 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 836ffa1047d..0184fe9b514 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -34,6 +34,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) int flags = nfsexp_flags(rqstp, exp); int ret; + /* discard any old override before preparing the new set */ + revert_creds(get_cred(current->real_cred)); new = prepare_creds(); if (!new) return -ENOMEM; @@ -82,7 +84,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) else new->cap_effective = cap_raise_nfsd_set(new->cap_effective, new->cap_permitted); - return commit_creds(new); + put_cred(override_creds(new)); + return 0; oom: ret = -ENOMEM; diff --git a/include/linux/cred.h b/include/linux/cred.h index 794aab5c66e..55a9c995d69 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -146,8 +146,8 @@ extern struct cred *prepare_exec_creds(void); extern struct cred *prepare_usermodehelper_creds(void); extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); -extern const struct cred *override_creds(const struct cred *) __deprecated; -extern void revert_creds(const struct cred *) __deprecated; +extern const struct cred *override_creds(const struct cred *); +extern void revert_creds(const struct cred *); extern void __init cred_init(void); /** @@ -202,32 +202,32 @@ static inline void put_cred(const struct cred *_cred) } /** - * current_cred - Access the current task's credentials + * current_cred - Access the current task's subjective credentials * - * Access the credentials of the current task. + * Access the subjective credentials of the current task. */ #define current_cred() \ (current->cred) /** - * __task_cred - Access another task's credentials + * __task_cred - Access a task's objective credentials * @task: The task to query * - * Access the credentials of another task. The caller must hold the - * RCU readlock. + * Access the objective credentials of a task. The caller must hold the RCU + * readlock. * * The caller must make sure task doesn't go away, either by holding a ref on * task or by holding tasklist_lock to prevent it from being unlinked. */ #define __task_cred(task) \ - ((const struct cred *)(rcu_dereference((task)->cred))) + ((const struct cred *)(rcu_dereference((task)->real_cred))) /** - * get_task_cred - Get another task's credentials + * get_task_cred - Get another task's objective credentials * @task: The task to query * - * Get the credentials of a task, pinning them so that they can't go away. - * Accessing a task's credentials directly is not permitted. + * Get the objective credentials of a task, pinning them so that they can't go + * away. Accessing a task's credentials directly is not permitted. * * The caller must make sure task doesn't go away, either by holding a ref on * task or by holding tasklist_lock to prevent it from being unlinked. @@ -243,10 +243,11 @@ static inline void put_cred(const struct cred *_cred) }) /** - * get_current_cred - Get the current task's credentials + * get_current_cred - Get the current task's subjective credentials * - * Get the credentials of the current task, pinning them so that they can't go - * away. Accessing the current task's credentials directly is not permitted. + * Get the subjective credentials of the current task, pinning them so that + * they can't go away. Accessing the current task's credentials directly is + * not permitted. */ #define get_current_cred() \ (get_cred(current_cred())) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 08c3b24ad9a..2597858035c 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -149,6 +149,7 @@ extern struct cred init_cred; .children = LIST_HEAD_INIT(tsk.children), \ .sibling = LIST_HEAD_INIT(tsk.sibling), \ .group_leader = &tsk, \ + .real_cred = &init_cred, \ .cred = &init_cred, \ .cred_exec_mutex = \ __MUTEX_INITIALIZER(tsk.cred_exec_mutex), \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 121d655e460..3443123b070 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1145,7 +1145,10 @@ struct task_struct { struct list_head cpu_timers[3]; /* process credentials */ - const struct cred *cred; /* actual/objective task credentials (COW) */ + const struct cred *real_cred; /* objective and real subjective task + * credentials (COW) */ + const struct cred *cred; /* effective (overridable) subjective task + * credentials (COW) */ struct mutex cred_exec_mutex; /* execve vs ptrace cred calculation mutex */ char comm[TASK_COMM_LEN]; /* executable name excluding path diff --git a/kernel/cred.c b/kernel/cred.c index b8bd2f99d8c..f3ca1066061 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -35,7 +35,7 @@ static struct thread_group_cred init_tgcred = { * The initial credentials for the initial task */ struct cred init_cred = { - .usage = ATOMIC_INIT(3), + .usage = ATOMIC_INIT(4), .securebits = SECUREBITS_DEFAULT, .cap_inheritable = CAP_INIT_INH_SET, .cap_permitted = CAP_FULL_SET, @@ -120,6 +120,8 @@ EXPORT_SYMBOL(__put_cred); * prepare a new copy, which the caller then modifies and then commits by * calling commit_creds(). * + * Preparation involves making a copy of the objective creds for modification. + * * Returns a pointer to the new creds-to-be if successful, NULL otherwise. * * Call commit_creds() or abort_creds() to clean up. @@ -130,7 +132,7 @@ struct cred *prepare_creds(void) const struct cred *old; struct cred *new; - BUG_ON(atomic_read(&task->cred->usage) < 1); + BUG_ON(atomic_read(&task->real_cred->usage) < 1); new = kmem_cache_alloc(cred_jar, GFP_KERNEL); if (!new) @@ -262,6 +264,9 @@ error: * * We share if we can, but under some circumstances we have to generate a new * set. + * + * The new process gets the current process's subjective credentials as its + * objective and subjective credentials */ int copy_creds(struct task_struct *p, unsigned long clone_flags) { @@ -278,6 +283,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) #endif clone_flags & CLONE_THREAD ) { + p->real_cred = get_cred(p->cred); get_cred(p->cred); atomic_inc(&p->cred->user->processes); return 0; @@ -317,7 +323,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) #endif atomic_inc(&new->user->processes); - p->cred = new; + p->cred = p->real_cred = get_cred(new); return 0; } @@ -326,7 +332,9 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) * @new: The credentials to be assigned * * Install a new set of credentials to the current task, using RCU to replace - * the old set. + * the old set. Both the objective and the subjective credentials pointers are + * updated. This function may not be called if the subjective credentials are + * in an overridden state. * * This function eats the caller's reference to the new credentials. * @@ -338,12 +346,15 @@ int commit_creds(struct cred *new) struct task_struct *task = current; const struct cred *old; + BUG_ON(task->cred != task->real_cred); + BUG_ON(atomic_read(&task->real_cred->usage) < 2); BUG_ON(atomic_read(&new->usage) < 1); - BUG_ON(atomic_read(&task->cred->usage) < 1); - old = task->cred; + old = task->real_cred; security_commit_creds(new, old); + get_cred(new); /* we will require a ref for the subj creds too */ + /* dumpability changes */ if (old->euid != new->euid || old->egid != new->egid || @@ -369,6 +380,7 @@ int commit_creds(struct cred *new) */ if (new->user != old->user) atomic_inc(&new->user->processes); + rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); if (new->user != old->user) atomic_dec(&old->user->processes); @@ -388,6 +400,8 @@ int commit_creds(struct cred *new) new->fsgid != old->fsgid) proc_id_connector(task, PROC_EVENT_GID); + /* release the old obj and subj refs both */ + put_cred(old); put_cred(old); return 0; } @@ -408,11 +422,11 @@ void abort_creds(struct cred *new) EXPORT_SYMBOL(abort_creds); /** - * override_creds - Temporarily override the current process's credentials + * override_creds - Override the current process's subjective credentials * @new: The credentials to be assigned * - * Install a set of temporary override credentials on the current process, - * returning the old set for later reversion. + * Install a set of temporary override subjective credentials on the current + * process, returning the old set for later reversion. */ const struct cred *override_creds(const struct cred *new) { @@ -424,11 +438,11 @@ const struct cred *override_creds(const struct cred *new) EXPORT_SYMBOL(override_creds); /** - * revert_creds - Revert a temporary credentials override + * revert_creds - Revert a temporary subjective credentials override * @old: The credentials to be restored * - * Revert a temporary set of override credentials to an old set, discarding the - * override set. + * Revert a temporary set of override subjective credentials to an old set, + * discarding the override set. */ void revert_creds(const struct cred *old) { diff --git a/kernel/fork.c b/kernel/fork.c index 82a7948a664..af0d0f04585 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -146,6 +146,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + put_cred(tsk->real_cred); put_cred(tsk->cred); delayacct_tsk_free(tsk); @@ -961,10 +962,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = -EAGAIN; - if (atomic_read(&p->cred->user->processes) >= + if (atomic_read(&p->real_cred->user->processes) >= p->signal->rlim[RLIMIT_NPROC].rlim_cur) { if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && - p->cred->user != current->nsproxy->user_ns->root_user) + p->real_cred->user != current->nsproxy->user_ns->root_user) goto bad_fork_free; } @@ -1278,6 +1279,7 @@ bad_fork_cleanup_put_domain: module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); + put_cred(p->real_cred); put_cred(p->cred); bad_fork_free: free_task(p); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 21a59218463..91b06f2aa96 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -161,7 +161,7 @@ static int selinux_secmark_enabled(void) */ static void cred_init_security(void) { - struct cred *cred = (struct cred *) current->cred; + struct cred *cred = (struct cred *) current->real_cred; struct task_security_struct *tsec; tsec = kzalloc(sizeof(struct task_security_struct), GFP_KERNEL); @@ -184,7 +184,7 @@ static inline u32 cred_sid(const struct cred *cred) } /* - * get the security ID of a task + * get the objective security ID of a task */ static inline u32 task_sid(const struct task_struct *task) { @@ -197,7 +197,7 @@ static inline u32 task_sid(const struct task_struct *task) } /* - * get the security ID of the current task + * get the subjective security ID of the current task */ static inline u32 current_sid(void) { @@ -1395,6 +1395,7 @@ static int cred_has_perm(const struct cred *actor, * Check permission between a pair of tasks, e.g. signal checks, * fork check, ptrace check, etc. * tsk1 is the actor and tsk2 is the target + * - this uses the default subjective creds of tsk1 */ static int task_has_perm(const struct task_struct *tsk1, const struct task_struct *tsk2, @@ -1410,6 +1411,22 @@ static int task_has_perm(const struct task_struct *tsk1, return avc_has_perm(sid1, sid2, SECCLASS_PROCESS, perms, NULL); } +/* + * Check permission between current and another task, e.g. signal checks, + * fork check, ptrace check, etc. + * current is the actor and tsk2 is the target + * - this uses current's subjective creds + */ +static int current_has_perm(const struct task_struct *tsk, + u32 perms) +{ + u32 sid, tsid; + + sid = current_sid(); + tsid = task_sid(tsk); + return avc_has_perm(sid, tsid, SECCLASS_PROCESS, perms, NULL); +} + #if CAP_LAST_CAP > 63 #error Fix SELinux to handle capabilities > 63. #endif @@ -1807,7 +1824,7 @@ static int selinux_ptrace_may_access(struct task_struct *child, return avc_has_perm(sid, csid, SECCLASS_FILE, FILE__READ, NULL); } - return task_has_perm(current, child, PROCESS__PTRACE); + return current_has_perm(child, PROCESS__PTRACE); } static int selinux_ptrace_traceme(struct task_struct *parent) @@ -1826,7 +1843,7 @@ static int selinux_capget(struct task_struct *target, kernel_cap_t *effective, { int error; - error = task_has_perm(current, target, PROCESS__GETCAP); + error = current_has_perm(target, PROCESS__GETCAP); if (error) return error; @@ -3071,7 +3088,7 @@ static int selinux_file_mprotect(struct vm_area_struct *vma, } else if (!vma->vm_file && vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack) { - rc = task_has_perm(current, current, PROCESS__EXECSTACK); + rc = current_has_perm(current, PROCESS__EXECSTACK); } else if (vma->vm_file && vma->anon_vma) { /* * We are making executable a file mapping that has @@ -3220,7 +3237,7 @@ static int selinux_task_create(unsigned long clone_flags) if (rc) return rc; - return task_has_perm(current, current, PROCESS__FORK); + return current_has_perm(current, PROCESS__FORK); } /* @@ -3285,17 +3302,17 @@ static int selinux_task_setgid(gid_t id0, gid_t id1, gid_t id2, int flags) static int selinux_task_setpgid(struct task_struct *p, pid_t pgid) { - return task_has_perm(current, p, PROCESS__SETPGID); + return current_has_perm(p, PROCESS__SETPGID); } static int selinux_task_getpgid(struct task_struct *p) { - return task_has_perm(current, p, PROCESS__GETPGID); + return current_has_perm(p, PROCESS__GETPGID); } static int selinux_task_getsid(struct task_struct *p) { - return task_has_perm(current, p, PROCESS__GETSESSION); + return current_has_perm(p, PROCESS__GETSESSION); } static void selinux_task_getsecid(struct task_struct *p, u32 *secid) @@ -3317,7 +3334,7 @@ static int selinux_task_setnice(struct task_struct *p, int nice) if (rc) return rc; - return task_has_perm(current, p, PROCESS__SETSCHED); + return current_has_perm(p, PROCESS__SETSCHED); } static int selinux_task_setioprio(struct task_struct *p, int ioprio) @@ -3328,12 +3345,12 @@ static int selinux_task_setioprio(struct task_struct *p, int ioprio) if (rc) return rc; - return task_has_perm(current, p, PROCESS__SETSCHED); + return current_has_perm(p, PROCESS__SETSCHED); } static int selinux_task_getioprio(struct task_struct *p) { - return task_has_perm(current, p, PROCESS__GETSCHED); + return current_has_perm(p, PROCESS__GETSCHED); } static int selinux_task_setrlimit(unsigned int resource, struct rlimit *new_rlim) @@ -3350,7 +3367,7 @@ static int selinux_task_setrlimit(unsigned int resource, struct rlimit *new_rlim later be used as a safe reset point for the soft limit upon context transitions. See selinux_bprm_committing_creds. */ if (old_rlim->rlim_max != new_rlim->rlim_max) - return task_has_perm(current, current, PROCESS__SETRLIMIT); + return current_has_perm(current, PROCESS__SETRLIMIT); return 0; } @@ -3363,17 +3380,17 @@ static int selinux_task_setscheduler(struct task_struct *p, int policy, struct s if (rc) return rc; - return task_has_perm(current, p, PROCESS__SETSCHED); + return current_has_perm(p, PROCESS__SETSCHED); } static int selinux_task_getscheduler(struct task_struct *p) { - return task_has_perm(current, p, PROCESS__GETSCHED); + return current_has_perm(p, PROCESS__GETSCHED); } static int selinux_task_movememory(struct task_struct *p) { - return task_has_perm(current, p, PROCESS__SETSCHED); + return current_has_perm(p, PROCESS__SETSCHED); } static int selinux_task_kill(struct task_struct *p, struct siginfo *info, @@ -3394,7 +3411,7 @@ static int selinux_task_kill(struct task_struct *p, struct siginfo *info, rc = avc_has_perm(secid, task_sid(p), SECCLASS_PROCESS, perm, NULL); else - rc = task_has_perm(current, p, perm); + rc = current_has_perm(p, perm); return rc; } @@ -5250,7 +5267,7 @@ static int selinux_getprocattr(struct task_struct *p, unsigned len; if (current != p) { - error = task_has_perm(current, p, PROCESS__GETATTR); + error = current_has_perm(p, PROCESS__GETATTR); if (error) return error; } @@ -5309,15 +5326,15 @@ static int selinux_setprocattr(struct task_struct *p, * above restriction is ever removed. */ if (!strcmp(name, "exec")) - error = task_has_perm(current, p, PROCESS__SETEXEC); + error = current_has_perm(p, PROCESS__SETEXEC); else if (!strcmp(name, "fscreate")) - error = task_has_perm(current, p, PROCESS__SETFSCREATE); + error = current_has_perm(p, PROCESS__SETFSCREATE); else if (!strcmp(name, "keycreate")) - error = task_has_perm(current, p, PROCESS__SETKEYCREATE); + error = current_has_perm(p, PROCESS__SETKEYCREATE); else if (!strcmp(name, "sockcreate")) - error = task_has_perm(current, p, PROCESS__SETSOCKCREATE); + error = current_has_perm(p, PROCESS__SETSOCKCREATE); else if (!strcmp(name, "current")) - error = task_has_perm(current, p, PROCESS__SETCURRENT); + error = current_has_perm(p, PROCESS__SETCURRENT); else error = -EINVAL; if (error) -- cgit v1.2.3-70-g09d2 From 3a3b7ce9336952ea7b9564d976d068a238976c9d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:28 +1100 Subject: CRED: Allow kernel services to override LSM settings for task actions Allow kernel services to override LSM settings appropriate to the actions performed by a task by duplicating a set of credentials, modifying it and then using task_struct::cred to point to it when performing operations on behalf of a task. This is used, for example, by CacheFiles which has to transparently access the cache on behalf of a process that thinks it is doing, say, NFS accesses with a potentially inappropriate (with respect to accessing the cache) set of credentials. This patch provides two LSM hooks for modifying a task security record: (*) security_kernel_act_as() which allows modification of the security datum with which a task acts on other objects (most notably files). (*) security_kernel_create_files_as() which allows modification of the security datum that is used to initialise the security data on a file that a task creates. The patch also provides four new credentials handling functions, which wrap the LSM functions: (1) prepare_kernel_cred() Prepare a set of credentials for a kernel service to use, based either on a daemon's credentials or on init_cred. All the keyrings are cleared. (2) set_security_override() Set the LSM security ID in a set of credentials to a specific security context, assuming permission from the LSM policy. (3) set_security_override_from_ctx() As (2), but takes the security context as a string. (4) set_create_files_as() Set the file creation LSM security ID in a set of credentials to be the same as that on a particular inode. Signed-off-by: Casey Schaufler [Smack changes] Signed-off-by: David Howells Signed-off-by: James Morris --- include/linux/cred.h | 6 +++ include/linux/security.h | 28 +++++++++++ kernel/cred.c | 113 +++++++++++++++++++++++++++++++++++++++++++++ security/capability.c | 12 +++++ security/security.c | 10 ++++ security/selinux/hooks.c | 46 ++++++++++++++++++ security/smack/smack_lsm.c | 37 +++++++++++++++ 7 files changed, 252 insertions(+) (limited to 'kernel') diff --git a/include/linux/cred.h b/include/linux/cred.h index 55a9c995d69..26c1ab17994 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -18,6 +18,7 @@ struct user_struct; struct cred; +struct inode; /* * COW Supplementary groups list @@ -148,6 +149,11 @@ extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); extern const struct cred *override_creds(const struct cred *); extern void revert_creds(const struct cred *); +extern struct cred *prepare_kernel_cred(struct task_struct *); +extern int change_create_files_as(struct cred *, struct inode *); +extern int set_security_override(struct cred *, u32); +extern int set_security_override_from_ctx(struct cred *, const char *); +extern int set_create_files_as(struct cred *, struct inode *); extern void __init cred_init(void); /** diff --git a/include/linux/security.h b/include/linux/security.h index 56a0eed6567..59a11e19b61 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -587,6 +587,19 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @new points to the new credentials. * @old points to the original credentials. * Install a new set of credentials. + * @kernel_act_as: + * Set the credentials for a kernel service to act as (subjective context). + * @new points to the credentials to be modified. + * @secid specifies the security ID to be set + * The current task must be the one that nominated @secid. + * Return 0 if successful. + * @kernel_create_files_as: + * Set the file creation context in a set of credentials to be the same as + * the objective context of the specified inode. + * @new points to the credentials to be modified. + * @inode points to the inode to use as a reference. + * The current task must be the one that nominated @inode. + * Return 0 if successful. * @task_setuid: * Check permission before setting one or more of the user identity * attributes of the current process. The @flags parameter indicates @@ -1381,6 +1394,8 @@ struct security_operations { int (*cred_prepare)(struct cred *new, const struct cred *old, gfp_t gfp); void (*cred_commit)(struct cred *new, const struct cred *old); + int (*kernel_act_as)(struct cred *new, u32 secid); + int (*kernel_create_files_as)(struct cred *new, struct inode *inode); int (*task_setuid) (uid_t id0, uid_t id1, uid_t id2, int flags); int (*task_fix_setuid) (struct cred *new, const struct cred *old, int flags); @@ -1632,6 +1647,8 @@ int security_task_create(unsigned long clone_flags); void security_cred_free(struct cred *cred); int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp); void security_commit_creds(struct cred *new, const struct cred *old); +int security_kernel_act_as(struct cred *new, u32 secid); +int security_kernel_create_files_as(struct cred *new, struct inode *inode); int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags); int security_task_fix_setuid(struct cred *new, const struct cred *old, int flags); @@ -2151,6 +2168,17 @@ static inline void security_commit_creds(struct cred *new, { } +static inline int security_kernel_act_as(struct cred *cred, u32 secid) +{ + return 0; +} + +static inline int security_kernel_create_files_as(struct cred *cred, + struct inode *inode) +{ + return 0; +} + static inline int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags) { diff --git a/kernel/cred.c b/kernel/cred.c index f3ca1066061..13697ca2bb3 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -462,3 +462,116 @@ void __init cred_init(void) cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); } + +/** + * prepare_kernel_cred - Prepare a set of credentials for a kernel service + * @daemon: A userspace daemon to be used as a reference + * + * Prepare a set of credentials for a kernel service. This can then be used to + * override a task's own credentials so that work can be done on behalf of that + * task that requires a different subjective context. + * + * @daemon is used to provide a base for the security record, but can be NULL. + * If @daemon is supplied, then the security data will be derived from that; + * otherwise they'll be set to 0 and no groups, full capabilities and no keys. + * + * The caller may change these controls afterwards if desired. + * + * Returns the new credentials or NULL if out of memory. + * + * Does not take, and does not return holding current->cred_replace_mutex. + */ +struct cred *prepare_kernel_cred(struct task_struct *daemon) +{ + const struct cred *old; + struct cred *new; + + new = kmem_cache_alloc(cred_jar, GFP_KERNEL); + if (!new) + return NULL; + + if (daemon) + old = get_task_cred(daemon); + else + old = get_cred(&init_cred); + + get_uid(new->user); + get_group_info(new->group_info); + +#ifdef CONFIG_KEYS + atomic_inc(&init_tgcred.usage); + new->tgcred = &init_tgcred; + new->request_key_auth = NULL; + new->thread_keyring = NULL; + new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; +#endif + +#ifdef CONFIG_SECURITY + new->security = NULL; +#endif + if (security_prepare_creds(new, old, GFP_KERNEL) < 0) + goto error; + + atomic_set(&new->usage, 1); + put_cred(old); + return new; + +error: + put_cred(new); + return NULL; +} +EXPORT_SYMBOL(prepare_kernel_cred); + +/** + * set_security_override - Set the security ID in a set of credentials + * @new: The credentials to alter + * @secid: The LSM security ID to set + * + * Set the LSM security ID in a set of credentials so that the subjective + * security is overridden when an alternative set of credentials is used. + */ +int set_security_override(struct cred *new, u32 secid) +{ + return security_kernel_act_as(new, secid); +} +EXPORT_SYMBOL(set_security_override); + +/** + * set_security_override_from_ctx - Set the security ID in a set of credentials + * @new: The credentials to alter + * @secctx: The LSM security context to generate the security ID from. + * + * Set the LSM security ID in a set of credentials so that the subjective + * security is overridden when an alternative set of credentials is used. The + * security ID is specified in string form as a security context to be + * interpreted by the LSM. + */ +int set_security_override_from_ctx(struct cred *new, const char *secctx) +{ + u32 secid; + int ret; + + ret = security_secctx_to_secid(secctx, strlen(secctx), &secid); + if (ret < 0) + return ret; + + return set_security_override(new, secid); +} +EXPORT_SYMBOL(set_security_override_from_ctx); + +/** + * set_create_files_as - Set the LSM file create context in a set of credentials + * @new: The credentials to alter + * @inode: The inode to take the context from + * + * Change the LSM file creation context in a set of credentials to be the same + * as the object context of the specified inode, so that the new inodes have + * the same MAC context as that inode. + */ +int set_create_files_as(struct cred *new, struct inode *inode) +{ + new->fsuid = inode->i_uid; + new->fsgid = inode->i_gid; + return security_kernel_create_files_as(new, inode); +} +EXPORT_SYMBOL(set_create_files_as); diff --git a/security/capability.c b/security/capability.c index 185804f99ad..b9e391425e6 100644 --- a/security/capability.c +++ b/security/capability.c @@ -348,6 +348,16 @@ static void cap_cred_commit(struct cred *new, const struct cred *old) { } +static int cap_kernel_act_as(struct cred *new, u32 secid) +{ + return 0; +} + +static int cap_kernel_create_files_as(struct cred *new, struct inode *inode) +{ + return 0; +} + static int cap_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags) { return 0; @@ -889,6 +899,8 @@ void security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, cred_free); set_to_cap_if_null(ops, cred_prepare); set_to_cap_if_null(ops, cred_commit); + set_to_cap_if_null(ops, kernel_act_as); + set_to_cap_if_null(ops, kernel_create_files_as); set_to_cap_if_null(ops, task_setuid); set_to_cap_if_null(ops, task_fix_setuid); set_to_cap_if_null(ops, task_setgid); diff --git a/security/security.c b/security/security.c index dc5babb2d6d..038ef04b2c7 100644 --- a/security/security.c +++ b/security/security.c @@ -616,6 +616,16 @@ void security_commit_creds(struct cred *new, const struct cred *old) return security_ops->cred_commit(new, old); } +int security_kernel_act_as(struct cred *new, u32 secid) +{ + return security_ops->kernel_act_as(new, secid); +} + +int security_kernel_create_files_as(struct cred *new, struct inode *inode) +{ + return security_ops->kernel_create_files_as(new, inode); +} + int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags) { return security_ops->task_setuid(id0, id1, id2, flags); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 91b06f2aa96..520f82ab3fb 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3277,6 +3277,50 @@ static void selinux_cred_commit(struct cred *new, const struct cred *old) secondary_ops->cred_commit(new, old); } +/* + * set the security data for a kernel service + * - all the creation contexts are set to unlabelled + */ +static int selinux_kernel_act_as(struct cred *new, u32 secid) +{ + struct task_security_struct *tsec = new->security; + u32 sid = current_sid(); + int ret; + + ret = avc_has_perm(sid, secid, + SECCLASS_KERNEL_SERVICE, + KERNEL_SERVICE__USE_AS_OVERRIDE, + NULL); + if (ret == 0) { + tsec->sid = secid; + tsec->create_sid = 0; + tsec->keycreate_sid = 0; + tsec->sockcreate_sid = 0; + } + return ret; +} + +/* + * set the file creation context in a security record to the same as the + * objective context of the specified inode + */ +static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode) +{ + struct inode_security_struct *isec = inode->i_security; + struct task_security_struct *tsec = new->security; + u32 sid = current_sid(); + int ret; + + ret = avc_has_perm(sid, isec->sid, + SECCLASS_KERNEL_SERVICE, + KERNEL_SERVICE__CREATE_FILES_AS, + NULL); + + if (ret == 0) + tsec->create_sid = isec->sid; + return 0; +} + static int selinux_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags) { /* Since setuid only affects the current process, and @@ -5593,6 +5637,8 @@ static struct security_operations selinux_ops = { .cred_free = selinux_cred_free, .cred_prepare = selinux_cred_prepare, .cred_commit = selinux_cred_commit, + .kernel_act_as = selinux_kernel_act_as, + .kernel_create_files_as = selinux_kernel_create_files_as, .task_setuid = selinux_task_setuid, .task_fix_setuid = selinux_task_fix_setuid, .task_setgid = selinux_task_setgid, diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index de396742abf..8ad48161cef 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -1011,6 +1011,41 @@ static void smack_cred_commit(struct cred *new, const struct cred *old) { } +/** + * smack_kernel_act_as - Set the subjective context in a set of credentials + * @new points to the set of credentials to be modified. + * @secid specifies the security ID to be set + * + * Set the security data for a kernel service. + */ +static int smack_kernel_act_as(struct cred *new, u32 secid) +{ + char *smack = smack_from_secid(secid); + + if (smack == NULL) + return -EINVAL; + + new->security = smack; + return 0; +} + +/** + * smack_kernel_create_files_as - Set the file creation label in a set of creds + * @new points to the set of credentials to be modified + * @inode points to the inode to use as a reference + * + * Set the file creation context in a set of credentials to the same + * as the objective context of the specified inode + */ +static int smack_kernel_create_files_as(struct cred *new, + struct inode *inode) +{ + struct inode_smack *isp = inode->i_security; + + new->security = isp->smk_inode; + return 0; +} + /** * smack_task_setpgid - Smack check on setting pgid * @p: the task object @@ -2641,6 +2676,8 @@ struct security_operations smack_ops = { .cred_free = smack_cred_free, .cred_prepare = smack_cred_prepare, .cred_commit = smack_cred_commit, + .kernel_act_as = smack_kernel_act_as, + .kernel_create_files_as = smack_kernel_create_files_as, .task_fix_setuid = cap_task_fix_setuid, .task_setpgid = smack_task_setpgid, .task_getpgid = smack_task_getpgid, -- cgit v1.2.3-70-g09d2 From 1acdac104668a0834cfa267de9946fac7764d486 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Nov 2008 10:02:53 -0800 Subject: futex: make clock selectable for FUTEX_WAIT_BITSET FUTEX_WAIT_BITSET could be used instead of FUTEX_WAIT by setting the bit set to FUTEX_BITSET_MATCH_ANY, but FUTEX_WAIT uses CLOCK_REALTIME while FUTEX_WAIT_BITSET uses CLOCK_MONOTONIC. Add a flag to select CLOCK_REALTIME for FUTEX_WAIT_BITSET so glibc can replace the FUTEX_WAIT logic which needs to do gettimeofday() calls before and after the syscall to convert the absolute timeout to a relative timeout for FUTEX_WAIT. Signed-off-by: Thomas Gleixner Cc: Ulrich Drepper --- include/linux/futex.h | 3 ++- kernel/futex.c | 24 +++++++++++++++++------- 2 files changed, 19 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/futex.h b/include/linux/futex.h index 8f627b9ae2b..3bf5bb5a34f 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -25,7 +25,8 @@ union ktime; #define FUTEX_WAKE_BITSET 10 #define FUTEX_PRIVATE_FLAG 128 -#define FUTEX_CMD_MASK ~FUTEX_PRIVATE_FLAG +#define FUTEX_CLOCK_REALTIME 256 +#define FUTEX_CMD_MASK ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME) #define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG) #define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG) diff --git a/kernel/futex.c b/kernel/futex.c index e10c5c8786a..ba0d3b83c09 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1142,12 +1142,13 @@ handle_fault: * In case we must use restart_block to restart a futex_wait, * we encode in the 'flags' shared capability */ -#define FLAGS_SHARED 1 +#define FLAGS_SHARED 0x01 +#define FLAGS_CLOCKRT 0x02 static long futex_wait_restart(struct restart_block *restart); static int futex_wait(u32 __user *uaddr, int fshared, - u32 val, ktime_t *abs_time, u32 bitset) + u32 val, ktime_t *abs_time, u32 bitset, int clockrt) { struct task_struct *curr = current; DECLARE_WAITQUEUE(wait, curr); @@ -1233,8 +1234,10 @@ static int futex_wait(u32 __user *uaddr, int fshared, slack = current->timer_slack_ns; if (rt_task(current)) slack = 0; - hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); + hrtimer_init_on_stack(&t.timer, + clockrt ? CLOCK_REALTIME : + CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); hrtimer_init_sleeper(&t, current); hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack); @@ -1289,6 +1292,8 @@ static int futex_wait(u32 __user *uaddr, int fshared, if (fshared) restart->futex.flags |= FLAGS_SHARED; + if (clockrt) + restart->futex.flags |= FLAGS_CLOCKRT; return -ERESTART_RESTARTBLOCK; } @@ -1312,7 +1317,8 @@ static long futex_wait_restart(struct restart_block *restart) if (restart->futex.flags & FLAGS_SHARED) fshared = 1; return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, - restart->futex.bitset); + restart->futex.bitset, + restart->futex.flags & FLAGS_CLOCKRT); } @@ -1905,18 +1911,22 @@ void exit_robust_list(struct task_struct *curr) long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { - int ret = -ENOSYS; + int clockrt, ret = -ENOSYS; int cmd = op & FUTEX_CMD_MASK; int fshared = 0; if (!(op & FUTEX_PRIVATE_FLAG)) fshared = 1; + clockrt = op & FUTEX_CLOCK_REALTIME; + if (clockrt && cmd != FUTEX_WAIT_BITSET) + return -ENOSYS; + switch (cmd) { case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; case FUTEX_WAIT_BITSET: - ret = futex_wait(uaddr, fshared, val, timeout, val3); + ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt); break; case FUTEX_WAKE: val3 = FUTEX_BITSET_MATCH_ANY; -- cgit v1.2.3-70-g09d2 From 18b6e0414e42d95183f07d8177e3ff0241abd825 Mon Sep 17 00:00:00 2001 From: Serge Hallyn Date: Wed, 15 Oct 2008 16:38:45 -0500 Subject: User namespaces: set of cleanups (v2) The user_ns is moved from nsproxy to user_struct, so that a struct cred by itself is sufficient to determine access (which it otherwise would not be). Corresponding ecryptfs fixes (by David Howells) are here as well. Fix refcounting. The following rules now apply: 1. The task pins the user struct. 2. The user struct pins its user namespace. 3. The user namespace pins the struct user which created it. User namespaces are cloned during copy_creds(). Unsharing a new user_ns is no longer possible. (We could re-add that, but it'll cause code duplication and doesn't seem useful if PAM doesn't need to clone user namespaces). When a user namespace is created, its first user (uid 0) gets empty keyrings and a clean group_info. This incorporates a previous patch by David Howells. Here is his original patch description: >I suggest adding the attached incremental patch. It makes the following >changes: > > (1) Provides a current_user_ns() macro to wrap accesses to current's user > namespace. > > (2) Fixes eCryptFS. > > (3) Renames create_new_userns() to create_user_ns() to be more consistent > with the other associated functions and because the 'new' in the name is > superfluous. > > (4) Moves the argument and permission checks made for CLONE_NEWUSER to the > beginning of do_fork() so that they're done prior to making any attempts > at allocation. > > (5) Calls create_user_ns() after prepare_creds(), and gives it the new creds > to fill in rather than have it return the new root user. I don't imagine > the new root user being used for anything other than filling in a cred > struct. > > This also permits me to get rid of a get_uid() and a free_uid(), as the > reference the creds were holding on the old user_struct can just be > transferred to the new namespace's creator pointer. > > (6) Makes create_user_ns() reset the UIDs and GIDs of the creds under > preparation rather than doing it in copy_creds(). > >David >Signed-off-by: David Howells Changelog: Oct 20: integrate dhowells comments 1. leave thread_keyring alone 2. use current_user_ns() in set_user() Signed-off-by: Serge Hallyn --- fs/ecryptfs/messaging.c | 13 ++++---- fs/ecryptfs/miscdev.c | 19 ++++------- include/linux/cred.h | 2 ++ include/linux/init_task.h | 1 - include/linux/nsproxy.h | 1 - include/linux/sched.h | 1 + include/linux/user_namespace.h | 13 +++----- kernel/cred.c | 15 +++++++-- kernel/fork.c | 19 +++++++++-- kernel/nsproxy.c | 15 ++------- kernel/sys.c | 4 +-- kernel/user.c | 47 ++++++++------------------ kernel/user_namespace.c | 75 +++++++++++++++++------------------------- 13 files changed, 96 insertions(+), 129 deletions(-) (limited to 'kernel') diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index e0b0a4e28b9..6913f727624 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -360,7 +360,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid, struct ecryptfs_msg_ctx *msg_ctx; size_t msg_size; struct nsproxy *nsproxy; - struct user_namespace *current_user_ns; + struct user_namespace *tsk_user_ns; uid_t ctx_euid; int rc; @@ -385,9 +385,9 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid, mutex_unlock(&ecryptfs_daemon_hash_mux); goto wake_up; } - current_user_ns = nsproxy->user_ns; + tsk_user_ns = __task_cred(msg_ctx->task)->user->user_ns; ctx_euid = task_euid(msg_ctx->task); - rc = ecryptfs_find_daemon_by_euid(&daemon, ctx_euid, current_user_ns); + rc = ecryptfs_find_daemon_by_euid(&daemon, ctx_euid, tsk_user_ns); rcu_read_unlock(); mutex_unlock(&ecryptfs_daemon_hash_mux); if (rc) { @@ -405,11 +405,11 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid, euid, ctx_euid); goto unlock; } - if (current_user_ns != user_ns) { + if (tsk_user_ns != user_ns) { rc = -EBADMSG; printk(KERN_WARNING "%s: Received message from user_ns " "[0x%p]; expected message from user_ns [0x%p]\n", - __func__, user_ns, nsproxy->user_ns); + __func__, user_ns, tsk_user_ns); goto unlock; } if (daemon->pid != pid) { @@ -468,8 +468,7 @@ ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type, uid_t euid = current_euid(); int rc; - rc = ecryptfs_find_daemon_by_euid(&daemon, euid, - current->nsproxy->user_ns); + rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns()); if (rc || !daemon) { rc = -ENOTCONN; printk(KERN_ERR "%s: User [%d] does not have a daemon " diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index 047ac609695..efd95a0ed1e 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -47,8 +47,7 @@ ecryptfs_miscdev_poll(struct file *file, poll_table *pt) mutex_lock(&ecryptfs_daemon_hash_mux); /* TODO: Just use file->private_data? */ - rc = ecryptfs_find_daemon_by_euid(&daemon, euid, - current->nsproxy->user_ns); + rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns()); BUG_ON(rc || !daemon); mutex_lock(&daemon->mux); mutex_unlock(&ecryptfs_daemon_hash_mux); @@ -95,11 +94,9 @@ ecryptfs_miscdev_open(struct inode *inode, struct file *file) "count; rc = [%d]\n", __func__, rc); goto out_unlock_daemon_list; } - rc = ecryptfs_find_daemon_by_euid(&daemon, euid, - current->nsproxy->user_ns); + rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns()); if (rc || !daemon) { - rc = ecryptfs_spawn_daemon(&daemon, euid, - current->nsproxy->user_ns, + rc = ecryptfs_spawn_daemon(&daemon, euid, current_user_ns(), task_pid(current)); if (rc) { printk(KERN_ERR "%s: Error attempting to spawn daemon; " @@ -153,8 +150,7 @@ ecryptfs_miscdev_release(struct inode *inode, struct file *file) int rc; mutex_lock(&ecryptfs_daemon_hash_mux); - rc = ecryptfs_find_daemon_by_euid(&daemon, euid, - current->nsproxy->user_ns); + rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns()); BUG_ON(rc || !daemon); mutex_lock(&daemon->mux); BUG_ON(daemon->pid != task_pid(current)); @@ -254,8 +250,7 @@ ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count, mutex_lock(&ecryptfs_daemon_hash_mux); /* TODO: Just use file->private_data? */ - rc = ecryptfs_find_daemon_by_euid(&daemon, euid, - current->nsproxy->user_ns); + rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns()); BUG_ON(rc || !daemon); mutex_lock(&daemon->mux); if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) { @@ -295,7 +290,7 @@ check_list: goto check_list; } BUG_ON(euid != daemon->euid); - BUG_ON(current->nsproxy->user_ns != daemon->user_ns); + BUG_ON(current_user_ns() != daemon->user_ns); BUG_ON(task_pid(current) != daemon->pid); msg_ctx = list_first_entry(&daemon->msg_ctx_out_queue, struct ecryptfs_msg_ctx, daemon_out_list); @@ -468,7 +463,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, goto out_free; } rc = ecryptfs_miscdev_response(&data[i], packet_size, - euid, current->nsproxy->user_ns, + euid, current_user_ns(), task_pid(current), seq); if (rc) printk(KERN_WARNING "%s: Failed to deliver miscdev " diff --git a/include/linux/cred.h b/include/linux/cred.h index 26c1ab17994..3282ee4318e 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -60,6 +60,7 @@ do { \ } while (0) extern struct group_info *groups_alloc(int); +extern struct group_info init_groups; extern void groups_free(struct group_info *); extern int set_current_groups(struct group_info *); extern int set_groups(struct cred *, struct group_info *); @@ -315,6 +316,7 @@ static inline void put_cred(const struct cred *_cred) #define current_fsgid() (current_cred_xxx(fsgid)) #define current_cap() (current_cred_xxx(cap_effective)) #define current_user() (current_cred_xxx(user)) +#define current_user_ns() (current_cred_xxx(user)->user_ns) #define current_security() (current_cred_xxx(security)) #define current_uid_gid(_uid, _gid) \ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 2597858035c..959f5522d10 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -57,7 +57,6 @@ extern struct nsproxy init_nsproxy; .mnt_ns = NULL, \ INIT_NET_NS(net_ns) \ INIT_IPC_NS(ipc_ns) \ - .user_ns = &init_user_ns, \ } #define INIT_SIGHAND(sighand) { \ diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index c8a768e5964..afad7dec1b3 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -27,7 +27,6 @@ struct nsproxy { struct ipc_namespace *ipc_ns; struct mnt_namespace *mnt_ns; struct pid_namespace *pid_ns; - struct user_namespace *user_ns; struct net *net_ns; }; extern struct nsproxy init_nsproxy; diff --git a/include/linux/sched.h b/include/linux/sched.h index 2036e9f2602..7f8015a3082 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -638,6 +638,7 @@ struct user_struct { /* Hash table maintenance information */ struct hlist_node uidhash_node; uid_t uid; + struct user_namespace *user_ns; #ifdef CONFIG_USER_SCHED struct task_group *tg; diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index b5f41d4c2ee..315bcd37522 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -12,7 +12,7 @@ struct user_namespace { struct kref kref; struct hlist_head uidhash_table[UIDHASH_SZ]; - struct user_struct *root_user; + struct user_struct *creator; }; extern struct user_namespace init_user_ns; @@ -26,8 +26,7 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns) return ns; } -extern struct user_namespace *copy_user_ns(int flags, - struct user_namespace *old_ns); +extern int create_user_ns(struct cred *new); extern void free_user_ns(struct kref *kref); static inline void put_user_ns(struct user_namespace *ns) @@ -43,13 +42,9 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns) return &init_user_ns; } -static inline struct user_namespace *copy_user_ns(int flags, - struct user_namespace *old_ns) +static inline int create_user_ns(struct cred *new) { - if (flags & CLONE_NEWUSER) - return ERR_PTR(-EINVAL); - - return old_ns; + return -EINVAL; } static inline void put_user_ns(struct user_namespace *ns) diff --git a/kernel/cred.c b/kernel/cred.c index 13697ca2bb3..ff7bc071991 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -274,6 +274,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) struct thread_group_cred *tgcred; #endif struct cred *new; + int ret; mutex_init(&p->cred_exec_mutex); @@ -293,6 +294,12 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) if (!new) return -ENOMEM; + if (clone_flags & CLONE_NEWUSER) { + ret = create_user_ns(new); + if (ret < 0) + goto error_put; + } + #ifdef CONFIG_KEYS /* new threads get their own thread keyrings if their parent already * had one */ @@ -309,8 +316,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) if (!(clone_flags & CLONE_THREAD)) { tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); if (!tgcred) { - put_cred(new); - return -ENOMEM; + ret = -ENOMEM; + goto error_put; } atomic_set(&tgcred->usage, 1); spin_lock_init(&tgcred->lock); @@ -325,6 +332,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) atomic_inc(&new->user->processes); p->cred = p->real_cred = get_cred(new); return 0; + +error_put: + put_cred(new); + return ret; } /** diff --git a/kernel/fork.c b/kernel/fork.c index 29c18c14812..1dd89451fae 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -976,7 +976,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (atomic_read(&p->real_cred->user->processes) >= p->signal->rlim[RLIMIT_NPROC].rlim_cur) { if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && - p->real_cred->user != current->nsproxy->user_ns->root_user) + p->real_cred->user != INIT_USER) goto bad_fork_free; } @@ -1334,6 +1334,20 @@ long do_fork(unsigned long clone_flags, int trace = 0; long nr; + /* + * Do some preliminary argument and permissions checking before we + * actually start allocating stuff + */ + if (clone_flags & CLONE_NEWUSER) { + if (clone_flags & CLONE_THREAD) + return -EINVAL; + /* hopefully this check will go away when userns support is + * complete + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + } + /* * We hope to recycle these flags after 2.6.26 */ @@ -1581,8 +1595,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) err = -EINVAL; if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER| - CLONE_NEWNET)) + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) goto bad_unshare_out; /* diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 1d3ef29a258..63598dca2d0 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -80,12 +80,6 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_pid; } - new_nsp->user_ns = copy_user_ns(flags, tsk->nsproxy->user_ns); - if (IS_ERR(new_nsp->user_ns)) { - err = PTR_ERR(new_nsp->user_ns); - goto out_user; - } - new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns); if (IS_ERR(new_nsp->net_ns)) { err = PTR_ERR(new_nsp->net_ns); @@ -95,9 +89,6 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, return new_nsp; out_net: - if (new_nsp->user_ns) - put_user_ns(new_nsp->user_ns); -out_user: if (new_nsp->pid_ns) put_pid_ns(new_nsp->pid_ns); out_pid: @@ -130,7 +121,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) get_nsproxy(old_ns); if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET))) + CLONE_NEWPID | CLONE_NEWNET))) return 0; if (!capable(CAP_SYS_ADMIN)) { @@ -173,8 +164,6 @@ void free_nsproxy(struct nsproxy *ns) put_ipc_ns(ns->ipc_ns); if (ns->pid_ns) put_pid_ns(ns->pid_ns); - if (ns->user_ns) - put_user_ns(ns->user_ns); put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); } @@ -189,7 +178,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWUSER | CLONE_NEWNET))) + CLONE_NEWNET))) return 0; if (!capable(CAP_SYS_ADMIN)) diff --git a/kernel/sys.c b/kernel/sys.c index ab735040468..ebe65c2c987 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -565,13 +565,13 @@ static int set_user(struct cred *new) { struct user_struct *new_user; - new_user = alloc_uid(current->nsproxy->user_ns, new->uid); + new_user = alloc_uid(current_user_ns(), new->uid); if (!new_user) return -EAGAIN; if (atomic_read(&new_user->processes) >= current->signal->rlim[RLIMIT_NPROC].rlim_cur && - new_user != current->nsproxy->user_ns->root_user) { + new_user != INIT_USER) { free_uid(new_user); return -EAGAIN; } diff --git a/kernel/user.c b/kernel/user.c index d476307dd4b..c0ef3a46443 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -20,9 +20,9 @@ struct user_namespace init_user_ns = { .kref = { - .refcount = ATOMIC_INIT(2), + .refcount = ATOMIC_INIT(1), }, - .root_user = &root_user, + .creator = &root_user, }; EXPORT_SYMBOL_GPL(init_user_ns); @@ -48,12 +48,14 @@ static struct kmem_cache *uid_cachep; */ static DEFINE_SPINLOCK(uidhash_lock); +/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */ struct user_struct root_user = { - .__count = ATOMIC_INIT(1), + .__count = ATOMIC_INIT(2), .processes = ATOMIC_INIT(1), .files = ATOMIC_INIT(0), .sigpending = ATOMIC_INIT(0), .locked_shm = 0, + .user_ns = &init_user_ns, #ifdef CONFIG_USER_SCHED .tg = &init_task_group, #endif @@ -314,12 +316,13 @@ done: * IRQ state (as stored in flags) is restored and uidhash_lock released * upon function exit. */ -static inline void free_user(struct user_struct *up, unsigned long flags) +static void free_user(struct user_struct *up, unsigned long flags) { /* restore back the count */ atomic_inc(&up->__count); spin_unlock_irqrestore(&uidhash_lock, flags); + put_user_ns(up->user_ns); INIT_WORK(&up->work, remove_user_sysfs_dir); schedule_work(&up->work); } @@ -335,13 +338,14 @@ static inline void uids_mutex_unlock(void) { } * IRQ state (as stored in flags) is restored and uidhash_lock released * upon function exit. */ -static inline void free_user(struct user_struct *up, unsigned long flags) +static void free_user(struct user_struct *up, unsigned long flags) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); sched_destroy_user(up); key_put(up->uid_keyring); key_put(up->session_keyring); + put_user_ns(up->user_ns); kmem_cache_free(uid_cachep, up); } @@ -357,7 +361,7 @@ struct user_struct *find_user(uid_t uid) { struct user_struct *ret; unsigned long flags; - struct user_namespace *ns = current->nsproxy->user_ns; + struct user_namespace *ns = current_user()->user_ns; spin_lock_irqsave(&uidhash_lock, flags); ret = uid_hash_find(uid, uidhashentry(ns, uid)); @@ -404,6 +408,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) if (sched_create_user(new) < 0) goto out_free_user; + new->user_ns = get_user_ns(ns); + if (uids_user_create(new)) goto out_destoy_sched; @@ -427,7 +433,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) up = new; } spin_unlock_irq(&uidhash_lock); - } uids_mutex_unlock(); @@ -436,6 +441,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) out_destoy_sched: sched_destroy_user(new); + put_user_ns(new->user_ns); out_free_user: kmem_cache_free(uid_cachep, new); out_unlock: @@ -443,33 +449,6 @@ out_unlock: return NULL; } -#ifdef CONFIG_USER_NS -void release_uids(struct user_namespace *ns) -{ - int i; - unsigned long flags; - struct hlist_head *head; - struct hlist_node *nd; - - spin_lock_irqsave(&uidhash_lock, flags); - /* - * collapse the chains so that the user_struct-s will - * be still alive, but not in hashes. subsequent free_uid() - * will free them. - */ - for (i = 0; i < UIDHASH_SZ; i++) { - head = ns->uidhash_table + i; - while (!hlist_empty(head)) { - nd = head->first; - hlist_del_init(nd); - } - } - spin_unlock_irqrestore(&uidhash_lock, flags); - - free_uid(ns->root_user); -} -#endif - static int __init uid_cache_init(void) { int n; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 0d9c51d6733..79084311ee5 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -9,70 +9,55 @@ #include #include #include +#include /* - * Clone a new ns copying an original user ns, setting refcount to 1 - * @old_ns: namespace to clone - * Return NULL on error (failure to kmalloc), new ns otherwise + * Create a new user namespace, deriving the creator from the user in the + * passed credentials, and replacing that user with the new root user for the + * new namespace. + * + * This is called by copy_creds(), which will finish setting the target task's + * credentials. */ -static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) +int create_user_ns(struct cred *new) { struct user_namespace *ns; - struct user_struct *new_user; - struct cred *new; + struct user_struct *root_user; int n; ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); if (!ns) - return ERR_PTR(-ENOMEM); + return -ENOMEM; kref_init(&ns->kref); for (n = 0; n < UIDHASH_SZ; ++n) INIT_HLIST_HEAD(ns->uidhash_table + n); - /* Insert new root user. */ - ns->root_user = alloc_uid(ns, 0); - if (!ns->root_user) { + /* Alloc new root user. */ + root_user = alloc_uid(ns, 0); + if (!root_user) { kfree(ns); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } - /* Reset current->user with a new one */ - new_user = alloc_uid(ns, current_uid()); - if (!new_user) { - free_uid(ns->root_user); - kfree(ns); - return ERR_PTR(-ENOMEM); - } - - /* Install the new user */ - new = prepare_creds(); - if (!new) { - free_uid(new_user); - free_uid(ns->root_user); - kfree(ns); - } - free_uid(new->user); - new->user = new_user; - commit_creds(new); - return ns; -} - -struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns) -{ - struct user_namespace *new_ns; - - BUG_ON(!old_ns); - get_user_ns(old_ns); - - if (!(flags & CLONE_NEWUSER)) - return old_ns; + /* set the new root user in the credentials under preparation */ + ns->creator = new->user; + new->user = root_user; + new->uid = new->euid = new->suid = new->fsuid = 0; + new->gid = new->egid = new->sgid = new->fsgid = 0; + put_group_info(new->group_info); + new->group_info = get_group_info(&init_groups); +#ifdef CONFIG_KEYS + key_put(new->request_key_auth); + new->request_key_auth = NULL; +#endif + /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ - new_ns = clone_user_ns(old_ns); + /* alloc_uid() incremented the userns refcount. Just set it to 1 */ + kref_set(&ns->kref, 1); - put_user_ns(old_ns); - return new_ns; + return 0; } void free_user_ns(struct kref *kref) @@ -80,7 +65,7 @@ void free_user_ns(struct kref *kref) struct user_namespace *ns; ns = container_of(kref, struct user_namespace, kref); - release_uids(ns); + free_uid(ns->creator); kfree(ns); } EXPORT_SYMBOL(free_user_ns); -- cgit v1.2.3-70-g09d2 From 6ded6ab9be4f6164aef1c527407c1b94f0929799 Mon Sep 17 00:00:00 2001 From: Serge Hallyn Date: Mon, 24 Nov 2008 16:24:10 -0500 Subject: User namespaces: use the current_user_ns() macro Fix up the last current_user()->user_ns instance to use current_user_ns(). Signed-off-by: Serge E. Hallyn --- kernel/user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index c0ef3a46443..97202cb29ad 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -361,7 +361,7 @@ struct user_struct *find_user(uid_t uid) { struct user_struct *ret; unsigned long flags; - struct user_namespace *ns = current_user()->user_ns; + struct user_namespace *ns = current_user_ns(); spin_lock_irqsave(&uidhash_lock, flags); ret = uid_hash_find(uid, uidhashentry(ns, uid)); -- cgit v1.2.3-70-g09d2 From 7807fafa52b990abb321f1212416c71e64523ecb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 25 Nov 2008 08:44:24 +0100 Subject: lockdep: fix unused function warning in kernel/lockdep.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: fix build warning this warning: kernel/lockdep.c:584: warning: ‘print_lock_dependencies’ defined but not used triggers because print_lock_dependencies() is only used if both CONFIG_TRACE_IRQFLAGS and CONFIG_PROVE_LOCKING are enabled. But adding #ifdefs is not an option here - it would spread out to 4-5 other helper functions and uglify the file. So mark this function as __used - it's static and the compiler can eliminate it just fine. Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index a4285830323..c137953420e 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -580,7 +580,8 @@ static void print_lock_class_header(struct lock_class *class, int depth) /* * printk all lock dependencies starting at : */ -static void print_lock_dependencies(struct lock_class *class, int depth) +static void __used +print_lock_dependencies(struct lock_class *class, int depth) { struct lock_list *entry; -- cgit v1.2.3-70-g09d2 From ca109491f612aab5c8152207631c0444f63da97f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Nov 2008 12:43:51 +0100 Subject: hrtimer: removing all ur callback modes Impact: cleanup, move all hrtimer processing into hardirq context This is an attempt at removing some of the hrtimer complexity by reducing the number of callback modes to 1. This means that all hrtimer callback functions will be ran from HARD-irq context. I went through all the 30 odd hrtimer callback functions in the kernel and saw only one that I'm not quite sure of, which is the one in net/can/bcm.c - hence I'm CC-ing the folks responsible for that code. Furthermore, the hrtimer core now calls callbacks directly with IRQs disabled in case you try to enqueue an expired timer. If this timer is a periodic timer (which should use hrtimer_forward() to advance its time) then it might be possible to end up in an inf. recursive loop due to the fact that hrtimer_forward() doesn't round up to the next timer granularity, and therefore keeps on calling the callback - obviously this needs a fix. Aside from that, this seems to compile and actually boot on my dual core test box - although I'm sure there are some bugs in, me not hitting any makes me certain :-) Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- drivers/input/touchscreen/ads7846.c | 4 +- include/linux/hrtimer.h | 34 +---- include/linux/interrupt.h | 3 - kernel/hrtimer.c | 280 ++++-------------------------------- kernel/sched.c | 2 - kernel/time/ntp.c | 4 +- kernel/time/tick-sched.c | 1 - kernel/trace/trace_sysprof.c | 1 - sound/drivers/pcsp/pcsp.c | 1 - 9 files changed, 37 insertions(+), 293 deletions(-) (limited to 'kernel') diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c index b9b7fc6ff1e..e1ece89fe92 100644 --- a/drivers/input/touchscreen/ads7846.c +++ b/drivers/input/touchscreen/ads7846.c @@ -697,7 +697,7 @@ static enum hrtimer_restart ads7846_timer(struct hrtimer *handle) struct ads7846 *ts = container_of(handle, struct ads7846, timer); int status = 0; - spin_lock_irq(&ts->lock); + spin_lock(&ts->lock); if (unlikely(!get_pendown_state(ts) || device_suspended(&ts->spi->dev))) { @@ -728,7 +728,7 @@ static enum hrtimer_restart ads7846_timer(struct hrtimer *handle) dev_err(&ts->spi->dev, "spi_async --> %d\n", status); } - spin_unlock_irq(&ts->lock); + spin_unlock(&ts->lock); return HRTIMER_NORESTART; } diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 3eba43878dc..bd37078c2d7 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -42,26 +42,6 @@ enum hrtimer_restart { HRTIMER_RESTART, /* Timer must be restarted */ }; -/* - * hrtimer callback modes: - * - * HRTIMER_CB_SOFTIRQ: Callback must run in softirq context - * HRTIMER_CB_IRQSAFE_PERCPU: Callback must run in hardirq context - * Special mode for tick emulation and - * scheduler timer. Such timers are per - * cpu and not allowed to be migrated on - * cpu unplug. - * HRTIMER_CB_IRQSAFE_UNLOCKED: Callback should run in hardirq context - * with timer->base lock unlocked - * used for timers which call wakeup to - * avoid lock order problems with rq->lock - */ -enum hrtimer_cb_mode { - HRTIMER_CB_SOFTIRQ, - HRTIMER_CB_IRQSAFE_PERCPU, - HRTIMER_CB_IRQSAFE_UNLOCKED, -}; - /* * Values to track state of the timer * @@ -70,7 +50,6 @@ enum hrtimer_cb_mode { * 0x00 inactive * 0x01 enqueued into rbtree * 0x02 callback function running - * 0x04 callback pending (high resolution mode) * * Special cases: * 0x03 callback function running and enqueued @@ -92,8 +71,7 @@ enum hrtimer_cb_mode { #define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_ENQUEUED 0x01 #define HRTIMER_STATE_CALLBACK 0x02 -#define HRTIMER_STATE_PENDING 0x04 -#define HRTIMER_STATE_MIGRATE 0x08 +#define HRTIMER_STATE_MIGRATE 0x04 /** * struct hrtimer - the basic hrtimer structure @@ -109,8 +87,6 @@ enum hrtimer_cb_mode { * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) * @state: state information (See bit values above) - * @cb_mode: high resolution timer feature to select the callback execution - * mode * @cb_entry: list head to enqueue an expired timer into the callback list * @start_site: timer statistics field to store the site where the timer * was started @@ -129,7 +105,6 @@ struct hrtimer { struct hrtimer_clock_base *base; unsigned long state; struct list_head cb_entry; - enum hrtimer_cb_mode cb_mode; #ifdef CONFIG_TIMER_STATS int start_pid; void *start_site; @@ -188,15 +163,11 @@ struct hrtimer_clock_base { * @check_clocks: Indictator, when set evaluate time source and clock * event devices whether high resolution mode can be * activated. - * @cb_pending: Expired timers are moved from the rbtree to this - * list in the timer interrupt. The list is processed - * in the softirq. * @nr_events: Total number of timer interrupt events */ struct hrtimer_cpu_base { spinlock_t lock; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; - struct list_head cb_pending; #ifdef CONFIG_HIGH_RES_TIMERS ktime_t expires_next; int hres_active; @@ -404,8 +375,7 @@ static inline int hrtimer_active(const struct hrtimer *timer) */ static inline int hrtimer_is_queued(struct hrtimer *timer) { - return timer->state & - (HRTIMER_STATE_ENQUEUED | HRTIMER_STATE_PENDING); + return timer->state & HRTIMER_STATE_ENQUEUED; } /* diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index f58a0cf8929..d6210a97a8c 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -251,9 +251,6 @@ enum BLOCK_SOFTIRQ, TASKLET_SOFTIRQ, SCHED_SOFTIRQ, -#ifdef CONFIG_HIGH_RES_TIMERS - HRTIMER_SOFTIRQ, -#endif RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ NR_SOFTIRQS diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 47e63349d1b..efd6f41e1c1 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -442,22 +442,6 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif -/* - * Check, whether the timer is on the callback pending list - */ -static inline int hrtimer_cb_pending(const struct hrtimer *timer) -{ - return timer->state & HRTIMER_STATE_PENDING; -} - -/* - * Remove a timer from the callback pending list - */ -static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) -{ - list_del_init(&timer->cb_entry); -} - /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -651,6 +635,8 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } +static void __run_hrtimer(struct hrtimer *timer); + /* * When High resolution timers are active, try to reprogram. Note, that in case * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry @@ -661,31 +647,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base) { if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { - - /* Timer is expired, act upon the callback mode */ - switch(timer->cb_mode) { - case HRTIMER_CB_IRQSAFE_PERCPU: - case HRTIMER_CB_IRQSAFE_UNLOCKED: - /* - * This is solely for the sched tick emulation with - * dynamic tick support to ensure that we do not - * restart the tick right on the edge and end up with - * the tick timer in the softirq ! The calling site - * takes care of this. Also used for hrtimer sleeper ! - */ - debug_hrtimer_deactivate(timer); - return 1; - case HRTIMER_CB_SOFTIRQ: - /* - * Move everything else into the softirq pending list ! - */ - list_add_tail(&timer->cb_entry, - &base->cpu_base->cb_pending); - timer->state = HRTIMER_STATE_PENDING; - return 1; - default: - BUG(); - } + /* + * XXX: recursion check? + * hrtimer_forward() should round up with timer granularity + * so that we never get into inf recursion here, + * it doesn't do that though + */ + __run_hrtimer(timer); + return 1; } return 0; } @@ -724,11 +693,6 @@ static int hrtimer_switch_to_hres(void) return 1; } -static inline void hrtimer_raise_softirq(void) -{ - raise_softirq(HRTIMER_SOFTIRQ); -} - #else static inline int hrtimer_hres_active(void) { return 0; } @@ -747,7 +711,6 @@ static inline int hrtimer_reprogram(struct hrtimer *timer, { return 0; } -static inline void hrtimer_raise_softirq(void) { } #endif /* CONFIG_HIGH_RES_TIMERS */ @@ -890,10 +853,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, unsigned long newstate, int reprogram) { - /* High res. callback list. NOP for !HIGHRES */ - if (hrtimer_cb_pending(timer)) - hrtimer_remove_cb_pending(timer); - else { + if (timer->state & HRTIMER_STATE_ENQUEUED) { /* * Remove the timer from the rbtree and replace the * first entry pointer if necessary. @@ -953,7 +913,7 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n { struct hrtimer_clock_base *base, *new_base; unsigned long flags; - int ret, raise; + int ret; base = lock_hrtimer_base(timer, &flags); @@ -988,26 +948,8 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n enqueue_hrtimer(timer, new_base, new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); - /* - * The timer may be expired and moved to the cb_pending - * list. We can not raise the softirq with base lock held due - * to a possible deadlock with runqueue lock. - */ - raise = timer->state == HRTIMER_STATE_PENDING; - - /* - * We use preempt_disable to prevent this task from migrating after - * setting up the softirq and raising it. Otherwise, if me migrate - * we will raise the softirq on the wrong CPU. - */ - preempt_disable(); - unlock_hrtimer_base(timer, &flags); - if (raise) - hrtimer_raise_softirq(); - preempt_enable(); - return ret; } EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); @@ -1192,75 +1134,6 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) } EXPORT_SYMBOL_GPL(hrtimer_get_res); -static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) -{ - spin_lock_irq(&cpu_base->lock); - - while (!list_empty(&cpu_base->cb_pending)) { - enum hrtimer_restart (*fn)(struct hrtimer *); - struct hrtimer *timer; - int restart; - int emulate_hardirq_ctx = 0; - - timer = list_entry(cpu_base->cb_pending.next, - struct hrtimer, cb_entry); - - debug_hrtimer_deactivate(timer); - timer_stats_account_hrtimer(timer); - - fn = timer->function; - /* - * A timer might have been added to the cb_pending list - * when it was migrated during a cpu-offline operation. - * Emulate hardirq context for such timers. - */ - if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU || - timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) - emulate_hardirq_ctx = 1; - - __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); - spin_unlock_irq(&cpu_base->lock); - - if (unlikely(emulate_hardirq_ctx)) { - local_irq_disable(); - restart = fn(timer); - local_irq_enable(); - } else - restart = fn(timer); - - spin_lock_irq(&cpu_base->lock); - - timer->state &= ~HRTIMER_STATE_CALLBACK; - if (restart == HRTIMER_RESTART) { - BUG_ON(hrtimer_active(timer)); - /* - * Enqueue the timer, allow reprogramming of the event - * device - */ - enqueue_hrtimer(timer, timer->base, 1); - } else if (hrtimer_active(timer)) { - /* - * If the timer was rearmed on another CPU, reprogram - * the event device. - */ - struct hrtimer_clock_base *base = timer->base; - - if (base->first == &timer->node && - hrtimer_reprogram(timer, base)) { - /* - * Timer is expired. Thus move it from tree to - * pending list again. - */ - __remove_hrtimer(timer, base, - HRTIMER_STATE_PENDING, 0); - list_add_tail(&timer->cb_entry, - &base->cpu_base->cb_pending); - } - } - } - spin_unlock_irq(&cpu_base->lock); -} - static void __run_hrtimer(struct hrtimer *timer) { struct hrtimer_clock_base *base = timer->base; @@ -1268,25 +1141,21 @@ static void __run_hrtimer(struct hrtimer *timer) enum hrtimer_restart (*fn)(struct hrtimer *); int restart; + WARN_ON(!irqs_disabled()); + debug_hrtimer_deactivate(timer); __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); timer_stats_account_hrtimer(timer); - fn = timer->function; - if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU || - timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) { - /* - * Used for scheduler timers, avoid lock inversion with - * rq->lock and tasklist_lock. - * - * These timers are required to deal with enqueue expiry - * themselves and are not allowed to migrate. - */ - spin_unlock(&cpu_base->lock); - restart = fn(timer); - spin_lock(&cpu_base->lock); - } else - restart = fn(timer); + + /* + * Because we run timers from hardirq context, there is no chance + * they get migrated to another cpu, therefore its safe to unlock + * the timer base. + */ + spin_unlock(&cpu_base->lock); + restart = fn(timer); + spin_lock(&cpu_base->lock); /* * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid @@ -1311,7 +1180,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); struct hrtimer_clock_base *base; ktime_t expires_next, now; - int i, raise = 0; + int i; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; @@ -1360,16 +1229,6 @@ void hrtimer_interrupt(struct clock_event_device *dev) break; } - /* Move softirq callbacks to the pending list */ - if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { - __remove_hrtimer(timer, base, - HRTIMER_STATE_PENDING, 0); - list_add_tail(&timer->cb_entry, - &base->cpu_base->cb_pending); - raise = 1; - continue; - } - __run_hrtimer(timer); } spin_unlock(&cpu_base->lock); @@ -1383,10 +1242,6 @@ void hrtimer_interrupt(struct clock_event_device *dev) if (tick_program_event(expires_next, 0)) goto retry; } - - /* Raise softirq ? */ - if (raise) - raise_softirq(HRTIMER_SOFTIRQ); } /** @@ -1413,11 +1268,6 @@ void hrtimer_peek_ahead_timers(void) local_irq_restore(flags); } -static void run_hrtimer_softirq(struct softirq_action *h) -{ - run_hrtimer_pending(&__get_cpu_var(hrtimer_bases)); -} - #endif /* CONFIG_HIGH_RES_TIMERS */ /* @@ -1429,8 +1279,6 @@ static void run_hrtimer_softirq(struct softirq_action *h) */ void hrtimer_run_pending(void) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - if (hrtimer_hres_active()) return; @@ -1444,8 +1292,6 @@ void hrtimer_run_pending(void) */ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) hrtimer_switch_to_hres(); - - run_hrtimer_pending(cpu_base); } /* @@ -1482,14 +1328,6 @@ void hrtimer_run_queues(void) hrtimer_get_expires_tv64(timer)) break; - if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { - __remove_hrtimer(timer, base, - HRTIMER_STATE_PENDING, 0); - list_add_tail(&timer->cb_entry, - &base->cpu_base->cb_pending); - continue; - } - __run_hrtimer(timer); } spin_unlock(&cpu_base->lock); @@ -1516,9 +1354,6 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) { sl->timer.function = hrtimer_wakeup; sl->task = task; -#ifdef CONFIG_HIGH_RES_TIMERS - sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; -#endif } static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) @@ -1655,36 +1490,22 @@ static void __cpuinit init_hrtimers_cpu(int cpu) for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) cpu_base->clock_base[i].cpu_base = cpu_base; - INIT_LIST_HEAD(&cpu_base->cb_pending); hrtimer_init_hres(cpu_base); } #ifdef CONFIG_HOTPLUG_CPU -static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, - struct hrtimer_clock_base *new_base, int dcpu) +static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, + struct hrtimer_clock_base *new_base, int dcpu) { struct hrtimer *timer; struct rb_node *node; - int raise = 0; while ((node = rb_first(&old_base->active))) { timer = rb_entry(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); debug_hrtimer_deactivate(timer); - /* - * Should not happen. Per CPU timers should be - * canceled _before_ the migration code is called - */ - if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) { - __remove_hrtimer(timer, old_base, - HRTIMER_STATE_INACTIVE, 0); - WARN(1, "hrtimer (%p %p)active but cpu %d dead\n", - timer, timer->function, dcpu); - continue; - } - /* * Mark it as STATE_MIGRATE not INACTIVE otherwise the * timer could be seen as !active and just vanish away @@ -1708,48 +1529,19 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * otherwise we end up with a stale timer. */ if (timer->state == HRTIMER_STATE_MIGRATE) { - timer->state = HRTIMER_STATE_PENDING; - list_add_tail(&timer->cb_entry, - &new_base->cpu_base->cb_pending); - raise = 1; + /* XXX: running on offline cpu */ + __run_hrtimer(timer); } #endif /* Clear the migration state bit */ timer->state &= ~HRTIMER_STATE_MIGRATE; } - return raise; } -#ifdef CONFIG_HIGH_RES_TIMERS -static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base, - struct hrtimer_cpu_base *new_base) -{ - struct hrtimer *timer; - int raise = 0; - - while (!list_empty(&old_base->cb_pending)) { - timer = list_entry(old_base->cb_pending.next, - struct hrtimer, cb_entry); - - __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0); - timer->base = &new_base->clock_base[timer->base->index]; - list_add_tail(&timer->cb_entry, &new_base->cb_pending); - raise = 1; - } - return raise; -} -#else -static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base, - struct hrtimer_cpu_base *new_base) -{ - return 0; -} -#endif - static void migrate_hrtimers(int cpu) { struct hrtimer_cpu_base *old_base, *new_base; - int i, raise = 0; + int i; BUG_ON(cpu_online(cpu)); old_base = &per_cpu(hrtimer_bases, cpu); @@ -1764,20 +1556,13 @@ static void migrate_hrtimers(int cpu) spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - if (migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i], cpu)) - raise = 1; + migrate_hrtimer_list(&old_base->clock_base[i], + &new_base->clock_base[i], cpu); } - if (migrate_hrtimer_pending(old_base, new_base)) - raise = 1; - spin_unlock(&old_base->lock); spin_unlock_irq(&new_base->lock); put_cpu_var(hrtimer_bases); - - if (raise) - hrtimer_raise_softirq(); } #endif /* CONFIG_HOTPLUG_CPU */ @@ -1817,9 +1602,6 @@ void __init hrtimers_init(void) hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); -#ifdef CONFIG_HIGH_RES_TIMERS - open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); -#endif } /** diff --git a/kernel/sched.c b/kernel/sched.c index 9b1e79371c2..5ac5e953616 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -203,7 +203,6 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rt_b->rt_period_timer.function = sched_rt_period_timer; - rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; } static inline int rt_bandwidth_enabled(void) @@ -1139,7 +1138,6 @@ static void init_rq_hrtick(struct rq *rq) hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rq->hrtick_timer.function = hrtick; - rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; } #else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 8ff15e5d486..f5f793d9241 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -131,7 +131,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) { enum hrtimer_restart res = HRTIMER_NORESTART; - write_seqlock_irq(&xtime_lock); + write_seqlock(&xtime_lock); switch (time_state) { case TIME_OK: @@ -164,7 +164,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) } update_vsyscall(&xtime, clock); - write_sequnlock_irq(&xtime_lock); + write_sequnlock(&xtime_lock); return res; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 342fc9ccab4..502a81e2639 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -681,7 +681,6 @@ void tick_setup_sched_timer(void) */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ts->sched_timer.function = tick_sched_timer; - ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; /* Get the next period (per cpu) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 9587d3bcba5..ae542e2e38d 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -202,7 +202,6 @@ static void start_stack_timer(int cpu) hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = stack_trace_timer_fn; - hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); } diff --git a/sound/drivers/pcsp/pcsp.c b/sound/drivers/pcsp/pcsp.c index 1899cf0685b..8e52b2a8a13 100644 --- a/sound/drivers/pcsp/pcsp.c +++ b/sound/drivers/pcsp/pcsp.c @@ -96,7 +96,6 @@ static int __devinit snd_card_pcsp_probe(int devnum, struct device *dev) return -EINVAL; hrtimer_init(&pcsp_chip.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - pcsp_chip.timer.cb_mode = HRTIMER_CB_SOFTIRQ; pcsp_chip.timer.function = pcsp_do_timer; card = snd_card_new(index, id, THIS_MODULE, 0); -- cgit v1.2.3-70-g09d2 From 8b752e3ef6e3f5cde87afc649dd51d92b1e549c1 Mon Sep 17 00:00:00 2001 From: Liming Wang Date: Fri, 28 Nov 2008 09:52:40 +0800 Subject: softirq: remove useless function __local_bh_enable Impact: remove unused code __local_bh_enable has been replaced with _local_bh_enable. As comments says "it always nests inside local_bh_enable() sections" has not been valid now. Also there is no reason to use __local_bh_enable anywhere, so we can remove this useless function. Signed-off-by: Liming Wang Signed-off-by: Ingo Molnar --- include/linux/bottom_half.h | 1 - kernel/softirq.c | 14 -------------- 2 files changed, 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h index 777dbf695d4..27b1bcffe40 100644 --- a/include/linux/bottom_half.h +++ b/include/linux/bottom_half.h @@ -2,7 +2,6 @@ #define _LINUX_BH_H extern void local_bh_disable(void); -extern void __local_bh_enable(void); extern void _local_bh_enable(void); extern void local_bh_enable(void); extern void local_bh_enable_ip(unsigned long ip); diff --git a/kernel/softirq.c b/kernel/softirq.c index e7c69a720d6..8d9934b4162 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -102,20 +102,6 @@ void local_bh_disable(void) EXPORT_SYMBOL(local_bh_disable); -void __local_bh_enable(void) -{ - WARN_ON_ONCE(in_irq()); - - /* - * softirqs should never be enabled by __local_bh_enable(), - * it always nests inside local_bh_enable() sections: - */ - WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET); - - sub_preempt_count(SOFTIRQ_OFFSET); -} -EXPORT_SYMBOL_GPL(__local_bh_enable); - /* * Special-case - softirqs can safely be enabled in * cond_resched_softirq(), or by __do_softirq(), -- cgit v1.2.3-70-g09d2 From 74853dba2f7a1a9b0905a09abcf65c1f3ce0b14f Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 28 Nov 2008 08:35:25 -0800 Subject: debug warnings: consolidate warn_slowpath and warn_on_slowpath Impact: cleanup, code reduction warn_slowpath is a superset of warn_on_slowpath; just have warn_on_slowpath call warn_slowpath with a NULL 3rd argument. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- kernel/panic.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 6513aac8e99..6bbf7b905c7 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -320,23 +320,6 @@ void oops_exit(void) } #ifdef WANT_WARN_ON_SLOWPATH -void warn_on_slowpath(const char *file, int line) -{ - char function[KSYM_SYMBOL_LEN]; - unsigned long caller = (unsigned long) __builtin_return_address(0); - sprint_symbol(function, caller); - - printk(KERN_WARNING "------------[ cut here ]------------\n"); - printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file, - line, function); - print_modules(); - dump_stack(); - print_oops_end_marker(); - add_taint(TAINT_WARN); -} -EXPORT_SYMBOL(warn_on_slowpath); - - void warn_slowpath(const char *file, int line, const char *fmt, ...) { va_list args; @@ -347,9 +330,12 @@ void warn_slowpath(const char *file, int line, const char *fmt, ...) printk(KERN_WARNING "------------[ cut here ]------------\n"); printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file, line, function); - va_start(args, fmt); - vprintk(fmt, args); - va_end(args); + + if (fmt) { + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); + } print_modules(); dump_stack(); @@ -357,6 +343,12 @@ void warn_slowpath(const char *file, int line, const char *fmt, ...) add_taint(TAINT_WARN); } EXPORT_SYMBOL(warn_slowpath); + +void warn_on_slowpath(const char *file, int line) +{ + warn_slowpath(file, line, NULL); +} +EXPORT_SYMBOL(warn_on_slowpath); #endif #ifdef CONFIG_CC_STACKPROTECTOR -- cgit v1.2.3-70-g09d2 From bd89bb29a01503c5cffa367eccb0b356f910cb8d Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 28 Nov 2008 08:36:09 -0800 Subject: debug warnings: print the DMI board info name in a WARN/WARN_ON Impact: extend WARN_ON() output with DMI_PRODUCT_NAME It's very useful for many low level WARN_ON's to find out which motherboard has the broken BIOS etc... this patch adds a printk to the WARN_ON code for this. On architectures without DMI, gcc should optimize the code out. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- kernel/panic.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 6bbf7b905c7..73d365199c3 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -21,6 +21,7 @@ #include #include #include +#include int panic_on_oops; static unsigned long tainted_mask; @@ -325,11 +326,16 @@ void warn_slowpath(const char *file, int line, const char *fmt, ...) va_list args; char function[KSYM_SYMBOL_LEN]; unsigned long caller = (unsigned long)__builtin_return_address(0); + const char *board; + sprint_symbol(function, caller); printk(KERN_WARNING "------------[ cut here ]------------\n"); printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file, line, function); + board = dmi_get_system_info(DMI_PRODUCT_NAME); + if (board) + printk(KERN_WARNING "Hardware name: %s\n", board); if (fmt) { va_start(args, fmt); -- cgit v1.2.3-70-g09d2 From ec5679e513305f1411753e5f5489935bd638af23 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 28 Nov 2008 17:56:14 +0100 Subject: debug warnings: eliminate warn_on_slowpath() Impact: cleanup, eliminate code now that warn_on_slowpath() uses warn_slowpath(...,NULL), we can eliminate warn_on_slowpath() altogether and use warn_slowpath(). Signed-off-by: Ingo Molnar --- include/asm-generic/bug.h | 7 +++---- kernel/panic.c | 6 ------ 2 files changed, 3 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 12c07c1866b..b8ba6941f58 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -33,15 +33,14 @@ struct bug_entry { #ifndef __WARN #ifndef __ASSEMBLY__ -extern void warn_on_slowpath(const char *file, const int line); extern void warn_slowpath(const char *file, const int line, const char *fmt, ...) __attribute__((format(printf, 3, 4))); #define WANT_WARN_ON_SLOWPATH #endif -#define __WARN() warn_on_slowpath(__FILE__, __LINE__) -#define __WARN_printf(arg...) warn_slowpath(__FILE__, __LINE__, arg) +#define __WARN() warn_slowpath(__FILE__, __LINE__, NULL) +#define __WARN_printf(arg...) warn_slowpath(__FILE__, __LINE__, arg) #else -#define __WARN_printf(arg...) do { printk(arg); __WARN(); } while (0) +#define __WARN_printf(arg...) do { printk(arg); __WARN(); } while (0) #endif #ifndef WARN_ON diff --git a/kernel/panic.c b/kernel/panic.c index 73d365199c3..50349a41fba 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -349,12 +349,6 @@ void warn_slowpath(const char *file, int line, const char *fmt, ...) add_taint(TAINT_WARN); } EXPORT_SYMBOL(warn_slowpath); - -void warn_on_slowpath(const char *file, int line) -{ - warn_slowpath(file, line, NULL); -} -EXPORT_SYMBOL(warn_on_slowpath); #endif #ifdef CONFIG_CC_STACKPROTECTOR -- cgit v1.2.3-70-g09d2 From f2b662da8d6bd44673537f3f64220afefdca369f Mon Sep 17 00:00:00 2001 From: David Brownell Date: Mon, 1 Dec 2008 14:31:38 -0800 Subject: genirq: record IRQ_LEVEL in irq_desc[] Impact: fix __irq_set_trigger() for IRQ_LEVEL When recording the irq trigger type, let's also make sure that IRQ_LEVEL gets set correctly. Signed-off-by: David Brownell Signed-off-by: Andrew Morton Acked-by: Benjamin Herrenschmidt Signed-off-by: Ingo Molnar --- kernel/irq/chip.c | 1 + kernel/irq/manage.c | 15 +++++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 10b5092e9bf..7765d4c80c3 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -124,6 +124,7 @@ int set_irq_type(unsigned int irq, unsigned int type) return -ENODEV; } + type &= IRQ_TYPE_SENSE_MASK; if (type == IRQ_TYPE_NONE) return 0; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 801addda3c4..46953a06f4a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -370,16 +370,18 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, return 0; } - ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK); + /* caller masked out all except trigger mode flags */ + ret = chip->set_type(irq, flags); if (ret) pr_err("setting trigger mode %d for irq %u failed (%pF)\n", - (int)(flags & IRQF_TRIGGER_MASK), - irq, chip->set_type); + (int)flags, irq, chip->set_type); else { + if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) + flags |= IRQ_LEVEL; /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ - desc->status &= ~IRQ_TYPE_SENSE_MASK; - desc->status |= flags & IRQ_TYPE_SENSE_MASK; + desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); + desc->status |= flags; } return ret; @@ -459,7 +461,8 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new) /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { - ret = __irq_set_trigger(desc, irq, new->flags); + ret = __irq_set_trigger(desc, irq, + new->flags & IRQF_TRIGGER_MASK); if (ret) { spin_unlock_irqrestore(&desc->lock, flags); -- cgit v1.2.3-70-g09d2 From 470c66239ef0336429b35345f3f615d47341e13b Mon Sep 17 00:00:00 2001 From: David Brownell Date: Mon, 1 Dec 2008 14:31:37 -0800 Subject: genirq: warn when IRQF_DISABLED may be ignored Impact: emit new warning We periodically waste time tracking down problems from the genirq framework not respecting IRQF_DISABLED for some shared IRQ cases. Linus views this as "will not fix", but we're still left with the bugs caused by this misbehavior. This patch adds a nag message in request_irq(), so that drivers can fix their IRQ handlers to avoid this problem. Note that developers will never see the relevant bugs when they run with LOCKDEP, so it's no wonder these bugs are hard to find. (That also means LOCKDEP is overlooking some IRQ-related bugs involving IRQ handlers that don't set IRQF_DISABLED...) Signed-off-by: David Brownell Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c498a1b8c62..7fd891c3a33 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -635,6 +635,18 @@ int request_irq(unsigned int irq, irq_handler_t handler, struct irq_desc *desc; int retval; + /* + * handle_IRQ_event() always ignores IRQF_DISABLED except for + * the _first_ irqaction (sigh). That can cause oopsing, but + * the behavior is classified as "will not fix" so we need to + * start nudging drivers away from using that idiom. + */ + if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) + == (IRQF_SHARED|IRQF_DISABLED)) + pr_warning("IRQ %d/%s: IRQF_DISABLED is not " + "guaranteed on shared IRQs\n", + irq, devname); + #ifdef CONFIG_LOCKDEP /* * Lockdep wants atomic interrupt handlers: -- cgit v1.2.3-70-g09d2 From 00ef9f7348dfd2fc223ec42aceb30836e86b367f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 4 Dec 2008 09:00:17 +0100 Subject: lockdep: change a held lock's class Impact: introduce new lockdep API Allow to change a held lock's class. Basically the same as the existing code to change a subclass therefore reuse all that. The XFS code will be able to use this to annotate their inode locking. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 12 ++++++++++-- kernel/lockdep.c | 24 +++++++++--------------- 2 files changed, 19 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 8956daf64ab..37a0361f468 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -314,8 +314,15 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass, extern void lock_release(struct lockdep_map *lock, int nested, unsigned long ip); -extern void lock_set_subclass(struct lockdep_map *lock, unsigned int subclass, - unsigned long ip); +extern void lock_set_class(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, unsigned int subclass, + unsigned long ip); + +static inline void lock_set_subclass(struct lockdep_map *lock, + unsigned int subclass, unsigned long ip) +{ + lock_set_class(lock, lock->name, lock->key, subclass, ip); +} # define INIT_LOCKDEP .lockdep_recursion = 0, @@ -333,6 +340,7 @@ static inline void lockdep_on(void) # define lock_acquire(l, s, t, r, c, n, i) do { } while (0) # define lock_release(l, n, i) do { } while (0) +# define lock_set_class(l, n, k, s, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) # define lockdep_init() do { } while (0) # define lockdep_info() do { } while (0) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 90f3fb64dbc..4fa6eeb4e8a 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -291,14 +291,12 @@ void lockdep_off(void) { current->lockdep_recursion++; } - EXPORT_SYMBOL(lockdep_off); void lockdep_on(void) { current->lockdep_recursion--; } - EXPORT_SYMBOL(lockdep_on); /* @@ -2513,7 +2511,6 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, if (subclass) register_lock_class(lock, subclass, 1); } - EXPORT_SYMBOL_GPL(lockdep_init_map); /* @@ -2694,8 +2691,9 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, } static int -__lock_set_subclass(struct lockdep_map *lock, - unsigned int subclass, unsigned long ip) +__lock_set_class(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, unsigned int subclass, + unsigned long ip) { struct task_struct *curr = current; struct held_lock *hlock, *prev_hlock; @@ -2722,6 +2720,7 @@ __lock_set_subclass(struct lockdep_map *lock, return print_unlock_inbalance_bug(curr, lock, ip); found_it: + lockdep_init_map(lock, name, key, 0); class = register_lock_class(lock, subclass, 0); hlock->class_idx = class - lock_classes + 1; @@ -2906,9 +2905,9 @@ static void check_flags(unsigned long flags) #endif } -void -lock_set_subclass(struct lockdep_map *lock, - unsigned int subclass, unsigned long ip) +void lock_set_class(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, unsigned int subclass, + unsigned long ip) { unsigned long flags; @@ -2918,13 +2917,12 @@ lock_set_subclass(struct lockdep_map *lock, raw_local_irq_save(flags); current->lockdep_recursion = 1; check_flags(flags); - if (__lock_set_subclass(lock, subclass, ip)) + if (__lock_set_class(lock, name, key, subclass, ip)) check_chain_key(current); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } - -EXPORT_SYMBOL_GPL(lock_set_subclass); +EXPORT_SYMBOL_GPL(lock_set_class); /* * We are not always called with irqs disabled - do that here, @@ -2948,7 +2946,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, current->lockdep_recursion = 0; raw_local_irq_restore(flags); } - EXPORT_SYMBOL_GPL(lock_acquire); void lock_release(struct lockdep_map *lock, int nested, @@ -2966,7 +2963,6 @@ void lock_release(struct lockdep_map *lock, int nested, current->lockdep_recursion = 0; raw_local_irq_restore(flags); } - EXPORT_SYMBOL_GPL(lock_release); #ifdef CONFIG_LOCK_STAT @@ -3451,7 +3447,6 @@ retry: if (unlock) read_unlock(&tasklist_lock); } - EXPORT_SYMBOL_GPL(debug_show_all_locks); /* @@ -3472,7 +3467,6 @@ void debug_show_held_locks(struct task_struct *task) { __debug_show_held_locks(task); } - EXPORT_SYMBOL_GPL(debug_show_held_locks); void lockdep_sys_exit(void) -- cgit v1.2.3-70-g09d2 From 37810659ea7d9572c5ac284ade272f806ef8f788 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 4 Dec 2008 11:17:10 +0100 Subject: hrtimer: removing all ur callback modes, fix hotplug Impact: fix hrtimer locking (reported by lockdep) in the CPU hotplug case This addition fixes the hotplug locking issue on my machine Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 65 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index efd6f41e1c1..b09c7a27631 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1496,7 +1496,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu) #ifdef CONFIG_HOTPLUG_CPU static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, - struct hrtimer_clock_base *new_base, int dcpu) + struct hrtimer_clock_base *new_base) { struct hrtimer *timer; struct rb_node *node; @@ -1514,40 +1514,34 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); timer->base = new_base; /* - * Enqueue the timer. Allow reprogramming of the event device + * Enqueue the timers on the new cpu, but do not reprogram + * the timer as that would enable a deadlock between + * hrtimer_enqueue_reprogramm() running the timer and us still + * holding a nested base lock. + * + * Instead we tickle the hrtimer interrupt after the migration + * is done, which will run all expired timers and re-programm + * the timer device. */ - enqueue_hrtimer(timer, new_base, 1); + enqueue_hrtimer(timer, new_base, 0); -#ifdef CONFIG_HIGH_RES_TIMERS - /* - * Happens with high res enabled when the timer was - * already expired and the callback mode is - * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The - * enqueue code does not move them to the soft irq - * pending list for performance/latency reasons, but - * in the migration state, we need to do that - * otherwise we end up with a stale timer. - */ - if (timer->state == HRTIMER_STATE_MIGRATE) { - /* XXX: running on offline cpu */ - __run_hrtimer(timer); - } -#endif /* Clear the migration state bit */ timer->state &= ~HRTIMER_STATE_MIGRATE; } } -static void migrate_hrtimers(int cpu) +static int migrate_hrtimers(int scpu) { struct hrtimer_cpu_base *old_base, *new_base; - int i; + int dcpu, i; - BUG_ON(cpu_online(cpu)); - old_base = &per_cpu(hrtimer_bases, cpu); + BUG_ON(cpu_online(scpu)); + old_base = &per_cpu(hrtimer_bases, scpu); new_base = &get_cpu_var(hrtimer_bases); - tick_cancel_sched_timer(cpu); + dcpu = smp_processor_id(); + + tick_cancel_sched_timer(scpu); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. @@ -1557,32 +1551,47 @@ static void migrate_hrtimers(int cpu) for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i], cpu); + &new_base->clock_base[i]); } spin_unlock(&old_base->lock); spin_unlock_irq(&new_base->lock); put_cpu_var(hrtimer_bases); + + return dcpu; +} + +static void tickle_timers(void *arg) +{ + hrtimer_peek_ahead_timers(); } + #endif /* CONFIG_HOTPLUG_CPU */ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { - unsigned int cpu = (long)hcpu; + int dcpu = -1, scpu = (long)hcpu; switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - init_hrtimers_cpu(cpu); + init_hrtimers_cpu(scpu); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_DEAD: case CPU_DEAD_FROZEN: - clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu); - migrate_hrtimers(cpu); + clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); + dcpu = migrate_hrtimers(scpu); + break; + + case CPU_POST_DEAD: + if (dcpu == -1) + break; + + smp_call_function_single(dcpu, tickle_timers, NULL, 0); break; #endif -- cgit v1.2.3-70-g09d2 From 0871420fad5844cb63cfcf85508c17bd9b15c08f Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 16 Nov 2008 23:49:24 -0800 Subject: sparc64: Add tsb-ratio sysctl. Add a sysctl to tweak the RSS limit used to decide when to grow the TSB for an address space. In order to avoid expensive divides and multiplies only simply positive and negative powers of two are supported. The function computed takes the number of TSB translations that will fit at one time in the TSB of a given size, and either adds or subtracts a percentage of entries. This final value is the RSS limit. See tsb_size_to_rss_limit(). Signed-off-by: David S. Miller --- arch/sparc/mm/tsb.c | 21 ++++++++++++++------- kernel/sysctl.c | 14 ++++++++++++++ 2 files changed, 28 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c index f0282fad632..36a0813f951 100644 --- a/arch/sparc/mm/tsb.c +++ b/arch/sparc/mm/tsb.c @@ -265,6 +265,18 @@ void __init pgtable_cache_init(void) } } +int sysctl_tsb_ratio = -2; + +static unsigned long tsb_size_to_rss_limit(unsigned long new_size) +{ + unsigned long num_ents = (new_size / sizeof(struct tsb)); + + if (sysctl_tsb_ratio < 0) + return num_ents - (num_ents >> -sysctl_tsb_ratio); + else + return num_ents + (num_ents >> sysctl_tsb_ratio); +} + /* When the RSS of an address space exceeds tsb_rss_limit for a TSB, * do_sparc64_fault() invokes this routine to try and grow it. * @@ -295,19 +307,14 @@ void tsb_grow(struct mm_struct *mm, unsigned long tsb_index, unsigned long rss) new_cache_index = 0; for (new_size = 8192; new_size < max_tsb_size; new_size <<= 1UL) { - unsigned long n_entries = new_size / sizeof(struct tsb); - - n_entries = (n_entries * 3) / 4; - if (n_entries > rss) + new_rss_limit = tsb_size_to_rss_limit(new_size); + if (new_rss_limit > rss) break; - new_cache_index++; } if (new_size == max_tsb_size) new_rss_limit = ~0UL; - else - new_rss_limit = ((new_size / sizeof(struct tsb)) * 3) / 4; retry_tsb_alloc: gfp_flags = GFP_KERNEL; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3d56fe7570d..4e2ac0aec9b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -121,6 +121,10 @@ extern int sg_big_buff; #include #endif +#ifdef CONFIG_SPARC64 +extern int sysctl_tsb_ratio; +#endif + #ifdef __hppa__ extern int pwrsw_enabled; extern int unaligned_enabled; @@ -451,6 +455,16 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_SPARC64 + { + .ctl_name = CTL_UNNUMBERED, + .procname = "tsb-ratio", + .data = &sysctl_tsb_ratio, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif #ifdef __hppa__ { .ctl_name = KERN_HPPA_PWRSW, -- cgit v1.2.3-70-g09d2 From c37bbb0fdcc01610fd55604eb6927210a1d20044 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 3 Dec 2008 13:17:06 -0600 Subject: user namespaces: let user_ns be cloned with fairsched (These two patches are in the next-unacked branch of git://git.kernel.org/pub/scm/linux/kernel/git/sergeh/userns-2.6. If they get some ACKs, then I hope to feed this into security-next. After these two, I think we're ready to tackle userns+capabilities) Fairsched creates a per-uid directory under /sys/kernel/uids/. So when you clone(CLONE_NEWUSER), it tries to create /sys/kernel/uids/0, which already exists, and you get back -ENOMEM. This was supposed to be fixed by sysfs tagging, but that was postponed (ok, rejected until sysfs locking is fixed). So, just as with network namespaces, we just don't create those directories for user namespaces other than the init. Signed-off-by: Serge E. Hallyn Signed-off-by: James Morris --- kernel/user.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 97202cb29ad..6c924bc48c0 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -246,6 +246,8 @@ static int uids_user_create(struct user_struct *up) int error; memset(kobj, 0, sizeof(struct kobject)); + if (up->user_ns != &init_user_ns) + return 0; kobj->kset = uids_kset; error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid); if (error) { @@ -281,6 +283,8 @@ static void remove_user_sysfs_dir(struct work_struct *w) unsigned long flags; int remove_user = 0; + if (up->user_ns != &init_user_ns) + return; /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() * atomic. */ -- cgit v1.2.3-70-g09d2 From 7657d90497f98426af17f0ac633a9b335bb7a8fb Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 3 Dec 2008 13:17:33 -0600 Subject: user namespaces: require cap_set{ug}id for CLONE_NEWUSER While ideally CLONE_NEWUSER will eventually require no privilege, the required permission checks are currently not there. As a result, CLONE_NEWUSER has the same effect as a setuid(0)+setgroups(1,"0"). While we already require CAP_SYS_ADMIN, requiring CAP_SETUID and CAP_SETGID seems appropriate. Signed-off-by: Serge E. Hallyn Acked-by: "Eric W. Biederman" Signed-off-by: James Morris --- kernel/fork.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 1dd89451fae..e3a85b33107 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1344,7 +1344,8 @@ long do_fork(unsigned long clone_flags, /* hopefully this check will go away when userns support is * complete */ - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || + !capable(CAP_SETGID)) return -EPERM; } -- cgit v1.2.3-70-g09d2 From a0a99b227da57f81319dd239bc4de811b0f530ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 8 Dec 2008 17:13:02 +0100 Subject: hrtimer: removing all ur callback modes, fix > Ingo, this addition fixes the hotplug issue on my machine And because we're all human... Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b09c7a27631..b741f850426 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1571,7 +1571,7 @@ static void tickle_timers(void *arg) static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { - int dcpu = -1, scpu = (long)hcpu; + int dcpu, scpu = (long)hcpu; switch (action) { @@ -1585,12 +1585,6 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, case CPU_DEAD_FROZEN: clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); dcpu = migrate_hrtimers(scpu); - break; - - case CPU_POST_DEAD: - if (dcpu == -1) - break; - smp_call_function_single(dcpu, tickle_timers, NULL, 0); break; #endif -- cgit v1.2.3-70-g09d2 From 94d6a5f7341ebaff53d4e41cc81fab37f0d9fbed Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Mon, 8 Dec 2008 15:52:21 -0600 Subject: user namespaces: document CFS behavior Documented the currently bogus state of support for CFS user groups with user namespaces. In particular, all users in a user namespace should be children of the user which created the user namespace. This is yet to be implemented. Signed-off-by: Serge E. Hallyn Acked-by: Dhaval Giani Signed-off-by: Serge E. Hallyn Signed-off-by: James Morris --- Documentation/scheduler/sched-design-CFS.txt | 21 +++++++++++++++++++++ kernel/user.c | 8 +++++++- 2 files changed, 28 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt index eb471c7a905..8398ca4ff4e 100644 --- a/Documentation/scheduler/sched-design-CFS.txt +++ b/Documentation/scheduler/sched-design-CFS.txt @@ -273,3 +273,24 @@ task groups and modify their CPU share using the "cgroups" pseudo filesystem. # #Launch gmplayer (or your favourite movie player) # echo > multimedia/tasks + +8. Implementation note: user namespaces + +User namespaces are intended to be hierarchical. But they are currently +only partially implemented. Each of those has ramifications for CFS. + +First, since user namespaces are hierarchical, the /sys/kernel/uids +presentation is inadequate. Eventually we will likely want to use sysfs +tagging to provide private views of /sys/kernel/uids within each user +namespace. + +Second, the hierarchical nature is intended to support completely +unprivileged use of user namespaces. So if using user groups, then +we want the users in a user namespace to be children of the user +who created it. + +That is currently unimplemented. So instead, every user in a new +user namespace will receive 1024 shares just like any user in the +initial user namespace. Note that at the moment creation of a new +user namespace requires each of CAP_SYS_ADMIN, CAP_SETUID, and +CAP_SETGID. diff --git a/kernel/user.c b/kernel/user.c index 6c924bc48c0..6608a3d8ca6 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -239,7 +239,13 @@ static struct kobj_type uids_ktype = { .release = uids_release, }; -/* create /sys/kernel/uids//cpu_share file for this user */ +/* + * Create /sys/kernel/uids//cpu_share file for this user + * We do not create this file for users in a user namespace (until + * sysfs tagging is implemented). + * + * See Documentation/scheduler/sched-design-CFS.txt for ramifications. + */ static int uids_user_create(struct user_struct *up) { struct kobject *kobj = &up->kobj; -- cgit v1.2.3-70-g09d2 From 68814b58c52077da9561b544089fe532a0842f71 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 24 Nov 2008 12:24:12 +0100 Subject: ring_buffer: update description for ring_buffer_alloc() Trivial patch. Cc: Steven Rostedt Signed-off-by: Robert Richter --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 668bbb5ef2b..c8996d239e4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -381,7 +381,7 @@ extern int ring_buffer_page_too_big(void); /** * ring_buffer_alloc - allocate a new ring_buffer - * @size: the size in bytes that is needed. + * @size: the size in bytes per cpu that is needed. * @flags: attributes to set for the ring buffer. * * Currently the only flag that is available is the RB_FL_OVERWRITE -- cgit v1.2.3-70-g09d2 From e2ac8ef576e45d9db7264abc51383e68d26067bb Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 12 Nov 2008 12:59:32 +0100 Subject: ftrace: remove unused function arg in trace_iterator_increment() This removes the unused cpu function parameter. Cc: Steven Rostedt Signed-off-by: Robert Richter --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d86e3252f30..a96b335fe75 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -914,7 +914,7 @@ enum trace_file_type { TRACE_FILE_LAT_FMT = 1, }; -static void trace_iterator_increment(struct trace_iterator *iter, int cpu) +static void trace_iterator_increment(struct trace_iterator *iter) { /* Don't allow ftrace to trace into the ring buffers */ ftrace_disable_cpu(); @@ -993,7 +993,7 @@ static void *find_next_entry_inc(struct trace_iterator *iter) iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts); if (iter->ent) - trace_iterator_increment(iter, iter->cpu); + trace_iterator_increment(iter); return iter->ent ? iter : NULL; } -- cgit v1.2.3-70-g09d2 From c4f50183f90fb1fd99aa5941f01b90cd1b882d2e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 11 Dec 2008 16:49:22 +0100 Subject: ring_buffer: adding EXPORT_SYMBOLs I added EXPORT_SYMBOL_GPLs for all functions part of the API (ring_buffer.h). This is required since oprofile is using the ring buffer and the compilation as modules would fail otherwise. Signed-off-by: Robert Richter Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c8996d239e4..30d57dd01a8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -31,6 +31,7 @@ void tracing_on(void) { ring_buffers_off = 0; } +EXPORT_SYMBOL_GPL(tracing_on); /** * tracing_off - turn off all tracing buffers @@ -44,6 +45,7 @@ void tracing_off(void) { ring_buffers_off = 1; } +EXPORT_SYMBOL_GPL(tracing_off); /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 @@ -60,12 +62,14 @@ u64 ring_buffer_time_stamp(int cpu) return time; } +EXPORT_SYMBOL_GPL(ring_buffer_time_stamp); void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) { /* Just stupid testing the normalize function and deltas */ *ts >>= DEBUG_SHIFT; } +EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); #define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event)) #define RB_ALIGNMENT_SHIFT 2 @@ -115,6 +119,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event) { return rb_event_length(event); } +EXPORT_SYMBOL_GPL(ring_buffer_event_length); /* inline for ring buffer fast paths */ static inline void * @@ -136,6 +141,7 @@ void *ring_buffer_event_data(struct ring_buffer_event *event) { return rb_event_data(event); } +EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define for_each_buffer_cpu(buffer, cpu) \ for_each_cpu_mask(cpu, buffer->cpumask) @@ -444,6 +450,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) kfree(buffer); return NULL; } +EXPORT_SYMBOL_GPL(ring_buffer_alloc); /** * ring_buffer_free - free a ring buffer. @@ -459,6 +466,7 @@ ring_buffer_free(struct ring_buffer *buffer) kfree(buffer); } +EXPORT_SYMBOL_GPL(ring_buffer_free); static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); @@ -620,6 +628,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) mutex_unlock(&buffer->mutex); return -ENOMEM; } +EXPORT_SYMBOL_GPL(ring_buffer_resize); static inline int rb_null_event(struct ring_buffer_event *event) { @@ -1220,6 +1229,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, preempt_enable_notrace(); return NULL; } +EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) @@ -1269,6 +1279,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, return 0; } +EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); /** * ring_buffer_write - write data to the buffer without reserving @@ -1334,6 +1345,7 @@ int ring_buffer_write(struct ring_buffer *buffer, return ret; } +EXPORT_SYMBOL_GPL(ring_buffer_write); static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) { @@ -1360,6 +1372,7 @@ void ring_buffer_record_disable(struct ring_buffer *buffer) { atomic_inc(&buffer->record_disabled); } +EXPORT_SYMBOL_GPL(ring_buffer_record_disable); /** * ring_buffer_record_enable - enable writes to the buffer @@ -1372,6 +1385,7 @@ void ring_buffer_record_enable(struct ring_buffer *buffer) { atomic_dec(&buffer->record_disabled); } +EXPORT_SYMBOL_GPL(ring_buffer_record_enable); /** * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer @@ -1393,6 +1407,7 @@ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu) cpu_buffer = buffer->buffers[cpu]; atomic_inc(&cpu_buffer->record_disabled); } +EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu); /** * ring_buffer_record_enable_cpu - enable writes to the buffer @@ -1412,6 +1427,7 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) cpu_buffer = buffer->buffers[cpu]; atomic_dec(&cpu_buffer->record_disabled); } +EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); /** * ring_buffer_entries_cpu - get the number of entries in a cpu buffer @@ -1428,6 +1444,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) cpu_buffer = buffer->buffers[cpu]; return cpu_buffer->entries; } +EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); /** * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer @@ -1444,6 +1461,7 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) cpu_buffer = buffer->buffers[cpu]; return cpu_buffer->overrun; } +EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); /** * ring_buffer_entries - get the number of entries in a buffer @@ -1466,6 +1484,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) return entries; } +EXPORT_SYMBOL_GPL(ring_buffer_entries); /** * ring_buffer_overrun_cpu - get the number of overruns in buffer @@ -1488,6 +1507,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer) return overruns; } +EXPORT_SYMBOL_GPL(ring_buffer_overruns); /** * ring_buffer_iter_reset - reset an iterator @@ -1513,6 +1533,7 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter) else iter->read_stamp = iter->head_page->time_stamp; } +EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); /** * ring_buffer_iter_empty - check if an iterator has no more to read @@ -1527,6 +1548,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter) return iter->head_page == cpu_buffer->commit_page && iter->head == rb_commit_index(cpu_buffer); } +EXPORT_SYMBOL_GPL(ring_buffer_iter_empty); static void rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, @@ -1797,6 +1819,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) return NULL; } +EXPORT_SYMBOL_GPL(ring_buffer_peek); /** * ring_buffer_iter_peek - peek at the next event to be read @@ -1867,6 +1890,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) return NULL; } +EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); /** * ring_buffer_consume - return an event and consume it @@ -1894,6 +1918,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) return event; } +EXPORT_SYMBOL_GPL(ring_buffer_consume); /** * ring_buffer_read_start - start a non consuming read of the buffer @@ -1934,6 +1959,7 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) return iter; } +EXPORT_SYMBOL_GPL(ring_buffer_read_start); /** * ring_buffer_finish - finish reading the iterator of the buffer @@ -1950,6 +1976,7 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter) atomic_dec(&cpu_buffer->record_disabled); kfree(iter); } +EXPORT_SYMBOL_GPL(ring_buffer_read_finish); /** * ring_buffer_read - read the next item in the ring buffer by the iterator @@ -1971,6 +1998,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) return event; } +EXPORT_SYMBOL_GPL(ring_buffer_read); /** * ring_buffer_size - return the size of the ring buffer (in bytes) @@ -1980,6 +2008,7 @@ unsigned long ring_buffer_size(struct ring_buffer *buffer) { return BUF_PAGE_SIZE * buffer->pages; } +EXPORT_SYMBOL_GPL(ring_buffer_size); static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) @@ -2022,6 +2051,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) spin_unlock_irqrestore(&cpu_buffer->lock, flags); } +EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); /** * ring_buffer_reset - reset a ring buffer @@ -2034,6 +2064,7 @@ void ring_buffer_reset(struct ring_buffer *buffer) for_each_buffer_cpu(buffer, cpu) ring_buffer_reset_cpu(buffer, cpu); } +EXPORT_SYMBOL_GPL(ring_buffer_reset); /** * rind_buffer_empty - is the ring buffer empty? @@ -2052,6 +2083,7 @@ int ring_buffer_empty(struct ring_buffer *buffer) } return 1; } +EXPORT_SYMBOL_GPL(ring_buffer_empty); /** * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty? @@ -2068,6 +2100,7 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) cpu_buffer = buffer->buffers[cpu]; return rb_per_cpu_empty(cpu_buffer); } +EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); /** * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers @@ -2117,6 +2150,7 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, return 0; } +EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, -- cgit v1.2.3-70-g09d2 From fa116ea35ec7f40e890972324409e99eed008d56 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 11 Dec 2008 17:04:11 +0100 Subject: nohz: no softirq pending warnings for offline cpus Impact: remove false positive warning After a cpu was taken down during cpu hotplug (read: disabled for interrupts) it still might have pending softirqs. However take_cpu_down makes sure that the idle task will run next instead of ksoftirqd on the taken down cpu. The idle task will call tick_nohz_stop_sched_tick which might warn about pending softirqs just before the cpu kills itself completely. However the pending softirqs on the dead cpu aren't a problem because they will be moved to an online cpu during CPU_DEAD handling. So make sure we warn only for online cpus. Signed-off-by: Heiko Carstens Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 342fc9ccab4..dc17ffcf191 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -247,7 +247,7 @@ void tick_nohz_stop_sched_tick(int inidle) if (need_resched()) goto end; - if (unlikely(local_softirq_pending())) { + if (unlikely(local_softirq_pending() && cpu_online(cpu))) { static int ratelimit; if (ratelimit < 10) { -- cgit v1.2.3-70-g09d2 From a93751cab71d63126687551823ed3e70cd85854a Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Thu, 11 Dec 2008 13:53:26 +0100 Subject: x86, bts, ftrace: adapt the hw-branch-tracer to the ds.c interface Impact: restructure code, cleanup Remove BTS bits from the hw-branch-tracer (renamed from bts-tracer) and use the ds interface. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 4 +- kernel/trace/Makefile | 2 +- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 14 +- kernel/trace/trace_bts.c | 276 --------------------------------------- kernel/trace/trace_hw_branches.c | 205 +++++++++++++++++++++++++++++ 6 files changed, 215 insertions(+), 288 deletions(-) delete mode 100644 kernel/trace/trace_bts.c create mode 100644 kernel/trace/trace_hw_branches.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index bde6f03512d..d8bae6f4219 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -251,9 +251,9 @@ config STACK_TRACER Say N if unsure. -config BTS_TRACER +config HW_BRANCH_TRACER depends on HAVE_HW_BRANCH_TRACER - bool "Trace branches" + bool "Trace hw branches" select TRACING help This tracer records all branches on the system in a circular diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 62dc561b667..349d5a93653 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -31,7 +31,7 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o obj-$(CONFIG_BOOT_TRACER) += trace_boot.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o -obj-$(CONFIG_BTS_TRACER) += trace_bts.o +obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o obj-$(CONFIG_POWER_TRACER) += trace_power.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8ebe0070c47..639344a4d3a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2425,7 +2425,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) /* Notify the tracer early; before we stop tracing. */ if (iter->trace && iter->trace->open) - iter->trace->open(iter); + iter->trace->open(iter); /* Annotate start of buffers if we had overruns */ if (ring_buffer_overruns(iter->tr->buffer)) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5ac697065a4..f07c246dd73 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -28,7 +28,7 @@ enum trace_type { TRACE_GRAPH_RET, TRACE_GRAPH_ENT, TRACE_USER_STACK, - TRACE_BTS, + TRACE_HW_BRANCHES, TRACE_POWER, __TRACE_LAST_TYPE @@ -159,10 +159,10 @@ struct trace_branch { char correct; }; -struct bts_entry { +struct hw_branch_entry { struct trace_entry ent; - unsigned long from; - unsigned long to; + u64 from; + u64 to; }; struct trace_power { @@ -278,7 +278,7 @@ extern void __ftrace_bad_type(void); TRACE_GRAPH_ENT); \ IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ - IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\ + IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ __ftrace_bad_type(); \ } while (0) @@ -414,9 +414,7 @@ void trace_function(struct trace_array *tr, void trace_graph_return(struct ftrace_graph_ret *trace); int trace_graph_entry(struct ftrace_graph_ent *trace); -void trace_bts(struct trace_array *tr, - unsigned long from, - unsigned long to); +void trace_hw_branch(struct trace_array *tr, u64 from, u64 to); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); diff --git a/kernel/trace/trace_bts.c b/kernel/trace/trace_bts.c deleted file mode 100644 index 23b76e4690e..00000000000 --- a/kernel/trace/trace_bts.c +++ /dev/null @@ -1,276 +0,0 @@ -/* - * BTS tracer - * - * Copyright (C) 2008 Markus Metzger - * - */ - -#include -#include -#include -#include -#include - -#include - -#include "trace.h" - - -#define SIZEOF_BTS (1 << 13) - -static DEFINE_PER_CPU(struct bts_tracer *, tracer); -static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); - -#define this_tracer per_cpu(tracer, smp_processor_id()) -#define this_buffer per_cpu(buffer, smp_processor_id()) - - -/* - * Information to interpret a BTS record. - * This will go into an in-kernel BTS interface. - */ -static unsigned char sizeof_field; -static unsigned long debugctl_mask; - -#define sizeof_bts (3 * sizeof_field) - -static void bts_trace_cpuinit(struct cpuinfo_x86 *c) -{ - switch (c->x86) { - case 0x6: - switch (c->x86_model) { - case 0x0 ... 0xC: - break; - case 0xD: - case 0xE: /* Pentium M */ - sizeof_field = sizeof(long); - debugctl_mask = (1<<6)|(1<<7); - break; - default: - sizeof_field = 8; - debugctl_mask = (1<<6)|(1<<7); - break; - } - break; - case 0xF: - switch (c->x86_model) { - case 0x0: - case 0x1: - case 0x2: /* Netburst */ - sizeof_field = sizeof(long); - debugctl_mask = (1<<2)|(1<<3); - break; - default: - /* sorry, don't know about them */ - break; - } - break; - default: - /* sorry, don't know about them */ - break; - } -} - -static inline void bts_enable(void) -{ - unsigned long debugctl; - - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | debugctl_mask); -} - -static inline void bts_disable(void) -{ - unsigned long debugctl; - - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl & ~debugctl_mask); -} - -static void bts_trace_reset(struct trace_array *tr) -{ - int cpu; - - tr->time_start = ftrace_now(tr->cpu); - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); -} - -static void bts_trace_start_cpu(void *arg) -{ - this_tracer = - ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS, - /* ovfl = */ NULL, /* th = */ (size_t)-1); - if (IS_ERR(this_tracer)) { - this_tracer = NULL; - return; - } - - bts_enable(); -} - -static void bts_trace_start(struct trace_array *tr) -{ - int cpu; - - bts_trace_reset(tr); - - for_each_cpu_mask(cpu, cpu_possible_map) - smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); -} - -static void bts_trace_stop_cpu(void *arg) -{ - if (this_tracer) { - bts_disable(); - - ds_release_bts(this_tracer); - this_tracer = NULL; - } -} - -static void bts_trace_stop(struct trace_array *tr) -{ - int cpu; - - for_each_cpu_mask(cpu, cpu_possible_map) - smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); -} - -static int bts_trace_init(struct trace_array *tr) -{ - bts_trace_cpuinit(&boot_cpu_data); - bts_trace_reset(tr); - bts_trace_start(tr); - - return 0; -} - -static void bts_trace_print_header(struct seq_file *m) -{ -#ifdef __i386__ - seq_puts(m, "# CPU# FROM TO FUNCTION\n"); - seq_puts(m, "# | | | |\n"); -#else - seq_puts(m, - "# CPU# FROM TO FUNCTION\n"); - seq_puts(m, - "# | | | |\n"); -#endif -} - -static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - struct trace_seq *seq = &iter->seq; - struct bts_entry *it; - - trace_assign_type(it, entry); - - if (entry->type == TRACE_BTS) { - int ret; -#ifdef CONFIG_KALLSYMS - char function[KSYM_SYMBOL_LEN]; - sprint_symbol(function, it->from); -#else - char *function = ""; -#endif - - ret = trace_seq_printf(seq, "%4d 0x%lx -> 0x%lx [%s]\n", - entry->cpu, it->from, it->to, function); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE;; - return TRACE_TYPE_HANDLED; - } - return TRACE_TYPE_UNHANDLED; -} - -void trace_bts(struct trace_array *tr, unsigned long from, unsigned long to) -{ - struct ring_buffer_event *event; - struct bts_entry *entry; - unsigned long irq; - - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq); - if (!event) - return; - entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, from); - entry->ent.type = TRACE_BTS; - entry->ent.cpu = smp_processor_id(); - entry->from = from; - entry->to = to; - ring_buffer_unlock_commit(tr->buffer, event, irq); -} - -static void trace_bts_at(struct trace_array *tr, size_t index) -{ - const void *raw = NULL; - unsigned long from, to; - int err; - - err = ds_access_bts(this_tracer, index, &raw); - if (err < 0) - return; - - from = *(const unsigned long *)raw; - to = *(const unsigned long *)((const char *)raw + sizeof_field); - - trace_bts(tr, from, to); -} - -static void trace_bts_cpu(void *arg) -{ - struct trace_array *tr = (struct trace_array *) arg; - size_t index = 0, end = 0, i; - int err; - - if (!this_tracer) - return; - - bts_disable(); - - err = ds_get_bts_index(this_tracer, &index); - if (err < 0) - goto out; - - err = ds_get_bts_end(this_tracer, &end); - if (err < 0) - goto out; - - for (i = index; i < end; i++) - trace_bts_at(tr, i); - - for (i = 0; i < index; i++) - trace_bts_at(tr, i); - -out: - bts_enable(); -} - -static void trace_bts_prepare(struct trace_iterator *iter) -{ - int cpu; - - for_each_cpu_mask(cpu, cpu_possible_map) - smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1); -} - -struct tracer bts_tracer __read_mostly = -{ - .name = "bts", - .init = bts_trace_init, - .reset = bts_trace_stop, - .print_header = bts_trace_print_header, - .print_line = bts_trace_print_line, - .start = bts_trace_start, - .stop = bts_trace_stop, - .open = trace_bts_prepare -}; - -__init static int init_bts_trace(void) -{ - return register_tracer(&bts_tracer); -} -device_initcall(init_bts_trace); diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c new file mode 100644 index 00000000000..ee29e012aa9 --- /dev/null +++ b/kernel/trace/trace_hw_branches.c @@ -0,0 +1,205 @@ +/* + * h/w branch tracer for x86 based on bts + * + * Copyright (C) 2008 Markus Metzger + * + */ + +#include +#include +#include +#include +#include + +#include + +#include "trace.h" + + +#define SIZEOF_BTS (1 << 13) + +static DEFINE_PER_CPU(struct bts_tracer *, tracer); +static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); + +#define this_tracer per_cpu(tracer, smp_processor_id()) +#define this_buffer per_cpu(buffer, smp_processor_id()) + + +static void bts_trace_reset(struct trace_array *tr) +{ + int cpu; + + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); +} + +static void bts_trace_start_cpu(void *arg) +{ + if (this_tracer) + ds_release_bts(this_tracer); + + this_tracer = + ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS, + /* ovfl = */ NULL, /* th = */ (size_t)-1, + BTS_KERNEL); + if (IS_ERR(this_tracer)) { + this_tracer = NULL; + return; + } +} + +static void bts_trace_start(struct trace_array *tr) +{ + int cpu; + + bts_trace_reset(tr); + + for_each_cpu_mask(cpu, cpu_possible_map) + smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); +} + +static void bts_trace_stop_cpu(void *arg) +{ + if (this_tracer) { + ds_release_bts(this_tracer); + this_tracer = NULL; + } +} + +static void bts_trace_stop(struct trace_array *tr) +{ + int cpu; + + for_each_cpu_mask(cpu, cpu_possible_map) + smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); +} + +static int bts_trace_init(struct trace_array *tr) +{ + bts_trace_reset(tr); + bts_trace_start(tr); + + return 0; +} + +static void bts_trace_print_header(struct seq_file *m) +{ + seq_puts(m, + "# CPU# FROM TO FUNCTION\n"); + seq_puts(m, + "# | | | |\n"); +} + +static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *seq = &iter->seq; + struct hw_branch_entry *it; + + trace_assign_type(it, entry); + + if (entry->type == TRACE_HW_BRANCHES) { + if (trace_seq_printf(seq, "%4d ", entry->cpu) && + trace_seq_printf(seq, "0x%016llx -> 0x%016llx ", + it->from, it->to) && + (!it->from || + seq_print_ip_sym(seq, it->from, /* sym_flags = */ 0)) && + trace_seq_printf(seq, "\n")) + return TRACE_TYPE_HANDLED; + return TRACE_TYPE_PARTIAL_LINE;; + } + return TRACE_TYPE_UNHANDLED; +} + +void trace_hw_branch(struct trace_array *tr, u64 from, u64 to) +{ + struct ring_buffer_event *event; + struct hw_branch_entry *entry; + unsigned long irq; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, from); + entry->ent.type = TRACE_HW_BRANCHES; + entry->ent.cpu = smp_processor_id(); + entry->from = from; + entry->to = to; + ring_buffer_unlock_commit(tr->buffer, event, irq); +} + +static void trace_bts_at(struct trace_array *tr, + const struct bts_trace *trace, void *at) +{ + struct bts_struct bts; + int err = 0; + + WARN_ON_ONCE(!trace->read); + if (!trace->read) + return; + + err = trace->read(this_tracer, at, &bts); + if (err < 0) + return; + + switch (bts.qualifier) { + case BTS_BRANCH: + trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to); + break; + } +} + +static void trace_bts_cpu(void *arg) +{ + struct trace_array *tr = (struct trace_array *) arg; + const struct bts_trace *trace; + unsigned char *at; + + if (!this_tracer) + return; + + ds_suspend_bts(this_tracer); + trace = ds_read_bts(this_tracer); + if (!trace) + goto out; + + for (at = trace->ds.top; (void *)at < trace->ds.end; + at += trace->ds.size) + trace_bts_at(tr, trace, at); + + for (at = trace->ds.begin; (void *)at < trace->ds.top; + at += trace->ds.size) + trace_bts_at(tr, trace, at); + +out: + ds_resume_bts(this_tracer); +} + +static void trace_bts_prepare(struct trace_iterator *iter) +{ + int cpu; + + for_each_cpu_mask(cpu, cpu_possible_map) + smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1); +} + +struct tracer bts_tracer __read_mostly = +{ + .name = "hw-branch-tracer", + .init = bts_trace_init, + .reset = bts_trace_stop, + .print_header = bts_trace_print_header, + .print_line = bts_trace_print_line, + .start = bts_trace_start, + .stop = bts_trace_stop, + .open = trace_bts_prepare +}; + +__init static int init_bts_trace(void) +{ + return register_tracer(&bts_tracer); +} +device_initcall(init_bts_trace); -- cgit v1.2.3-70-g09d2 From f8b755ac8e0cc3f330269e4c4504514f987167a2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 9 Dec 2008 23:55:25 +0100 Subject: tracing/function-graph-tracer: Output arrows signal on hardirq call/return Impact: make more obvious the hardirq calls in the output When a hardirq is triggered inside the codeflow on output, we have now two arrows that indicate the entry and return of the hardirq. 0) | bit_waitqueue() { 0) 0.880 us | __phys_addr(); 0) 2.699 us | } 0) | __wake_up_bit() { 0) ==========> | smp_apic_timer_interrupt() { 0) 0.797 us | native_apic_mem_write(); 0) 0.715 us | exit_idle(); 0) | irq_enter() { 0) 0.722 us | idle_cpu(); 0) 5.519 us | } 0) | hrtimer_interrupt() { 0) | ktime_get() { 0) | ktime_get_ts() { 0) 0.805 us | getnstimeofday(); [...] 0) ! 108.528 us | } 0) | irq_exit() { 0) | do_softirq() { 0) | __do_softirq() { 0) 0.895 us | __local_bh_disable(); 0) | run_timer_softirq() { 0) 0.827 us | hrtimer_run_pending(); 0) 1.226 us | _spin_lock_irq(); 0) | _spin_unlock_irq() { 0) 6.550 us | } 0) 0.924 us | _local_bh_enable(); 0) + 12.129 us | } 0) + 13.911 us | } 0) 0.707 us | idle_cpu(); 0) + 17.009 us | } 0) ! 137.419 us | } 0) <========== | 0) 1.045 us | } 0) ! 148.908 us | } 0) ! 151.022 us | } 0) ! 153.022 us | } 0) 0.963 us | journal_mark_dirty(); 0) 0.925 us | __brelse(); Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions_graph.c | 66 +++++++++++++++++++++++++++++++++--- 1 file changed, 62 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index af60eef4cbc..4bf39fcae97 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -231,6 +231,49 @@ trace_branch_is_leaf(struct trace_iterator *iter, return true; } +static enum print_line_t +print_graph_irq(struct trace_seq *s, unsigned long addr, + enum trace_type type, int cpu, pid_t pid) +{ + int ret; + + if (addr < (unsigned long)__irqentry_text_start || + addr >= (unsigned long)__irqentry_text_end) + return TRACE_TYPE_UNHANDLED; + + if (type == TRACE_GRAPH_ENT) { + ret = trace_seq_printf(s, "==========> | "); + } else { + /* Cpu */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { + ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + /* Proc */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { + ret = print_graph_proc(s, pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* No overhead */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = trace_seq_printf(s, "<========== |\n"); + } + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; +} static enum print_line_t print_graph_duration(unsigned long long duration, struct trace_seq *s) @@ -344,7 +387,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, static enum print_line_t print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, - struct trace_seq *s) + struct trace_seq *s, pid_t pid, int cpu) { int i; int ret; @@ -357,8 +400,18 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, return TRACE_TYPE_PARTIAL_LINE; } - /* No time */ - ret = trace_seq_printf(s, " | "); + /* Interrupt */ + ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, pid); + if (ret == TRACE_TYPE_UNHANDLED) { + /* No time */ + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } else { + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { @@ -410,7 +463,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, if (trace_branch_is_leaf(iter, field)) return print_graph_entry_leaf(iter, field, s); else - return print_graph_entry_nested(field, s); + return print_graph_entry_nested(field, s, iter->ent->pid, cpu); } @@ -474,6 +527,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, if (!ret) return TRACE_TYPE_PARTIAL_LINE; } + + ret = print_graph_irq(s, trace->func, TRACE_GRAPH_RET, cpu, ent->pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; } -- cgit v1.2.3-70-g09d2 From cbc34ed1ac36690f75fd272e19e7b4fc29aae5a2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Dec 2008 08:08:22 +0100 Subject: sched: fix tracepoints in scheduler The trace point only caught one of many places where a task changes cpu, put it in the right place to we get all of them. Change the signature while we're at it. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/trace/sched.h | 4 ++-- kernel/sched.c | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/trace/sched.h b/include/trace/sched.h index 9b2854abf7e..f4549d506b1 100644 --- a/include/trace/sched.h +++ b/include/trace/sched.h @@ -30,8 +30,8 @@ DECLARE_TRACE(sched_switch, TPARGS(rq, prev, next)); DECLARE_TRACE(sched_migrate_task, - TPPROTO(struct rq *rq, struct task_struct *p, int dest_cpu), - TPARGS(rq, p, dest_cpu)); + TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu), + TPARGS(p, orig_cpu, dest_cpu)); DECLARE_TRACE(sched_process_free, TPPROTO(struct task_struct *p), diff --git a/kernel/sched.c b/kernel/sched.c index 7729c4bbc8b..d377097572f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1851,6 +1851,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) clock_offset = old_rq->clock - new_rq->clock; + trace_sched_migrate_task(p, task_cpu(p), new_cpu); + #ifdef CONFIG_SCHEDSTATS if (p->se.wait_start) p->se.wait_start -= clock_offset; @@ -2868,7 +2870,6 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) || unlikely(!cpu_active(dest_cpu))) goto out; - trace_sched_migrate_task(rq, p, dest_cpu); /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ -- cgit v1.2.3-70-g09d2 From 001474491fabeca233168a8598f721c808040f90 Mon Sep 17 00:00:00 2001 From: "Woodruff, Richard" Date: Mon, 1 Dec 2008 14:18:11 -0800 Subject: nohz: suppress needless timer reprogramming In my device I get many interrupts from a high speed USB device in a very short period of time. The system spends a lot of time reprogramming the hardware timer which is in a slower timing domain as compared to the CPU. This results in the CPU spending a huge amount of time waiting for the timer posting to be done. All of this reprogramming is useless as the wake up time has not changed. As measured using ETM trace this drops my reprogramming penalty from almost 60% CPU load down to 15% during high interrupt rate. I can send traces to show this. Suppress setting of duplicate timer event when timer already stopped. Timer programming can be very costly and can result in long cpu stall/wait times. [akpm@linux-foundation.org: coding-style fixes] [tglx@linutronix.de: move the check to the right place and avoid raising the softirq for nothing] Signed-off-by: Richard Woodruff Cc: johnstul@us.ibm.com Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index dc17ffcf191..87fc34f21db 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -282,8 +282,31 @@ void tick_nohz_stop_sched_tick(int inidle) /* Schedule the tick, if we are at least one jiffie off */ if ((long)delta_jiffies >= 1) { + /* + * calculate the expiry time for the next timer wheel + * timer + */ + expires = ktime_add_ns(last_update, tick_period.tv64 * + delta_jiffies); + + /* + * If this cpu is the one which updates jiffies, then + * give up the assignment and let it be taken by the + * cpu which runs the tick timer next, which might be + * this cpu as well. If we don't drop this here the + * jiffies might be stale and do_timer() never + * invoked. + */ + if (cpu == tick_do_timer_cpu) + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + if (delta_jiffies > 1) cpu_set(cpu, nohz_cpu_mask); + + /* Skip reprogram of event if its not changed */ + if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) + goto out; + /* * nohz_stop_sched_tick can be called several times before * the nohz_restart_sched_tick is called. This happens when @@ -306,17 +329,6 @@ void tick_nohz_stop_sched_tick(int inidle) rcu_enter_nohz(); } - /* - * If this cpu is the one which updates jiffies, then - * give up the assignment and let it be taken by the - * cpu which runs the tick timer next, which might be - * this cpu as well. If we don't drop this here the - * jiffies might be stale and do_timer() never - * invoked. - */ - if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = TICK_DO_TIMER_NONE; - ts->idle_sleeps++; /* @@ -332,12 +344,7 @@ void tick_nohz_stop_sched_tick(int inidle) goto out; } - /* - * calculate the expiry time for the next timer wheel - * timer - */ - expires = ktime_add_ns(last_update, tick_period.tv64 * - delta_jiffies); + /* Mark expiries */ ts->idle_expires = expires; if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { -- cgit v1.2.3-70-g09d2 From 27af4245b6ce99e08c6a7c38825383bf51119cc9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 1 Dec 2008 14:18:13 -0800 Subject: posix-timers: use "struct pid*" instead of "struct task_struct*" Impact: restructure, clean up code k_itimer holds the ref to the ->it_process until sys_timer_delete(). This allows to pin up to RLIMIT_SIGPENDING dead task_struct's. Change the code to use "struct pid *" instead. The patch doesn't kill ->it_process, it places ->it_pid into the union. ->it_process is still used by do_cpu_nanosleep() as before. It would be trivial to change the nanosleep code as well, but since it uses it_process in a special way I think it is better to keep this field for grep. The patch bloats the kernel by 104 bytes and it also adds the new pointer, ->it_signal, to k_itimer. It is used by lock_timer() to verify that the found timer was not created by another process. It is not clear why do we use the global database (and thus the global idr_lock) for posix timers. We still need the signal_struct->posix_timers which contains all useable timers, perhaps it is better to use some form of per-process array instead. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- include/linux/posix-timers.h | 6 +++++- kernel/posix-timers.c | 43 +++++++++++++++++++++++-------------------- 2 files changed, 28 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index a7c72135554..4f71bf4e628 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -45,7 +45,11 @@ struct k_itimer { int it_requeue_pending; /* waiting to requeue this timer */ #define REQUEUE_PENDING 1 int it_sigev_notify; /* notify word of sigevent struct */ - struct task_struct *it_process; /* process to send signal to */ + struct signal_struct *it_signal; + union { + struct pid *it_pid; /* pid of process to send signal to */ + struct task_struct *it_process; /* for clock_nanosleep */ + }; struct sigqueue *sigq; /* signal queue entry. */ union { struct { diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 5e79c662294..42a39afd694 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -116,7 +116,7 @@ static DEFINE_SPINLOCK(idr_lock); * must supply functions here, even if the function just returns * ENOSYS. The standard POSIX timer management code assumes the * following: 1.) The k_itimer struct (sched.h) is used for the - * timer. 2.) The list, it_lock, it_clock, it_id and it_process + * timer. 2.) The list, it_lock, it_clock, it_id and it_pid * fields are not modified by timer code. * * At this time all functions EXCEPT clock_nanosleep can be @@ -313,7 +313,8 @@ void do_schedule_next_timer(struct siginfo *info) int posix_timer_event(struct k_itimer *timr, int si_private) { - int shared, ret; + struct task_struct *task; + int shared, ret = -1; /* * FIXME: if ->sigq is queued we can race with * dequeue_signal()->do_schedule_next_timer(). @@ -327,8 +328,13 @@ int posix_timer_event(struct k_itimer *timr, int si_private) */ timr->sigq->info.si_sys_private = si_private; - shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); - ret = send_sigqueue(timr->sigq, timr->it_process, shared); + rcu_read_lock(); + task = pid_task(timr->it_pid, PIDTYPE_PID); + if (task) { + shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); + ret = send_sigqueue(timr->sigq, task, shared); + } + rcu_read_unlock(); /* If we failed to send the signal the timer stops. */ return ret > 0; } @@ -405,7 +411,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) return ret; } -static struct task_struct * good_sigevent(sigevent_t * event) +static struct pid *good_sigevent(sigevent_t * event) { struct task_struct *rtn = current->group_leader; @@ -419,7 +425,7 @@ static struct task_struct * good_sigevent(sigevent_t * event) ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) return NULL; - return rtn; + return task_pid(rtn); } void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) @@ -471,7 +477,7 @@ sys_timer_create(const clockid_t which_clock, { struct k_itimer *new_timer; int error, new_timer_id; - struct task_struct *process; + struct pid *it_pid; sigevent_t event; int it_id_set = IT_ID_NOT_SET; @@ -525,11 +531,9 @@ sys_timer_create(const clockid_t which_clock, goto out; } rcu_read_lock(); - process = good_sigevent(&event); - if (process) - get_task_struct(process); + it_pid = get_pid(good_sigevent(&event)); rcu_read_unlock(); - if (!process) { + if (!it_pid) { error = -EINVAL; goto out; } @@ -537,8 +541,7 @@ sys_timer_create(const clockid_t which_clock, event.sigev_notify = SIGEV_SIGNAL; event.sigev_signo = SIGALRM; event.sigev_value.sival_int = new_timer->it_id; - process = current->group_leader; - get_task_struct(process); + it_pid = get_pid(task_tgid(current)); } new_timer->it_sigev_notify = event.sigev_notify; @@ -548,7 +551,8 @@ sys_timer_create(const clockid_t which_clock, new_timer->sigq->info.si_code = SI_TIMER; spin_lock_irq(¤t->sighand->siglock); - new_timer->it_process = process; + new_timer->it_pid = it_pid; + new_timer->it_signal = current->signal; list_add(&new_timer->list, ¤t->signal->posix_timers); spin_unlock_irq(¤t->sighand->siglock); @@ -583,8 +587,7 @@ static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags) timr = idr_find(&posix_timers_id, (int)timer_id); if (timr) { spin_lock(&timr->it_lock); - if (timr->it_process && - same_thread_group(timr->it_process, current)) { + if (timr->it_pid && timr->it_signal == current->signal) { spin_unlock(&idr_lock); return timr; } @@ -831,8 +834,8 @@ retry_delete: * This keeps any tasks waiting on the spin lock from thinking * they got something (see the lock code above). */ - put_task_struct(timer->it_process); - timer->it_process = NULL; + put_pid(timer->it_pid); + timer->it_pid = NULL; unlock_timer(timer, flags); release_posix_timer(timer, IT_ID_SET); @@ -858,8 +861,8 @@ retry_delete: * This keeps any tasks waiting on the spin lock from thinking * they got something (see the lock code above). */ - put_task_struct(timer->it_process); - timer->it_process = NULL; + put_pid(timer->it_pid); + timer->it_pid = NULL; unlock_timer(timer, flags); release_posix_timer(timer, IT_ID_SET); -- cgit v1.2.3-70-g09d2 From 899921025b406a71a8aeb2d7855658ea0cf0ed23 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 1 Dec 2008 14:18:15 -0800 Subject: posix-timers: check ->it_signal instead of ->it_pid to validate the timer Impact: clean up, speed up ->it_pid (was ->it_process) has also a special meaning: if it is NULL, the timer is under deletion or it wasn't initialized yet. We can check ->it_signal != NULL instead, this way we can - simplify sys_timer_create() a bit - remove yet another check from lock_timer() - move put_pid(->it_pid) into release_posix_timer() which runs outside of ->it_lock Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 42a39afd694..aa922bbb640 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -464,6 +464,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) idr_remove(&posix_timers_id, tmr->it_id); spin_unlock_irqrestore(&idr_lock, flags); } + put_pid(tmr->it_pid); sigqueue_free(tmr->sigq); kmem_cache_free(posix_timers_cache, tmr); } @@ -477,7 +478,6 @@ sys_timer_create(const clockid_t which_clock, { struct k_itimer *new_timer; int error, new_timer_id; - struct pid *it_pid; sigevent_t event; int it_id_set = IT_ID_NOT_SET; @@ -531,9 +531,9 @@ sys_timer_create(const clockid_t which_clock, goto out; } rcu_read_lock(); - it_pid = get_pid(good_sigevent(&event)); + new_timer->it_pid = get_pid(good_sigevent(&event)); rcu_read_unlock(); - if (!it_pid) { + if (!new_timer->it_pid) { error = -EINVAL; goto out; } @@ -541,7 +541,7 @@ sys_timer_create(const clockid_t which_clock, event.sigev_notify = SIGEV_SIGNAL; event.sigev_signo = SIGALRM; event.sigev_value.sival_int = new_timer->it_id; - it_pid = get_pid(task_tgid(current)); + new_timer->it_pid = get_pid(task_tgid(current)); } new_timer->it_sigev_notify = event.sigev_notify; @@ -551,7 +551,6 @@ sys_timer_create(const clockid_t which_clock, new_timer->sigq->info.si_code = SI_TIMER; spin_lock_irq(¤t->sighand->siglock); - new_timer->it_pid = it_pid; new_timer->it_signal = current->signal; list_add(&new_timer->list, ¤t->signal->posix_timers); spin_unlock_irq(¤t->sighand->siglock); @@ -587,7 +586,7 @@ static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags) timr = idr_find(&posix_timers_id, (int)timer_id); if (timr) { spin_lock(&timr->it_lock); - if (timr->it_pid && timr->it_signal == current->signal) { + if (timr->it_signal == current->signal) { spin_unlock(&idr_lock); return timr; } @@ -834,8 +833,7 @@ retry_delete: * This keeps any tasks waiting on the spin lock from thinking * they got something (see the lock code above). */ - put_pid(timer->it_pid); - timer->it_pid = NULL; + timer->it_signal = NULL; unlock_timer(timer, flags); release_posix_timer(timer, IT_ID_SET); @@ -861,8 +859,7 @@ retry_delete: * This keeps any tasks waiting on the spin lock from thinking * they got something (see the lock code above). */ - put_pid(timer->it_pid); - timer->it_pid = NULL; + timer->it_signal = NULL; unlock_timer(timer, flags); release_posix_timer(timer, IT_ID_SET); -- cgit v1.2.3-70-g09d2 From 03e89e4574a680af15f59329b061f35d9813aff4 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 16 Dec 2008 08:45:30 +0100 Subject: sched: fix wakeup preemption clock Impact: sharpen the wakeup-granularity to always be against current scheduler time It was possible to do the preemption check against an old time stamp. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_fair.c | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ad7b93be569..88215066efa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2266,6 +2266,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) smp_wmb(); rq = task_rq_lock(p, &flags); + update_rq_clock(rq); old_state = p->state; if (!(old_state & state)) goto out; @@ -2323,7 +2324,6 @@ out_activate: schedstat_inc(p, se.nr_wakeups_local); else schedstat_inc(p, se.nr_wakeups_remote); - update_rq_clock(rq); activate_task(rq, p, 1); success = 1; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 98345e45b05..928cd74cff0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1345,12 +1345,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) { struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); - if (unlikely(rt_prio(p->prio))) { - struct cfs_rq *cfs_rq = task_cfs_rq(curr); + update_curr(cfs_rq); - update_rq_clock(rq); - update_curr(cfs_rq); + if (unlikely(rt_prio(p->prio))) { resched_task(curr); return; } -- cgit v1.2.3-70-g09d2 From 34f28ecd0f4bdc733c681294d02d9fab5880591b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 16 Dec 2008 08:45:31 +0100 Subject: sched: optimize update_curr() Impact: micro-optimization Skip the hard work when there is none. Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 928cd74cff0..5ad4440f0fc 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -492,6 +492,8 @@ static void update_curr(struct cfs_rq *cfs_rq) * overflow on 32 bits): */ delta_exec = (unsigned long)(now - curr->exec_start); + if (!delta_exec) + return; __update_curr(cfs_rq, curr, delta_exec); curr->exec_start = now; -- cgit v1.2.3-70-g09d2 From 720f54988e17b33f3f477010b3a68ee872d20d5a Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Mon, 15 Dec 2008 22:02:01 -0800 Subject: sched, cpuacct: refactoring cpuusage_read / cpuusage_write Impact: micro-optimize the code on 64-bit architectures In the thread regarding to 'export percpu cpuacct cgroup stats' http://lkml.org/lkml/2008/12/7/13 akpm pointed out that current cpuacct code is inefficient. This patch refactoring the following: * make cpu_rq locking only on 32-bit * change iterator to each_present_cpu instead of each_possible_cpu to make it hotplug friendly. It's a bit of code churn, but I was rewarded with 160 byte code size saving on x86-64 arch and zero code size change on i386. Signed-off-by: Ken Chen Cc: Paul Menage Cc: Li Zefan Signed-off-by: Ingo Molnar --- kernel/sched.c | 56 +++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 88215066efa..41b7e2d524d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9275,6 +9275,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) kfree(ca); } +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +{ + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 data; + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit platforms. + */ + spin_lock_irq(&cpu_rq(cpu)->lock); + data = *cpuusage; + spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + data = *cpuusage; +#endif + + return data; +} + +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +{ + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit write safe on 32-bit platforms. + */ + spin_lock_irq(&cpu_rq(cpu)->lock); + *cpuusage = val; + spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + *cpuusage = val; +#endif +} + /* return total cpu usage (in nanoseconds) of a group */ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) { @@ -9282,17 +9317,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) u64 totalcpuusage = 0; int i; - for_each_possible_cpu(i) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, i); - - /* - * Take rq->lock to make 64-bit addition safe on 32-bit - * platforms. - */ - spin_lock_irq(&cpu_rq(i)->lock); - totalcpuusage += *cpuusage; - spin_unlock_irq(&cpu_rq(i)->lock); - } + for_each_present_cpu(i) + totalcpuusage += cpuacct_cpuusage_read(ca, i); return totalcpuusage; } @@ -9309,13 +9335,9 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, goto out; } - for_each_possible_cpu(i) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, i); + for_each_present_cpu(i) + cpuacct_cpuusage_write(ca, i, 0); - spin_lock_irq(&cpu_rq(i)->lock); - *cpuusage = 0; - spin_unlock_irq(&cpu_rq(i)->lock); - } out: return err; } -- cgit v1.2.3-70-g09d2 From e9515c3c9feecd74174c2998add0db51e02abb8d Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Mon, 15 Dec 2008 22:04:15 -0800 Subject: sched, cpuacct: export percpu cpuacct cgroup stats This patch export per-cpu CPU cycle usage for a given cpuacct cgroup. There is a need for a user space monitor daemon to track group CPU usage on per-cpu base. It is also useful for monitoring CFS load balancer behavior by tracking per CPU group usage. Signed-off-by: Ken Chen Reviewed-by: Li Zefan Reviewed-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 41b7e2d524d..f53e2b8ef52 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9342,12 +9342,32 @@ out: return err; } +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct cpuacct *ca = cgroup_ca(cgroup); + u64 percpu; + int i; + + for_each_present_cpu(i) { + percpu = cpuacct_cpuusage_read(ca, i); + seq_printf(m, "%llu ", (unsigned long long) percpu); + } + seq_printf(m, "\n"); + return 0; +} + static struct cftype files[] = { { .name = "usage", .read_u64 = cpuusage_read, .write_u64 = cpuusage_write, }, + { + .name = "usage_percpu", + .read_seq_string = cpuacct_percpu_seq_read, + }, + }; static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) -- cgit v1.2.3-70-g09d2 From 80f40ee4a07530cc3acbc239a9299ec47025825b Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Mon, 15 Dec 2008 11:56:48 +0530 Subject: sched: use RCU variant of list traversal in for_each_leaf_rt_rq() Impact: fix potential of rare crash for_each_leaf_rt_rq() walks an RCU protected list (rq->leaf_rt_rq_list), but doesn't use list_for_each_entry_rcu(). Fix this. Signed-off-by: Bharata B Rao Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d9ba9d5f99d..7bdf84c85cc 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -77,7 +77,7 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) } #define for_each_leaf_rt_rq(rt_rq, rq) \ - list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) + list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) { -- cgit v1.2.3-70-g09d2 From 3ac52669c7a24b93663acfcab606d1065ed1accd Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 13 Dec 2008 09:15:27 -0800 Subject: resources: skip sanity check of busy resources Impact: reduce false positives in iomem_map_sanity_check() Some drivers (vesafb) only map/reserve a portion of a resource. If then some other driver comes in and maps the whole resource, the current code WARN_ON's. This is not the intent of the checks in iomem_map_sanity_check(); rather these checks want to warn when crossing *hardware* resources only. This patch skips BUSY resources as suggested by Linus. Note: having two drivers talk to the same hardware at the same time is obviously not optimal behavior, but that's a separate story. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- kernel/resource.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 4337063663e..e633106b12f 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -853,6 +853,15 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size) if (PFN_DOWN(p->start) <= PFN_DOWN(addr) && PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1)) continue; + /* + * if a resource is "BUSY", it's not a hardware resource + * but a driver mapping of such a resource; we don't want + * to warn for those; some drivers legitimately map only + * partial hardware resources. (example: vesafb) + */ + if (p->flags & IORESOURCE_BUSY) + continue; + printk(KERN_WARNING "resource map sanity check conflict: " "0x%llx 0x%llx 0x%llx 0x%llx %s\n", (unsigned long long)addr, -- cgit v1.2.3-70-g09d2 From 343e9099c8152daff20e10d6269edec21da44fc0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 Dec 2008 16:13:07 -0800 Subject: rcu: fix rcutorture behavior during reboot Impact: fix very rare reboot hang Because rcutorture ignored all signals, it does not terminate in response to the signals sent at shutdown time. This can cause strange failures due to its continuing to make use of kernel function too late in the shutdown sequence. This patch therefore adds a shutdown notifier to rcutorture, causing it to shut down in response to a reboot or an orderly shutdown. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcutorture.c | 66 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 85cb90588a5..b3106552210 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -108,7 +109,6 @@ struct rcu_torture { int rtort_mbtest; }; -static int fullstop = 0; /* stop generating callbacks at test end. */ static LIST_HEAD(rcu_torture_freelist); static struct rcu_torture *rcu_torture_current = NULL; static long rcu_torture_current_version = 0; @@ -136,6 +136,30 @@ static int stutter_pause_test = 0; #endif int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; +#define FULLSTOP_SIGNALED 1 /* Bail due to signal. */ +#define FULLSTOP_CLEANUP 2 /* Orderly shutdown. */ +static int fullstop; /* stop generating callbacks at test end. */ +DEFINE_MUTEX(fullstop_mutex); /* protect fullstop transitions and */ + /* spawning of kthreads. */ + +/* + * Detect and respond to a signal-based shutdown. + */ +static int +rcutorture_shutdown_notify(struct notifier_block *unused1, + unsigned long unused2, void *unused3) +{ + if (fullstop) + return NOTIFY_DONE; + if (signal_pending(current)) { + mutex_lock(&fullstop_mutex); + if (!ACCESS_ONCE(fullstop)) + fullstop = FULLSTOP_SIGNALED; + mutex_unlock(&fullstop_mutex); + } + return NOTIFY_DONE; +} + /* * Allocate an element from the rcu_tortures pool. */ @@ -199,11 +223,12 @@ rcu_random(struct rcu_random_state *rrsp) static void rcu_stutter_wait(void) { - while (stutter_pause_test || !rcutorture_runnable) + while ((stutter_pause_test || !rcutorture_runnable) && !fullstop) { if (rcutorture_runnable) schedule_timeout_interruptible(1); else schedule_timeout_interruptible(round_jiffies_relative(HZ)); + } } /* @@ -599,7 +624,7 @@ rcu_torture_writer(void *arg) rcu_stutter_wait(); } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); - while (!kthread_should_stop()) + while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) schedule_timeout_uninterruptible(1); return 0; } @@ -624,7 +649,7 @@ rcu_torture_fakewriter(void *arg) } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); - while (!kthread_should_stop()) + while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) schedule_timeout_uninterruptible(1); return 0; } @@ -734,7 +759,7 @@ rcu_torture_reader(void *arg) VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); if (irqreader && cur_ops->irqcapable) del_timer_sync(&t); - while (!kthread_should_stop()) + while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) schedule_timeout_uninterruptible(1); return 0; } @@ -831,7 +856,7 @@ rcu_torture_stats(void *arg) do { schedule_timeout_interruptible(stat_interval * HZ); rcu_torture_stats_print(); - } while (!kthread_should_stop()); + } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); return 0; } @@ -899,7 +924,7 @@ rcu_torture_shuffle(void *arg) do { schedule_timeout_interruptible(shuffle_interval * HZ); rcu_torture_shuffle_tasks(); - } while (!kthread_should_stop()); + } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); return 0; } @@ -914,10 +939,10 @@ rcu_torture_stutter(void *arg) do { schedule_timeout_interruptible(stutter * HZ); stutter_pause_test = 1; - if (!kthread_should_stop()) + if (!kthread_should_stop() && !fullstop) schedule_timeout_interruptible(stutter * HZ); stutter_pause_test = 0; - } while (!kthread_should_stop()); + } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); return 0; } @@ -934,12 +959,27 @@ rcu_torture_print_module_parms(char *tag) stutter, irqreader); } +static struct notifier_block rcutorture_nb = { + .notifier_call = rcutorture_shutdown_notify, +}; + static void rcu_torture_cleanup(void) { int i; - fullstop = 1; + mutex_lock(&fullstop_mutex); + if (!fullstop) { + /* If being signaled, let it happen, then exit. */ + mutex_unlock(&fullstop_mutex); + schedule_timeout_interruptible(10 * HZ); + if (cur_ops->cb_barrier != NULL) + cur_ops->cb_barrier(); + return; + } + fullstop = FULLSTOP_CLEANUP; + mutex_unlock(&fullstop_mutex); + unregister_reboot_notifier(&rcutorture_nb); if (stutter_task) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); kthread_stop(stutter_task); @@ -1015,6 +1055,8 @@ rcu_torture_init(void) { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &srcu_ops, &sched_ops, &sched_ops_sync, }; + mutex_lock(&fullstop_mutex); + /* Process args and tell the world that the torturer is on the job. */ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { cur_ops = torture_ops[i]; @@ -1024,6 +1066,7 @@ rcu_torture_init(void) if (i == ARRAY_SIZE(torture_ops)) { printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", torture_type); + mutex_unlock(&fullstop_mutex); return (-EINVAL); } if (cur_ops->init) @@ -1146,9 +1189,12 @@ rcu_torture_init(void) goto unwind; } } + register_reboot_notifier(&rcutorture_nb); + mutex_unlock(&fullstop_mutex); return 0; unwind: + mutex_unlock(&fullstop_mutex); rcu_torture_cleanup(); return firsterr; } -- cgit v1.2.3-70-g09d2 From 2c2d7329d8afa9efa3ec24e19a53e7be9d14f242 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 16 Dec 2008 22:08:58 +0100 Subject: tracing/ftrace: use preempt_enable_no_resched_notrace in ring_buffer_time_stamp() Impact: prevent a trace recursion After some tests with function graph tracer under x86-32, I saw some recursions caused by ring_buffer_time_stamp() that calls preempt_enable_no_notrace() which calls preempt_schedule() which is traced itself. This patch re-enables preemption without rescheduling. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7f69cfeaadf..eab81f918f6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -107,7 +107,7 @@ u64 ring_buffer_time_stamp(int cpu) preempt_disable_notrace(); /* shift to debug/test normalization and TIME_EXTENTS */ time = sched_clock() << DEBUG_SHIFT; - preempt_enable_notrace(); + preempt_enable_no_resched_notrace(); return time; } -- cgit v1.2.3-70-g09d2 From 66896a85cf2890b6bbbc4c9ccdcd296600ffbf89 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 13 Dec 2008 20:18:13 +0100 Subject: tracing/ftrace: add the printk-msg-only option Impact: display ftrace_printk messages "as is" By default, ftrace_printk() messages find their output with some other informations like pid, caller, ... Sometimes a developer just want to have the ftrace_printk left "as is", without other information. This is done by providing a default-off option called printk-msg-only. To enable it, just do `echo printk-msg-only > /debugfs/tracing/trace_options` Before the patch: <...>-2739 [000] 145.692153: __might_sleep: I'm an ftrace_printk msg in __might_sleep <...>-2739 [000] 145.692155: __might_sleep: I'm another ftrace_printk msg in __might_sleep After the patch and the printk-msg-only option enabled: I'm an ftrace_printk msg in __might_sleep I'm another ftrace_printk msg in __might_sleep Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 25 +++++++++++++++++++++++++ kernel/trace/trace.h | 3 ++- 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 639344a4d3a..1a3d6b32978 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -287,6 +287,7 @@ static const char *trace_options[] = { "annotate", "userstacktrace", "sym-userobj", + "printk-msg-only", NULL }; @@ -2265,6 +2266,25 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) return TRACE_TYPE_HANDLED; } +static enum print_line_t print_printk_msg_only(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct print_entry *field; + int ret; + + trace_assign_type(field, entry); + + ret = trace_seq_printf(s, field->buf); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + if (entry->flags & TRACE_FLAG_CONT) + trace_seq_print_cont(s, iter); + + return TRACE_TYPE_HANDLED; +} + static enum print_line_t print_bin_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -2345,6 +2365,11 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter) return ret; } + if (iter->ent->type == TRACE_PRINT && + trace_flags & TRACE_ITER_PRINTK && + trace_flags & TRACE_ITER_PRINTK_MSGONLY) + return print_printk_msg_only(iter); + if (trace_flags & TRACE_ITER_BIN) return print_bin_fmt(iter); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f07c246dd73..fc75dce7a66 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -578,7 +578,8 @@ enum trace_iterator_flags { TRACE_ITER_BRANCH = 0x1000, TRACE_ITER_ANNOTATE = 0x2000, TRACE_ITER_USERSTACKTRACE = 0x4000, - TRACE_ITER_SYM_USEROBJ = 0x8000 + TRACE_ITER_SYM_USEROBJ = 0x8000, + TRACE_ITER_PRINTK_MSGONLY = 0x10000 }; /* -- cgit v1.2.3-70-g09d2 From 73500ac545d24610eb2cf8579ffc88957e9c5847 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Wed, 17 Dec 2008 17:29:56 -0800 Subject: futex: rename field in futex_q to clarify single waiter semantics Impact: simplify code I've tripped over the naming of this field a couple times. The futex_q uses a "waiters" list to represent a single blocked task and then calles wake_up_all(). This can lead to confusion in trying to understand the intent of the code, which is to have a single futex_q for every task waiting on a futex. This patch corrects the problem, using a single pointer to the waiting task, and an appropriate call to wake_up, rather than wake_up_all. Compile and boot tested on an 8way x86_64 machine. Signed-off-by: Darren Hart Acked-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/futex.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index ba0d3b83c09..99f8acce08b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -92,11 +92,12 @@ struct futex_pi_state { * A futex_q has a woken state, just like tasks have TASK_RUNNING. * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. * The order of wakup is always to make the first condition true, then - * wake up q->waiters, then make the second condition true. + * wake up q->waiter, then make the second condition true. */ struct futex_q { struct plist_node list; - wait_queue_head_t waiters; + /* There can only be a single waiter */ + wait_queue_head_t waiter; /* Which hash list lock to use: */ spinlock_t *lock_ptr; @@ -573,7 +574,7 @@ static void wake_futex(struct futex_q *q) * The lock in wake_up_all() is a crucial memory barrier after the * plist_del() and also before assigning to q->lock_ptr. */ - wake_up_all(&q->waiters); + wake_up(&q->waiter); /* * The waiting task can free the futex_q as soon as this is written, * without taking any locks. This must come last. @@ -930,7 +931,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) { struct futex_hash_bucket *hb; - init_waitqueue_head(&q->waiters); + init_waitqueue_head(&q->waiter); get_futex_key_refs(&q->key); hb = hash_futex(&q->key); @@ -1221,7 +1222,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, /* add_wait_queue is the barrier after __set_current_state. */ __set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&q.waiters, &wait); + add_wait_queue(&q.waiter, &wait); /* * !plist_node_empty() is safe here without any lock. * q.lock_ptr != 0 is not safe, because of ordering against wakeup. -- cgit v1.2.3-70-g09d2 From f38f1d2aa5a3520cf05da7cd6bd12fe2b0c509b7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 16 Dec 2008 23:06:40 -0500 Subject: trace: add a way to enable or disable the stack tracer Impact: enhancement to stack tracer The stack tracer currently is either on when configured in or off when it is not. It can not be disabled when it is configured on. (besides disabling the function tracer that it uses) This patch adds a way to enable or disable the stack tracer at run time. It defaults off on bootup, but a kernel parameter 'stacktrace' has been added to enable it on bootup. A new sysctl has been added "kernel.stack_tracer_enabled" to let the user enable or disable the stack tracer at run time. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 4 +++ include/linux/ftrace.h | 8 ++++++ kernel/sysctl.c | 10 +++++++ kernel/trace/Kconfig | 13 +++++++--- kernel/trace/trace_stack.c | 52 ++++++++++++++++++++++++++++++++++--- 5 files changed, 79 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 2919a2e9193..edab81c1318 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -89,6 +89,7 @@ parameter is applicable: SPARC Sparc architecture is enabled. SWSUSP Software suspend (hibernation) is enabled. SUSPEND System suspend states are enabled. + FTRACE Function tracing enabled. TS Appropriate touchscreen support is enabled. USB USB support is enabled. USBHID USB Human Interface Device support is enabled. @@ -2173,6 +2174,9 @@ and is between 256 and 4096 characters. It is defined in the file st= [HW,SCSI] SCSI tape parameters (buffers, etc.) See Documentation/scsi/st.txt. + stacktrace [FTRACE] + Enabled the stack tracer on boot up. + sti= [PARISC,HW] Format: Set the STI (builtin display/keyboard on the HP-PARISC diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 44020f31bd8..6b0db53caa7 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -86,6 +86,14 @@ static inline void ftrace_stop(void) { } static inline void ftrace_start(void) { } #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_STACK_TRACER +extern int stack_tracer_enabled; +int +stack_trace_sysctl(struct ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *lenp, + loff_t *ppos); +#endif + #ifdef CONFIG_DYNAMIC_FTRACE /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */ #include diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c83f566e940..6ac501a2dcc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -487,6 +487,16 @@ static struct ctl_table kern_table[] = { .proc_handler = &ftrace_enable_sysctl, }, #endif +#ifdef CONFIG_STACK_TRACER + { + .ctl_name = CTL_UNNUMBERED, + .procname = "stack_tracer_enabled", + .data = &stack_tracer_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &stack_trace_sysctl, + }, +#endif #ifdef CONFIG_TRACING { .ctl_name = CTL_UNNUMBERED, diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d8bae6f4219..e2a4ff6fc3a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -244,10 +244,15 @@ config STACK_TRACER This tracer works by hooking into every function call that the kernel executes, and keeping a maximum stack depth value and - stack-trace saved. Because this logic has to execute in every - kernel function, all the time, this option can slow down the - kernel measurably and is generally intended for kernel - developers only. + stack-trace saved. If this is configured with DYNAMIC_FTRACE + then it will not have any overhead while the stack tracer + is disabled. + + To enable the stack tracer on bootup, pass in 'stacktrace' + on the kernel command line. + + The stack tracer can also be enabled or disabled via the + sysctl kernel.stack_tracer_enabled Say N if unsure. diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 0b863f2cbc8..4842c969c78 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include "trace.h" @@ -31,6 +32,10 @@ static raw_spinlock_t max_stack_lock = static int stack_trace_disabled __read_mostly; static DEFINE_PER_CPU(int, trace_active); +static DEFINE_MUTEX(stack_sysctl_mutex); + +int stack_tracer_enabled; +static int last_stack_tracer_enabled; static inline void check_stack(void) { @@ -174,7 +179,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, return count; } -static struct file_operations stack_max_size_fops = { +static const struct file_operations stack_max_size_fops = { .open = tracing_open_generic, .read = stack_max_size_read, .write = stack_max_size_write, @@ -272,7 +277,7 @@ static int t_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations stack_trace_seq_ops = { +static const struct seq_operations stack_trace_seq_ops = { .start = t_start, .next = t_next, .stop = t_stop, @@ -288,12 +293,48 @@ static int stack_trace_open(struct inode *inode, struct file *file) return ret; } -static struct file_operations stack_trace_fops = { +static const struct file_operations stack_trace_fops = { .open = stack_trace_open, .read = seq_read, .llseek = seq_lseek, }; +int +stack_trace_sysctl(struct ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + mutex_lock(&stack_sysctl_mutex); + + ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + + if (ret || !write || + (last_stack_tracer_enabled == stack_tracer_enabled)) + goto out; + + last_stack_tracer_enabled = stack_tracer_enabled; + + if (stack_tracer_enabled) + register_ftrace_function(&trace_ops); + else + unregister_ftrace_function(&trace_ops); + + out: + mutex_unlock(&stack_sysctl_mutex); + return ret; +} + +static int start_stack_trace __initdata; + +static __init int enable_stacktrace(char *str) +{ + start_stack_trace = 1; + return 1; +} +__setup("stacktrace", enable_stacktrace); + static __init int stack_trace_init(void) { struct dentry *d_tracer; @@ -311,7 +352,10 @@ static __init int stack_trace_init(void) if (!entry) pr_warning("Could not create debugfs 'stack_trace' entry\n"); - register_ftrace_function(&trace_ops); + if (start_stack_trace) { + register_ftrace_function(&trace_ops); + stack_tracer_enabled = 1; + } return 0; } -- cgit v1.2.3-70-g09d2 From e05a43b744fb9518cbf8539a7ef33164ac60a70f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 17 Dec 2008 09:43:00 -0500 Subject: trace: better use of stack_trace_enabled for boot up code Impact: clean up Andrew Morton suggested to use the stack_tracer_enabled variable to decide whether or not to start stack tracing on bootup. This lets us remove the start_stack_trace variable. Reported-by: Andrew Morton Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_stack.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 4842c969c78..d0871bc0aca 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -308,7 +308,7 @@ stack_trace_sysctl(struct ctl_table *table, int write, mutex_lock(&stack_sysctl_mutex); - ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + ret = proc_dointvec(table, write, file, buffer, lenp, ppos); if (ret || !write || (last_stack_tracer_enabled == stack_tracer_enabled)) @@ -326,11 +326,10 @@ stack_trace_sysctl(struct ctl_table *table, int write, return ret; } -static int start_stack_trace __initdata; - static __init int enable_stacktrace(char *str) { - start_stack_trace = 1; + stack_tracer_enabled = 1; + last_stack_tracer_enabled = 1; return 1; } __setup("stacktrace", enable_stacktrace); @@ -352,10 +351,8 @@ static __init int stack_trace_init(void) if (!entry) pr_warning("Could not create debugfs 'stack_trace' entry\n"); - if (start_stack_trace) { + if (stack_tracer_enabled) register_ftrace_function(&trace_ops); - stack_tracer_enabled = 1; - } return 0; } -- cgit v1.2.3-70-g09d2 From ea3a6d6d60b2504c573fe3415f6617e8310c0236 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 17 Dec 2008 15:05:36 -0500 Subject: ftrace: add not to regex on filtering functions Impact: enhancement Ingo Molnar has asked about a way to remove items from the filter lists. Currently, you can only add or replace items. The way items are added to the list is through opening one of the list files (set_ftrace_filter or set_ftrace_notrace) via append. If the file is opened for truncate, the list is cleared. echo spin_lock > /debug/tracing/set_ftrace_filter The above will replace the list with only spin_lock echo spin_lock >> /debug/tracing/set_ftrace_filter The above will add spin_lock to the list. Now this patch adds: echo '!spin_lock' >> /debug/tracing/set_ftrace_filter This will remove spin_lock from the list. The limited glob features of these lists also can be notted. echo '!spin_*' >> /debug/tracing/set_ftrace_filter This will remove all functions that start with 'spin_' Note: echo '!spin_*' > /debug/tracing/set_ftrace_filter will simply clear out the list (notice the '>' instead of '>>') Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a12f80efcea..2f32969c09d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1047,6 +1047,13 @@ ftrace_match(unsigned char *buff, int len, int enable) int type = MATCH_FULL; unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; unsigned i, match = 0, search_len = 0; + int not = 0; + + if (buff[0] == '!') { + not = 1; + buff++; + len--; + } for (i = 0; i < len; i++) { if (buff[i] == '*') { @@ -1100,8 +1107,12 @@ ftrace_match(unsigned char *buff, int len, int enable) matched = 1; break; } - if (matched) - rec->flags |= flag; + if (matched) { + if (not) + rec->flags &= ~flag; + else + rec->flags |= flag; + } } pg = pg->next; } -- cgit v1.2.3-70-g09d2 From 3d9101e92529e1ff6014f95a69afc82f37b9b13a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 17 Dec 2008 22:34:13 +0100 Subject: trace: fix task state printout Impact: fix occasionally incorrect trace output The tracing code has interesting varieties of printing out task state. Unfortunalely only one of the instances is correct as it copies the code from sched.c:sched_show_task(). The others are plain wrong as they treatthe bitfield as an integer offset into the character array. Also the size check of the character array is wrong as it includes the trailing \0. Use a common state decoder inline which does the Right Thing. Signed-off-by: Thomas Gleixner Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d86e3252f30..803100518f1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1301,6 +1301,13 @@ lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; +static int task_state_char(unsigned long state) +{ + int bit = state ? __ffs(state) + 1 : 0; + + return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; +} + /* * The message is supposed to contain an ending newline. * If the printing stops prematurely, try to add a newline of our own. @@ -1396,12 +1403,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) trace_assign_type(field, entry); - T = field->next_state < sizeof(state_to_char) ? - state_to_char[field->next_state] : 'X'; - - state = field->prev_state ? - __ffs(field->prev_state) + 1 : 0; - S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X'; + T = task_state_char(field->next_state); + S = task_state_char(field->prev_state); comm = trace_find_cmdline(field->next_pid); trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", field->prev_pid, @@ -1519,10 +1522,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) trace_assign_type(field, entry); - S = field->prev_state < sizeof(state_to_char) ? - state_to_char[field->prev_state] : 'X'; - T = field->next_state < sizeof(state_to_char) ? - state_to_char[field->next_state] : 'X'; + T = task_state_char(field->next_state); + S = task_state_char(field->prev_state); ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n", field->prev_pid, field->prev_prio, @@ -1621,12 +1622,9 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) trace_assign_type(field, entry); - S = field->prev_state < sizeof(state_to_char) ? - state_to_char[field->prev_state] : 'X'; - T = field->next_state < sizeof(state_to_char) ? - state_to_char[field->next_state] : 'X'; - if (entry->type == TRACE_WAKE) - S = '+'; + T = task_state_char(field->next_state); + S = entry->type == TRACE_WAKE ? '+' : + task_state_char(field->prev_state); ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n", field->prev_pid, field->prev_prio, @@ -1712,12 +1710,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) trace_assign_type(field, entry); - S = field->prev_state < sizeof(state_to_char) ? - state_to_char[field->prev_state] : 'X'; - T = field->next_state < sizeof(state_to_char) ? - state_to_char[field->next_state] : 'X'; - if (entry->type == TRACE_WAKE) - S = '+'; + T = task_state_char(field->next_state); + S = entry->type == TRACE_WAKE ? '+' : + task_state_char(field->prev_state); SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); SEQ_PUT_HEX_FIELD_RET(s, S); -- cgit v1.2.3-70-g09d2 From 6d102bc68f3dd2ae0e305b09170b1751aa67baeb Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 17 Dec 2008 17:48:23 +0800 Subject: tracing/ring-buffer: remove unused ring_buffer size Impact: remove dead code struct ring_buffer.size is not set after ring_buffer is initialized or resized. it is always 0. we can use "buffer->pages * PAGE_SIZE" to get ring_buffer's size Signed-off-by: Lai Jiangshan Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index eab81f918f6..bb6922a931b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -258,7 +258,6 @@ struct ring_buffer_per_cpu { }; struct ring_buffer { - unsigned long size; unsigned pages; unsigned flags; int cpus; @@ -2210,8 +2209,7 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, return -EINVAL; /* At least make sure the two buffers are somewhat the same */ - if (buffer_a->size != buffer_b->size || - buffer_a->pages != buffer_b->pages) + if (buffer_a->pages != buffer_b->pages) return -EINVAL; cpu_buffer_a = buffer_a->buffers[cpu]; -- cgit v1.2.3-70-g09d2 From 9c2c48020ec0dd6ecd27e5a1298f73b40d85a595 Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Tue, 16 Dec 2008 23:41:22 -0800 Subject: schedstat: consolidate per-task cpu runtime stats Impact: simplify code When we turn on CONFIG_SCHEDSTATS, per-task cpu runtime is accumulated twice. Once in task->se.sum_exec_runtime and once in sched_info.cpu_time. These two stats are exactly the same. Given that task->se.sum_exec_runtime is always accumulated by the core scheduler, sched_info can reuse that data instead of duplicate the accounting. Signed-off-by: Ken Chen Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- fs/proc/base.c | 2 +- include/linux/sched.h | 3 +-- kernel/delayacct.c | 2 +- kernel/sched.c | 2 ++ kernel/sched_stats.h | 5 ++--- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/fs/proc/base.c b/fs/proc/base.c index d4677603c88..4d745bac768 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -347,7 +347,7 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer) static int proc_pid_schedstat(struct task_struct *task, char *buffer) { return sprintf(buffer, "%llu %llu %lu\n", - task->sched_info.cpu_time, + task->se.sum_exec_runtime, task->sched_info.run_delay, task->sched_info.pcount); } diff --git a/include/linux/sched.h b/include/linux/sched.h index 8cccd6dc5d6..2d1e840ddd3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -670,8 +670,7 @@ struct reclaim_state; struct sched_info { /* cumulative counters */ unsigned long pcount; /* # of times run on this cpu */ - unsigned long long cpu_time, /* time spent on the cpu */ - run_delay; /* time spent waiting on a runqueue */ + unsigned long long run_delay; /* time spent waiting on a runqueue */ /* timestamps */ unsigned long long last_arrival,/* when we last ran on a cpu */ diff --git a/kernel/delayacct.c b/kernel/delayacct.c index b3179dad71b..abb6e17505e 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -127,7 +127,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) */ t1 = tsk->sched_info.pcount; t2 = tsk->sched_info.run_delay; - t3 = tsk->sched_info.cpu_time; + t3 = tsk->se.sum_exec_runtime; d->cpu_count += t1; diff --git a/kernel/sched.c b/kernel/sched.c index f53e2b8ef52..fd835fc320b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -596,6 +596,8 @@ struct rq { #ifdef CONFIG_SCHEDSTATS /* latency stats */ struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ /* sys_sched_yield() stats */ unsigned int yld_exp_empty; diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 7dbf72a2b02..3b01098164c 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -31,7 +31,7 @@ static int show_schedstat(struct seq_file *seq, void *v) rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count, rq->sched_switch, rq->sched_count, rq->sched_goidle, rq->ttwu_count, rq->ttwu_local, - rq->rq_sched_info.cpu_time, + rq->rq_cpu_time, rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); seq_printf(seq, "\n"); @@ -123,7 +123,7 @@ static inline void rq_sched_info_depart(struct rq *rq, unsigned long long delta) { if (rq) - rq->rq_sched_info.cpu_time += delta; + rq->rq_cpu_time += delta; } static inline void @@ -236,7 +236,6 @@ static inline void sched_info_depart(struct task_struct *t) unsigned long long delta = task_rq(t)->clock - t->sched_info.last_arrival; - t->sched_info.cpu_time += delta; rq_sched_info_depart(task_rq(t), delta); if (t->state == TASK_RUNNING) -- cgit v1.2.3-70-g09d2 From 74c8a6130486bed224e960790f4aa72dd09c061e Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 17 Dec 2008 19:40:33 +0900 Subject: locking, irq: enclose irq_desc_lock_class in CONFIG_LOCKDEP Impact: simplify code commit "08678b0: generic: sparse irqs: use irq_desc() [...]" introduced the irq_desc_lock_class variable. But it is used only if CONFIG_SPARSE_IRQ=Y or CONFIG_TRACE_IRQFLAGS=Y. Otherwise, following warnings happen: CC kernel/irq/handle.o kernel/irq/handle.c:26: warning: 'irq_desc_lock_class' defined but not used Actually, current early_init_irq_lock_class has a bit strange and messy ifdef. In addition, it is not valueable. 1. this function is protected by !CONFIG_SPARSE_IRQ, but that is not necessary. if CONFIG_SPARSE_IRQ=Y, desc of all irq number are initialized by NULL at first - then this function calling is safe. 2. this function protected by CONFIG_TRACE_IRQFLAGS too. but it is not necessary either, because lockdep_set_class() doesn't have bad side effect even if CONFIG_TRACE_IRQFLAGS=n. This patch bloat kernel size a bit on CONFIG_TRACE_IRQFLAGS=n and CONFIG_SPARSE_IRQ=Y - but that's ok. early_init_irq_lock_class() is not a fastpatch at all. To avoid messy ifdefs is more important than a few bytes diet. Signed-off-by: KOSAKI Motohiro Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 2 +- kernel/irq/handle.c | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 29aec6e1002..9dba554c802 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -377,7 +377,7 @@ do { \ #endif /* CONFIG_LOCK_STAT */ -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_GENERIC_HARDIRQS) +#ifdef CONFIG_GENERIC_HARDIRQS extern void early_init_irq_lock_class(void); #else static inline void early_init_irq_lock_class(void) diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index f1a23069c20..6492400cb50 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -422,11 +422,8 @@ out: } #endif - -#ifdef CONFIG_TRACE_IRQFLAGS void early_init_irq_lock_class(void) { -#ifndef CONFIG_SPARSE_IRQ struct irq_desc *desc; int i; @@ -436,9 +433,7 @@ void early_init_irq_lock_class(void) lockdep_set_class(&desc->lock, &irq_desc_lock_class); } -#endif } -#endif #ifdef CONFIG_SPARSE_IRQ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) -- cgit v1.2.3-70-g09d2 From 64db4cfff99c04cd5f550357edcc8780f96b54a2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 18 Dec 2008 21:55:32 +0100 Subject: "Tree RCU": scalable classic RCU implementation This patch fixes a long-standing performance bug in classic RCU that results in massive internal-to-RCU lock contention on systems with more than a few hundred CPUs. Although this patch creates a separate flavor of RCU for ease of review and patch maintenance, it is intended to replace classic RCU. This patch still handles stress better than does mainline, so I am still calling it ready for inclusion. This patch is against the -tip tree. Nevertheless, experience on an actual 1000+ CPU machine would still be most welcome. Most of the changes noted below were found while creating an rcutiny (which should permit ejecting the current rcuclassic) and while doing detailed line-by-line documentation. Updates from v9 (http://lkml.org/lkml/2008/12/2/334): o Fixes from remainder of line-by-line code walkthrough, including comment spelling, initialization, undesirable narrowing due to type conversion, removing redundant memory barriers, removing redundant local-variable initialization, and removing redundant local variables. I do not believe that any of these fixes address the CPU-hotplug issues that Andi Kleen was seeing, but please do give it a whirl in case the machine is smarter than I am. A writeup from the walkthrough may be found at the following URL, in case you are suffering from terminal insomnia or masochism: http://www.kernel.org/pub/linux/kernel/people/paulmck/tmp/rcutree-walkthrough.2008.12.16a.pdf o Made rcutree tracing use seq_file, as suggested some time ago by Lai Jiangshan. o Added a .csv variant of the rcudata debugfs trace file, to allow people having thousands of CPUs to drop the data into a spreadsheet. Tested with oocalc and gnumeric. Updated documentation to suit. Updates from v8 (http://lkml.org/lkml/2008/11/15/139): o Fix a theoretical race between grace-period initialization and force_quiescent_state() that could occur if more than three jiffies were required to carry out the grace-period initialization. Which it might, if you had enough CPUs. o Apply Ingo's printk-standardization patch. o Substitute local variables for repeated accesses to global variables. o Fix comment misspellings and redundant (but harmless) increments of ->n_rcu_pending (this latter after having explicitly added it). o Apply checkpatch fixes. Updates from v7 (http://lkml.org/lkml/2008/10/10/291): o Fixed a number of problems noted by Gautham Shenoy, including the cpu-stall-detection bug that he was having difficulty convincing me was real. ;-) o Changed cpu-stall detection to wait for ten seconds rather than three in order to reduce false positive, as suggested by Ingo Molnar. o Produced a design document (http://lwn.net/Articles/305782/). The act of writing this document uncovered a number of both theoretical and "here and now" bugs as noted below. o Fix dynticks_nesting accounting confusion, simplify WARN_ON() condition, fix kerneldoc comments, and add memory barriers in dynticks interface functions. o Add more data to tracing. o Remove unused "rcu_barrier" field from rcu_data structure. o Count calls to rcu_pending() from scheduling-clock interrupt to use as a surrogate timebase should jiffies stop counting. o Fix a theoretical race between force_quiescent_state() and grace-period initialization. Yes, initialization does have to go on for some jiffies for this race to occur, but given enough CPUs... Updates from v6 (http://lkml.org/lkml/2008/9/23/448): o Fix a number of checkpatch.pl complaints. o Apply review comments from Ingo Molnar and Lai Jiangshan on the stall-detection code. o Fix several bugs in !CONFIG_SMP builds. o Fix a misspelled config-parameter name so that RCU now announces at boot time if stall detection is configured. o Run tests on numerous combinations of configurations parameters, which after the fixes above, now build and run correctly. Updates from v5 (http://lkml.org/lkml/2008/9/15/92, bad subject line): o Fix a compiler error in the !CONFIG_FANOUT_EXACT case (blew a changeset some time ago, and finally got around to retesting this option). o Fix some tracing bugs in rcupreempt that caused incorrect totals to be printed. o I now test with a more brutal random-selection online/offline script (attached). Probably more brutal than it needs to be on the people reading it as well, but so it goes. o A number of optimizations and usability improvements: o Make rcu_pending() ignore the grace-period timeout when there is no grace period in progress. o Make force_quiescent_state() avoid going for a global lock in the case where there is no grace period in progress. o Rearrange struct fields to improve struct layout. o Make call_rcu() initiate a grace period if RCU was idle, rather than waiting for the next scheduling clock interrupt. o Invoke rcu_irq_enter() and rcu_irq_exit() only when idle, as suggested by Andi Kleen. I still don't completely trust this change, and might back it out. o Make CONFIG_RCU_TRACE be the single config variable manipulated for all forms of RCU, instead of the prior confusion. o Document tracing files and formats for both rcupreempt and rcutree. Updates from v4 for those missing v5 given its bad subject line: o Separated dynticks interface so that NMIs and irqs call separate functions, greatly simplifying it. In particular, this code no longer requires a proof of correctness. ;-) o Separated dynticks state out into its own per-CPU structure, avoiding the duplicated accounting. o The case where a dynticks-idle CPU runs an irq handler that invokes call_rcu() is now correctly handled, forcing that CPU out of dynticks-idle mode. o Review comments have been applied (thank you all!!!). For but one example, fixed the dynticks-ordering issue that Manfred pointed out, saving me much debugging. ;-) o Adjusted rcuclassic and rcupreempt to handle dynticks changes. Attached is an updated patch to Classic RCU that applies a hierarchy, greatly reducing the contention on the top-level lock for large machines. This passes 10-hour concurrent rcutorture and online-offline testing on 128-CPU ppc64 without dynticks enabled, and exposes some timekeeping bugs in presence of dynticks (exciting working on a system where "sleep 1" hangs until interrupted...), which were fixed in the 2.6.27 kernel. It is getting more reliable than mainline by some measures, so the next version will be against -tip for inclusion. See also Manfred Spraul's recent patches (or his earlier work from 2004 at http://marc.info/?l=linux-kernel&m=108546384711797&w=2). We will converge onto a common patch in the fullness of time, but are currently exploring different regions of the design space. That said, I have already gratefully stolen quite a few of Manfred's ideas. This patch provides CONFIG_RCU_FANOUT, which controls the bushiness of the RCU hierarchy. Defaults to 32 on 32-bit machines and 64 on 64-bit machines. If CONFIG_NR_CPUS is less than CONFIG_RCU_FANOUT, there is no hierarchy. By default, the RCU initialization code will adjust CONFIG_RCU_FANOUT to balance the hierarchy, so strongly NUMA architectures may choose to set CONFIG_RCU_FANOUT_EXACT to disable this balancing, allowing the hierarchy to be exactly aligned to the underlying hardware. Up to two levels of hierarchy are permitted (in addition to the root node), allowing up to 16,384 CPUs on 32-bit systems and up to 262,144 CPUs on 64-bit systems. I just know that I am going to regret saying this, but this seems more than sufficient for the foreseeable future. (Some architectures might wish to set CONFIG_RCU_FANOUT=4, which would limit such architectures to 64 CPUs. If this becomes a real problem, additional levels can be added, but I doubt that it will make a significant difference on real hardware.) In the common case, a given CPU will manipulate its private rcu_data structure and the rcu_node structure that it shares with its immediate neighbors. This can reduce both lock and memory contention by multiple orders of magnitude, which should eliminate the need for the strange manipulations that are reported to be required when running Linux on very large systems. Some shortcomings: o More bugs will probably surface as a result of an ongoing line-by-line code inspection. Patches will be provided as required. o There are probably hangs, rcutorture failures, &c. Seems quite stable on a 128-CPU machine, but that is kind of small compared to 4096 CPUs. However, seems to do better than mainline. Patches will be provided as required. o The memory footprint of this version is several KB larger than rcuclassic. A separate UP-only rcutiny patch will be provided, which will reduce the memory footprint significantly, even compared to the old rcuclassic. One such patch passes light testing, and has a memory footprint smaller even than rcuclassic. Initial reaction from various embedded guys was "it is not worth it", so am putting it aside. Credits: o Manfred Spraul for ideas, review comments, and bugs spotted, as well as some good friendly competition. ;-) o Josh Triplett, Ingo Molnar, Peter Zijlstra, Mathieu Desnoyers, Lai Jiangshan, Andi Kleen, Andy Whitcroft, and Andrew Morton for reviews and comments. o Thomas Gleixner for much-needed help with some timer issues (see patches below). o Jon M. Tollefson, Tim Pepper, Andrew Theurer, Jose R. Santos, Andy Whitcroft, Darrick Wong, Nishanth Aravamudan, Anton Blanchard, Dave Kleikamp, and Nathan Lynch for keeping machines alive despite my heavy abuse^Wtesting. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- Documentation/RCU/00-INDEX | 2 + Documentation/RCU/trace.txt | 413 +++++++++ arch/powerpc/platforms/pseries/rtasd.c | 4 + include/linux/hardirq.h | 14 +- include/linux/rcupdate.h | 10 +- include/linux/rcutree.h | 329 +++++++ init/Kconfig | 18 +- kernel/Kconfig.preempt | 62 +- kernel/Makefile | 6 +- kernel/rcupreempt.c | 10 + kernel/rcupreempt_trace.c | 10 +- kernel/rcutree.c | 1535 ++++++++++++++++++++++++++++++++ kernel/rcutree_trace.c | 271 ++++++ kernel/softirq.c | 5 +- lib/Kconfig.debug | 13 + 15 files changed, 2671 insertions(+), 31 deletions(-) create mode 100644 Documentation/RCU/trace.txt create mode 100644 include/linux/rcutree.h create mode 100644 kernel/rcutree.c create mode 100644 kernel/rcutree_trace.c (limited to 'kernel') diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX index 461481dfb7c..7dc0695a8f9 100644 --- a/Documentation/RCU/00-INDEX +++ b/Documentation/RCU/00-INDEX @@ -16,6 +16,8 @@ RTFP.txt - List of RCU papers (bibliography) going back to 1980. torture.txt - RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST) +trace.txt + - CONFIG_RCU_TRACE debugfs files and formats UP.txt - RCU on Uniprocessor Systems whatisRCU.txt diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt new file mode 100644 index 00000000000..068848240a8 --- /dev/null +++ b/Documentation/RCU/trace.txt @@ -0,0 +1,413 @@ +CONFIG_RCU_TRACE debugfs Files and Formats + + +The rcupreempt and rcutree implementations of RCU provide debugfs trace +output that summarizes counters and state. This information is useful for +debugging RCU itself, and can sometimes also help to debug abuses of RCU. +Note that the rcuclassic implementation of RCU does not provide debugfs +trace output. + +The following sections describe the debugfs files and formats for +preemptable RCU (rcupreempt) and hierarchical RCU (rcutree). + + +Preemptable RCU debugfs Files and Formats + +This implementation of RCU provides three debugfs files under the +top-level directory RCU: rcu/rcuctrs (which displays the per-CPU +counters used by preemptable RCU) rcu/rcugp (which displays grace-period +counters), and rcu/rcustats (which internal counters for debugging RCU). + +The output of "cat rcu/rcuctrs" looks as follows: + +CPU last cur F M + 0 5 -5 0 0 + 1 -1 0 0 0 + 2 0 1 0 0 + 3 0 1 0 0 + 4 0 1 0 0 + 5 0 1 0 0 + 6 0 2 0 0 + 7 0 -1 0 0 + 8 0 1 0 0 +ggp = 26226, state = waitzero + +The per-CPU fields are as follows: + +o "CPU" gives the CPU number. Offline CPUs are not displayed. + +o "last" gives the value of the counter that is being decremented + for the current grace period phase. In the example above, + the counters sum to 4, indicating that there are still four + RCU read-side critical sections still running that started + before the last counter flip. + +o "cur" gives the value of the counter that is currently being + both incremented (by rcu_read_lock()) and decremented (by + rcu_read_unlock()). In the example above, the counters sum to + 1, indicating that there is only one RCU read-side critical section + still running that started after the last counter flip. + +o "F" indicates whether RCU is waiting for this CPU to acknowledge + a counter flip. In the above example, RCU is not waiting on any, + which is consistent with the state being "waitzero" rather than + "waitack". + +o "M" indicates whether RCU is waiting for this CPU to execute a + memory barrier. In the above example, RCU is not waiting on any, + which is consistent with the state being "waitzero" rather than + "waitmb". + +o "ggp" is the global grace-period counter. + +o "state" is the RCU state, which can be one of the following: + + o "idle": there is no grace period in progress. + + o "waitack": RCU just incremented the global grace-period + counter, which has the effect of reversing the roles of + the "last" and "cur" counters above, and is waiting for + all the CPUs to acknowledge the flip. Once the flip has + been acknowledged, CPUs will no longer be incrementing + what are now the "last" counters, so that their sum will + decrease monotonically down to zero. + + o "waitzero": RCU is waiting for the sum of the "last" counters + to decrease to zero. + + o "waitmb": RCU is waiting for each CPU to execute a memory + barrier, which ensures that instructions from a given CPU's + last RCU read-side critical section cannot be reordered + with instructions following the memory-barrier instruction. + +The output of "cat rcu/rcugp" looks as follows: + +oldggp=48870 newggp=48873 + +Note that reading from this file provokes a synchronize_rcu(). The +"oldggp" value is that of "ggp" from rcu/rcuctrs above, taken before +executing the synchronize_rcu(), and the "newggp" value is also the +"ggp" value, but taken after the synchronize_rcu() command returns. + + +The output of "cat rcu/rcugp" looks as follows: + +na=1337955 nl=40 wa=1337915 wl=44 da=1337871 dl=0 dr=1337871 di=1337871 +1=50989 e1=6138 i1=49722 ie1=82 g1=49640 a1=315203 ae1=265563 a2=49640 +z1=1401244 ze1=1351605 z2=49639 m1=5661253 me1=5611614 m2=49639 + +These are counters tracking internal preemptable-RCU events, however, +some of them may be useful for debugging algorithms using RCU. In +particular, the "nl", "wl", and "dl" values track the number of RCU +callbacks in various states. The fields are as follows: + +o "na" is the total number of RCU callbacks that have been enqueued + since boot. + +o "nl" is the number of RCU callbacks waiting for the previous + grace period to end so that they can start waiting on the next + grace period. + +o "wa" is the total number of RCU callbacks that have started waiting + for a grace period since boot. "na" should be roughly equal to + "nl" plus "wa". + +o "wl" is the number of RCU callbacks currently waiting for their + grace period to end. + +o "da" is the total number of RCU callbacks whose grace periods + have completed since boot. "wa" should be roughly equal to + "wl" plus "da". + +o "dr" is the total number of RCU callbacks that have been removed + from the list of callbacks ready to invoke. "dr" should be roughly + equal to "da". + +o "di" is the total number of RCU callbacks that have been invoked + since boot. "di" should be roughly equal to "da", though some + early versions of preemptable RCU had a bug so that only the + last CPU's count of invocations was displayed, rather than the + sum of all CPU's counts. + +o "1" is the number of calls to rcu_try_flip(). This should be + roughly equal to the sum of "e1", "i1", "a1", "z1", and "m1" + described below. In other words, the number of times that + the state machine is visited should be equal to the sum of the + number of times that each state is visited plus the number of + times that the state-machine lock acquisition failed. + +o "e1" is the number of times that rcu_try_flip() was unable to + acquire the fliplock. + +o "i1" is the number of calls to rcu_try_flip_idle(). + +o "ie1" is the number of times rcu_try_flip_idle() exited early + due to the calling CPU having no work for RCU. + +o "g1" is the number of times that rcu_try_flip_idle() decided + to start a new grace period. "i1" should be roughly equal to + "ie1" plus "g1". + +o "a1" is the number of calls to rcu_try_flip_waitack(). + +o "ae1" is the number of times that rcu_try_flip_waitack() found + that at least one CPU had not yet acknowledge the new grace period + (AKA "counter flip"). + +o "a2" is the number of time rcu_try_flip_waitack() found that + all CPUs had acknowledged. "a1" should be roughly equal to + "ae1" plus "a2". (This particular output was collected on + a 128-CPU machine, hence the smaller-than-usual fraction of + calls to rcu_try_flip_waitack() finding all CPUs having already + acknowledged.) + +o "z1" is the number of calls to rcu_try_flip_waitzero(). + +o "ze1" is the number of times that rcu_try_flip_waitzero() found + that not all of the old RCU read-side critical sections had + completed. + +o "z2" is the number of times that rcu_try_flip_waitzero() finds + the sum of the counters equal to zero, in other words, that + all of the old RCU read-side critical sections had completed. + The value of "z1" should be roughly equal to "ze1" plus + "z2". + +o "m1" is the number of calls to rcu_try_flip_waitmb(). + +o "me1" is the number of times that rcu_try_flip_waitmb() finds + that at least one CPU has not yet executed a memory barrier. + +o "m2" is the number of times that rcu_try_flip_waitmb() finds that + all CPUs have executed a memory barrier. + + +Hierarchical RCU debugfs Files and Formats + +This implementation of RCU provides three debugfs files under the +top-level directory RCU: rcu/rcudata (which displays fields in struct +rcu_data), rcu/rcugp (which displays grace-period counters), and +rcu/rcuhier (which displays the struct rcu_node hierarchy). + +The output of "cat rcu/rcudata" looks as follows: + +rcu: + 0 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=1 rp=3c2a dt=23301/73 dn=2 df=1882 of=0 ri=2126 ql=2 b=10 + 1 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=3 rp=39a6 dt=78073/1 dn=2 df=1402 of=0 ri=1875 ql=46 b=10 + 2 c=4010 g=4010 pq=1 pqc=4010 qp=0 rpfq=-5 rp=1d12 dt=16646/0 dn=2 df=3140 of=0 ri=2080 ql=0 b=10 + 3 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=2b50 dt=21159/1 dn=2 df=2230 of=0 ri=1923 ql=72 b=10 + 4 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1644 dt=5783/1 dn=2 df=3348 of=0 ri=2805 ql=7 b=10 + 5 c=4012 g=4013 pq=0 pqc=4011 qp=1 rpfq=3 rp=1aac dt=5879/1 dn=2 df=3140 of=0 ri=2066 ql=10 b=10 + 6 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=ed8 dt=5847/1 dn=2 df=3797 of=0 ri=1266 ql=10 b=10 + 7 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1fa2 dt=6199/1 dn=2 df=2795 of=0 ri=2162 ql=28 b=10 +rcu_bh: + 0 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-145 rp=21d6 dt=23301/73 dn=2 df=0 of=0 ri=0 ql=0 b=10 + 1 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-170 rp=20ce dt=78073/1 dn=2 df=26 of=0 ri=5 ql=0 b=10 + 2 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-83 rp=fbd dt=16646/0 dn=2 df=28 of=0 ri=4 ql=0 b=10 + 3 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-105 rp=178c dt=21159/1 dn=2 df=28 of=0 ri=2 ql=0 b=10 + 4 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-30 rp=b54 dt=5783/1 dn=2 df=32 of=0 ri=0 ql=0 b=10 + 5 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-29 rp=df5 dt=5879/1 dn=2 df=30 of=0 ri=3 ql=0 b=10 + 6 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-28 rp=788 dt=5847/1 dn=2 df=32 of=0 ri=0 ql=0 b=10 + 7 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-53 rp=1098 dt=6199/1 dn=2 df=30 of=0 ri=3 ql=0 b=10 + +The first section lists the rcu_data structures for rcu, the second for +rcu_bh. Each section has one line per CPU, or eight for this 8-CPU system. +The fields are as follows: + +o The number at the beginning of each line is the CPU number. + CPUs numbers followed by an exclamation mark are offline, + but have been online at least once since boot. There will be + no output for CPUs that have never been online, which can be + a good thing in the surprisingly common case where NR_CPUS is + substantially larger than the number of actual CPUs. + +o "c" is the count of grace periods that this CPU believes have + completed. CPUs in dynticks idle mode may lag quite a ways + behind, for example, CPU 4 under "rcu" above, which has slept + through the past 25 RCU grace periods. It is not unusual to + see CPUs lagging by thousands of grace periods. + +o "g" is the count of grace periods that this CPU believes have + started. Again, CPUs in dynticks idle mode may lag behind. + If the "c" and "g" values are equal, this CPU has already + reported a quiescent state for the last RCU grace period that + it is aware of, otherwise, the CPU believes that it owes RCU a + quiescent state. + +o "pq" indicates that this CPU has passed through a quiescent state + for the current grace period. It is possible for "pq" to be + "1" and "c" different than "g", which indicates that although + the CPU has passed through a quiescent state, either (1) this + CPU has not yet reported that fact, (2) some other CPU has not + yet reported for this grace period, or (3) both. + +o "pqc" indicates which grace period the last-observed quiescent + state for this CPU corresponds to. This is important for handling + the race between CPU 0 reporting an extended dynticks-idle + quiescent state for CPU 1 and CPU 1 suddenly waking up and + reporting its own quiescent state. If CPU 1 was the last CPU + for the current grace period, then the CPU that loses this race + will attempt to incorrectly mark CPU 1 as having checked in for + the next grace period! + +o "qp" indicates that RCU still expects a quiescent state from + this CPU. + +o "rpfq" is the number of rcu_pending() calls on this CPU required + to induce this CPU to invoke force_quiescent_state(). + +o "rp" is low-order four hex digits of the count of how many times + rcu_pending() has been invoked on this CPU. + +o "dt" is the current value of the dyntick counter that is incremented + when entering or leaving dynticks idle state, either by the + scheduler or by irq. The number after the "/" is the interrupt + nesting depth when in dyntick-idle state, or one greater than + the interrupt-nesting depth otherwise. + + This field is displayed only for CONFIG_NO_HZ kernels. + +o "dn" is the current value of the dyntick counter that is incremented + when entering or leaving dynticks idle state via NMI. If both + the "dt" and "dn" values are even, then this CPU is in dynticks + idle mode and may be ignored by RCU. If either of these two + counters is odd, then RCU must be alert to the possibility of + an RCU read-side critical section running on this CPU. + + This field is displayed only for CONFIG_NO_HZ kernels. + +o "df" is the number of times that some other CPU has forced a + quiescent state on behalf of this CPU due to this CPU being in + dynticks-idle state. + + This field is displayed only for CONFIG_NO_HZ kernels. + +o "of" is the number of times that some other CPU has forced a + quiescent state on behalf of this CPU due to this CPU being + offline. In a perfect world, this might neve happen, but it + turns out that offlining and onlining a CPU can take several grace + periods, and so there is likely to be an extended period of time + when RCU believes that the CPU is online when it really is not. + Please note that erring in the other direction (RCU believing a + CPU is offline when it is really alive and kicking) is a fatal + error, so it makes sense to err conservatively. + +o "ri" is the number of times that RCU has seen fit to send a + reschedule IPI to this CPU in order to get it to report a + quiescent state. + +o "ql" is the number of RCU callbacks currently residing on + this CPU. This is the total number of callbacks, regardless + of what state they are in (new, waiting for grace period to + start, waiting for grace period to end, ready to invoke). + +o "b" is the batch limit for this CPU. If more than this number + of RCU callbacks is ready to invoke, then the remainder will + be deferred. + + +The output of "cat rcu/rcugp" looks as follows: + +rcu: completed=33062 gpnum=33063 +rcu_bh: completed=464 gpnum=464 + +Again, this output is for both "rcu" and "rcu_bh". The fields are +taken from the rcu_state structure, and are as follows: + +o "completed" is the number of grace periods that have completed. + It is comparable to the "c" field from rcu/rcudata in that a + CPU whose "c" field matches the value of "completed" is aware + that the corresponding RCU grace period has completed. + +o "gpnum" is the number of grace periods that have started. It is + comparable to the "g" field from rcu/rcudata in that a CPU + whose "g" field matches the value of "gpnum" is aware that the + corresponding RCU grace period has started. + + If these two fields are equal (as they are for "rcu_bh" above), + then there is no grace period in progress, in other words, RCU + is idle. On the other hand, if the two fields differ (as they + do for "rcu" above), then an RCU grace period is in progress. + + +The output of "cat rcu/rcuhier" looks as follows, with very long lines: + +c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 +1/1 0:127 ^0 +3/3 0:35 ^0 0/0 36:71 ^1 0/0 72:107 ^2 0/0 108:127 ^3 +3/3f 0:5 ^0 2/3 6:11 ^1 0/0 12:17 ^2 0/0 18:23 ^3 0/0 24:29 ^4 0/0 30:35 ^5 0/0 36:41 ^0 0/0 42:47 ^1 0/0 48:53 ^2 0/0 54:59 ^3 0/0 60:65 ^4 0/0 66:71 ^5 0/0 72:77 ^0 0/0 78:83 ^1 0/0 84:89 ^2 0/0 90:95 ^3 0/0 96:101 ^4 0/0 102:107 ^5 0/0 108:113 ^0 0/0 114:119 ^1 0/0 120:125 ^2 0/0 126:127 ^3 +rcu_bh: +c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 +0/1 0:127 ^0 +0/3 0:35 ^0 0/0 36:71 ^1 0/0 72:107 ^2 0/0 108:127 ^3 +0/3f 0:5 ^0 0/3 6:11 ^1 0/0 12:17 ^2 0/0 18:23 ^3 0/0 24:29 ^4 0/0 30:35 ^5 0/0 36:41 ^0 0/0 42:47 ^1 0/0 48:53 ^2 0/0 54:59 ^3 0/0 60:65 ^4 0/0 66:71 ^5 0/0 72:77 ^0 0/0 78:83 ^1 0/0 84:89 ^2 0/0 90:95 ^3 0/0 96:101 ^4 0/0 102:107 ^5 0/0 108:113 ^0 0/0 114:119 ^1 0/0 120:125 ^2 0/0 126:127 ^3 + +This is once again split into "rcu" and "rcu_bh" portions. The fields are +as follows: + +o "c" is exactly the same as "completed" under rcu/rcugp. + +o "g" is exactly the same as "gpnum" under rcu/rcugp. + +o "s" is the "signaled" state that drives force_quiescent_state()'s + state machine. + +o "jfq" is the number of jiffies remaining for this grace period + before force_quiescent_state() is invoked to help push things + along. Note that CPUs in dyntick-idle mode thoughout the grace + period will not report on their own, but rather must be check by + some other CPU via force_quiescent_state(). + +o "j" is the low-order four hex digits of the jiffies counter. + Yes, Paul did run into a number of problems that turned out to + be due to the jiffies counter no longer counting. Why do you ask? + +o "nfqs" is the number of calls to force_quiescent_state() since + boot. + +o "nfqsng" is the number of useless calls to force_quiescent_state(), + where there wasn't actually a grace period active. This can + happen due to races. The number in parentheses is the difference + between "nfqs" and "nfqsng", or the number of times that + force_quiescent_state() actually did some real work. + +o "fqlh" is the number of calls to force_quiescent_state() that + exited immediately (without even being counted in nfqs above) + due to contention on ->fqslock. + +o Each element of the form "1/1 0:127 ^0" represents one struct + rcu_node. Each line represents one level of the hierarchy, from + root to leaves. It is best to think of the rcu_data structures + as forming yet another level after the leaves. Note that there + might be either one, two, or three levels of rcu_node structures, + depending on the relationship between CONFIG_RCU_FANOUT and + CONFIG_NR_CPUS. + + o The numbers separated by the "/" are the qsmask followed + by the qsmaskinit. The qsmask will have one bit + set for each entity in the next lower level that + has not yet checked in for the current grace period. + The qsmaskinit will have one bit for each entity that is + currently expected to check in during each grace period. + The value of qsmaskinit is assigned to that of qsmask + at the beginning of each grace period. + + For example, for "rcu", the qsmask of the first entry + of the lowest level is 0x14, meaning that we are still + waiting for CPUs 2 and 4 to check in for the current + grace period. + + o The numbers separated by the ":" are the range of CPUs + served by this struct rcu_node. This can be helpful + in working out how the hierarchy is wired together. + + For example, the first entry at the lowest level shows + "0:5", indicating that it covers CPUs 0 through 5. + + o The number after the "^" indicates the bit in the + next higher level rcu_node structure that this + rcu_node structure corresponds to. + + For example, the first entry at the lowest level shows + "^0", indicating that it corresponds to bit zero in + the first entry at the middle level. diff --git a/arch/powerpc/platforms/pseries/rtasd.c b/arch/powerpc/platforms/pseries/rtasd.c index f4e55be2eea..afad9f5ac0a 100644 --- a/arch/powerpc/platforms/pseries/rtasd.c +++ b/arch/powerpc/platforms/pseries/rtasd.c @@ -208,6 +208,7 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal) break; case ERR_TYPE_KERNEL_PANIC: default: + WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */ spin_unlock_irqrestore(&rtasd_log_lock, s); return; } @@ -227,6 +228,7 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal) /* Check to see if we need to or have stopped logging */ if (fatal || !logging_enabled) { logging_enabled = 0; + WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */ spin_unlock_irqrestore(&rtasd_log_lock, s); return; } @@ -249,11 +251,13 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal) else rtas_log_start += 1; + WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */ spin_unlock_irqrestore(&rtasd_log_lock, s); wake_up_interruptible(&rtas_log_wait); break; case ERR_TYPE_KERNEL_PANIC: default: + WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */ spin_unlock_irqrestore(&rtasd_log_lock, s); return; } diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 181006cc94a..9b70b923169 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -118,13 +118,17 @@ static inline void account_system_vtime(struct task_struct *tsk) } #endif -#if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ) +#if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU) extern void rcu_irq_enter(void); extern void rcu_irq_exit(void); +extern void rcu_nmi_enter(void); +extern void rcu_nmi_exit(void); #else # define rcu_irq_enter() do { } while (0) # define rcu_irq_exit() do { } while (0) -#endif /* CONFIG_PREEMPT_RCU */ +# define rcu_nmi_enter() do { } while (0) +# define rcu_nmi_exit() do { } while (0) +#endif /* #if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU) */ /* * It is safe to do non-atomic ops on ->hardirq_context, @@ -134,7 +138,6 @@ extern void rcu_irq_exit(void); */ #define __irq_enter() \ do { \ - rcu_irq_enter(); \ account_system_vtime(current); \ add_preempt_count(HARDIRQ_OFFSET); \ trace_hardirq_enter(); \ @@ -153,7 +156,6 @@ extern void irq_enter(void); trace_hardirq_exit(); \ account_system_vtime(current); \ sub_preempt_count(HARDIRQ_OFFSET); \ - rcu_irq_exit(); \ } while (0) /* @@ -161,7 +163,7 @@ extern void irq_enter(void); */ extern void irq_exit(void); -#define nmi_enter() do { lockdep_off(); __irq_enter(); } while (0) -#define nmi_exit() do { __irq_exit(); lockdep_on(); } while (0) +#define nmi_enter() do { lockdep_off(); rcu_nmi_enter(); __irq_enter(); } while (0) +#define nmi_exit() do { __irq_exit(); rcu_nmi_exit(); lockdep_on(); } while (0) #endif /* LINUX_HARDIRQ_H */ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 86f1f5e43e3..bfd289aff57 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -52,11 +52,15 @@ struct rcu_head { void (*func)(struct rcu_head *head); }; -#ifdef CONFIG_CLASSIC_RCU +#if defined(CONFIG_CLASSIC_RCU) #include -#else /* #ifdef CONFIG_CLASSIC_RCU */ +#elif defined(CONFIG_TREE_RCU) +#include +#elif defined(CONFIG_PREEMPT_RCU) #include -#endif /* #else #ifdef CONFIG_CLASSIC_RCU */ +#else +#error "Unknown RCU implementation specified to kernel configuration" +#endif /* #else #if defined(CONFIG_CLASSIC_RCU) */ #define RCU_HEAD_INIT { .next = NULL, .func = NULL } #define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h new file mode 100644 index 00000000000..d4368b7975c --- /dev/null +++ b/include/linux/rcutree.h @@ -0,0 +1,329 @@ +/* + * Read-Copy Update mechanism for mutual exclusion (tree-based version) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2008 + * + * Author: Dipankar Sarma + * Paul E. McKenney Hierarchical algorithm + * + * Based on the original work by Paul McKenney + * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU + */ + +#ifndef __LINUX_RCUTREE_H +#define __LINUX_RCUTREE_H + +#include +#include +#include +#include +#include +#include + +/* + * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. + * In theory, it should be possible to add more levels straightforwardly. + * In practice, this has not been tested, so there is probably some + * bug somewhere. + */ +#define MAX_RCU_LVLS 3 +#define RCU_FANOUT (CONFIG_RCU_FANOUT) +#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) +#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) + +#if NR_CPUS <= RCU_FANOUT +# define NUM_RCU_LVLS 1 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 (NR_CPUS) +# define NUM_RCU_LVL_2 0 +# define NUM_RCU_LVL_3 0 +#elif NR_CPUS <= RCU_FANOUT_SQ +# define NUM_RCU_LVLS 2 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT) +# define NUM_RCU_LVL_2 (NR_CPUS) +# define NUM_RCU_LVL_3 0 +#elif NR_CPUS <= RCU_FANOUT_CUBE +# define NUM_RCU_LVLS 3 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ) +# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT)) +# define NUM_RCU_LVL_3 NR_CPUS +#else +# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" +#endif /* #if (NR_CPUS) <= RCU_FANOUT */ + +#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) +#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) + +/* + * Dynticks per-CPU state. + */ +struct rcu_dynticks { + int dynticks_nesting; /* Track nesting level, sort of. */ + int dynticks; /* Even value for dynticks-idle, else odd. */ + int dynticks_nmi; /* Even value for either dynticks-idle or */ + /* not in nmi handler, else odd. So this */ + /* remains even for nmi from irq handler. */ +}; + +/* + * Definition for node within the RCU grace-period-detection hierarchy. + */ +struct rcu_node { + spinlock_t lock; + unsigned long qsmask; /* CPUs or groups that need to switch in */ + /* order for current grace period to proceed.*/ + unsigned long qsmaskinit; + /* Per-GP initialization for qsmask. */ + unsigned long grpmask; /* Mask to apply to parent qsmask. */ + int grplo; /* lowest-numbered CPU or group here. */ + int grphi; /* highest-numbered CPU or group here. */ + u8 grpnum; /* CPU/group number for next level up. */ + u8 level; /* root is at level 0. */ + struct rcu_node *parent; +} ____cacheline_internodealigned_in_smp; + +/* Index values for nxttail array in struct rcu_data. */ +#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ +#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ +#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */ +#define RCU_NEXT_TAIL 3 +#define RCU_NEXT_SIZE 4 + +/* Per-CPU data for read-copy update. */ +struct rcu_data { + /* 1) quiescent-state and grace-period handling : */ + long completed; /* Track rsp->completed gp number */ + /* in order to detect GP end. */ + long gpnum; /* Highest gp number that this CPU */ + /* is aware of having started. */ + long passed_quiesc_completed; + /* Value of completed at time of qs. */ + bool passed_quiesc; /* User-mode/idle loop etc. */ + bool qs_pending; /* Core waits for quiesc state. */ + bool beenonline; /* CPU online at least once. */ + struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ + unsigned long grpmask; /* Mask to apply to leaf qsmask. */ + + /* 2) batch handling */ + /* + * If nxtlist is not NULL, it is partitioned as follows. + * Any of the partitions might be empty, in which case the + * pointer to that partition will be equal to the pointer for + * the following partition. When the list is empty, all of + * the nxttail elements point to nxtlist, which is NULL. + * + * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]): + * Entries that might have arrived after current GP ended + * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): + * Entries known to have arrived before current GP ended + * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): + * Entries that batch # <= ->completed - 1: waiting for current GP + * [nxtlist, *nxttail[RCU_DONE_TAIL]): + * Entries that batch # <= ->completed + * The grace period for these entries has completed, and + * the other grace-period-completed entries may be moved + * here temporarily in rcu_process_callbacks(). + */ + struct rcu_head *nxtlist; + struct rcu_head **nxttail[RCU_NEXT_SIZE]; + long qlen; /* # of queued callbacks */ + long blimit; /* Upper limit on a processed batch */ + +#ifdef CONFIG_NO_HZ + /* 3) dynticks interface. */ + struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ + int dynticks_snap; /* Per-GP tracking for dynticks. */ + int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */ +#endif /* #ifdef CONFIG_NO_HZ */ + + /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ +#ifdef CONFIG_NO_HZ + unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ +#endif /* #ifdef CONFIG_NO_HZ */ + unsigned long offline_fqs; /* Kicked due to being offline. */ + unsigned long resched_ipi; /* Sent a resched IPI. */ + + /* 5) state to allow this CPU to force_quiescent_state on others */ + long n_rcu_pending; /* rcu_pending() calls since boot. */ + long n_rcu_pending_force_qs; /* when to force quiescent states. */ + + int cpu; +}; + +/* Values for signaled field in struct rcu_state. */ +#define RCU_GP_INIT 0 /* Grace period being initialized. */ +#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */ +#define RCU_FORCE_QS 2 /* Need to force quiescent state. */ +#ifdef CONFIG_NO_HZ +#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK +#else /* #ifdef CONFIG_NO_HZ */ +#define RCU_SIGNAL_INIT RCU_FORCE_QS +#endif /* #else #ifdef CONFIG_NO_HZ */ + +#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */ +#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */ +#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ + /* to take at least one */ + /* scheduling clock irq */ + /* before ratting on them. */ + +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +/* + * RCU global state, including node hierarchy. This hierarchy is + * represented in "heap" form in a dense array. The root (first level) + * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second + * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]), + * and the third level in ->node[m+1] and following (->node[m+1] referenced + * by ->level[2]). The number of levels is determined by the number of + * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy" + * consisting of a single rcu_node. + */ +struct rcu_state { + struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ + struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ + u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ + u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ + struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */ + + /* The following fields are guarded by the root rcu_node's lock. */ + + u8 signaled ____cacheline_internodealigned_in_smp; + /* Force QS state. */ + long gpnum; /* Current gp number. */ + long completed; /* # of last completed gp. */ + spinlock_t onofflock; /* exclude on/offline and */ + /* starting new GP. */ + spinlock_t fqslock; /* Only one task forcing */ + /* quiescent states. */ + unsigned long jiffies_force_qs; /* Time at which to invoke */ + /* force_quiescent_state(). */ + unsigned long n_force_qs; /* Number of calls to */ + /* force_quiescent_state(). */ + unsigned long n_force_qs_lh; /* ~Number of calls leaving */ + /* due to lock unavailable. */ + unsigned long n_force_qs_ngp; /* Number of calls leaving */ + /* due to no GP active. */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + unsigned long gp_start; /* Time at which GP started, */ + /* but in jiffies. */ + unsigned long jiffies_stall; /* Time at which to check */ + /* for CPU stalls. */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +#ifdef CONFIG_NO_HZ + long dynticks_completed; /* Value of completed @ snap. */ +#endif /* #ifdef CONFIG_NO_HZ */ +}; + +extern struct rcu_state rcu_state; +DECLARE_PER_CPU(struct rcu_data, rcu_data); + +extern struct rcu_state rcu_bh_state; +DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); + +/* + * Increment the quiescent state counter. + * The counter is a bit degenerated: We do not need to know + * how many quiescent states passed, just if there was at least + * one since the start of the grace period. Thus just a flag. + */ +static inline void rcu_qsctr_inc(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + rdp->passed_quiesc = 1; + rdp->passed_quiesc_completed = rdp->completed; +} +static inline void rcu_bh_qsctr_inc(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); + rdp->passed_quiesc = 1; + rdp->passed_quiesc_completed = rdp->completed; +} + +extern int rcu_pending(int cpu); +extern int rcu_needs_cpu(int cpu); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +extern struct lockdep_map rcu_lock_map; +# define rcu_read_acquire() \ + lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) +# define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_) +#else +# define rcu_read_acquire() do { } while (0) +# define rcu_read_release() do { } while (0) +#endif + +static inline void __rcu_read_lock(void) +{ + preempt_disable(); + __acquire(RCU); + rcu_read_acquire(); +} +static inline void __rcu_read_unlock(void) +{ + rcu_read_release(); + __release(RCU); + preempt_enable(); +} +static inline void __rcu_read_lock_bh(void) +{ + local_bh_disable(); + __acquire(RCU_BH); + rcu_read_acquire(); +} +static inline void __rcu_read_unlock_bh(void) +{ + rcu_read_release(); + __release(RCU_BH); + local_bh_enable(); +} + +#define __synchronize_sched() synchronize_rcu() + +#define call_rcu_sched(head, func) call_rcu(head, func) + +static inline void rcu_init_sched(void) +{ +} + +extern void __rcu_init(void); +extern void rcu_check_callbacks(int cpu, int user); +extern void rcu_restart_cpu(int cpu); + +extern long rcu_batches_completed(void); +extern long rcu_batches_completed_bh(void); + +#ifdef CONFIG_NO_HZ +void rcu_enter_nohz(void); +void rcu_exit_nohz(void); +#else /* CONFIG_NO_HZ */ +static inline void rcu_enter_nohz(void) +{ +} +static inline void rcu_exit_nohz(void) +{ +} +#endif /* CONFIG_NO_HZ */ + +#endif /* __LINUX_RCUTREE_H */ diff --git a/init/Kconfig b/init/Kconfig index f763762d544..9dd7958a71f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -928,10 +928,16 @@ source "block/Kconfig" config PREEMPT_NOTIFIERS bool -config CLASSIC_RCU - def_bool !PREEMPT_RCU +config TREE_RCU_TRACE + def_bool RCU_TRACE && TREE_RCU + select DEBUG_FS help - This option selects the classic RCU implementation that is - designed for best read-side performance on non-realtime - systems. Classic RCU is the default. Note that the - PREEMPT_RCU symbol is used to select/deselect this option. + This option provides tracing for the TREE_RCU implementation, + permitting Makefile to trivially select kernel/rcutree_trace.c. + +config PREEMPT_RCU_TRACE + def_bool RCU_TRACE && PREEMPT_RCU + select DEBUG_FS + help + This option provides tracing for the PREEMPT_RCU implementation, + permitting Makefile to trivially select kernel/rcupreempt_trace.c. diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 9fdba03dc1f..463f29743ea 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -52,10 +52,29 @@ config PREEMPT endchoice +choice + prompt "RCU Implementation" + default CLASSIC_RCU + +config CLASSIC_RCU + bool "Classic RCU" + help + This option selects the classic RCU implementation that is + designed for best read-side performance on non-realtime + systems. + + Select this option if you are unsure. + +config TREE_RCU + bool "Tree-based hierarchical RCU" + help + This option selects the RCU implementation that is + designed for very large SMP system with hundreds or + thousands of CPUs. + config PREEMPT_RCU bool "Preemptible RCU" depends on PREEMPT - default n help This option reduces the latency of the kernel by making certain RCU sections preemptible. Normally RCU code is non-preemptible, if @@ -64,16 +83,47 @@ config PREEMPT_RCU now-naive assumptions about each RCU read-side critical section remaining on a given CPU through its execution. - Say N if you are unsure. +endchoice config RCU_TRACE - bool "Enable tracing for RCU - currently stats in debugfs" - depends on PREEMPT_RCU - select DEBUG_FS - default y + bool "Enable tracing for RCU" + depends on TREE_RCU || PREEMPT_RCU help This option provides tracing in RCU which presents stats in debugfs for debugging RCU implementation. Say Y here if you want to enable RCU tracing Say N if you are unsure. + +config RCU_FANOUT + int "Tree-based hierarchical RCU fanout value" + range 2 64 if 64BIT + range 2 32 if !64BIT + depends on TREE_RCU + default 64 if 64BIT + default 32 if !64BIT + help + This option controls the fanout of hierarchical implementations + of RCU, allowing RCU to work efficiently on machines with + large numbers of CPUs. This value must be at least the cube + root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit + systems and up to 262,144 for 64-bit systems. + + Select a specific number if testing RCU itself. + Take the default if unsure. + +config RCU_FANOUT_EXACT + bool "Disable tree-based hierarchical RCU auto-balancing" + depends on TREE_RCU + default n + help + This option forces use of the exact RCU_FANOUT value specified, + regardless of imbalances in the hierarchy. This is useful for + testing RCU itself, and might one day be useful on systems with + strong NUMA behavior. + + Without RCU_FANOUT_EXACT, the code will balance the hierarchy. + + Say n if unsure. + + diff --git a/kernel/Makefile b/kernel/Makefile index 19fad003b19..b4fdbbff5ec 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -74,10 +74,10 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o +obj-$(CONFIG_TREE_RCU) += rcutree.o obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o -ifeq ($(CONFIG_PREEMPT_RCU),y) -obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o -endif +obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o +obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 59236e8b9da..04982659875 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -551,6 +551,16 @@ void rcu_irq_exit(void) } } +void rcu_nmi_enter(void) +{ + rcu_irq_enter(); +} + +void rcu_nmi_exit(void) +{ + rcu_irq_exit(); +} + static void dyntick_save_progress_counter(int cpu) { struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c index 35c2d3360ec..7c2665cac17 100644 --- a/kernel/rcupreempt_trace.c +++ b/kernel/rcupreempt_trace.c @@ -149,12 +149,12 @@ static void rcupreempt_trace_sum(struct rcupreempt_trace *sp) sp->done_length += cp->done_length; sp->done_add += cp->done_add; sp->done_remove += cp->done_remove; - atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked)); + atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked); sp->rcu_check_callbacks += cp->rcu_check_callbacks; - atomic_set(&sp->rcu_try_flip_1, - atomic_read(&cp->rcu_try_flip_1)); - atomic_set(&sp->rcu_try_flip_e1, - atomic_read(&cp->rcu_try_flip_e1)); + atomic_add(atomic_read(&cp->rcu_try_flip_1), + &sp->rcu_try_flip_1); + atomic_add(atomic_read(&cp->rcu_try_flip_e1), + &sp->rcu_try_flip_e1); sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1; sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1; sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1; diff --git a/kernel/rcutree.c b/kernel/rcutree.c new file mode 100644 index 00000000000..a342b032112 --- /dev/null +++ b/kernel/rcutree.c @@ -0,0 +1,1535 @@ +/* + * Read-Copy Update mechanism for mutual exclusion + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2008 + * + * Authors: Dipankar Sarma + * Manfred Spraul + * Paul E. McKenney Hierarchical version + * + * Based on the original work by Paul McKenney + * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key rcu_lock_key; +struct lockdep_map rcu_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); +EXPORT_SYMBOL_GPL(rcu_lock_map); +#endif + +/* Data structures. */ + +#define RCU_STATE_INITIALIZER(name) { \ + .level = { &name.node[0] }, \ + .levelcnt = { \ + NUM_RCU_LVL_0, /* root of hierarchy. */ \ + NUM_RCU_LVL_1, \ + NUM_RCU_LVL_2, \ + NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \ + }, \ + .signaled = RCU_SIGNAL_INIT, \ + .gpnum = -300, \ + .completed = -300, \ + .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ + .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ + .n_force_qs = 0, \ + .n_force_qs_ngp = 0, \ +} + +struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state); +DEFINE_PER_CPU(struct rcu_data, rcu_data); + +struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); +DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); + +#ifdef CONFIG_NO_HZ +DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks); +#endif /* #ifdef CONFIG_NO_HZ */ + +static int blimit = 10; /* Maximum callbacks per softirq. */ +static int qhimark = 10000; /* If this many pending, ignore blimit. */ +static int qlowmark = 100; /* Once only this many pending, use blimit. */ + +static void force_quiescent_state(struct rcu_state *rsp, int relaxed); + +/* + * Return the number of RCU batches processed thus far for debug & stats. + */ +long rcu_batches_completed(void) +{ + return rcu_state.completed; +} +EXPORT_SYMBOL_GPL(rcu_batches_completed); + +/* + * Return the number of RCU BH batches processed thus far for debug & stats. + */ +long rcu_batches_completed_bh(void) +{ + return rcu_bh_state.completed; +} +EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); + +/* + * Does the CPU have callbacks ready to be invoked? + */ +static int +cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) +{ + return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]; +} + +/* + * Does the current CPU require a yet-as-unscheduled grace period? + */ +static int +cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) +{ + /* ACCESS_ONCE() because we are accessing outside of lock. */ + return *rdp->nxttail[RCU_DONE_TAIL] && + ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum); +} + +/* + * Return the root node of the specified rcu_state structure. + */ +static struct rcu_node *rcu_get_root(struct rcu_state *rsp) +{ + return &rsp->node[0]; +} + +#ifdef CONFIG_SMP + +/* + * If the specified CPU is offline, tell the caller that it is in + * a quiescent state. Otherwise, whack it with a reschedule IPI. + * Grace periods can end up waiting on an offline CPU when that + * CPU is in the process of coming online -- it will be added to the + * rcu_node bitmasks before it actually makes it online. The same thing + * can happen while a CPU is in the process of coming online. Because this + * race is quite rare, we check for it after detecting that the grace + * period has been delayed rather than checking each and every CPU + * each and every time we start a new grace period. + */ +static int rcu_implicit_offline_qs(struct rcu_data *rdp) +{ + /* + * If the CPU is offline, it is in a quiescent state. We can + * trust its state not to change because interrupts are disabled. + */ + if (cpu_is_offline(rdp->cpu)) { + rdp->offline_fqs++; + return 1; + } + + /* The CPU is online, so send it a reschedule IPI. */ + if (rdp->cpu != smp_processor_id()) + smp_send_reschedule(rdp->cpu); + else + set_need_resched(); + rdp->resched_ipi++; + return 0; +} + +#endif /* #ifdef CONFIG_SMP */ + +#ifdef CONFIG_NO_HZ +static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5); + +/** + * rcu_enter_nohz - inform RCU that current CPU is entering nohz + * + * Enter nohz mode, in other words, -leave- the mode in which RCU + * read-side critical sections can occur. (Though RCU read-side + * critical sections can occur in irq handlers in nohz mode, a possibility + * handled by rcu_irq_enter() and rcu_irq_exit()). + */ +void rcu_enter_nohz(void) +{ + unsigned long flags; + struct rcu_dynticks *rdtp; + + smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + rdtp->dynticks++; + rdtp->dynticks_nesting--; + WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs); + local_irq_restore(flags); +} + +/* + * rcu_exit_nohz - inform RCU that current CPU is leaving nohz + * + * Exit nohz mode, in other words, -enter- the mode in which RCU + * read-side critical sections normally occur. + */ +void rcu_exit_nohz(void) +{ + unsigned long flags; + struct rcu_dynticks *rdtp; + + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + rdtp->dynticks++; + rdtp->dynticks_nesting++; + WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs); + local_irq_restore(flags); + smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ +} + +/** + * rcu_nmi_enter - inform RCU of entry to NMI context + * + * If the CPU was idle with dynamic ticks active, and there is no + * irq handler running, this updates rdtp->dynticks_nmi to let the + * RCU grace-period handling know that the CPU is active. + */ +void rcu_nmi_enter(void) +{ + struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); + + if (rdtp->dynticks & 0x1) + return; + rdtp->dynticks_nmi++; + WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs); + smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ +} + +/** + * rcu_nmi_exit - inform RCU of exit from NMI context + * + * If the CPU was idle with dynamic ticks active, and there is no + * irq handler running, this updates rdtp->dynticks_nmi to let the + * RCU grace-period handling know that the CPU is no longer active. + */ +void rcu_nmi_exit(void) +{ + struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); + + if (rdtp->dynticks & 0x1) + return; + smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ + rdtp->dynticks_nmi++; + WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs); +} + +/** + * rcu_irq_enter - inform RCU of entry to hard irq context + * + * If the CPU was idle with dynamic ticks active, this updates the + * rdtp->dynticks to let the RCU handling know that the CPU is active. + */ +void rcu_irq_enter(void) +{ + struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); + + if (rdtp->dynticks_nesting++) + return; + rdtp->dynticks++; + WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs); + smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ +} + +/** + * rcu_irq_exit - inform RCU of exit from hard irq context + * + * If the CPU was idle with dynamic ticks active, update the rdp->dynticks + * to put let the RCU handling be aware that the CPU is going back to idle + * with no ticks. + */ +void rcu_irq_exit(void) +{ + struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); + + if (--rdtp->dynticks_nesting) + return; + smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ + rdtp->dynticks++; + WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs); + + /* If the interrupt queued a callback, get out of dyntick mode. */ + if (__get_cpu_var(rcu_data).nxtlist || + __get_cpu_var(rcu_bh_data).nxtlist) + set_need_resched(); +} + +/* + * Record the specified "completed" value, which is later used to validate + * dynticks counter manipulations. Specify "rsp->completed - 1" to + * unconditionally invalidate any future dynticks manipulations (which is + * useful at the beginning of a grace period). + */ +static void dyntick_record_completed(struct rcu_state *rsp, long comp) +{ + rsp->dynticks_completed = comp; +} + +#ifdef CONFIG_SMP + +/* + * Recall the previously recorded value of the completion for dynticks. + */ +static long dyntick_recall_completed(struct rcu_state *rsp) +{ + return rsp->dynticks_completed; +} + +/* + * Snapshot the specified CPU's dynticks counter so that we can later + * credit them with an implicit quiescent state. Return 1 if this CPU + * is already in a quiescent state courtesy of dynticks idle mode. + */ +static int dyntick_save_progress_counter(struct rcu_data *rdp) +{ + int ret; + int snap; + int snap_nmi; + + snap = rdp->dynticks->dynticks; + snap_nmi = rdp->dynticks->dynticks_nmi; + smp_mb(); /* Order sampling of snap with end of grace period. */ + rdp->dynticks_snap = snap; + rdp->dynticks_nmi_snap = snap_nmi; + ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0); + if (ret) + rdp->dynticks_fqs++; + return ret; +} + +/* + * Return true if the specified CPU has passed through a quiescent + * state by virtue of being in or having passed through an dynticks + * idle state since the last call to dyntick_save_progress_counter() + * for this same CPU. + */ +static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) +{ + long curr; + long curr_nmi; + long snap; + long snap_nmi; + + curr = rdp->dynticks->dynticks; + snap = rdp->dynticks_snap; + curr_nmi = rdp->dynticks->dynticks_nmi; + snap_nmi = rdp->dynticks_nmi_snap; + smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ + + /* + * If the CPU passed through or entered a dynticks idle phase with + * no active irq/NMI handlers, then we can safely pretend that the CPU + * already acknowledged the request to pass through a quiescent + * state. Either way, that CPU cannot possibly be in an RCU + * read-side critical section that started before the beginning + * of the current RCU grace period. + */ + if ((curr != snap || (curr & 0x1) == 0) && + (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) { + rdp->dynticks_fqs++; + return 1; + } + + /* Go check for the CPU being offline. */ + return rcu_implicit_offline_qs(rdp); +} + +#endif /* #ifdef CONFIG_SMP */ + +#else /* #ifdef CONFIG_NO_HZ */ + +static void dyntick_record_completed(struct rcu_state *rsp, long comp) +{ +} + +#ifdef CONFIG_SMP + +/* + * If there are no dynticks, then the only way that a CPU can passively + * be in a quiescent state is to be offline. Unlike dynticks idle, which + * is a point in time during the prior (already finished) grace period, + * an offline CPU is always in a quiescent state, and thus can be + * unconditionally applied. So just return the current value of completed. + */ +static long dyntick_recall_completed(struct rcu_state *rsp) +{ + return rsp->completed; +} + +static int dyntick_save_progress_counter(struct rcu_data *rdp) +{ + return 0; +} + +static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) +{ + return rcu_implicit_offline_qs(rdp); +} + +#endif /* #ifdef CONFIG_SMP */ + +#endif /* #else #ifdef CONFIG_NO_HZ */ + +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + +static void record_gp_stall_check_time(struct rcu_state *rsp) +{ + rsp->gp_start = jiffies; + rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; +} + +static void print_other_cpu_stall(struct rcu_state *rsp) +{ + int cpu; + long delta; + unsigned long flags; + struct rcu_node *rnp = rcu_get_root(rsp); + struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; + struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES]; + + /* Only let one CPU complain about others per time interval. */ + + spin_lock_irqsave(&rnp->lock, flags); + delta = jiffies - rsp->jiffies_stall; + if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) { + spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rnp->lock, flags); + + /* OK, time to rat on our buddy... */ + + printk(KERN_ERR "INFO: RCU detected CPU stalls:"); + for (; rnp_cur < rnp_end; rnp_cur++) { + if (rnp_cur->qsmask == 0) + continue; + for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++) + if (rnp_cur->qsmask & (1UL << cpu)) + printk(" %d", rnp_cur->grplo + cpu); + } + printk(" (detected by %d, t=%ld jiffies)\n", + smp_processor_id(), (long)(jiffies - rsp->gp_start)); + force_quiescent_state(rsp, 0); /* Kick them all. */ +} + +static void print_cpu_stall(struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_node *rnp = rcu_get_root(rsp); + + printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n", + smp_processor_id(), jiffies - rsp->gp_start); + dump_stack(); + spin_lock_irqsave(&rnp->lock, flags); + if ((long)(jiffies - rsp->jiffies_stall) >= 0) + rsp->jiffies_stall = + jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rnp->lock, flags); + set_need_resched(); /* kick ourselves to get things going. */ +} + +static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) +{ + long delta; + struct rcu_node *rnp; + + delta = jiffies - rsp->jiffies_stall; + rnp = rdp->mynode; + if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { + + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(rsp); + + } else if (rsp->gpnum != rsp->completed && + delta >= RCU_STALL_RAT_DELAY) { + + /* They had two time units to dump stack, so complain. */ + print_other_cpu_stall(rsp); + } +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +static void record_gp_stall_check_time(struct rcu_state *rsp) +{ +} + +static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +/* + * Update CPU-local rcu_data state to record the newly noticed grace period. + * This is used both when we started the grace period and when we notice + * that someone else started the grace period. + */ +static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) +{ + rdp->qs_pending = 1; + rdp->passed_quiesc = 0; + rdp->gpnum = rsp->gpnum; + rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending + + RCU_JIFFIES_TILL_FORCE_QS; +} + +/* + * Did someone else start a new RCU grace period start since we last + * checked? Update local state appropriately if so. Must be called + * on the CPU corresponding to rdp. + */ +static int +check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) +{ + unsigned long flags; + int ret = 0; + + local_irq_save(flags); + if (rdp->gpnum != rsp->gpnum) { + note_new_gpnum(rsp, rdp); + ret = 1; + } + local_irq_restore(flags); + return ret; +} + +/* + * Start a new RCU grace period if warranted, re-initializing the hierarchy + * in preparation for detecting the next grace period. The caller must hold + * the root node's ->lock, which is released before return. Hard irqs must + * be disabled. + */ +static void +rcu_start_gp(struct rcu_state *rsp, unsigned long flags) + __releases(rcu_get_root(rsp)->lock) +{ + struct rcu_data *rdp = rsp->rda[smp_processor_id()]; + struct rcu_node *rnp = rcu_get_root(rsp); + struct rcu_node *rnp_cur; + struct rcu_node *rnp_end; + + if (!cpu_needs_another_gp(rsp, rdp)) { + spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + + /* Advance to a new grace period and initialize state. */ + rsp->gpnum++; + rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ + rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; + rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending + + RCU_JIFFIES_TILL_FORCE_QS; + record_gp_stall_check_time(rsp); + dyntick_record_completed(rsp, rsp->completed - 1); + note_new_gpnum(rsp, rdp); + + /* + * Because we are first, we know that all our callbacks will + * be covered by this upcoming grace period, even the ones + * that were registered arbitrarily recently. + */ + rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + + /* Special-case the common single-level case. */ + if (NUM_RCU_NODES == 1) { + rnp->qsmask = rnp->qsmaskinit; + spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + + spin_unlock(&rnp->lock); /* leave irqs disabled. */ + + + /* Exclude any concurrent CPU-hotplug operations. */ + spin_lock(&rsp->onofflock); /* irqs already disabled. */ + + /* + * Set the quiescent-state-needed bits in all the non-leaf RCU + * nodes for all currently online CPUs. This operation relies + * on the layout of the hierarchy within the rsp->node[] array. + * Note that other CPUs will access only the leaves of the + * hierarchy, which still indicate that no grace period is in + * progress. In addition, we have excluded CPU-hotplug operations. + * + * We therefore do not need to hold any locks. Any required + * memory barriers will be supplied by the locks guarding the + * leaf rcu_nodes in the hierarchy. + */ + + rnp_end = rsp->level[NUM_RCU_LVLS - 1]; + for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++) + rnp_cur->qsmask = rnp_cur->qsmaskinit; + + /* + * Now set up the leaf nodes. Here we must be careful. First, + * we need to hold the lock in order to exclude other CPUs, which + * might be contending for the leaf nodes' locks. Second, as + * soon as we initialize a given leaf node, its CPUs might run + * up the rest of the hierarchy. We must therefore acquire locks + * for each node that we touch during this stage. (But we still + * are excluding CPU-hotplug operations.) + * + * Note that the grace period cannot complete until we finish + * the initialization process, as there will be at least one + * qsmask bit set in the root node until that time, namely the + * one corresponding to this CPU. + */ + rnp_end = &rsp->node[NUM_RCU_NODES]; + rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; + for (; rnp_cur < rnp_end; rnp_cur++) { + spin_lock(&rnp_cur->lock); /* irqs already disabled. */ + rnp_cur->qsmask = rnp_cur->qsmaskinit; + spin_unlock(&rnp_cur->lock); /* irqs already disabled. */ + } + + rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ + spin_unlock_irqrestore(&rsp->onofflock, flags); +} + +/* + * Advance this CPU's callbacks, but only if the current grace period + * has ended. This may be called only from the CPU to whom the rdp + * belongs. + */ +static void +rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) +{ + long completed_snap; + unsigned long flags; + + local_irq_save(flags); + completed_snap = ACCESS_ONCE(rsp->completed); /* outside of lock. */ + + /* Did another grace period end? */ + if (rdp->completed != completed_snap) { + + /* Advance callbacks. No harm if list empty. */ + rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; + rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL]; + rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + + /* Remember that we saw this grace-period completion. */ + rdp->completed = completed_snap; + } + local_irq_restore(flags); +} + +/* + * Similar to cpu_quiet(), for which it is a helper function. Allows + * a group of CPUs to be quieted at one go, though all the CPUs in the + * group must be represented by the same leaf rcu_node structure. + * That structure's lock must be held upon entry, and it is released + * before return. + */ +static void +cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, + unsigned long flags) + __releases(rnp->lock) +{ + /* Walk up the rcu_node hierarchy. */ + for (;;) { + if (!(rnp->qsmask & mask)) { + + /* Our bit has already been cleared, so done. */ + spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + rnp->qsmask &= ~mask; + if (rnp->qsmask != 0) { + + /* Other bits still set at this level, so done. */ + spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + mask = rnp->grpmask; + if (rnp->parent == NULL) { + + /* No more levels. Exit loop holding root lock. */ + + break; + } + spin_unlock_irqrestore(&rnp->lock, flags); + rnp = rnp->parent; + spin_lock_irqsave(&rnp->lock, flags); + } + + /* + * Get here if we are the last CPU to pass through a quiescent + * state for this grace period. Clean up and let rcu_start_gp() + * start up the next grace period if one is needed. Note that + * we still hold rnp->lock, as required by rcu_start_gp(), which + * will release it. + */ + rsp->completed = rsp->gpnum; + rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); + rcu_start_gp(rsp, flags); /* releases rnp->lock. */ +} + +/* + * Record a quiescent state for the specified CPU, which must either be + * the current CPU or an offline CPU. The lastcomp argument is used to + * make sure we are still in the grace period of interest. We don't want + * to end the current grace period based on quiescent states detected in + * an earlier grace period! + */ +static void +cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) +{ + unsigned long flags; + unsigned long mask; + struct rcu_node *rnp; + + rnp = rdp->mynode; + spin_lock_irqsave(&rnp->lock, flags); + if (lastcomp != ACCESS_ONCE(rsp->completed)) { + + /* + * Someone beat us to it for this grace period, so leave. + * The race with GP start is resolved by the fact that we + * hold the leaf rcu_node lock, so that the per-CPU bits + * cannot yet be initialized -- so we would simply find our + * CPU's bit already cleared in cpu_quiet_msk() if this race + * occurred. + */ + rdp->passed_quiesc = 0; /* try again later! */ + spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + mask = rdp->grpmask; + if ((rnp->qsmask & mask) == 0) { + spin_unlock_irqrestore(&rnp->lock, flags); + } else { + rdp->qs_pending = 0; + + /* + * This GP can't end until cpu checks in, so all of our + * callbacks can be processed during the next GP. + */ + rdp = rsp->rda[smp_processor_id()]; + rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + + cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */ + } +} + +/* + * Check to see if there is a new grace period of which this CPU + * is not yet aware, and if so, set up local rcu_data state for it. + * Otherwise, see if this CPU has just passed through its first + * quiescent state for this grace period, and record that fact if so. + */ +static void +rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) +{ + /* If there is now a new grace period, record and return. */ + if (check_for_new_grace_period(rsp, rdp)) + return; + + /* + * Does this CPU still need to do its part for current grace period? + * If no, return and let the other CPUs do their part as well. + */ + if (!rdp->qs_pending) + return; + + /* + * Was there a quiescent state since the beginning of the grace + * period? If no, then exit and wait for the next call. + */ + if (!rdp->passed_quiesc) + return; + + /* Tell RCU we are done (but cpu_quiet() will be the judge of that). */ + cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy + * and move all callbacks from the outgoing CPU to the current one. + */ +static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) +{ + int i; + unsigned long flags; + long lastcomp; + unsigned long mask; + struct rcu_data *rdp = rsp->rda[cpu]; + struct rcu_data *rdp_me; + struct rcu_node *rnp; + + /* Exclude any attempts to start a new grace period. */ + spin_lock_irqsave(&rsp->onofflock, flags); + + /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ + rnp = rdp->mynode; + mask = rdp->grpmask; /* rnp->grplo is constant. */ + do { + spin_lock(&rnp->lock); /* irqs already disabled. */ + rnp->qsmaskinit &= ~mask; + if (rnp->qsmaskinit != 0) { + spin_unlock(&rnp->lock); /* irqs already disabled. */ + break; + } + mask = rnp->grpmask; + spin_unlock(&rnp->lock); /* irqs already disabled. */ + rnp = rnp->parent; + } while (rnp != NULL); + lastcomp = rsp->completed; + + spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ + + /* Being offline is a quiescent state, so go record it. */ + cpu_quiet(cpu, rsp, rdp, lastcomp); + + /* + * Move callbacks from the outgoing CPU to the running CPU. + * Note that the outgoing CPU is now quiscent, so it is now + * (uncharacteristically) safe to access it rcu_data structure. + * Note also that we must carefully retain the order of the + * outgoing CPU's callbacks in order for rcu_barrier() to work + * correctly. Finally, note that we start all the callbacks + * afresh, even those that have passed through a grace period + * and are therefore ready to invoke. The theory is that hotplug + * events are rare, and that if they are frequent enough to + * indefinitely delay callbacks, you have far worse things to + * be worrying about. + */ + rdp_me = rsp->rda[smp_processor_id()]; + if (rdp->nxtlist != NULL) { + *rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; + rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; + rdp_me->qlen += rdp->qlen; + rdp->qlen = 0; + } + local_irq_restore(flags); +} + +/* + * Remove the specified CPU from the RCU hierarchy and move any pending + * callbacks that it might have to the current CPU. This code assumes + * that at least one CPU in the system will remain running at all times. + * Any attempt to offline -all- CPUs is likely to strand RCU callbacks. + */ +static void rcu_offline_cpu(int cpu) +{ + __rcu_offline_cpu(cpu, &rcu_state); + __rcu_offline_cpu(cpu, &rcu_bh_state); +} + +#else /* #ifdef CONFIG_HOTPLUG_CPU */ + +static void rcu_offline_cpu(int cpu) +{ +} + +#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ + +/* + * Invoke any RCU callbacks that have made it to the end of their grace + * period. Thottle as specified by rdp->blimit. + */ +static void rcu_do_batch(struct rcu_data *rdp) +{ + unsigned long flags; + struct rcu_head *next, *list, **tail; + int count; + + /* If no callbacks are ready, just return.*/ + if (!cpu_has_callbacks_ready_to_invoke(rdp)) + return; + + /* + * Extract the list of ready callbacks, disabling to prevent + * races with call_rcu() from interrupt handlers. + */ + local_irq_save(flags); + list = rdp->nxtlist; + rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; + *rdp->nxttail[RCU_DONE_TAIL] = NULL; + tail = rdp->nxttail[RCU_DONE_TAIL]; + for (count = RCU_NEXT_SIZE - 1; count >= 0; count--) + if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL]) + rdp->nxttail[count] = &rdp->nxtlist; + local_irq_restore(flags); + + /* Invoke callbacks. */ + count = 0; + while (list) { + next = list->next; + prefetch(next); + list->func(list); + list = next; + if (++count >= rdp->blimit) + break; + } + + local_irq_save(flags); + + /* Update count, and requeue any remaining callbacks. */ + rdp->qlen -= count; + if (list != NULL) { + *tail = rdp->nxtlist; + rdp->nxtlist = list; + for (count = 0; count < RCU_NEXT_SIZE; count++) + if (&rdp->nxtlist == rdp->nxttail[count]) + rdp->nxttail[count] = tail; + else + break; + } + + /* Reinstate batch limit if we have worked down the excess. */ + if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) + rdp->blimit = blimit; + + local_irq_restore(flags); + + /* Re-raise the RCU softirq if there are callbacks remaining. */ + if (cpu_has_callbacks_ready_to_invoke(rdp)) + raise_softirq(RCU_SOFTIRQ); +} + +/* + * Check to see if this CPU is in a non-context-switch quiescent state + * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). + * Also schedule the RCU softirq handler. + * + * This function must be called with hardirqs disabled. It is normally + * invoked from the scheduling-clock interrupt. If rcu_pending returns + * false, there is no point in invoking rcu_check_callbacks(). + */ +void rcu_check_callbacks(int cpu, int user) +{ + if (user || + (idle_cpu(cpu) && !in_softirq() && + hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + + /* + * Get here if this CPU took its interrupt from user + * mode or from the idle loop, and if this is not a + * nested interrupt. In this case, the CPU is in + * a quiescent state, so count it. + * + * No memory barrier is required here because both + * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference + * only CPU-local variables that other CPUs neither + * access nor modify, at least not while the corresponding + * CPU is online. + */ + + rcu_qsctr_inc(cpu); + rcu_bh_qsctr_inc(cpu); + + } else if (!in_softirq()) { + + /* + * Get here if this CPU did not take its interrupt from + * softirq, in other words, if it is not interrupting + * a rcu_bh read-side critical section. This is an _bh + * critical section, so count it. + */ + + rcu_bh_qsctr_inc(cpu); + } + raise_softirq(RCU_SOFTIRQ); +} + +#ifdef CONFIG_SMP + +/* + * Scan the leaf rcu_node structures, processing dyntick state for any that + * have not yet encountered a quiescent state, using the function specified. + * Returns 1 if the current grace period ends while scanning (possibly + * because we made it end). + */ +static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, + int (*f)(struct rcu_data *)) +{ + unsigned long bit; + int cpu; + unsigned long flags; + unsigned long mask; + struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; + struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES]; + + for (; rnp_cur < rnp_end; rnp_cur++) { + mask = 0; + spin_lock_irqsave(&rnp_cur->lock, flags); + if (rsp->completed != lastcomp) { + spin_unlock_irqrestore(&rnp_cur->lock, flags); + return 1; + } + if (rnp_cur->qsmask == 0) { + spin_unlock_irqrestore(&rnp_cur->lock, flags); + continue; + } + cpu = rnp_cur->grplo; + bit = 1; + for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) { + if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu])) + mask |= bit; + } + if (mask != 0 && rsp->completed == lastcomp) { + + /* cpu_quiet_msk() releases rnp_cur->lock. */ + cpu_quiet_msk(mask, rsp, rnp_cur, flags); + continue; + } + spin_unlock_irqrestore(&rnp_cur->lock, flags); + } + return 0; +} + +/* + * Force quiescent states on reluctant CPUs, and also detect which + * CPUs are in dyntick-idle mode. + */ +static void force_quiescent_state(struct rcu_state *rsp, int relaxed) +{ + unsigned long flags; + long lastcomp; + struct rcu_data *rdp = rsp->rda[smp_processor_id()]; + struct rcu_node *rnp = rcu_get_root(rsp); + u8 signaled; + + if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) + return; /* No grace period in progress, nothing to force. */ + if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { + rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ + return; /* Someone else is already on the job. */ + } + if (relaxed && + (long)(rsp->jiffies_force_qs - jiffies) >= 0 && + (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) >= 0) + goto unlock_ret; /* no emergency and done recently. */ + rsp->n_force_qs++; + spin_lock(&rnp->lock); + lastcomp = rsp->completed; + signaled = rsp->signaled; + rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; + rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending + + RCU_JIFFIES_TILL_FORCE_QS; + if (lastcomp == rsp->gpnum) { + rsp->n_force_qs_ngp++; + spin_unlock(&rnp->lock); + goto unlock_ret; /* no GP in progress, time updated. */ + } + spin_unlock(&rnp->lock); + switch (signaled) { + case RCU_GP_INIT: + + break; /* grace period still initializing, ignore. */ + + case RCU_SAVE_DYNTICK: + + if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) + break; /* So gcc recognizes the dead code. */ + + /* Record dyntick-idle state. */ + if (rcu_process_dyntick(rsp, lastcomp, + dyntick_save_progress_counter)) + goto unlock_ret; + + /* Update state, record completion counter. */ + spin_lock(&rnp->lock); + if (lastcomp == rsp->completed) { + rsp->signaled = RCU_FORCE_QS; + dyntick_record_completed(rsp, lastcomp); + } + spin_unlock(&rnp->lock); + break; + + case RCU_FORCE_QS: + + /* Check dyntick-idle state, send IPI to laggarts. */ + if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp), + rcu_implicit_dynticks_qs)) + goto unlock_ret; + + /* Leave state in case more forcing is required. */ + + break; + } +unlock_ret: + spin_unlock_irqrestore(&rsp->fqslock, flags); +} + +#else /* #ifdef CONFIG_SMP */ + +static void force_quiescent_state(struct rcu_state *rsp, int relaxed) +{ + set_need_resched(); +} + +#endif /* #else #ifdef CONFIG_SMP */ + +/* + * This does the RCU processing work from softirq context for the + * specified rcu_state and rcu_data structures. This may be called + * only from the CPU to whom the rdp belongs. + */ +static void +__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) +{ + unsigned long flags; + + /* + * If an RCU GP has gone long enough, go check for dyntick + * idle CPUs and, if needed, send resched IPIs. + */ + if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 || + (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0) + force_quiescent_state(rsp, 1); + + /* + * Advance callbacks in response to end of earlier grace + * period that some other CPU ended. + */ + rcu_process_gp_end(rsp, rdp); + + /* Update RCU state based on any recent quiescent states. */ + rcu_check_quiescent_state(rsp, rdp); + + /* Does this CPU require a not-yet-started grace period? */ + if (cpu_needs_another_gp(rsp, rdp)) { + spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); + rcu_start_gp(rsp, flags); /* releases above lock */ + } + + /* If there are callbacks ready, invoke them. */ + rcu_do_batch(rdp); +} + +/* + * Do softirq processing for the current CPU. + */ +static void rcu_process_callbacks(struct softirq_action *unused) +{ + /* + * Memory references from any prior RCU read-side critical sections + * executed by the interrupted code must be seen before any RCU + * grace-period manipulations below. + */ + smp_mb(); /* See above block comment. */ + + __rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data)); + __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); + + /* + * Memory references from any later RCU read-side critical sections + * executed by the interrupted code must be seen after any RCU + * grace-period manipulations above. + */ + smp_mb(); /* See above block comment. */ +} + +static void +__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), + struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_data *rdp; + + head->func = func; + head->next = NULL; + + smp_mb(); /* Ensure RCU update seen before callback registry. */ + + /* + * Opportunistically note grace-period endings and beginnings. + * Note that we might see a beginning right after we see an + * end, but never vice versa, since this CPU has to pass through + * a quiescent state betweentimes. + */ + local_irq_save(flags); + rdp = rsp->rda[smp_processor_id()]; + rcu_process_gp_end(rsp, rdp); + check_for_new_grace_period(rsp, rdp); + + /* Add the callback to our list. */ + *rdp->nxttail[RCU_NEXT_TAIL] = head; + rdp->nxttail[RCU_NEXT_TAIL] = &head->next; + + /* Start a new grace period if one not already started. */ + if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) { + unsigned long nestflag; + struct rcu_node *rnp_root = rcu_get_root(rsp); + + spin_lock_irqsave(&rnp_root->lock, nestflag); + rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ + } + + /* Force the grace period if too many callbacks or too long waiting. */ + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = LONG_MAX; + force_quiescent_state(rsp, 0); + } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 || + (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0) + force_quiescent_state(rsp, 1); + local_irq_restore(flags); +} + +/* + * Queue an RCU callback for invocation after a grace period. + */ +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_state); +} +EXPORT_SYMBOL_GPL(call_rcu); + +/* + * Queue an RCU for invocation after a quicker grace period. + */ +void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_bh_state); +} +EXPORT_SYMBOL_GPL(call_rcu_bh); + +/* + * Check to see if there is any immediate RCU-related work to be done + * by the current CPU, for the specified type of RCU, returning 1 if so. + * The checks are in order of increasing expense: checks that can be + * carried out against CPU-local state are performed first. However, + * we must check for CPU stalls first, else we might not get a chance. + */ +static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) +{ + rdp->n_rcu_pending++; + + /* Check for CPU stalls, if enabled. */ + check_cpu_stall(rsp, rdp); + + /* Is the RCU core waiting for a quiescent state from this CPU? */ + if (rdp->qs_pending) + return 1; + + /* Does this CPU have callbacks ready to invoke? */ + if (cpu_has_callbacks_ready_to_invoke(rdp)) + return 1; + + /* Has RCU gone idle with this CPU needing another grace period? */ + if (cpu_needs_another_gp(rsp, rdp)) + return 1; + + /* Has another RCU grace period completed? */ + if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */ + return 1; + + /* Has a new RCU grace period started? */ + if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */ + return 1; + + /* Has an RCU GP gone long enough to send resched IPIs &c? */ + if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) && + ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 || + (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)) + return 1; + + /* nothing to do */ + return 0; +} + +/* + * Check to see if there is any immediate RCU-related work to be done + * by the current CPU, returning 1 if so. This function is part of the + * RCU implementation; it is -not- an exported member of the RCU API. + */ +int rcu_pending(int cpu) +{ + return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) || + __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)); +} + +/* + * Check to see if any future RCU-related work will need to be done + * by the current CPU, even if none need be done immediately, returning + * 1 if so. This function is part of the RCU implementation; it is -not- + * an exported member of the RCU API. + */ +int rcu_needs_cpu(int cpu) +{ + /* RCU callbacks either ready or pending? */ + return per_cpu(rcu_data, cpu).nxtlist || + per_cpu(rcu_bh_data, cpu).nxtlist; +} + +/* + * Initialize a CPU's per-CPU RCU data. We take this "scorched earth" + * approach so that we don't have to worry about how long the CPU has + * been gone, or whether it ever was online previously. We do trust the + * ->mynode field, as it is constant for a given struct rcu_data and + * initialized during early boot. + * + * Note that only one online or offline event can be happening at a given + * time. Note also that we can accept some slop in the rsp->completed + * access due to the fact that this CPU cannot possibly have any RCU + * callbacks in flight yet. + */ +static void +rcu_init_percpu_data(int cpu, struct rcu_state *rsp) +{ + unsigned long flags; + int i; + long lastcomp; + unsigned long mask; + struct rcu_data *rdp = rsp->rda[cpu]; + struct rcu_node *rnp = rcu_get_root(rsp); + + /* Set up local state, ensuring consistent view of global state. */ + spin_lock_irqsave(&rnp->lock, flags); + lastcomp = rsp->completed; + rdp->completed = lastcomp; + rdp->gpnum = lastcomp; + rdp->passed_quiesc = 0; /* We could be racing with new GP, */ + rdp->qs_pending = 1; /* so set up to respond to current GP. */ + rdp->beenonline = 1; /* We have now been online. */ + rdp->passed_quiesc_completed = lastcomp - 1; + rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; + rdp->qlen = 0; + rdp->blimit = blimit; +#ifdef CONFIG_NO_HZ + rdp->dynticks = &per_cpu(rcu_dynticks, cpu); +#endif /* #ifdef CONFIG_NO_HZ */ + rdp->cpu = cpu; + spin_unlock(&rnp->lock); /* irqs remain disabled. */ + + /* + * A new grace period might start here. If so, we won't be part + * of it, but that is OK, as we are currently in a quiescent state. + */ + + /* Exclude any attempts to start a new GP on large systems. */ + spin_lock(&rsp->onofflock); /* irqs already disabled. */ + + /* Add CPU to rcu_node bitmasks. */ + rnp = rdp->mynode; + mask = rdp->grpmask; + do { + /* Exclude any attempts to start a new GP on small systems. */ + spin_lock(&rnp->lock); /* irqs already disabled. */ + rnp->qsmaskinit |= mask; + mask = rnp->grpmask; + spin_unlock(&rnp->lock); /* irqs already disabled. */ + rnp = rnp->parent; + } while (rnp != NULL && !(rnp->qsmaskinit & mask)); + + spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ + + /* + * A new grace period might start here. If so, we will be part of + * it, and its gpnum will be greater than ours, so we will + * participate. It is also possible for the gpnum to have been + * incremented before this function was called, and the bitmasks + * to not be filled out until now, in which case we will also + * participate due to our gpnum being behind. + */ + + /* Since it is coming online, the CPU is in a quiescent state. */ + cpu_quiet(cpu, rsp, rdp, lastcomp); + local_irq_restore(flags); +} + +static void __cpuinit rcu_online_cpu(int cpu) +{ +#ifdef CONFIG_NO_HZ + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + rdtp->dynticks_nesting = 1; + rdtp->dynticks |= 1; /* need consecutive #s even for hotplug. */ + rdtp->dynticks_nmi = (rdtp->dynticks_nmi + 1) & ~0x1; +#endif /* #ifdef CONFIG_NO_HZ */ + rcu_init_percpu_data(cpu, &rcu_state); + rcu_init_percpu_data(cpu, &rcu_bh_state); + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); +} + +/* + * Handle CPU online/offline notifcation events. + */ +static int __cpuinit rcu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + rcu_online_cpu(cpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + rcu_offline_cpu(cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +/* + * Compute the per-level fanout, either using the exact fanout specified + * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. + */ +#ifdef CONFIG_RCU_FANOUT_EXACT +static void __init rcu_init_levelspread(struct rcu_state *rsp) +{ + int i; + + for (i = NUM_RCU_LVLS - 1; i >= 0; i--) + rsp->levelspread[i] = CONFIG_RCU_FANOUT; +} +#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ +static void __init rcu_init_levelspread(struct rcu_state *rsp) +{ + int ccur; + int cprv; + int i; + + cprv = NR_CPUS; + for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { + ccur = rsp->levelcnt[i]; + rsp->levelspread[i] = (cprv + ccur - 1) / ccur; + cprv = ccur; + } +} +#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */ + +/* + * Helper function for rcu_init() that initializes one rcu_state structure. + */ +static void __init rcu_init_one(struct rcu_state *rsp) +{ + int cpustride = 1; + int i; + int j; + struct rcu_node *rnp; + + /* Initialize the level-tracking arrays. */ + + for (i = 1; i < NUM_RCU_LVLS; i++) + rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; + rcu_init_levelspread(rsp); + + /* Initialize the elements themselves, starting from the leaves. */ + + for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { + cpustride *= rsp->levelspread[i]; + rnp = rsp->level[i]; + for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { + spin_lock_init(&rnp->lock); + rnp->qsmask = 0; + rnp->qsmaskinit = 0; + rnp->grplo = j * cpustride; + rnp->grphi = (j + 1) * cpustride - 1; + if (rnp->grphi >= NR_CPUS) + rnp->grphi = NR_CPUS - 1; + if (i == 0) { + rnp->grpnum = 0; + rnp->grpmask = 0; + rnp->parent = NULL; + } else { + rnp->grpnum = j % rsp->levelspread[i - 1]; + rnp->grpmask = 1UL << rnp->grpnum; + rnp->parent = rsp->level[i - 1] + + j / rsp->levelspread[i - 1]; + } + rnp->level = i; + } + } +} + +/* + * Helper macro for __rcu_init(). To be used nowhere else! + * Assigns leaf node pointers into each CPU's rcu_data structure. + */ +#define RCU_DATA_PTR_INIT(rsp, rcu_data) \ +do { \ + rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \ + j = 0; \ + for_each_possible_cpu(i) { \ + if (i > rnp[j].grphi) \ + j++; \ + per_cpu(rcu_data, i).mynode = &rnp[j]; \ + (rsp)->rda[i] = &per_cpu(rcu_data, i); \ + } \ +} while (0) + +static struct notifier_block __cpuinitdata rcu_nb = { + .notifier_call = rcu_cpu_notify, +}; + +void __init __rcu_init(void) +{ + int i; /* All used by RCU_DATA_PTR_INIT(). */ + int j; + struct rcu_node *rnp; + + printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n"); +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + rcu_init_one(&rcu_state); + RCU_DATA_PTR_INIT(&rcu_state, rcu_data); + rcu_init_one(&rcu_bh_state); + RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data); + + for_each_online_cpu(i) + rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i); + /* Register notifier for non-boot CPUs */ + register_cpu_notifier(&rcu_nb); + printk(KERN_WARNING "Experimental hierarchical RCU init done.\n"); +} + +module_param(blimit, int, 0); +module_param(qhimark, int, 0); +module_param(qlowmark, int, 0); diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c new file mode 100644 index 00000000000..d6db3e83782 --- /dev/null +++ b/kernel/rcutree_trace.c @@ -0,0 +1,271 @@ +/* + * Read-Copy Update tracing for classic implementation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2008 + * + * Papers: http://www.rdrop.com/users/paulmck/RCU + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) +{ + if (!rdp->beenonline) + return; + seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d rpfq=%ld rp=%x", + rdp->cpu, + cpu_is_offline(rdp->cpu) ? '!' : ' ', + rdp->completed, rdp->gpnum, + rdp->passed_quiesc, rdp->passed_quiesc_completed, + rdp->qs_pending, + rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending, + (int)(rdp->n_rcu_pending & 0xffff)); +#ifdef CONFIG_NO_HZ + seq_printf(m, " dt=%d/%d dn=%d df=%lu", + rdp->dynticks->dynticks, + rdp->dynticks->dynticks_nesting, + rdp->dynticks->dynticks_nmi, + rdp->dynticks_fqs); +#endif /* #ifdef CONFIG_NO_HZ */ + seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); + seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit); +} + +#define PRINT_RCU_DATA(name, func, m) \ + do { \ + int _p_r_d_i; \ + \ + for_each_possible_cpu(_p_r_d_i) \ + func(m, &per_cpu(name, _p_r_d_i)); \ + } while (0) + +static int show_rcudata(struct seq_file *m, void *unused) +{ + seq_puts(m, "rcu:\n"); + PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m); + seq_puts(m, "rcu_bh:\n"); + PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m); + return 0; +} + +static int rcudata_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcudata, NULL); +} + +static struct file_operations rcudata_fops = { + .owner = THIS_MODULE, + .open = rcudata_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) +{ + if (!rdp->beenonline) + return; + seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d,%ld,%ld", + rdp->cpu, + cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"", + rdp->completed, rdp->gpnum, + rdp->passed_quiesc, rdp->passed_quiesc_completed, + rdp->qs_pending, + rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending, + rdp->n_rcu_pending); +#ifdef CONFIG_NO_HZ + seq_printf(m, ",%d,%d,%d,%lu", + rdp->dynticks->dynticks, + rdp->dynticks->dynticks_nesting, + rdp->dynticks->dynticks_nmi, + rdp->dynticks_fqs); +#endif /* #ifdef CONFIG_NO_HZ */ + seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); + seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit); +} + +static int show_rcudata_csv(struct seq_file *m, void *unused) +{ + seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",\"rpfq\",\"rp\","); +#ifdef CONFIG_NO_HZ + seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); +#endif /* #ifdef CONFIG_NO_HZ */ + seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); + seq_puts(m, "\"rcu:\"\n"); + PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m); + seq_puts(m, "\"rcu_bh:\"\n"); + PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m); + return 0; +} + +static int rcudata_csv_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcudata_csv, NULL); +} + +static struct file_operations rcudata_csv_fops = { + .owner = THIS_MODULE, + .open = rcudata_csv_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) +{ + int level = 0; + struct rcu_node *rnp; + + seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " + "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", + rsp->completed, rsp->gpnum, rsp->signaled, + (long)(rsp->jiffies_force_qs - jiffies), + (int)(jiffies & 0xffff), + rsp->n_force_qs, rsp->n_force_qs_ngp, + rsp->n_force_qs - rsp->n_force_qs_ngp, + rsp->n_force_qs_lh); + for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { + if (rnp->level != level) { + seq_puts(m, "\n"); + level = rnp->level; + } + seq_printf(m, "%lx/%lx %d:%d ^%d ", + rnp->qsmask, rnp->qsmaskinit, + rnp->grplo, rnp->grphi, rnp->grpnum); + } + seq_puts(m, "\n"); +} + +static int show_rcuhier(struct seq_file *m, void *unused) +{ + seq_puts(m, "rcu:\n"); + print_one_rcu_state(m, &rcu_state); + seq_puts(m, "rcu_bh:\n"); + print_one_rcu_state(m, &rcu_bh_state); + return 0; +} + +static int rcuhier_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcuhier, NULL); +} + +static struct file_operations rcuhier_fops = { + .owner = THIS_MODULE, + .open = rcuhier_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int show_rcugp(struct seq_file *m, void *unused) +{ + seq_printf(m, "rcu: completed=%ld gpnum=%ld\n", + rcu_state.completed, rcu_state.gpnum); + seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", + rcu_bh_state.completed, rcu_bh_state.gpnum); + return 0; +} + +static int rcugp_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcugp, NULL); +} + +static struct file_operations rcugp_fops = { + .owner = THIS_MODULE, + .open = rcugp_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir; +static int __init rcuclassic_trace_init(void) +{ + rcudir = debugfs_create_dir("rcu", NULL); + if (!rcudir) + goto out; + + datadir = debugfs_create_file("rcudata", 0444, rcudir, + NULL, &rcudata_fops); + if (!datadir) + goto free_out; + + datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir, + NULL, &rcudata_csv_fops); + if (!datadir_csv) + goto free_out; + + gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); + if (!gpdir) + goto free_out; + + hierdir = debugfs_create_file("rcuhier", 0444, rcudir, + NULL, &rcuhier_fops); + if (!hierdir) + goto free_out; + return 0; +free_out: + if (datadir) + debugfs_remove(datadir); + if (datadir_csv) + debugfs_remove(datadir_csv); + if (gpdir) + debugfs_remove(gpdir); + debugfs_remove(rcudir); +out: + return 1; +} + +static void __exit rcuclassic_trace_cleanup(void) +{ + debugfs_remove(datadir); + debugfs_remove(datadir_csv); + debugfs_remove(gpdir); + debugfs_remove(hierdir); + debugfs_remove(rcudir); +} + + +module_init(rcuclassic_trace_init); +module_exit(rcuclassic_trace_cleanup); + +MODULE_AUTHOR("Paul E. McKenney"); +MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); +MODULE_LICENSE("GPL"); diff --git a/kernel/softirq.c b/kernel/softirq.c index e7c69a720d6..80d323e6f61 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -269,6 +269,7 @@ void irq_enter(void) { int cpu = smp_processor_id(); + rcu_irq_enter(); if (idle_cpu(cpu) && !in_interrupt()) { __irq_enter(); tick_check_idle(cpu); @@ -295,9 +296,9 @@ void irq_exit(void) #ifdef CONFIG_NO_HZ /* Make sure that timer wheel updates are propagated */ - if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) - tick_nohz_stop_sched_tick(0); rcu_irq_exit(); + if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) + tick_nohz_stop_sched_tick(0); #endif preempt_enable_no_resched(); } diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b0f239e443b..465d822f3f5 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -619,6 +619,19 @@ config RCU_CPU_STALL_DETECTOR Say N if you are unsure. +config RCU_CPU_STALL_DETECTOR + bool "Check for stalled CPUs delaying RCU grace periods" + depends on CLASSIC_RCU || TREE_RCU + default n + help + This option causes RCU to printk information on which + CPUs are delaying the current grace period, but only when + the grace period extends for excessive time periods. + + Say Y if you want RCU to perform such checks. + + Say N if you are unsure. + config KPROBES_SANITY_TEST bool "Kprobes sanity tests" depends on DEBUG_KERNEL -- cgit v1.2.3-70-g09d2 From b2e3c0adec918ea22b6c9d7c76193dd3aaba9bd4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 19 Dec 2008 00:48:27 +0100 Subject: hrtimers: fix warning in kernel/hrtimer.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit this warning: kernel/hrtimer.c: In function ‘hrtimer_cpu_notify’: kernel/hrtimer.c:1574: warning: unused variable ‘dcpu’ is caused because 'dcpu' is only used in the CONFIG_HOTPLUG_CPU case. Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b741f850426..bda9cb92427 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1571,7 +1571,7 @@ static void tickle_timers(void *arg) static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { - int dcpu, scpu = (long)hcpu; + int scpu = (long)hcpu; switch (action) { @@ -1583,10 +1583,14 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, #ifdef CONFIG_HOTPLUG_CPU case CPU_DEAD: case CPU_DEAD_FROZEN: + { + int dcpu; + clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); dcpu = migrate_hrtimers(scpu); smp_call_function_single(dcpu, tickle_timers, NULL, 0); break; + } #endif default: -- cgit v1.2.3-70-g09d2 From 3bddb9a3246f6df5cf3b7655cb541ac10203bb71 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 19 Dec 2008 01:03:29 +0100 Subject: tracing: fix warning in kernel/trace/trace.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit this warning: kernel/trace/trace.c: In function ‘print_lat_fmt’: kernel/trace/trace.c:1826: warning: unused variable ‘state’ Triggers because 'state' has become unused - remove it. Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1a3d6b32978..49fc7201295 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1816,7 +1816,6 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) char *comm; int S, T; int i; - unsigned state; if (entry->type == TRACE_CONT) return TRACE_TYPE_HANDLED; -- cgit v1.2.3-70-g09d2 From c71dd42db2c6f1637b92502a214587431c1a6ad2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 19 Dec 2008 01:09:51 +0100 Subject: tracing: fix warnings in kernel/trace/trace_sched_switch.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit these warnings: kernel/trace/trace_sched_switch.c: In function ‘tracing_sched_register’: kernel/trace/trace_sched_switch.c:96: warning: passing argument 1 of ‘register_trace_sched_wakeup_new’ from incompatible pointer type kernel/trace/trace_sched_switch.c:112: warning: passing argument 1 of ‘unregister_trace_sched_wakeup_new’ from incompatible pointer type kernel/trace/trace_sched_switch.c: In function ‘tracing_sched_unregister’: kernel/trace/trace_sched_switch.c:121: warning: passing argument 1 of ‘unregister_trace_sched_wakeup_new’ from incompatible pointer type Trigger because sched_wakeup_new tracepoints need the same trace signature as sched_wakeup - which was changed recently. Fix it. Signed-off-by: Ingo Molnar --- include/trace/sched.h | 4 ++-- kernel/sched.c | 2 +- kernel/trace/trace_sched_switch.c | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/trace/sched.h b/include/trace/sched.h index f4549d506b1..bc4c9eadc6b 100644 --- a/include/trace/sched.h +++ b/include/trace/sched.h @@ -21,8 +21,8 @@ DECLARE_TRACE(sched_wakeup, TPARGS(rq, p)); DECLARE_TRACE(sched_wakeup_new, - TPPROTO(struct rq *rq, struct task_struct *p), - TPARGS(rq, p)); + TPPROTO(struct rq *rq, struct task_struct *p, int success), + TPARGS(rq, p, success)); DECLARE_TRACE(sched_switch, TPPROTO(struct rq *rq, struct task_struct *prev, diff --git a/kernel/sched.c b/kernel/sched.c index d377097572f..ac5a70a87d1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2457,7 +2457,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_new(rq, p); inc_nr_running(rq); } - trace_sched_wakeup_new(rq, p); + trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, 0); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 863390557b4..781d72ef873 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -247,3 +247,4 @@ __init static int init_sched_switch_trace(void) return register_tracer(&sched_switch_trace); } device_initcall(init_sched_switch_trace); + -- cgit v1.2.3-70-g09d2 From b56863630ddbdea6e22df8835f78f0b1da037103 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 18 Dec 2008 15:06:34 -0800 Subject: futex: clean up futex_(un)lock_pi fault handling Impact: cleanup Some apparently left over cruft code was complicating the fault logic: Testing if uval != -EFAULT doesn't have any meaning, get_user() sets ret to either 0 or -EFAULT, there's no need to compare uval, especially not against EFAULT which it will never be. This patch removes the superfluous test and clarifies the comment blocks. Build and boot tested on an 8way x86_64 system. Signed-off-by: Darren Hart Signed-off-by: Ingo Molnar --- kernel/futex.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 99f8acce08b..b4f87bac91c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1565,12 +1565,11 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, uaddr_faulted: /* - * We have to r/w *(int __user *)uaddr, but we can't modify it - * non-atomically. Therefore, if get_user below is not - * enough, we need to handle the fault ourselves, while - * still holding the mmap_sem. - * - * ... and hb->lock. :-) --ANK + * We have to r/w *(int __user *)uaddr, and we have to modify it + * atomically. Therefore, if we continue to fault after get_user() + * below, we need to handle the fault ourselves, while still holding + * the mmap_sem. This can occur if the uaddr is under contention as + * we have to drop the mmap_sem in order to call get_user(). */ queue_unlock(&q, hb); @@ -1582,7 +1581,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, } ret = get_user(uval, uaddr); - if (!ret && (uval != -EFAULT)) + if (!ret) goto retry; if (to) @@ -1676,12 +1675,11 @@ out: pi_faulted: /* - * We have to r/w *(int __user *)uaddr, but we can't modify it - * non-atomically. Therefore, if get_user below is not - * enough, we need to handle the fault ourselves, while - * still holding the mmap_sem. - * - * ... and hb->lock. --ANK + * We have to r/w *(int __user *)uaddr, and we have to modify it + * atomically. Therefore, if we continue to fault after get_user() + * below, we need to handle the fault ourselves, while still holding + * the mmap_sem. This can occur if the uaddr is under contention as + * we have to drop the mmap_sem in order to call get_user(). */ spin_unlock(&hb->lock); @@ -1694,7 +1692,7 @@ pi_faulted: } ret = get_user(uval, uaddr); - if (!ret && (uval != -EFAULT)) + if (!ret) goto retry; return ret; -- cgit v1.2.3-70-g09d2 From 213cc060797378059a28ebc5c539f3e9a80160bd Mon Sep 17 00:00:00 2001 From: Pekka J Enberg Date: Fri, 19 Dec 2008 12:08:39 +0200 Subject: ftrace: introduce tracing_reset_online_cpus() helper Impact: cleanup This patch factors out common code from multiple tracers into a tracing_reset_online_cpus() function and converts the tracers to use it. Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 10 ++++++++++ kernel/trace/trace.h | 1 + kernel/trace/trace_boot.c | 12 +----------- kernel/trace/trace_functions.c | 14 ++------------ kernel/trace/trace_hw_branches.c | 14 ++------------ kernel/trace/trace_mmiotrace.c | 6 +----- kernel/trace/trace_sched_switch.c | 14 ++------------ kernel/trace/trace_sysprof.c | 12 +----------- 8 files changed, 20 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0eb6d48347f..79db26e8216 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -679,6 +679,16 @@ void tracing_reset(struct trace_array *tr, int cpu) ftrace_enable_cpu(); } +void tracing_reset_online_cpus(struct trace_array *tr) +{ + int cpu; + + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); +} + #define SAVED_CMDLINES 128 static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fc75dce7a66..cc7a4f86403 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -374,6 +374,7 @@ struct trace_iterator { int tracing_is_enabled(void); void trace_wake_up(void); void tracing_reset(struct trace_array *tr, int cpu); +void tracing_reset_online_cpus(struct trace_array *tr); int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *tracing_init_dentry(void); void init_tracer_sysprof_debugfs(struct dentry *d_tracer); diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index a4fa2c57e34..3ccebde2848 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -37,16 +37,6 @@ void disable_boot_trace(void) tracing_stop_sched_switch_record(); } -static void reset_boot_trace(struct trace_array *tr) -{ - int cpu; - - tr->time_start = ftrace_now(tr->cpu); - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); -} - static int boot_trace_init(struct trace_array *tr) { int cpu; @@ -130,7 +120,7 @@ struct tracer boot_tracer __read_mostly = { .name = "initcall", .init = boot_trace_init, - .reset = reset_boot_trace, + .reset = tracing_reset_online_cpus, .print_line = initcall_print_line, }; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index e74f6d0a321..9236d7e25a1 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -16,20 +16,10 @@ #include "trace.h" -static void function_reset(struct trace_array *tr) -{ - int cpu; - - tr->time_start = ftrace_now(tr->cpu); - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); -} - static void start_function_trace(struct trace_array *tr) { tr->cpu = get_cpu(); - function_reset(tr); + tracing_reset_online_cpus(tr); put_cpu(); tracing_start_cmdline_record(); @@ -55,7 +45,7 @@ static void function_trace_reset(struct trace_array *tr) static void function_trace_start(struct trace_array *tr) { - function_reset(tr); + tracing_reset_online_cpus(tr); } static struct tracer function_trace __read_mostly = diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index ee29e012aa9..b6a3e20a49a 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -25,16 +25,6 @@ static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); #define this_buffer per_cpu(buffer, smp_processor_id()) -static void bts_trace_reset(struct trace_array *tr) -{ - int cpu; - - tr->time_start = ftrace_now(tr->cpu); - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); -} - static void bts_trace_start_cpu(void *arg) { if (this_tracer) @@ -54,7 +44,7 @@ static void bts_trace_start(struct trace_array *tr) { int cpu; - bts_trace_reset(tr); + tracing_reset_online_cpus(tr); for_each_cpu_mask(cpu, cpu_possible_map) smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); @@ -78,7 +68,7 @@ static void bts_trace_stop(struct trace_array *tr) static int bts_trace_init(struct trace_array *tr) { - bts_trace_reset(tr); + tracing_reset_online_cpus(tr); bts_trace_start(tr); return 0; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 2fb6da6523b..fffcb069f1d 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -22,14 +22,10 @@ static unsigned long prev_overruns; static void mmio_reset_data(struct trace_array *tr) { - int cpu; - overrun_detected = false; prev_overruns = 0; - tr->time_start = ftrace_now(tr->cpu); - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); + tracing_reset_online_cpus(tr); } static int mmio_trace_init(struct trace_array *tr) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 781d72ef873..add2c1fdae9 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -72,16 +72,6 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee) local_irq_restore(flags); } -static void sched_switch_reset(struct trace_array *tr) -{ - int cpu; - - tr->time_start = ftrace_now(tr->cpu); - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); -} - static int tracing_sched_register(void) { int ret; @@ -197,7 +187,7 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr) static void start_sched_trace(struct trace_array *tr) { - sched_switch_reset(tr); + tracing_reset_online_cpus(tr); tracing_start_sched_switch_record(); } @@ -221,7 +211,7 @@ static void sched_switch_trace_reset(struct trace_array *tr) static void sched_switch_trace_start(struct trace_array *tr) { - sched_switch_reset(tr); + tracing_reset_online_cpus(tr); tracing_start_sched_switch(); } diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 54960edb96d..01becf1f19f 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -234,20 +234,10 @@ static void stop_stack_timers(void) stop_stack_timer(cpu); } -static void stack_reset(struct trace_array *tr) -{ - int cpu; - - tr->time_start = ftrace_now(tr->cpu); - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); -} - static void start_stack_trace(struct trace_array *tr) { mutex_lock(&sample_timer_lock); - stack_reset(tr); + tracing_reset_online_cpus(tr); start_stack_timers(); tracer_enabled = 1; mutex_unlock(&sample_timer_lock); -- cgit v1.2.3-70-g09d2 From 9bb482476c6c9d1ae033306440c51ceac93ea80c Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 16 Dec 2008 11:30:08 +0000 Subject: allow stripping of generated symbols under CONFIG_KALLSYMS_ALL Building upon parts of the module stripping patch, this patch introduces similar stripping for vmlinux when CONFIG_KALLSYMS_ALL=y. Using CONFIG_KALLSYMS_STRIP_GENERATED reduces the overhead of CONFIG_KALLSYMS_ALL from 245k/310k to 65k/80k for the (i386/x86-64) kernels I tested with. The patch also does away with the need to special case the kallsyms- internal symbols by making them available even in the first linking stage. While it is a generated file, the patch includes the changes to scripts/genksyms/keywords.c_shipped, as I'm unsure what the procedure here is. Signed-off-by: Jan Beulich Signed-off-by: Sam Ravnborg --- Makefile | 47 ++++++--- arch/x86/scripts/strip-symbols | 1 + init/Kconfig | 7 ++ kernel/kallsyms.c | 16 ++- scripts/genksyms/keywords.c_shipped | 189 ++++++++++++++++++------------------ scripts/genksyms/keywords.gperf | 2 + scripts/kallsyms.c | 21 ++-- 7 files changed, 155 insertions(+), 128 deletions(-) create mode 100644 arch/x86/scripts/strip-symbols (limited to 'kernel') diff --git a/Makefile b/Makefile index 5dd0ed3b12c..b3d1c8f1f4c 100644 --- a/Makefile +++ b/Makefile @@ -604,6 +604,9 @@ export INSTALL_PATH ?= /boot MODLIB = $(INSTALL_MOD_PATH)/lib/modules/$(KERNELRELEASE) export MODLIB +strip-symbols := $(srctree)/scripts/strip-symbols \ + $(wildcard $(srctree)/arch/$(ARCH)/scripts/strip-symbols) + # # INSTALL_MOD_STRIP, if defined, will cause modules to be stripped while # they get installed. If INSTALL_MOD_STRIP is '1', then the default @@ -611,8 +614,10 @@ export MODLIB # be used as the option(s) to the objcopy command. ifdef INSTALL_MOD_STRIP ifeq ($(INSTALL_MOD_STRIP),1) -mod_strip_cmd = $(OBJCOPY) --strip-debug --strip-symbols \ - $(srctree)/scripts/strip-symbols --wildcard +mod_strip_cmd = $(OBJCOPY) --strip-debug +ifeq ($(CONFIG_KALLSYMS_ALL),$(CONFIG_KALLSYMS_STRIP_GENERATED)) +mod_strip_cmd += --wildcard $(addprefix --strip-symbols ,$(strip-symbols)) +endif else mod_strip_cmd = $(OBJCOPY) $(INSTALL_MOD_STRIP) endif # INSTALL_MOD_STRIP=1 @@ -747,6 +752,7 @@ last_kallsyms := 2 endif kallsyms.o := .tmp_kallsyms$(last_kallsyms).o +kallsyms.h := $(wildcard include/config/kallsyms/*.h) $(wildcard include/config/kallsyms/*/*.h) define verify_kallsyms $(Q)$(if $($(quiet)cmd_sysmap), \ @@ -771,24 +777,41 @@ endef # Generate .S file with all kernel symbols quiet_cmd_kallsyms = KSYM $@ - cmd_kallsyms = $(NM) -n $< | $(KALLSYMS) \ - $(if $(CONFIG_KALLSYMS_ALL),--all-symbols) > $@ + cmd_kallsyms = { test $* -eq 0 || $(NM) -n $<; } \ + | $(KALLSYMS) $(if $(CONFIG_KALLSYMS_ALL),--all-symbols) >$@ -.tmp_kallsyms1.o .tmp_kallsyms2.o .tmp_kallsyms3.o: %.o: %.S scripts FORCE +quiet_cmd_kstrip = STRIP $@ + cmd_kstrip = $(OBJCOPY) --wildcard $(addprefix --strip$(if $(CONFIG_RELOCATABLE),-unneeded)-symbols ,$(filter %/scripts/strip-symbols,$^)) $< $@ + +$(foreach n,0 1 2 3,.tmp_kallsyms$(n).o): KBUILD_AFLAGS += -Wa,--strip-local-absolute +$(foreach n,0 1 2 3,.tmp_kallsyms$(n).o): %.o: %.S scripts FORCE $(call if_changed_dep,as_o_S) -.tmp_kallsyms%.S: .tmp_vmlinux% $(KALLSYMS) +ifeq ($(CONFIG_KALLSYMS_STRIP_GENERATED),y) +strip-ext := .stripped +endif + +.tmp_kallsyms%.S: .tmp_vmlinux%$(strip-ext) $(KALLSYMS) $(kallsyms.h) $(call cmd,kallsyms) +# make -jN seems to have problems with intermediate files, see bug #3330. +.SECONDARY: $(foreach n,1 2 3,.tmp_vmlinux$(n).stripped) +.tmp_vmlinux%.stripped: .tmp_vmlinux% $(strip-symbols) $(kallsyms.h) + $(call cmd,kstrip) + +ifneq ($(CONFIG_DEBUG_INFO),y) +.tmp_vmlinux%: LDFLAGS_vmlinux += -S +endif # .tmp_vmlinux1 must be complete except kallsyms, so update vmlinux version -.tmp_vmlinux1: $(vmlinux-lds) $(vmlinux-all) FORCE - $(call if_changed_rule,ksym_ld) +.tmp_vmlinux%: $(vmlinux-lds) $(vmlinux-all) FORCE + $(if $(filter 1,$*),$(call if_changed_rule,ksym_ld),$(call if_changed,vmlinux__)) -.tmp_vmlinux2: $(vmlinux-lds) $(vmlinux-all) .tmp_kallsyms1.o FORCE - $(call if_changed,vmlinux__) +.tmp_vmlinux0$(strip-ext): + $(Q)echo "placeholder" >$@ -.tmp_vmlinux3: $(vmlinux-lds) $(vmlinux-all) .tmp_kallsyms2.o FORCE - $(call if_changed,vmlinux__) +.tmp_vmlinux1: .tmp_kallsyms0.o +.tmp_vmlinux2: .tmp_kallsyms1.o +.tmp_vmlinux3: .tmp_kallsyms2.o # Needs to visit scripts/ before $(KALLSYMS) can be used. $(KALLSYMS): scripts ; diff --git a/arch/x86/scripts/strip-symbols b/arch/x86/scripts/strip-symbols new file mode 100644 index 00000000000..a2f1ccb827c --- /dev/null +++ b/arch/x86/scripts/strip-symbols @@ -0,0 +1 @@ +__cpu_vendor_dev_X86_VENDOR_* diff --git a/init/Kconfig b/init/Kconfig index f763762d544..0f5af409fef 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -588,6 +588,13 @@ config KALLSYMS_ALL Say N. +config KALLSYMS_STRIP_GENERATED + bool "Strip machine generated symbols from kallsyms" + depends on KALLSYMS_ALL + default y + help + Say N if you want kallsyms to retain even machine generated symbols. + config KALLSYMS_EXTRA_PASS bool "Do an extra kallsyms pass" depends on KALLSYMS diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 7b8b0f21a5b..e694afa0eb8 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -30,20 +30,19 @@ #define all_var 0 #endif -/* These will be re-linked against their real values during the second link stage */ -extern const unsigned long kallsyms_addresses[] __attribute__((weak)); -extern const u8 kallsyms_names[] __attribute__((weak)); +extern const unsigned long kallsyms_addresses[]; +extern const u8 kallsyms_names[]; /* tell the compiler that the count isn't in the small data section if the arch * has one (eg: FRV) */ extern const unsigned long kallsyms_num_syms -__attribute__((weak, section(".rodata"))); + __attribute__((__section__(".rodata"))); -extern const u8 kallsyms_token_table[] __attribute__((weak)); -extern const u16 kallsyms_token_index[] __attribute__((weak)); +extern const u8 kallsyms_token_table[]; +extern const u16 kallsyms_token_index[]; -extern const unsigned long kallsyms_markers[] __attribute__((weak)); +extern const unsigned long kallsyms_markers[]; static inline int is_kernel_inittext(unsigned long addr) { @@ -168,9 +167,6 @@ static unsigned long get_symbol_pos(unsigned long addr, unsigned long symbol_start = 0, symbol_end = 0; unsigned long i, low, high, mid; - /* This kernel should never had been booted. */ - BUG_ON(!kallsyms_addresses); - /* do a binary search on the sorted kallsyms_addresses array */ low = 0; high = kallsyms_num_syms; diff --git a/scripts/genksyms/keywords.c_shipped b/scripts/genksyms/keywords.c_shipped index 971e0113ae7..83484fe93ed 100644 --- a/scripts/genksyms/keywords.c_shipped +++ b/scripts/genksyms/keywords.c_shipped @@ -1,4 +1,4 @@ -/* ANSI-C code produced by gperf version 3.0.2 */ +/* ANSI-C code produced by gperf version 3.0.1 */ /* Command-line: gperf -L ANSI-C -a -C -E -g -H is_reserved_hash -k '1,3,$' -N is_reserved_word -p -t scripts/genksyms/keywords.gperf */ #if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ @@ -32,7 +32,7 @@ #line 3 "scripts/genksyms/keywords.gperf" struct resword { const char *name; int token; }; -/* maximum key range = 62, duplicates = 0 */ +/* maximum key range = 64, duplicates = 0 */ #ifdef __GNUC__ __inline @@ -46,32 +46,32 @@ is_reserved_hash (register const char *str, register unsigned int len) { static const unsigned char asso_values[] = { - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 5, - 65, 65, 65, 65, 65, 65, 35, 65, 65, 65, - 0, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 0, 65, 0, 65, 5, - 20, 15, 10, 30, 65, 15, 65, 65, 20, 0, - 10, 35, 20, 65, 10, 5, 0, 10, 5, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65 + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 0, + 67, 67, 67, 67, 67, 67, 15, 67, 67, 67, + 0, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 0, 67, 0, 67, 5, + 25, 20, 15, 30, 67, 15, 67, 67, 10, 0, + 10, 40, 20, 67, 10, 5, 0, 10, 15, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67 }; return len + asso_values[(unsigned char)str[2]] + asso_values[(unsigned char)str[0]] + asso_values[(unsigned char)str[len - 1]]; } @@ -84,116 +84,119 @@ is_reserved_word (register const char *str, register unsigned int len) { enum { - TOTAL_KEYWORDS = 43, + TOTAL_KEYWORDS = 45, MIN_WORD_LENGTH = 3, MAX_WORD_LENGTH = 24, MIN_HASH_VALUE = 3, - MAX_HASH_VALUE = 64 + MAX_HASH_VALUE = 66 }; static const struct resword wordlist[] = { {""}, {""}, {""}, -#line 26 "scripts/genksyms/keywords.gperf" +#line 28 "scripts/genksyms/keywords.gperf" {"asm", ASM_KEYW}, {""}, -#line 8 "scripts/genksyms/keywords.gperf" +#line 10 "scripts/genksyms/keywords.gperf" {"__asm", ASM_KEYW}, {""}, -#line 9 "scripts/genksyms/keywords.gperf" +#line 11 "scripts/genksyms/keywords.gperf" {"__asm__", ASM_KEYW}, {""}, {""}, -#line 52 "scripts/genksyms/keywords.gperf" +#line 54 "scripts/genksyms/keywords.gperf" {"__typeof__", TYPEOF_KEYW}, {""}, -#line 12 "scripts/genksyms/keywords.gperf" +#line 14 "scripts/genksyms/keywords.gperf" {"__const", CONST_KEYW}, -#line 11 "scripts/genksyms/keywords.gperf" - {"__attribute__", ATTRIBUTE_KEYW}, #line 13 "scripts/genksyms/keywords.gperf" + {"__attribute__", ATTRIBUTE_KEYW}, +#line 15 "scripts/genksyms/keywords.gperf" {"__const__", CONST_KEYW}, -#line 18 "scripts/genksyms/keywords.gperf" +#line 20 "scripts/genksyms/keywords.gperf" {"__signed__", SIGNED_KEYW}, -#line 44 "scripts/genksyms/keywords.gperf" +#line 46 "scripts/genksyms/keywords.gperf" {"static", STATIC_KEYW}, -#line 20 "scripts/genksyms/keywords.gperf" - {"__volatile__", VOLATILE_KEYW}, -#line 39 "scripts/genksyms/keywords.gperf" + {""}, +#line 41 "scripts/genksyms/keywords.gperf" {"int", INT_KEYW}, -#line 32 "scripts/genksyms/keywords.gperf" +#line 34 "scripts/genksyms/keywords.gperf" {"char", CHAR_KEYW}, -#line 33 "scripts/genksyms/keywords.gperf" +#line 35 "scripts/genksyms/keywords.gperf" {"const", CONST_KEYW}, -#line 45 "scripts/genksyms/keywords.gperf" +#line 47 "scripts/genksyms/keywords.gperf" {"struct", STRUCT_KEYW}, -#line 24 "scripts/genksyms/keywords.gperf" +#line 26 "scripts/genksyms/keywords.gperf" {"__restrict__", RESTRICT_KEYW}, -#line 25 "scripts/genksyms/keywords.gperf" +#line 27 "scripts/genksyms/keywords.gperf" {"restrict", RESTRICT_KEYW}, -#line 23 "scripts/genksyms/keywords.gperf" - {"_restrict", RESTRICT_KEYW}, -#line 16 "scripts/genksyms/keywords.gperf" +#line 7 "scripts/genksyms/keywords.gperf" + {"EXPORT_SYMBOL_GPL_FUTURE", EXPORT_SYMBOL_KEYW}, +#line 18 "scripts/genksyms/keywords.gperf" {"__inline__", INLINE_KEYW}, -#line 10 "scripts/genksyms/keywords.gperf" - {"__attribute", ATTRIBUTE_KEYW}, {""}, -#line 14 "scripts/genksyms/keywords.gperf" +#line 22 "scripts/genksyms/keywords.gperf" + {"__volatile__", VOLATILE_KEYW}, +#line 5 "scripts/genksyms/keywords.gperf" + {"EXPORT_SYMBOL", EXPORT_SYMBOL_KEYW}, +#line 25 "scripts/genksyms/keywords.gperf" + {"_restrict", RESTRICT_KEYW}, + {""}, +#line 12 "scripts/genksyms/keywords.gperf" + {"__attribute", ATTRIBUTE_KEYW}, +#line 6 "scripts/genksyms/keywords.gperf" + {"EXPORT_SYMBOL_GPL", EXPORT_SYMBOL_KEYW}, +#line 16 "scripts/genksyms/keywords.gperf" {"__extension__", EXTENSION_KEYW}, -#line 35 "scripts/genksyms/keywords.gperf" +#line 37 "scripts/genksyms/keywords.gperf" {"enum", ENUM_KEYW}, -#line 19 "scripts/genksyms/keywords.gperf" - {"__volatile", VOLATILE_KEYW}, -#line 36 "scripts/genksyms/keywords.gperf" +#line 8 "scripts/genksyms/keywords.gperf" + {"EXPORT_UNUSED_SYMBOL", EXPORT_SYMBOL_KEYW}, +#line 38 "scripts/genksyms/keywords.gperf" {"extern", EXTERN_KEYW}, {""}, -#line 17 "scripts/genksyms/keywords.gperf" +#line 19 "scripts/genksyms/keywords.gperf" {"__signed", SIGNED_KEYW}, -#line 7 "scripts/genksyms/keywords.gperf" - {"EXPORT_SYMBOL_GPL_FUTURE", EXPORT_SYMBOL_KEYW}, - {""}, -#line 51 "scripts/genksyms/keywords.gperf" +#line 9 "scripts/genksyms/keywords.gperf" + {"EXPORT_UNUSED_SYMBOL_GPL", EXPORT_SYMBOL_KEYW}, +#line 49 "scripts/genksyms/keywords.gperf" + {"union", UNION_KEYW}, +#line 53 "scripts/genksyms/keywords.gperf" {"typeof", TYPEOF_KEYW}, -#line 46 "scripts/genksyms/keywords.gperf" +#line 48 "scripts/genksyms/keywords.gperf" {"typedef", TYPEDEF_KEYW}, -#line 15 "scripts/genksyms/keywords.gperf" +#line 17 "scripts/genksyms/keywords.gperf" {"__inline", INLINE_KEYW}, -#line 31 "scripts/genksyms/keywords.gperf" +#line 33 "scripts/genksyms/keywords.gperf" {"auto", AUTO_KEYW}, -#line 47 "scripts/genksyms/keywords.gperf" - {"union", UNION_KEYW}, - {""}, {""}, -#line 48 "scripts/genksyms/keywords.gperf" - {"unsigned", UNSIGNED_KEYW}, -#line 49 "scripts/genksyms/keywords.gperf" - {"void", VOID_KEYW}, -#line 42 "scripts/genksyms/keywords.gperf" - {"short", SHORT_KEYW}, +#line 21 "scripts/genksyms/keywords.gperf" + {"__volatile", VOLATILE_KEYW}, {""}, {""}, #line 50 "scripts/genksyms/keywords.gperf" - {"volatile", VOLATILE_KEYW}, - {""}, -#line 37 "scripts/genksyms/keywords.gperf" - {"float", FLOAT_KEYW}, -#line 34 "scripts/genksyms/keywords.gperf" - {"double", DOUBLE_KEYW}, + {"unsigned", UNSIGNED_KEYW}, {""}, -#line 5 "scripts/genksyms/keywords.gperf" - {"EXPORT_SYMBOL", EXPORT_SYMBOL_KEYW}, - {""}, {""}, -#line 38 "scripts/genksyms/keywords.gperf" +#line 44 "scripts/genksyms/keywords.gperf" + {"short", SHORT_KEYW}, +#line 40 "scripts/genksyms/keywords.gperf" {"inline", INLINE_KEYW}, -#line 6 "scripts/genksyms/keywords.gperf" - {"EXPORT_SYMBOL_GPL", EXPORT_SYMBOL_KEYW}, -#line 41 "scripts/genksyms/keywords.gperf" - {"register", REGISTER_KEYW}, {""}, -#line 22 "scripts/genksyms/keywords.gperf" +#line 52 "scripts/genksyms/keywords.gperf" + {"volatile", VOLATILE_KEYW}, +#line 42 "scripts/genksyms/keywords.gperf" + {"long", LONG_KEYW}, +#line 24 "scripts/genksyms/keywords.gperf" {"_Bool", BOOL_KEYW}, -#line 43 "scripts/genksyms/keywords.gperf" - {"signed", SIGNED_KEYW}, {""}, {""}, -#line 40 "scripts/genksyms/keywords.gperf" - {"long", LONG_KEYW} +#line 43 "scripts/genksyms/keywords.gperf" + {"register", REGISTER_KEYW}, +#line 51 "scripts/genksyms/keywords.gperf" + {"void", VOID_KEYW}, +#line 39 "scripts/genksyms/keywords.gperf" + {"float", FLOAT_KEYW}, +#line 36 "scripts/genksyms/keywords.gperf" + {"double", DOUBLE_KEYW}, + {""}, {""}, {""}, {""}, +#line 45 "scripts/genksyms/keywords.gperf" + {"signed", SIGNED_KEYW} }; if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) diff --git a/scripts/genksyms/keywords.gperf b/scripts/genksyms/keywords.gperf index 5ef3733225f..8abe7ab8d88 100644 --- a/scripts/genksyms/keywords.gperf +++ b/scripts/genksyms/keywords.gperf @@ -5,6 +5,8 @@ struct resword { const char *name; int token; } EXPORT_SYMBOL, EXPORT_SYMBOL_KEYW EXPORT_SYMBOL_GPL, EXPORT_SYMBOL_KEYW EXPORT_SYMBOL_GPL_FUTURE, EXPORT_SYMBOL_KEYW +EXPORT_UNUSED_SYMBOL, EXPORT_SYMBOL_KEYW +EXPORT_UNUSED_SYMBOL_GPL, EXPORT_SYMBOL_KEYW __asm, ASM_KEYW __asm__, ASM_KEYW __attribute, ATTRIBUTE_KEYW diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index ad2434b2697..92758120a76 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -130,18 +130,9 @@ static int read_symbol(FILE *in, struct sym_entry *s) static int symbol_valid(struct sym_entry *s) { /* Symbols which vary between passes. Passes 1 and 2 must have - * identical symbol lists. The kallsyms_* symbols below are only added - * after pass 1, they would be included in pass 2 when --all-symbols is - * specified so exclude them to get a stable symbol list. + * identical symbol lists. */ static char *special_symbols[] = { - "kallsyms_addresses", - "kallsyms_num_syms", - "kallsyms_names", - "kallsyms_markers", - "kallsyms_token_table", - "kallsyms_token_index", - /* Exclude linker generated symbols which vary between passes */ "_SDA_BASE_", /* ppc */ "_SDA2_BASE_", /* ppc */ @@ -173,7 +164,9 @@ static int symbol_valid(struct sym_entry *s) } /* Exclude symbols which vary between passes. */ - if (strstr((char *)s->sym + offset, "_compiled.")) + if (strstr((char *)s->sym + offset, "_compiled.") || + strncmp((char*)s->sym + offset, "__compound_literal.", 19) == 0 || + strncmp((char*)s->sym + offset, "__compound_literal$", 19) == 0) return 0; for (i = 0; special_symbols[i]; i++) @@ -550,8 +543,10 @@ int main(int argc, char **argv) usage(); read_map(stdin); - sort_symbols(); - optimize_token_table(); + if (table_cnt) { + sort_symbols(); + optimize_token_table(); + } write_src(); return 0; -- cgit v1.2.3-70-g09d2 From 26cc271db798cf211d35967cbfbb53e997126b84 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 19 Dec 2008 10:23:03 -0800 Subject: printk: fix discarding message when recursion_bug Impact: fix truncated recursion bug message printout When recursion_bug is true, kernel discards original message because printk_buf contains recursion_bug_msg with NULL terminator. The sizeof(recursion_bug_msg) makes this, use strlen() to get correct length without NULL terminator. Reported-by: Toshikazu Nakayama Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index f492f1583d7..e651ab05655 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -662,7 +662,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) if (recursion_bug) { recursion_bug = 0; strcpy(printk_buf, recursion_bug_msg); - printed_len = sizeof(recursion_bug_msg); + printed_len = strlen(recursion_bug_msg); } /* Emit the output into the temporary buffer */ printed_len += vscnprintf(printk_buf + printed_len, -- cgit v1.2.3-70-g09d2 From b909895739427874c089bc0e03dc119f99cab2dd Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 19 Dec 2008 13:48:34 -0800 Subject: sparseirq: fix numa_migrate_irq_desc dependency and comments Impact: reduce kconfig variable scope and clean up Bartlomiej pointed out that the config dependencies and comments are not right. update it depend to NUMA, and fix some comments Reported-by: Bartlomiej Zolnierkiewicz Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- arch/x86/kernel/io_apic.c | 2 +- kernel/irq/numa_migrate.c | 11 +++-------- 3 files changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 60a008857a3..5c243826334 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -250,7 +250,7 @@ config SPARSE_IRQ config NUMA_MIGRATE_IRQ_DESC bool "Move irq desc when changing irq smp_affinity" - depends on SPARSE_IRQ && SMP + depends on SPARSE_IRQ && NUMA default n help This enables moving irq_desc to cpu/node that irq will use handled. diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index bfe1245b1a3..a74887b416c 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -2471,7 +2471,7 @@ static void irq_complete_move(struct irq_desc **descp) if (likely(!cfg->move_desc_pending)) return; - /* domain is not change, but affinity is changed */ + /* domain has not changed, but affinity did */ me = smp_processor_id(); if (cpu_isset(me, desc->affinity)) { *descp = desc = move_irq_desc(desc, me); diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 0178e229699..089c3746358 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -1,13 +1,8 @@ /* - * linux/kernel/irq/handle.c - * - * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar - * Copyright (C) 2005-2006, Thomas Gleixner, Russell King - * - * This file contains the core interrupt handling code. - * - * Detailed information is available in Documentation/DocBook/genericirq + * NUMA irq-desc migration code * + * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to + * the new "home node" of the IRQ. */ #include -- cgit v1.2.3-70-g09d2 From bf53de907dfdaac178c92d774aae7370d7b97d20 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 19 Dec 2008 15:10:24 +0100 Subject: x86, bts: add fork and exit handling Impact: introduce new ptrace facility Add arch_ptrace_untrace() function that is called when the tracer detaches (either voluntarily or when the tracing task dies); ptrace_disable() is only called on a voluntary detach. Add ptrace_fork() and arch_ptrace_fork(). They are called when a traced task is forked. Clear DS and BTS related fields on fork. Release DS resources and reclaim memory in ptrace_untrace(). This releases resources already when the tracing task dies. We used to do that when the traced task dies. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ds.h | 9 ++++++++ arch/x86/include/asm/ptrace.h | 7 ++++++ arch/x86/kernel/ds.c | 11 ++++++++++ arch/x86/kernel/process_32.c | 20 ++++++++--------- arch/x86/kernel/process_64.c | 20 ++++++++--------- arch/x86/kernel/ptrace.c | 50 ++++++++++++++++++++++++++++++++++--------- include/linux/ptrace.h | 22 +++++++++++++++++++ kernel/fork.c | 2 ++ kernel/ptrace.c | 12 +++++++++++ 9 files changed, 121 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index ee0ea3a96c1..a8f672ba100 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -252,12 +252,21 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *); */ extern void ds_switch_to(struct task_struct *prev, struct task_struct *next); +/* + * Task clone/init and cleanup work + */ +extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father); +extern void ds_exit_thread(struct task_struct *tsk); + #else /* CONFIG_X86_DS */ struct cpuinfo_x86; static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {} static inline void ds_switch_to(struct task_struct *prev, struct task_struct *next) {} +static inline void ds_copy_thread(struct task_struct *tsk, + struct task_struct *father) {} +static inline void ds_exit_thread(struct task_struct *tsk) {} #endif /* CONFIG_X86_DS */ #endif /* _ASM_X86_DS_H */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index fbf74421591..6d34d954c22 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -235,6 +235,13 @@ extern int do_get_thread_area(struct task_struct *p, int idx, extern int do_set_thread_area(struct task_struct *p, int idx, struct user_desc __user *info, int can_allocate); +extern void x86_ptrace_untrace(struct task_struct *); +extern void x86_ptrace_fork(struct task_struct *child, + unsigned long clone_flags); + +#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk) +#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags) + #endif /* __KERNEL__ */ #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 98d271e60e0..da91701a234 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -1017,3 +1017,14 @@ void ds_switch_to(struct task_struct *prev, struct task_struct *next) update_debugctlmsr(next->thread.debugctlmsr); } + +void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) +{ + clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR); + tsk->thread.ds_ctx = NULL; +} + +void ds_exit_thread(struct task_struct *tsk) +{ + WARN_ON(tsk->thread.ds_ctx); +} diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 605eff9a8ac..3ba155d2488 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -60,6 +60,7 @@ #include #include #include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -251,17 +252,8 @@ void exit_thread(void) tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; put_cpu(); } -#ifdef CONFIG_X86_DS - /* Free any BTS tracers that have not been properly released. */ - if (unlikely(current->bts)) { - ds_release_bts(current->bts); - current->bts = NULL; - - kfree(current->bts_buffer); - current->bts_buffer = NULL; - current->bts_size = 0; - } -#endif /* CONFIG_X86_DS */ + + ds_exit_thread(current); } void flush_thread(void) @@ -343,6 +335,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } + + ds_copy_thread(p, current); + + clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); + p->thread.debugctlmsr = 0; + return err; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 1cfd2a4bf85..416fb9282f4 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -53,6 +53,7 @@ #include #include #include +#include asmlinkage extern void ret_from_fork(void); @@ -236,17 +237,8 @@ void exit_thread(void) t->io_bitmap_max = 0; put_cpu(); } -#ifdef CONFIG_X86_DS - /* Free any BTS tracers that have not been properly released. */ - if (unlikely(current->bts)) { - ds_release_bts(current->bts); - current->bts = NULL; - - kfree(current->bts_buffer); - current->bts_buffer = NULL; - current->bts_size = 0; - } -#endif /* CONFIG_X86_DS */ + + ds_exit_thread(current); } void flush_thread(void) @@ -376,6 +368,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, if (err) goto out; } + + ds_copy_thread(p, me); + + clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); + p->thread.debugctlmsr = 0; + err = 0; out: if (err && p->thread.io_bitmap_ptr) { diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 45e9855da2d..6ad2bb60765 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -769,8 +769,47 @@ static int ptrace_bts_size(struct task_struct *child) return (trace->ds.top - trace->ds.begin) / trace->ds.size; } + +static void ptrace_bts_fork(struct task_struct *tsk) +{ + tsk->bts = NULL; + tsk->bts_buffer = NULL; + tsk->bts_size = 0; + tsk->thread.bts_ovfl_signal = 0; +} + +static void ptrace_bts_untrace(struct task_struct *child) +{ + if (unlikely(child->bts)) { + ds_release_bts(child->bts); + child->bts = NULL; + + kfree(child->bts_buffer); + child->bts_buffer = NULL; + child->bts_size = 0; + } +} + +static void ptrace_bts_detach(struct task_struct *child) +{ + ptrace_bts_untrace(child); +} +#else +static inline void ptrace_bts_fork(struct task_struct *tsk) {} +static inline void ptrace_bts_detach(struct task_struct *child) {} +static inline void ptrace_bts_untrace(struct task_struct *child) {} #endif /* CONFIG_X86_PTRACE_BTS */ +void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags) +{ + ptrace_bts_fork(child); +} + +void x86_ptrace_untrace(struct task_struct *child) +{ + ptrace_bts_untrace(child); +} + /* * Called by kernel/ptrace.c when detaching.. * @@ -782,16 +821,7 @@ void ptrace_disable(struct task_struct *child) #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif -#ifdef CONFIG_X86_PTRACE_BTS - if (child->bts) { - ds_release_bts(child->bts); - child->bts = NULL; - - kfree(child->bts_buffer); - child->bts_buffer = NULL; - child->bts_size = 0; - } -#endif /* CONFIG_X86_PTRACE_BTS */ + ptrace_bts_detach(child); } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 22641d5d45d..98b93ca4db0 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -94,6 +94,7 @@ extern void ptrace_notify(int exit_code); extern void __ptrace_link(struct task_struct *child, struct task_struct *new_parent); extern void __ptrace_unlink(struct task_struct *child); +extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags); #define PTRACE_MODE_READ 1 #define PTRACE_MODE_ATTACH 2 /* Returns 0 on success, -errno on denial. */ @@ -313,6 +314,27 @@ static inline void user_enable_block_step(struct task_struct *task) #define arch_ptrace_stop(code, info) do { } while (0) #endif +#ifndef arch_ptrace_untrace +/* + * Do machine-specific work before untracing child. + * + * This is called for a normal detach as well as from ptrace_exit() + * when the tracing task dies. + * + * Called with write_lock(&tasklist_lock) held. + */ +#define arch_ptrace_untrace(task) do { } while (0) +#endif + +#ifndef arch_ptrace_fork +/* + * Do machine-specific work to initialize a new task. + * + * This is called from copy_process(). + */ +#define arch_ptrace_fork(child, clone_flags) do { } while (0) +#endif + extern int task_current_syscall(struct task_struct *target, long *callno, unsigned long args[6], unsigned int maxargs, unsigned long *sp, unsigned long *pc); diff --git a/kernel/fork.c b/kernel/fork.c index 7b93da72d4a..65ce60adc8e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1096,6 +1096,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif + if (unlikely(ptrace_reparented(current))) + ptrace_fork(p, clone_flags); /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 4c8bcd7dd8e..100a71cfdab 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -25,6 +25,17 @@ #include #include + +/* + * Initialize a new task whose father had been ptraced. + * + * Called from copy_process(). + */ +void ptrace_fork(struct task_struct *child, unsigned long clone_flags) +{ + arch_ptrace_fork(child, clone_flags); +} + /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. @@ -72,6 +83,7 @@ void __ptrace_unlink(struct task_struct *child) child->parent = child->real_parent; list_del_init(&child->ptrace_entry); + arch_ptrace_untrace(child); if (task_is_traced(child)) ptrace_untrace(child); } -- cgit v1.2.3-70-g09d2 From 3d44cc3e01ee1b40317f79ed54324e25c4f848df Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 20 Dec 2008 21:27:34 +0100 Subject: Null pointer deref with hrtimer_try_to_cancel() Impact: Prevent kernel crash with posix timer clockid CLOCK_MONOTONIC_RAW commit 2d42244ae71d6c7b0884b5664cf2eda30fb2ae68 (clocksource: introduce CLOCK_MONOTONIC_RAW) introduced a new clockid, which is only available to read out the raw not NTP adjusted system time. The above commit did not prevent that a posix timer can be created with that clockid. The timer_create() syscall succeeds and initializes the timer to a non existing hrtimer base. When the timer is deleted either by timer_delete() or by the exit() cleanup the kernel crashes. Prevent the creation of timers for CLOCK_MONOTONIC_RAW by setting the posix clock function to no_timer_create which returns an error code. Reported-and-tested-by: Eric Sesterhenn Signed-off-by: Thomas Gleixner Acked-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/posix-timers.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 5e79c662294..a140e44eebb 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -197,6 +197,11 @@ static int common_timer_create(struct k_itimer *new_timer) return 0; } +static int no_timer_create(struct k_itimer *new_timer) +{ + return -EOPNOTSUPP; +} + /* * Return nonzero if we know a priori this clockid_t value is bogus. */ @@ -248,6 +253,7 @@ static __init int init_posix_timers(void) .clock_getres = hrtimer_get_res, .clock_get = posix_get_monotonic_raw, .clock_set = do_posix_clock_nosettime, + .timer_create = no_timer_create, }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); -- cgit v1.2.3-70-g09d2 From a8ccf1d6f60e3e6ae63122e02378cd4d40dd4aac Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 23 Dec 2008 11:32:24 -0500 Subject: ring-buffer: fix dangling commit race Impact: fix stuck trace-buffers If an interrupt comes in during the rb_set_commit_to_write and pushes the tail page forward just at the right time, the commit updates will miss the adding of the interrupt data. This will cause the commit pointer to cease from moving forward. Thanks to Jiaying Zhang for finding this race. Reported-by: Jiaying Zhang Signed-off-by: Steven Rostedt Cc: Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bb6922a931b..d03f4f44a82 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -838,6 +838,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) * back to us). This allows us to do a simple loop to * assign the commit to the tail. */ + again: while (cpu_buffer->commit_page != cpu_buffer->tail_page) { cpu_buffer->commit_page->page->commit = cpu_buffer->commit_page->write; @@ -853,6 +854,17 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->commit_page->write; barrier(); } + + /* again, keep gcc from optimizing */ + barrier(); + + /* + * If an interrupt came in just after the first while loop + * and pushed the tail page forward, we will be left with + * a dangling commit that will never go forward. + */ + if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) + goto again; } static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) -- cgit v1.2.3-70-g09d2 From 98db8df777438e16ad0f44a0fba05ebbdb73db8d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 23 Dec 2008 11:32:25 -0500 Subject: ring-buffer: prevent false positive warning Impact: eliminate false WARN_ON message If an interrupt goes off after the setting of the local variable tail_page and before incrementing the write index of that page, the interrupt could push the commit forward to the next page. Later a check is made to see if interrupts pushed the buffer around the entire ring buffer by comparing the next page to the last commited page. This can produce a false positive if the interrupt had pushed the commit page forward as stated above. Thanks to Jiaying Zhang for finding this race. Reported-by: Jiaying Zhang Signed-off-by: Steven Rostedt Cc: Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d03f4f44a82..76f34c0ef29 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -962,12 +962,15 @@ static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, unsigned type, unsigned long length, u64 *ts) { - struct buffer_page *tail_page, *head_page, *reader_page; + struct buffer_page *tail_page, *head_page, *reader_page, *commit_page; unsigned long tail, write; struct ring_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_event *event; unsigned long flags; + commit_page = cpu_buffer->commit_page; + /* we just need to protect against interrupts */ + barrier(); tail_page = cpu_buffer->tail_page; write = local_add_return(length, &tail_page->write); tail = write - length; @@ -993,7 +996,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * it all the way around the buffer, bail, and warn * about it. */ - if (unlikely(next_page == cpu_buffer->commit_page)) { + if (unlikely(next_page == commit_page)) { WARN_ON_ONCE(1); goto out_unlock; } -- cgit v1.2.3-70-g09d2 From e368d3a836797ddf193b1ec18c97407a791d2451 Mon Sep 17 00:00:00 2001 From: Sharyathi Nagesh Date: Tue, 23 Dec 2008 13:57:12 -0800 Subject: cgroups: suppress bogus warning messages Remove spurious warning messages that are thrown onto the console during cgroup operations. Signed-off-by: Alexey Dobriyan Signed-off-by: Sharyathi Nagesh Acked-by: Serge E. Hallyn Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8185a0f0959..a3415507bd0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2934,9 +2934,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, again: root = subsys->root; if (root == &rootnode) { - printk(KERN_INFO - "Not cloning cgroup for unused subsystem %s\n", - subsys->name); mutex_unlock(&cgroup_mutex); return 0; } -- cgit v1.2.3-70-g09d2 From 20ca9b3f4c6dfa0af8dd5b18a64df17eb994b54d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 23 Dec 2008 13:57:14 -0800 Subject: cgroups: avoid accessing uninitialized data in failure path If cgroup_get_rootdir() failed, free_cg_links() will be called in the failure path, but tmp_cg_links hasn't been initialized at that time. I introduced this bug in the 2.6.27 merge window. Signed-off-by: Li Zefan Acked-by: Serge Hallyn Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a3415507bd0..2606d0fb4e5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1024,7 +1024,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, if (ret == -EBUSY) { mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); - goto drop_new_super; + goto free_cg_links; } /* EBUSY should be the only error here */ @@ -1073,10 +1073,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type, return simple_set_mnt(mnt, sb); + free_cg_links: + free_cg_links(&tmp_cg_links); drop_new_super: up_write(&sb->s_umount); deactivate_super(sb); - free_cg_links(&tmp_cg_links); return ret; } -- cgit v1.2.3-70-g09d2 From 12d79bafb75639f406a9f71aab94808c414c836e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 25 Dec 2008 09:31:28 +0100 Subject: rcu: provide RCU options on non-preempt architectures too Impact: build fix Some old architectures still do not use kernel/Kconfig.preempt, so the moving of the RCU options there broke their build: In file included from /home/mingo/tip/include/linux/sem.h:81, from /home/mingo/tip/include/linux/sched.h:69, from /home/mingo/tip/arch/alpha/kernel/asm-offsets.c:9: /home/mingo/tip/include/linux/rcupdate.h:62:2: error: #error "Unknown RCU implementation specified to kernel configuration" Move these options back to init/Kconfig, which every architecture includes. Signed-off-by: Ingo Molnar --- init/Kconfig | 74 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/Kconfig.preempt | 75 -------------------------------------------------- 2 files changed, 74 insertions(+), 75 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index 9dd7958a71f..6b0fdedf359 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -928,6 +928,80 @@ source "block/Kconfig" config PREEMPT_NOTIFIERS bool +choice + prompt "RCU Implementation" + default CLASSIC_RCU + +config CLASSIC_RCU + bool "Classic RCU" + help + This option selects the classic RCU implementation that is + designed for best read-side performance on non-realtime + systems. + + Select this option if you are unsure. + +config TREE_RCU + bool "Tree-based hierarchical RCU" + help + This option selects the RCU implementation that is + designed for very large SMP system with hundreds or + thousands of CPUs. + +config PREEMPT_RCU + bool "Preemptible RCU" + depends on PREEMPT + help + This option reduces the latency of the kernel by making certain + RCU sections preemptible. Normally RCU code is non-preemptible, if + this option is selected then read-only RCU sections become + preemptible. This helps latency, but may expose bugs due to + now-naive assumptions about each RCU read-side critical section + remaining on a given CPU through its execution. + +endchoice + +config RCU_TRACE + bool "Enable tracing for RCU" + depends on TREE_RCU || PREEMPT_RCU + help + This option provides tracing in RCU which presents stats + in debugfs for debugging RCU implementation. + + Say Y here if you want to enable RCU tracing + Say N if you are unsure. + +config RCU_FANOUT + int "Tree-based hierarchical RCU fanout value" + range 2 64 if 64BIT + range 2 32 if !64BIT + depends on TREE_RCU + default 64 if 64BIT + default 32 if !64BIT + help + This option controls the fanout of hierarchical implementations + of RCU, allowing RCU to work efficiently on machines with + large numbers of CPUs. This value must be at least the cube + root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit + systems and up to 262,144 for 64-bit systems. + + Select a specific number if testing RCU itself. + Take the default if unsure. + +config RCU_FANOUT_EXACT + bool "Disable tree-based hierarchical RCU auto-balancing" + depends on TREE_RCU + default n + help + This option forces use of the exact RCU_FANOUT value specified, + regardless of imbalances in the hierarchy. This is useful for + testing RCU itself, and might one day be useful on systems with + strong NUMA behavior. + + Without RCU_FANOUT_EXACT, the code will balance the hierarchy. + + Say N if unsure. + config TREE_RCU_TRACE def_bool RCU_TRACE && TREE_RCU select DEBUG_FS diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 463f29743ea..bf987b95b35 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -52,78 +52,3 @@ config PREEMPT endchoice -choice - prompt "RCU Implementation" - default CLASSIC_RCU - -config CLASSIC_RCU - bool "Classic RCU" - help - This option selects the classic RCU implementation that is - designed for best read-side performance on non-realtime - systems. - - Select this option if you are unsure. - -config TREE_RCU - bool "Tree-based hierarchical RCU" - help - This option selects the RCU implementation that is - designed for very large SMP system with hundreds or - thousands of CPUs. - -config PREEMPT_RCU - bool "Preemptible RCU" - depends on PREEMPT - help - This option reduces the latency of the kernel by making certain - RCU sections preemptible. Normally RCU code is non-preemptible, if - this option is selected then read-only RCU sections become - preemptible. This helps latency, but may expose bugs due to - now-naive assumptions about each RCU read-side critical section - remaining on a given CPU through its execution. - -endchoice - -config RCU_TRACE - bool "Enable tracing for RCU" - depends on TREE_RCU || PREEMPT_RCU - help - This option provides tracing in RCU which presents stats - in debugfs for debugging RCU implementation. - - Say Y here if you want to enable RCU tracing - Say N if you are unsure. - -config RCU_FANOUT - int "Tree-based hierarchical RCU fanout value" - range 2 64 if 64BIT - range 2 32 if !64BIT - depends on TREE_RCU - default 64 if 64BIT - default 32 if !64BIT - help - This option controls the fanout of hierarchical implementations - of RCU, allowing RCU to work efficiently on machines with - large numbers of CPUs. This value must be at least the cube - root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit - systems and up to 262,144 for 64-bit systems. - - Select a specific number if testing RCU itself. - Take the default if unsure. - -config RCU_FANOUT_EXACT - bool "Disable tree-based hierarchical RCU auto-balancing" - depends on TREE_RCU - default n - help - This option forces use of the exact RCU_FANOUT value specified, - regardless of imbalances in the hierarchy. This is useful for - testing RCU itself, and might one day be useful on systems with - strong NUMA behavior. - - Without RCU_FANOUT_EXACT, the code will balance the hierarchy. - - Say n if unsure. - - -- cgit v1.2.3-70-g09d2 From 9212ddb5eada64fec5a08b28207401f3cc3d0876 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 25 Dec 2008 11:21:20 +0100 Subject: stacktrace: provide save_stack_trace_tsk() weak alias Impact: build fix Some architectures have not implemented save_stack_trace_tsk() yet: fs/built-in.o: In function `proc_pid_stack': base.c:(.text+0x3f140): undefined reference to `save_stack_trace_tsk' So warn about that if the facility is used. Signed-off-by: Ingo Molnar --- kernel/stacktrace.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 94b527ef1d1..eb212f8f8bc 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -6,6 +6,7 @@ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar */ #include +#include #include #include #include @@ -24,3 +25,13 @@ void print_stack_trace(struct stack_trace *trace, int spaces) } EXPORT_SYMBOL_GPL(print_stack_trace); +/* + * Architectures that do not implement save_stack_trace_tsk get this + * weak alias and a once-per-bootup warning (whenever this facility + * is utilized - for example by procfs): + */ +__weak void +save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) +{ + WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n"); +} -- cgit v1.2.3-70-g09d2 From 468a15bb4cc61694495cc5ed7ffca29e87c79b69 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 16 Dec 2008 08:07:03 +0100 Subject: sched, trace: update trace_sched_wakeup() Impact: extend the wakeup tracepoint with the info whether the wakeup was real Add the information needed to distinguish 'real' wakeups from 'false' wakeups. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/trace/sched.h | 4 ++-- kernel/sched.c | 2 +- kernel/trace/trace_sched_switch.c | 2 +- kernel/trace/trace_sched_wakeup.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/trace/sched.h b/include/trace/sched.h index bc4c9eadc6b..0d81098ee9f 100644 --- a/include/trace/sched.h +++ b/include/trace/sched.h @@ -17,8 +17,8 @@ DECLARE_TRACE(sched_wait_task, TPARGS(rq, p)); DECLARE_TRACE(sched_wakeup, - TPPROTO(struct rq *rq, struct task_struct *p), - TPARGS(rq, p)); + TPPROTO(struct rq *rq, struct task_struct *p, int success), + TPARGS(rq, p, success)); DECLARE_TRACE(sched_wakeup_new, TPPROTO(struct rq *rq, struct task_struct *p, int success), diff --git a/kernel/sched.c b/kernel/sched.c index ceda5799466..dcb39bc88f6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2324,7 +2324,7 @@ out_activate: success = 1; out_running: - trace_sched_wakeup(rq, p); + trace_sched_wakeup(rq, p, success); check_preempt_curr(rq, p, sync); p->state = TASK_RUNNING; diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index add2c1fdae9..df175cb4564 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -49,7 +49,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, } static void -probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee) +probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) { struct trace_array_cpu *data; unsigned long flags; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 0067b49746c..43586b689e3 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -211,7 +211,7 @@ static void wakeup_reset(struct trace_array *tr) } static void -probe_wakeup(struct rq *rq, struct task_struct *p) +probe_wakeup(struct rq *rq, struct task_struct *p, int success) { int cpu = smp_processor_id(); unsigned long flags; -- cgit v1.2.3-70-g09d2 From be4d638c1597580ed2294d899d9f1a2cd10e462c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 26 Dec 2008 22:23:43 +1030 Subject: cpumask: Replace cpu_coregroup_map with cpu_coregroup_mask cpu_coregroup_map returned a cpumask_t: it's going away. (Note, the sched part of this patch won't apply meaningfully to the sched tree, but I'm posting it to show the goal). Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Cc: Jens Axboe Cc: Ingo Molnar --- block/blk.h | 4 ++-- kernel/sched.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/block/blk.h b/block/blk.h index d2e49af90db..6e1ed40534e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -99,8 +99,8 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) static inline int blk_cpu_to_group(int cpu) { #ifdef CONFIG_SCHED_MC - cpumask_t mask = cpu_coregroup_map(cpu); - return first_cpu(mask); + const struct cpumask *mask = cpu_coregroup_mask(cpu); + return cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) return first_cpu(per_cpu(cpu_sibling_map, cpu)); #else diff --git a/kernel/sched.c b/kernel/sched.c index d2d16d1273b..42929239830 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7119,7 +7119,7 @@ cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, { int group; #ifdef CONFIG_SCHED_MC - *mask = cpu_coregroup_map(cpu); + *mask = *cpu_coregroup_mask(cpu); cpus_and(*mask, *mask, *cpu_map); group = first_cpu(*mask); #elif defined(CONFIG_SCHED_SMT) @@ -7485,7 +7485,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sd = &per_cpu(core_domains, i); SD_INIT(sd, MC); set_domain_attribute(sd, attr); - sd->span = cpu_coregroup_map(i); + sd->span = *cpu_coregroup_mask(i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; @@ -7528,7 +7528,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SCHED_CPUMASK_VAR(this_core_map, allmasks); SCHED_CPUMASK_VAR(send_covered, allmasks); - *this_core_map = cpu_coregroup_map(i); + *this_core_map = *cpu_coregroup_mask(i); cpus_and(*this_core_map, *this_core_map, *cpu_map); if (i != first_cpu(*this_core_map)) continue; -- cgit v1.2.3-70-g09d2 From 7c0990c7ee988aa193abbb7da3faeb9279146dbf Mon Sep 17 00:00:00 2001 From: Nikanth Karthikesan Date: Wed, 19 Nov 2008 10:20:23 +0100 Subject: Do not free io context when taking recursive faults in do_exit When taking recursive faults in do_exit, if the io_context is not null, exit_io_context() is being called. But it might decrement the refcount more than once. It is better to leave this task alone. Signed-off-by: Nikanth Karthikesan Signed-off-by: Jens Axboe --- kernel/exit.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index c7422ca9203..9a213474f54 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1037,8 +1037,6 @@ NORET_TYPE void do_exit(long code) * task into the wait for ever nirwana as well. */ tsk->flags |= PF_EXITPIDONE; - if (tsk->io_context) - exit_io_context(); set_current_state(TASK_UNINTERRUPTIBLE); schedule(); } -- cgit v1.2.3-70-g09d2 From abf137dd7712132ee56d5b3143c2ff61a72a5faa Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 9 Dec 2008 08:11:22 +0100 Subject: aio: make the lookup_ioctx() lockless The mm->ioctx_list is currently protected by a reader-writer lock, so we always grab that lock on the read side for doing ioctx lookups. As the workload is extremely reader biased, turn this into an rcu hlist so we can make lookup_ioctx() lockless. Get rid of the rwlock and use a spinlock for providing update side exclusion. There's usually only 1 entry on this list, so it doesn't make sense to look into fancier data structures. Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- arch/s390/mm/pgtable.c | 4 +- fs/aio.c | 100 ++++++++++++++++++++++++++--------------------- include/linux/aio.h | 5 ++- include/linux/mm_types.h | 5 ++- kernel/fork.c | 4 +- 5 files changed, 67 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index ef3635b52fc..0767827540b 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -263,7 +263,7 @@ int s390_enable_sie(void) /* lets check if we are allowed to replace the mm */ task_lock(tsk); if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || - tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) { + tsk->mm != tsk->active_mm || !hlist_empty(&tsk->mm->ioctx_list)) { task_unlock(tsk); return -EINVAL; } @@ -279,7 +279,7 @@ int s390_enable_sie(void) /* Now lets check again if something happened */ task_lock(tsk); if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || - tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) { + tsk->mm != tsk->active_mm || !hlist_empty(&tsk->mm->ioctx_list)) { mmput(mm); task_unlock(tsk); return -EINVAL; diff --git a/fs/aio.c b/fs/aio.c index f658441d566..d6f89d3c15e 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -191,6 +191,20 @@ static int aio_setup_ring(struct kioctx *ctx) kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \ } while(0) +static void ctx_rcu_free(struct rcu_head *head) +{ + struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); + unsigned nr_events = ctx->max_reqs; + + kmem_cache_free(kioctx_cachep, ctx); + + if (nr_events) { + spin_lock(&aio_nr_lock); + BUG_ON(aio_nr - nr_events > aio_nr); + aio_nr -= nr_events; + spin_unlock(&aio_nr_lock); + } +} /* __put_ioctx * Called when the last user of an aio context has gone away, @@ -198,8 +212,6 @@ static int aio_setup_ring(struct kioctx *ctx) */ static void __put_ioctx(struct kioctx *ctx) { - unsigned nr_events = ctx->max_reqs; - BUG_ON(ctx->reqs_active); cancel_delayed_work(&ctx->wq); @@ -208,14 +220,7 @@ static void __put_ioctx(struct kioctx *ctx) mmdrop(ctx->mm); ctx->mm = NULL; pr_debug("__put_ioctx: freeing %p\n", ctx); - kmem_cache_free(kioctx_cachep, ctx); - - if (nr_events) { - spin_lock(&aio_nr_lock); - BUG_ON(aio_nr - nr_events > aio_nr); - aio_nr -= nr_events; - spin_unlock(&aio_nr_lock); - } + call_rcu(&ctx->rcu_head, ctx_rcu_free); } #define get_ioctx(kioctx) do { \ @@ -235,6 +240,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) { struct mm_struct *mm; struct kioctx *ctx; + int did_sync = 0; /* Prevent overflows */ if ((nr_events > (0x10000000U / sizeof(struct io_event))) || @@ -267,21 +273,30 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) goto out_freectx; /* limit the number of system wide aios */ - spin_lock(&aio_nr_lock); - if (aio_nr + ctx->max_reqs > aio_max_nr || - aio_nr + ctx->max_reqs < aio_nr) - ctx->max_reqs = 0; - else - aio_nr += ctx->max_reqs; - spin_unlock(&aio_nr_lock); + do { + spin_lock_bh(&aio_nr_lock); + if (aio_nr + nr_events > aio_max_nr || + aio_nr + nr_events < aio_nr) + ctx->max_reqs = 0; + else + aio_nr += ctx->max_reqs; + spin_unlock_bh(&aio_nr_lock); + if (ctx->max_reqs || did_sync) + break; + + /* wait for rcu callbacks to have completed before giving up */ + synchronize_rcu(); + did_sync = 1; + ctx->max_reqs = nr_events; + } while (1); + if (ctx->max_reqs == 0) goto out_cleanup; /* now link into global list. */ - write_lock(&mm->ioctx_list_lock); - ctx->next = mm->ioctx_list; - mm->ioctx_list = ctx; - write_unlock(&mm->ioctx_list_lock); + spin_lock(&mm->ioctx_lock); + hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); + spin_unlock(&mm->ioctx_lock); dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, current->mm, ctx->ring_info.nr); @@ -375,11 +390,12 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb) */ void exit_aio(struct mm_struct *mm) { - struct kioctx *ctx = mm->ioctx_list; - mm->ioctx_list = NULL; - while (ctx) { - struct kioctx *next = ctx->next; - ctx->next = NULL; + struct kioctx *ctx; + + while (!hlist_empty(&mm->ioctx_list)) { + ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list); + hlist_del_rcu(&ctx->list); + aio_cancel_all(ctx); wait_for_all_aios(ctx); @@ -394,7 +410,6 @@ void exit_aio(struct mm_struct *mm) atomic_read(&ctx->users), ctx->dead, ctx->reqs_active); put_ioctx(ctx); - ctx = next; } } @@ -555,19 +570,21 @@ int aio_put_req(struct kiocb *req) static struct kioctx *lookup_ioctx(unsigned long ctx_id) { - struct kioctx *ioctx; - struct mm_struct *mm; + struct mm_struct *mm = current->mm; + struct kioctx *ctx = NULL; + struct hlist_node *n; - mm = current->mm; - read_lock(&mm->ioctx_list_lock); - for (ioctx = mm->ioctx_list; ioctx; ioctx = ioctx->next) - if (likely(ioctx->user_id == ctx_id && !ioctx->dead)) { - get_ioctx(ioctx); + rcu_read_lock(); + + hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) { + if (ctx->user_id == ctx_id && !ctx->dead) { + get_ioctx(ctx); break; } - read_unlock(&mm->ioctx_list_lock); + } - return ioctx; + rcu_read_unlock(); + return ctx; } /* @@ -1215,19 +1232,14 @@ out: static void io_destroy(struct kioctx *ioctx) { struct mm_struct *mm = current->mm; - struct kioctx **tmp; int was_dead; /* delete the entry from the list is someone else hasn't already */ - write_lock(&mm->ioctx_list_lock); + spin_lock(&mm->ioctx_lock); was_dead = ioctx->dead; ioctx->dead = 1; - for (tmp = &mm->ioctx_list; *tmp && *tmp != ioctx; - tmp = &(*tmp)->next) - ; - if (*tmp) - *tmp = ioctx->next; - write_unlock(&mm->ioctx_list_lock); + hlist_del_rcu(&ioctx->list); + spin_unlock(&mm->ioctx_lock); dprintk("aio_release(%p)\n", ioctx); if (likely(!was_dead)) diff --git a/include/linux/aio.h b/include/linux/aio.h index f6b8cf99b59..b16a957030f 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -183,7 +184,7 @@ struct kioctx { /* This needs improving */ unsigned long user_id; - struct kioctx *next; + struct hlist_node list; wait_queue_head_t wait; @@ -199,6 +200,8 @@ struct kioctx { struct aio_ring_info ring_info; struct delayed_work wq; + + struct rcu_head rcu_head; }; /* prototypes */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index fe825471d5a..9cfc9b627fd 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -232,8 +232,9 @@ struct mm_struct { struct core_state *core_state; /* coredumping support */ /* aio bits */ - rwlock_t ioctx_list_lock; /* aio lock */ - struct kioctx *ioctx_list; + spinlock_t ioctx_lock; + struct hlist_head ioctx_list; + #ifdef CONFIG_MM_OWNER /* * "owner" points to a task that is regarded as the canonical diff --git a/kernel/fork.c b/kernel/fork.c index 6144b36cd89..43cbf30669e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -415,8 +415,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) set_mm_counter(mm, file_rss, 0); set_mm_counter(mm, anon_rss, 0); spin_lock_init(&mm->page_table_lock); - rwlock_init(&mm->ioctx_list_lock); - mm->ioctx_list = NULL; + spin_lock_init(&mm->ioctx_lock); + INIT_HLIST_HEAD(&mm->ioctx_list); mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; mm_init_owner(mm, p); -- cgit v1.2.3-70-g09d2 From b3199c025d1646e25e7d1d640dd605db251dccf8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:05:14 +1030 Subject: cpumask: switch over to cpu_online/possible/active/present_mask: core Impact: cleanup This implements the obsolescent cpu_online_map in terms of cpu_online_mask, rather than the other way around. Same for the other maps. The documentation comments are also updated to refer to _mask rather than _map. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis --- include/linux/cpumask.h | 75 +++++++++++++++++++------------------------------ kernel/cpu.c | 49 +++++++++++++++----------------- 2 files changed, 52 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index b5ad19a6f43..db2341beca4 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -416,65 +416,54 @@ int __next_cpu_nr(int n, const cpumask_t *srcp); /* * The following particular system cpumasks and operations manage - * possible, present, active and online cpus. Each of them is a fixed size - * bitmap of size NR_CPUS. + * possible, present, active and online cpus. * - * #ifdef CONFIG_HOTPLUG_CPU - * cpu_possible_map - has bit 'cpu' set iff cpu is populatable - * cpu_present_map - has bit 'cpu' set iff cpu is populated - * cpu_online_map - has bit 'cpu' set iff cpu available to scheduler - * cpu_active_map - has bit 'cpu' set iff cpu available to migration - * #else - * cpu_possible_map - has bit 'cpu' set iff cpu is populated - * cpu_present_map - copy of cpu_possible_map - * cpu_online_map - has bit 'cpu' set iff cpu available to scheduler - * #endif + * cpu_possible_mask- has bit 'cpu' set iff cpu is populatable + * cpu_present_mask - has bit 'cpu' set iff cpu is populated + * cpu_online_mask - has bit 'cpu' set iff cpu available to scheduler + * cpu_active_mask - has bit 'cpu' set iff cpu available to migration * - * In either case, NR_CPUS is fixed at compile time, as the static - * size of these bitmaps. The cpu_possible_map is fixed at boot - * time, as the set of CPU id's that it is possible might ever - * be plugged in at anytime during the life of that system boot. - * The cpu_present_map is dynamic(*), representing which CPUs - * are currently plugged in. And cpu_online_map is the dynamic - * subset of cpu_present_map, indicating those CPUs available - * for scheduling. + * If !CONFIG_HOTPLUG_CPU, present == possible, and active == online. * - * If HOTPLUG is enabled, then cpu_possible_map is forced to have + * The cpu_possible_mask is fixed at boot time, as the set of CPU id's + * that it is possible might ever be plugged in at anytime during the + * life of that system boot. The cpu_present_mask is dynamic(*), + * representing which CPUs are currently plugged in. And + * cpu_online_mask is the dynamic subset of cpu_present_mask, + * indicating those CPUs available for scheduling. + * + * If HOTPLUG is enabled, then cpu_possible_mask is forced to have * all NR_CPUS bits set, otherwise it is just the set of CPUs that * ACPI reports present at boot. * - * If HOTPLUG is enabled, then cpu_present_map varies dynamically, + * If HOTPLUG is enabled, then cpu_present_mask varies dynamically, * depending on what ACPI reports as currently plugged in, otherwise - * cpu_present_map is just a copy of cpu_possible_map. + * cpu_present_mask is just a copy of cpu_possible_mask. * - * (*) Well, cpu_present_map is dynamic in the hotplug case. If not - * hotplug, it's a copy of cpu_possible_map, hence fixed at boot. + * (*) Well, cpu_present_mask is dynamic in the hotplug case. If not + * hotplug, it's a copy of cpu_possible_mask, hence fixed at boot. * * Subtleties: * 1) UP arch's (NR_CPUS == 1, CONFIG_SMP not defined) hardcode * assumption that their single CPU is online. The UP - * cpu_{online,possible,present}_maps are placebos. Changing them + * cpu_{online,possible,present}_masks are placebos. Changing them * will have no useful affect on the following num_*_cpus() * and cpu_*() macros in the UP case. This ugliness is a UP * optimization - don't waste any instructions or memory references * asking if you're online or how many CPUs there are if there is * only one CPU. - * 2) Most SMP arch's #define some of these maps to be some - * other map specific to that arch. Therefore, the following - * must be #define macros, not inlines. To see why, examine - * the assembly code produced by the following. Note that - * set1() writes phys_x_map, but set2() writes x_map: - * int x_map, phys_x_map; - * #define set1(a) x_map = a - * inline void set2(int a) { x_map = a; } - * #define x_map phys_x_map - * main(){ set1(3); set2(5); } */ -extern cpumask_t cpu_possible_map; -extern cpumask_t cpu_online_map; -extern cpumask_t cpu_present_map; -extern cpumask_t cpu_active_map; +extern const struct cpumask *const cpu_possible_mask; +extern const struct cpumask *const cpu_online_mask; +extern const struct cpumask *const cpu_present_mask; +extern const struct cpumask *const cpu_active_mask; + +/* These strip const, as traditionally they weren't const. */ +#define cpu_possible_map (*(cpumask_t *)cpu_possible_mask) +#define cpu_online_map (*(cpumask_t *)cpu_online_mask) +#define cpu_present_map (*(cpumask_t *)cpu_present_mask) +#define cpu_active_map (*(cpumask_t *)cpu_active_mask) #if NR_CPUS > 1 #define num_online_cpus() cpus_weight_nr(cpu_online_map) @@ -1058,12 +1047,6 @@ static inline void free_bootmem_cpumask_var(cpumask_var_t mask) } #endif /* CONFIG_CPUMASK_OFFSTACK */ -/* The pointer versions of the maps, these will become the primary versions. */ -#define cpu_possible_mask ((const struct cpumask *)&cpu_possible_map) -#define cpu_online_mask ((const struct cpumask *)&cpu_online_map) -#define cpu_present_mask ((const struct cpumask *)&cpu_present_map) -#define cpu_active_mask ((const struct cpumask *)&cpu_active_map) - /* It's common to want to use cpu_all_mask in struct member initializers, * so it has to refer to an address rather than a pointer. */ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); diff --git a/kernel/cpu.c b/kernel/cpu.c index bae131a1211..3ddc509b19c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -15,30 +15,8 @@ #include #include -/* - * Represents all cpu's present in the system - * In systems capable of hotplug, this map could dynamically grow - * as new cpu's are detected in the system via any platform specific - * method, such as ACPI for e.g. - */ -cpumask_t cpu_present_map __read_mostly; -EXPORT_SYMBOL(cpu_present_map); - -/* - * Represents all cpu's that are currently online. - */ -cpumask_t cpu_online_map __read_mostly; -EXPORT_SYMBOL(cpu_online_map); - -#ifdef CONFIG_INIT_ALL_POSSIBLE -cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; -#else -cpumask_t cpu_possible_map __read_mostly; -#endif -EXPORT_SYMBOL(cpu_possible_map); - #ifdef CONFIG_SMP -/* Serializes the updates to cpu_online_map, cpu_present_map */ +/* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); @@ -65,8 +43,6 @@ void __init cpu_hotplug_init(void) cpu_hotplug.refcount = 0; } -cpumask_t cpu_active_map; - #ifdef CONFIG_HOTPLUG_CPU void get_online_cpus(void) @@ -97,7 +73,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus); /* * The following two API's must be used when attempting - * to serialize the updates to cpu_online_map, cpu_present_map. + * to serialize the updates to cpu_online_mask, cpu_present_mask. */ void cpu_maps_update_begin(void) { @@ -503,3 +479,24 @@ EXPORT_SYMBOL_GPL(cpu_bit_bitmap); const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; EXPORT_SYMBOL(cpu_all_bits); + +#ifdef CONFIG_INIT_ALL_POSSIBLE +static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly + = CPU_BITS_ALL; +#else +static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly; +#endif +const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits); +EXPORT_SYMBOL(cpu_possible_mask); + +static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly; +const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits); +EXPORT_SYMBOL(cpu_online_mask); + +static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly; +const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits); +EXPORT_SYMBOL(cpu_present_mask); + +static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly; +const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits); +EXPORT_SYMBOL(cpu_active_mask); -- cgit v1.2.3-70-g09d2 From 3fa41520696fec2815e2d88fbcccdda77ba4d693 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:05:16 +1030 Subject: cpumask: make set_cpu_*/init_cpu_* out-of-line They're only for use in boot/cpu hotplug code anyway, and this avoids the use of deprecated cpu_*_map. Stephen Rothwell points out that gcc 4.2.4 (on powerpc at least) didn't like the cast away of const anyway: include/linux/cpumask.h: In function 'set_cpu_possible': include/linux/cpumask.h:1052: warning: passing argument 2 of 'cpumask_set_cpu' discards qualifiers from pointer target type So this kills two birds with one stone. Signed-off-by: Rusty Russell --- include/linux/cpumask.h | 53 +++++++------------------------------------------ kernel/cpu.c | 47 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index e62a67156c5..7c178a6baae 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -1057,50 +1057,11 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) /* Wrappers for arch boot code to manipulate normally-constant masks */ -static inline void set_cpu_possible(unsigned int cpu, bool possible) -{ - if (possible) - cpumask_set_cpu(cpu, &cpu_possible_map); - else - cpumask_clear_cpu(cpu, &cpu_possible_map); -} - -static inline void set_cpu_present(unsigned int cpu, bool present) -{ - if (present) - cpumask_set_cpu(cpu, &cpu_present_map); - else - cpumask_clear_cpu(cpu, &cpu_present_map); -} - -static inline void set_cpu_online(unsigned int cpu, bool online) -{ - if (online) - cpumask_set_cpu(cpu, &cpu_online_map); - else - cpumask_clear_cpu(cpu, &cpu_online_map); -} - -static inline void set_cpu_active(unsigned int cpu, bool active) -{ - if (active) - cpumask_set_cpu(cpu, &cpu_active_map); - else - cpumask_clear_cpu(cpu, &cpu_active_map); -} - -static inline void init_cpu_present(const struct cpumask *src) -{ - cpumask_copy(&cpu_present_map, src); -} - -static inline void init_cpu_possible(const struct cpumask *src) -{ - cpumask_copy(&cpu_possible_map, src); -} - -static inline void init_cpu_online(const struct cpumask *src) -{ - cpumask_copy(&cpu_online_map, src); -} +void set_cpu_possible(unsigned int cpu, bool possible); +void set_cpu_present(unsigned int cpu, bool present); +void set_cpu_online(unsigned int cpu, bool online); +void set_cpu_active(unsigned int cpu, bool active); +void init_cpu_present(const struct cpumask *src); +void init_cpu_possible(const struct cpumask *src); +void init_cpu_online(const struct cpumask *src); #endif /* __LINUX_CPUMASK_H */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 3ddc509b19c..2c9f78f3a2f 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -500,3 +500,50 @@ EXPORT_SYMBOL(cpu_present_mask); static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly; const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits); EXPORT_SYMBOL(cpu_active_mask); + +void set_cpu_possible(unsigned int cpu, bool possible) +{ + if (possible) + cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits)); +} + +void set_cpu_present(unsigned int cpu, bool present) +{ + if (present) + cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits)); +} + +void set_cpu_online(unsigned int cpu, bool online) +{ + if (online) + cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); +} + +void set_cpu_active(unsigned int cpu, bool active) +{ + if (active) + cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits)); +} + +void init_cpu_present(const struct cpumask *src) +{ + cpumask_copy(to_cpumask(cpu_present_bits), src); +} + +void init_cpu_possible(const struct cpumask *src) +{ + cpumask_copy(to_cpumask(cpu_possible_bits), src); +} + +void init_cpu_online(const struct cpumask *src) +{ + cpumask_copy(to_cpumask(cpu_online_bits), src); +} -- cgit v1.2.3-70-g09d2 From 54b11e6d57a10aa9d0009efd93873e17bffd5d30 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:05:16 +1030 Subject: cpumask: smp_call_function_many() Impact: Implementation change to remove cpumask_t from stack. Actually change smp_call_function_mask() to smp_call_function_many(). We avoid cpumasks on the stack in this version. (S390 has its own version, but that's going away apparently). We have to do some dancing to figure out if 0 or 1 other cpus are in the mask supplied and the online mask without allocating a tmp cpumask. It's still fairly cheap. We allocate the cpumask at the end of the call_function_data structure: if allocation fails we fallback to smp_call_function_single rather than using the baroque quiescing code (which needs a cpumask on stack). (Thanks to Hiroshi Shimamoto for spotting several bugs in previous versions!) Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Cc: Hiroshi Shimamoto Cc: npiggin@suse.de Cc: axboe@kernel.dk --- include/linux/smp.h | 15 +++--- kernel/smp.c | 139 ++++++++++++++++++---------------------------------- 2 files changed, 57 insertions(+), 97 deletions(-) (limited to 'kernel') diff --git a/include/linux/smp.h b/include/linux/smp.h index 2f85f3b04bc..b8246696810 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -67,15 +67,16 @@ extern void smp_cpus_done(unsigned int max_cpus); * Call a function on all other processors */ int smp_call_function(void(*func)(void *info), void *info, int wait); -/* Deprecated: use smp_call_function_many() which uses a cpumask ptr. */ -int smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info, - int wait); +void smp_call_function_many(const struct cpumask *mask, + void (*func)(void *info), void *info, bool wait); -static inline void smp_call_function_many(const struct cpumask *mask, - void (*func)(void *info), void *info, - int wait) +/* Deprecated: Use smp_call_function_many which takes a pointer to the mask. */ +static inline int +smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info, + int wait) { - smp_call_function_mask(*mask, func, info, wait); + smp_call_function_many(&mask, func, info, wait); + return 0; } int smp_call_function_single(int cpuid, void (*func) (void *info), void *info, diff --git a/kernel/smp.c b/kernel/smp.c index 75c8dde58c5..9f0eafed139 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -24,8 +24,8 @@ struct call_function_data { struct call_single_data csd; spinlock_t lock; unsigned int refs; - cpumask_t cpumask; struct rcu_head rcu_head; + unsigned long cpumask_bits[]; }; struct call_single_queue { @@ -110,13 +110,13 @@ void generic_smp_call_function_interrupt(void) list_for_each_entry_rcu(data, &call_function_queue, csd.list) { int refs; - if (!cpu_isset(cpu, data->cpumask)) + if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits))) continue; data->csd.func(data->csd.info); spin_lock(&data->lock); - cpu_clear(cpu, data->cpumask); + cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits)); WARN_ON(data->refs == 0); data->refs--; refs = data->refs; @@ -266,51 +266,13 @@ void __smp_call_function_single(int cpu, struct call_single_data *data) generic_exec_single(cpu, data); } -/* Dummy function */ -static void quiesce_dummy(void *unused) -{ -} - -/* - * Ensure stack based data used in call function mask is safe to free. - * - * This is needed by smp_call_function_mask when using on-stack data, because - * a single call function queue is shared by all CPUs, and any CPU may pick up - * the data item on the queue at any time before it is deleted. So we need to - * ensure that all CPUs have transitioned through a quiescent state after - * this call. - * - * This is a very slow function, implemented by sending synchronous IPIs to - * all possible CPUs. For this reason, we have to alloc data rather than use - * stack based data even in the case of synchronous calls. The stack based - * data is then just used for deadlock/oom fallback which will be very rare. - * - * If a faster scheme can be made, we could go back to preferring stack based - * data -- the data allocation/free is non-zero cost. - */ -static void smp_call_function_mask_quiesce_stack(cpumask_t mask) -{ - struct call_single_data data; - int cpu; - - data.func = quiesce_dummy; - data.info = NULL; - - for_each_cpu_mask(cpu, mask) { - data.flags = CSD_FLAG_WAIT; - generic_exec_single(cpu, &data); - } -} - /** - * smp_call_function_mask(): Run a function on a set of other CPUs. - * @mask: The set of cpus to run on. + * smp_call_function_many(): Run a function on a set of other CPUs. + * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait (atomically) until function has completed on other CPUs. * - * Returns 0 on success, else a negative status code. - * * If @wait is true, then returns once @func has returned. Note that @wait * will be implicitly turned on in case of allocation failures, since * we fall back to on-stack allocation. @@ -319,53 +281,57 @@ static void smp_call_function_mask_quiesce_stack(cpumask_t mask) * hardware interrupt handler or from a bottom half handler. Preemption * must be disabled when calling this function. */ -int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, - int wait) +void smp_call_function_many(const struct cpumask *mask, + void (*func)(void *), void *info, + bool wait) { - struct call_function_data d; - struct call_function_data *data = NULL; - cpumask_t allbutself; + struct call_function_data *data; unsigned long flags; - int cpu, num_cpus; - int slowpath = 0; + int cpu, next_cpu; /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); - cpu = smp_processor_id(); - allbutself = cpu_online_map; - cpu_clear(cpu, allbutself); - cpus_and(mask, mask, allbutself); - num_cpus = cpus_weight(mask); - - /* - * If zero CPUs, return. If just a single CPU, turn this request - * into a targetted single call instead since it's faster. - */ - if (!num_cpus) - return 0; - else if (num_cpus == 1) { - cpu = first_cpu(mask); - return smp_call_function_single(cpu, func, info, wait); + /* So, what's a CPU they want? Ignoring this one. */ + cpu = cpumask_first_and(mask, cpu_online_mask); + if (cpu == smp_processor_id()) + cpu = cpumask_next_and(cpu, mask, cpu_online_mask); + /* No online cpus? We're done. */ + if (cpu >= nr_cpu_ids) + return; + + /* Do we have another CPU which isn't us? */ + next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask); + if (next_cpu == smp_processor_id()) + next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask); + + /* Fastpath: do that cpu by itself. */ + if (next_cpu >= nr_cpu_ids) { + smp_call_function_single(cpu, func, info, wait); + return; } - data = kmalloc(sizeof(*data), GFP_ATOMIC); - if (data) { - data->csd.flags = CSD_FLAG_ALLOC; - if (wait) - data->csd.flags |= CSD_FLAG_WAIT; - } else { - data = &d; - data->csd.flags = CSD_FLAG_WAIT; - wait = 1; - slowpath = 1; + data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC); + if (unlikely(!data)) { + /* Slow path. */ + for_each_online_cpu(cpu) { + if (cpu == smp_processor_id()) + continue; + if (cpumask_test_cpu(cpu, mask)) + smp_call_function_single(cpu, func, info, wait); + } + return; } spin_lock_init(&data->lock); + data->csd.flags = CSD_FLAG_ALLOC; + if (wait) + data->csd.flags |= CSD_FLAG_WAIT; data->csd.func = func; data->csd.info = info; - data->refs = num_cpus; - data->cpumask = mask; + cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits)); + data->refs = cpumask_weight(to_cpumask(data->cpumask_bits)); spin_lock_irqsave(&call_function_lock, flags); list_add_tail_rcu(&data->csd.list, &call_function_queue); @@ -377,18 +343,13 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, smp_mb(); /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi(mask); + arch_send_call_function_ipi(*to_cpumask(data->cpumask_bits)); /* optionally wait for the CPUs to complete */ - if (wait) { + if (wait) csd_flag_wait(&data->csd); - if (unlikely(slowpath)) - smp_call_function_mask_quiesce_stack(mask); - } - - return 0; } -EXPORT_SYMBOL(smp_call_function_mask); +EXPORT_SYMBOL(smp_call_function_many); /** * smp_call_function(): Run a function on all other CPUs. @@ -396,7 +357,7 @@ EXPORT_SYMBOL(smp_call_function_mask); * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait (atomically) until function has completed on other CPUs. * - * Returns 0 on success, else a negative status code. + * Returns 0. * * If @wait is true, then returns once @func has returned; otherwise * it returns just before the target cpu calls @func. In case of allocation @@ -407,12 +368,10 @@ EXPORT_SYMBOL(smp_call_function_mask); */ int smp_call_function(void (*func)(void *), void *info, int wait) { - int ret; - preempt_disable(); - ret = smp_call_function_mask(cpu_online_map, func, info, wait); + smp_call_function_many(cpu_online_mask, func, info, wait); preempt_enable(); - return ret; + return 0; } EXPORT_SYMBOL(smp_call_function); -- cgit v1.2.3-70-g09d2 From ce47d974f71af26d00832e83a43ac79bec272d99 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:05:17 +1030 Subject: cpumask: arch_send_call_function_ipi_mask: core Impact: new API to reduce stack usage We're weaning the core code off handing cpumask's around on-stack. This introduces arch_send_call_function_ipi_mask(). Signed-off-by: Rusty Russell --- kernel/smp.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 9f0eafed139..172b1826890 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -266,6 +266,12 @@ void __smp_call_function_single(int cpu, struct call_single_data *data) generic_exec_single(cpu, data); } +/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */ +#ifndef arch_send_call_function_ipi_mask +#define arch_send_call_function_ipi_mask(maskp) \ + arch_send_call_function_ipi(*(maskp)) +#endif + /** * smp_call_function_many(): Run a function on a set of other CPUs. * @mask: The set of cpus to run on (only runs on online subset). @@ -343,7 +349,7 @@ void smp_call_function_many(const struct cpumask *mask, smp_mb(); /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi(*to_cpumask(data->cpumask_bits)); + arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits)); /* optionally wait for the CPUs to complete */ if (wait) -- cgit v1.2.3-70-g09d2 From 1af237a099a3b8ff56aa384f605c6a68af7bf288 Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Tue, 30 Dec 2008 06:41:44 +0800 Subject: tracing: removed duplicated #include Removed duplicated #include in kernel/trace/trace.c. Signed-off-by: Huang Weiyi Signed-off-by: Linus Torvalds --- kernel/trace/trace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3608f6cb2f7..4185d522163 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3-70-g09d2 From 4f4b6c1a94a8735bbdc030a2911cf395495645b6 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:15 +1030 Subject: cpumask: prepare for iterators to only go to nr_cpu_ids/nr_cpumask_bits.: core Impact: cleanup In future, all cpumask ops will only be valid (in general) for bit numbers < nr_cpu_ids. So use that instead of NR_CPUS in iterators and other comparisons. This is always safe: no cpu number can be >= nr_cpu_ids, and nr_cpu_ids is initialized to NR_CPUS at boot. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Acked-by: Ingo Molnar Acked-by: James Morris Cc: Eric Biederman --- kernel/kexec.c | 2 +- kernel/smp.c | 2 +- security/selinux/selinuxfs.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index ac0fde7b54d..3fb855ad6aa 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1116,7 +1116,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) struct elf_prstatus prstatus; u32 *buf; - if ((cpu < 0) || (cpu >= NR_CPUS)) + if ((cpu < 0) || (cpu >= nr_cpu_ids)) return; /* Using ELF notes here is opportunistic. diff --git a/kernel/smp.c b/kernel/smp.c index 172b1826890..5cfa0e5e3e8 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -223,7 +223,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, local_irq_save(flags); func(info); local_irq_restore(flags); - } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) { + } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { struct call_single_data *data = NULL; if (!wait) { diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index c8630363823..e5520996a75 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -1211,7 +1211,7 @@ static struct avc_cache_stats *sel_avc_get_stat_idx(loff_t *idx) { int cpu; - for (cpu = *idx; cpu < NR_CPUS; ++cpu) { + for (cpu = *idx; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *idx = cpu + 1; -- cgit v1.2.3-70-g09d2 From 9e01c1b74c9531e301c900edaa92a99fcb7738f2 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:22 +1030 Subject: cpumask: convert kernel trace functions Impact: Reduce future memory usage, use new cpumask API. (Eventually, cpumask_var_t will be allocated based on nr_cpu_ids, not NR_CPUS). Convert kernel trace functions to use struct cpumask API: 1) Use cpumask_copy/cpumask_test_cpu/for_each_cpu. 2) Use cpumask_var_t and alloc_cpumask_var/free_cpumask_var everywhere. 3) Use on_each_cpu instead of playing with current->cpus_allowed. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Acked-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 42 ++++++++++++++++++------------- kernel/trace/trace.c | 60 ++++++++++++++++++++++++++------------------ kernel/trace/trace_sysprof.c | 13 +++------- 3 files changed, 64 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d601a7c458..a9d9760dc7b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -195,7 +195,7 @@ void *ring_buffer_event_data(struct ring_buffer_event *event) EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define for_each_buffer_cpu(buffer, cpu) \ - for_each_cpu_mask(cpu, buffer->cpumask) + for_each_cpu(cpu, buffer->cpumask) #define TS_SHIFT 27 #define TS_MASK ((1ULL << TS_SHIFT) - 1) @@ -267,7 +267,7 @@ struct ring_buffer { unsigned pages; unsigned flags; int cpus; - cpumask_t cpumask; + cpumask_var_t cpumask; atomic_t record_disabled; struct mutex mutex; @@ -458,6 +458,9 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) if (!buffer) return NULL; + if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) + goto fail_free_buffer; + buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); buffer->flags = flags; @@ -465,14 +468,14 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) if (buffer->pages == 1) buffer->pages++; - buffer->cpumask = cpu_possible_map; + cpumask_copy(buffer->cpumask, cpu_possible_mask); buffer->cpus = nr_cpu_ids; bsize = sizeof(void *) * nr_cpu_ids; buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()), GFP_KERNEL); if (!buffer->buffers) - goto fail_free_buffer; + goto fail_free_cpumask; for_each_buffer_cpu(buffer, cpu) { buffer->buffers[cpu] = @@ -492,6 +495,9 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) } kfree(buffer->buffers); + fail_free_cpumask: + free_cpumask_var(buffer->cpumask); + fail_free_buffer: kfree(buffer); return NULL; @@ -510,6 +516,8 @@ ring_buffer_free(struct ring_buffer *buffer) for_each_buffer_cpu(buffer, cpu) rb_free_cpu_buffer(buffer->buffers[cpu]); + free_cpumask_var(buffer->cpumask); + kfree(buffer); } EXPORT_SYMBOL_GPL(ring_buffer_free); @@ -1283,7 +1291,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, cpu = raw_smp_processor_id(); - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) goto out; cpu_buffer = buffer->buffers[cpu]; @@ -1396,7 +1404,7 @@ int ring_buffer_write(struct ring_buffer *buffer, cpu = raw_smp_processor_id(); - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) goto out; cpu_buffer = buffer->buffers[cpu]; @@ -1478,7 +1486,7 @@ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; cpu_buffer = buffer->buffers[cpu]; @@ -1498,7 +1506,7 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; cpu_buffer = buffer->buffers[cpu]; @@ -1515,7 +1523,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; @@ -1532,7 +1540,7 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; @@ -1850,7 +1858,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) struct buffer_page *reader; int nr_loops = 0; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; cpu_buffer = buffer->buffers[cpu]; @@ -2025,7 +2033,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) struct ring_buffer_event *event; unsigned long flags; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); @@ -2062,7 +2070,7 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) struct ring_buffer_iter *iter; unsigned long flags; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; iter = kmalloc(sizeof(*iter), GFP_KERNEL); @@ -2172,7 +2180,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; unsigned long flags; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); @@ -2228,7 +2236,7 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 1; cpu_buffer = buffer->buffers[cpu]; @@ -2252,8 +2260,8 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, struct ring_buffer_per_cpu *cpu_buffer_a; struct ring_buffer_per_cpu *cpu_buffer_b; - if (!cpu_isset(cpu, buffer_a->cpumask) || - !cpu_isset(cpu, buffer_b->cpumask)) + if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || + !cpumask_test_cpu(cpu, buffer_b->cpumask)) return -EINVAL; /* At least make sure the two buffers are somewhat the same */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0e91f43b6ba..5d04e27f3b4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -89,10 +89,10 @@ static inline void ftrace_enable_cpu(void) preempt_enable(); } -static cpumask_t __read_mostly tracing_buffer_mask; +static cpumask_var_t __read_mostly tracing_buffer_mask; #define for_each_tracing_cpu(cpu) \ - for_each_cpu_mask(cpu, tracing_buffer_mask) + for_each_cpu(cpu, tracing_buffer_mask) /* * ftrace_dump_on_oops - variable to dump ftrace buffer on oops @@ -2646,13 +2646,7 @@ static struct file_operations show_traces_fops = { /* * Only trace on a CPU if the bitmask is set: */ -static cpumask_t tracing_cpumask = CPU_MASK_ALL; - -/* - * When tracing/tracing_cpu_mask is modified then this holds - * the new bitmask we are about to install: - */ -static cpumask_t tracing_cpumask_new; +static cpumask_var_t tracing_cpumask; /* * The tracer itself will not take this lock, but still we want @@ -2674,7 +2668,7 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf, mutex_lock(&tracing_cpumask_update_lock); - len = cpumask_scnprintf(mask_str, count, &tracing_cpumask); + len = cpumask_scnprintf(mask_str, count, tracing_cpumask); if (count - len < 2) { count = -EINVAL; goto out_err; @@ -2693,9 +2687,13 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) { int err, cpu; + cpumask_var_t tracing_cpumask_new; + + if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) + return -ENOMEM; mutex_lock(&tracing_cpumask_update_lock); - err = cpumask_parse_user(ubuf, count, &tracing_cpumask_new); + err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); if (err) goto err_unlock; @@ -2706,26 +2704,28 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, * Increase/decrease the disabled counter if we are * about to flip a bit in the cpumask: */ - if (cpu_isset(cpu, tracing_cpumask) && - !cpu_isset(cpu, tracing_cpumask_new)) { + if (cpumask_test_cpu(cpu, tracing_cpumask) && + !cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_inc(&global_trace.data[cpu]->disabled); } - if (!cpu_isset(cpu, tracing_cpumask) && - cpu_isset(cpu, tracing_cpumask_new)) { + if (!cpumask_test_cpu(cpu, tracing_cpumask) && + cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_dec(&global_trace.data[cpu]->disabled); } } __raw_spin_unlock(&ftrace_max_lock); local_irq_enable(); - tracing_cpumask = tracing_cpumask_new; + cpumask_copy(tracing_cpumask, tracing_cpumask_new); mutex_unlock(&tracing_cpumask_update_lock); + free_cpumask_var(tracing_cpumask_new); return count; err_unlock: mutex_unlock(&tracing_cpumask_update_lock); + free_cpumask_var(tracing_cpumask); return err; } @@ -3752,7 +3752,6 @@ void ftrace_dump(void) static DEFINE_SPINLOCK(ftrace_dump_lock); /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; - static cpumask_t mask; static int dump_ran; unsigned long flags; int cnt = 0, cpu; @@ -3786,8 +3785,6 @@ void ftrace_dump(void) * and then release the locks again. */ - cpus_clear(mask); - while (!trace_empty(&iter)) { if (!cnt) @@ -3823,19 +3820,28 @@ __init static int tracer_alloc_buffers(void) { struct trace_array_cpu *data; int i; + int ret = -ENOMEM; - /* TODO: make the number of buffers hot pluggable with CPUS */ - tracing_buffer_mask = cpu_possible_map; + if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) + goto out; + + if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) + goto out_free_buffer_mask; + cpumask_copy(tracing_buffer_mask, cpu_possible_mask); + cpumask_copy(tracing_cpumask, cpu_all_mask); + + /* TODO: make the number of buffers hot pluggable with CPUS */ global_trace.buffer = ring_buffer_alloc(trace_buf_size, TRACE_BUFFER_FLAGS); if (!global_trace.buffer) { printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); WARN_ON(1); - return 0; + goto out_free_cpumask; } global_trace.entries = ring_buffer_size(global_trace.buffer); + #ifdef CONFIG_TRACER_MAX_TRACE max_tr.buffer = ring_buffer_alloc(trace_buf_size, TRACE_BUFFER_FLAGS); @@ -3843,7 +3849,7 @@ __init static int tracer_alloc_buffers(void) printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); WARN_ON(1); ring_buffer_free(global_trace.buffer); - return 0; + goto out_free_cpumask; } max_tr.entries = ring_buffer_size(max_tr.buffer); WARN_ON(max_tr.entries != global_trace.entries); @@ -3873,8 +3879,14 @@ __init static int tracer_alloc_buffers(void) &trace_panic_notifier); register_die_notifier(&trace_die_notifier); + ret = 0; - return 0; +out_free_cpumask: + free_cpumask_var(tracing_cpumask); +out_free_buffer_mask: + free_cpumask_var(tracing_buffer_mask); +out: + return ret; } early_initcall(tracer_alloc_buffers); fs_initcall(tracer_init_debugfs); diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index a5779bd975d..eaca5ad803f 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -196,9 +196,9 @@ static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; } -static void start_stack_timer(int cpu) +static void start_stack_timer(void *unused) { - struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); + struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer); hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = stack_trace_timer_fn; @@ -208,14 +208,7 @@ static void start_stack_timer(int cpu) static void start_stack_timers(void) { - cpumask_t saved_mask = current->cpus_allowed; - int cpu; - - for_each_online_cpu(cpu) { - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - start_stack_timer(cpu); - } - set_cpus_allowed_ptr(current, &saved_mask); + on_each_cpu(start_stack_timer, NULL, 1); } static void stop_stack_timer(int cpu) -- cgit v1.2.3-70-g09d2 From 4462344ee9ea9224d026801b877887f2f39774a3 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:23 +1030 Subject: cpumask: convert kernel trace functions further Impact: Reduce future memory usage, use new cpumask API. Since the last patch was created and acked, more old cpumask users slipped into kernel/trace. Mostly trivial conversions, except struct trace_iterator's "started" member becomes a cpumask_var_t. Signed-off-by: Rusty Russell --- kernel/trace/trace.c | 12 +++++++++--- kernel/trace/trace.h | 2 +- kernel/trace/trace_boot.c | 2 +- kernel/trace/trace_functions_graph.c | 2 +- kernel/trace/trace_hw_branches.c | 6 +++--- kernel/trace/trace_power.c | 2 +- 6 files changed, 16 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5d04e27f3b4..c580233add9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1811,10 +1811,10 @@ static void test_cpu_buff_start(struct trace_iterator *iter) if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) return; - if (cpu_isset(iter->cpu, iter->started)) + if (cpumask_test_cpu(iter->cpu, iter->started)) return; - cpu_set(iter->cpu, iter->started); + cpumask_set_cpu(iter->cpu, iter->started); trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu); } @@ -3114,10 +3114,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (!iter) return -ENOMEM; + if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { + kfree(iter); + return -ENOMEM; + } + mutex_lock(&trace_types_lock); /* trace pipe does not show start of buffer */ - cpus_setall(iter->started); + cpumask_setall(iter->started); iter->tr = &global_trace; iter->trace = current_trace; @@ -3134,6 +3139,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) { struct trace_iterator *iter = file->private_data; + free_cpumask_var(iter->started); kfree(iter); atomic_dec(&tracing_reader); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index cc7a4f86403..4d3d381bfd9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -368,7 +368,7 @@ struct trace_iterator { loff_t pos; long idx; - cpumask_t started; + cpumask_var_t started; }; int tracing_is_enabled(void); diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 3ccebde2848..366c8c333e1 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -42,7 +42,7 @@ static int boot_trace_init(struct trace_array *tr) int cpu; boot_trace = tr; - for_each_cpu_mask(cpu, cpu_possible_map) + for_each_cpu(cpu, cpu_possible_mask) tracing_reset(tr, cpu); tracing_sched_switch_assign_trace(tr); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4bf39fcae97..930c08e5b38 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -79,7 +79,7 @@ print_graph_cpu(struct trace_seq *s, int cpu) int i; int ret; int log10_this = log10_cpu(cpu); - int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map)); + int log10_all = log10_cpu(cpumask_weight(cpu_online_mask)); /* diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index b6a3e20a49a..649df22d435 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -46,7 +46,7 @@ static void bts_trace_start(struct trace_array *tr) tracing_reset_online_cpus(tr); - for_each_cpu_mask(cpu, cpu_possible_map) + for_each_cpu(cpu, cpu_possible_mask) smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); } @@ -62,7 +62,7 @@ static void bts_trace_stop(struct trace_array *tr) { int cpu; - for_each_cpu_mask(cpu, cpu_possible_map) + for_each_cpu(cpu, cpu_possible_mask) smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); } @@ -172,7 +172,7 @@ static void trace_bts_prepare(struct trace_iterator *iter) { int cpu; - for_each_cpu_mask(cpu, cpu_possible_map) + for_each_cpu(cpu, cpu_possible_mask) smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1); } diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index a7172a352f6..7bda248daf5 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -39,7 +39,7 @@ static int power_trace_init(struct trace_array *tr) trace_power_enabled = 1; - for_each_cpu_mask(cpu, cpu_possible_map) + for_each_cpu(cpu, cpu_possible_mask) tracing_reset(tr, cpu); return 0; } -- cgit v1.2.3-70-g09d2 From f1fc057c79cb2d27602fb3ad08a031f13459ef27 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:23 +1030 Subject: cpumask: remove any_online_cpu() users: kernel/ Impact: Remove obsolete API usage any_online_cpu() is a good name, but it takes a cpumask_t, not a pointer. There are several places where any_online_cpu() doesn't really want a mask arg at all. Replace all callers with cpumask_any() and cpumask_any_and(). Signed-off-by: Rusty Russell Signed-off-by: Mike Travis --- kernel/softirq.c | 2 +- kernel/softlockup.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 466e75ce271..b7568d7def2 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -733,7 +733,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, break; /* Unbind so it can run. Fall thru. */ kthread_bind(per_cpu(ksoftirqd, hotcpu), - any_online_cpu(cpu_online_map)); + cpumask_any(cpu_online_mask)); case CPU_DEAD: case CPU_DEAD_FROZEN: { struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 1ab790c67b1..492f0c72fec 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -303,7 +303,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - check_cpu = any_online_cpu(cpu_online_map); + check_cpu = cpumask_any(cpu_online_mask); wake_up_process(per_cpu(watchdog_task, hotcpu)); break; #ifdef CONFIG_HOTPLUG_CPU @@ -313,7 +313,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) cpumask_t temp_cpu_online_map = cpu_online_map; cpu_clear(hotcpu, temp_cpu_online_map); - check_cpu = any_online_cpu(temp_cpu_online_map); + check_cpu = cpumask_any(&temp_cpu_online_map); } break; @@ -323,7 +323,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; /* Unbind so it can run. Fall thru. */ kthread_bind(per_cpu(watchdog_task, hotcpu), - any_online_cpu(cpu_online_map)); + cpumask_any(cpu_online_mask)); case CPU_DEAD: case CPU_DEAD_FROZEN: p = per_cpu(watchdog_task, hotcpu); -- cgit v1.2.3-70-g09d2 From a45185d2d7108b01b90b9e0293377be4d6346dde Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:24 +1030 Subject: cpumask: convert kernel/compat.c Impact: Reduce stack usage, use new cpumask API. Straightforward conversion; cpumasks' size is given by cpumask_size() (now a variable rather than fixed) and on-stack cpu masks use cpumask_var_t. Signed-off-by: Rusty Russell --- kernel/compat.c | 49 ++++++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 8eafe3eb50d..d52e2ec1deb 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -454,16 +454,16 @@ asmlinkage long compat_sys_waitid(int which, compat_pid_t pid, } static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, - unsigned len, cpumask_t *new_mask) + unsigned len, struct cpumask *new_mask) { unsigned long *k; - if (len < sizeof(cpumask_t)) - memset(new_mask, 0, sizeof(cpumask_t)); - else if (len > sizeof(cpumask_t)) - len = sizeof(cpumask_t); + if (len < cpumask_size()) + memset(new_mask, 0, cpumask_size()); + else if (len > cpumask_size()) + len = cpumask_size(); - k = cpus_addr(*new_mask); + k = cpumask_bits(new_mask); return compat_get_bitmap(k, user_mask_ptr, len * 8); } @@ -471,40 +471,51 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, unsigned int len, compat_ulong_t __user *user_mask_ptr) { - cpumask_t new_mask; + cpumask_var_t new_mask; int retval; - retval = compat_get_user_cpu_mask(user_mask_ptr, len, &new_mask); + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; + + retval = compat_get_user_cpu_mask(user_mask_ptr, len, new_mask); if (retval) - return retval; + goto out; - return sched_setaffinity(pid, &new_mask); + retval = sched_setaffinity(pid, new_mask); +out: + free_cpumask_var(new_mask); + return retval; } asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, compat_ulong_t __user *user_mask_ptr) { int ret; - cpumask_t mask; + cpumask_var_t mask; unsigned long *k; - unsigned int min_length = sizeof(cpumask_t); + unsigned int min_length = cpumask_size(); - if (NR_CPUS <= BITS_PER_COMPAT_LONG) + if (nr_cpu_ids <= BITS_PER_COMPAT_LONG) min_length = sizeof(compat_ulong_t); if (len < min_length) return -EINVAL; - ret = sched_getaffinity(pid, &mask); + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + ret = sched_getaffinity(pid, mask); if (ret < 0) - return ret; + goto out; - k = cpus_addr(mask); + k = cpumask_bits(mask); ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8); - if (ret) - return ret; + if (ret == 0) + ret = min_length; - return min_length; +out: + free_cpumask_var(mask); + return ret; } int get_compat_itimerspec(struct itimerspec *dst, -- cgit v1.2.3-70-g09d2 From e7577c50f2fb2d1c167e2c04a4b4c2cc042acb82 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:25 +1030 Subject: cpumask: convert kernel/workqueue.c Impact: Reduce memory usage, use new cpumask API. cpu_populated_map becomes a cpumask_var_t, and cpu_singlethread_map is simply a cpumask pointer: it's simply the cpumask containing the first possible CPU anyway. Signed-off-by: Rusty Russell --- kernel/workqueue.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4952322cba4..2f445833ae3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -73,7 +73,7 @@ static DEFINE_SPINLOCK(workqueue_lock); static LIST_HEAD(workqueues); static int singlethread_cpu __read_mostly; -static cpumask_t cpu_singlethread_map __read_mostly; +static const struct cpumask *cpu_singlethread_map __read_mostly; /* * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD * flushes cwq->worklist. This means that flush_workqueue/wait_on_work @@ -81,7 +81,7 @@ static cpumask_t cpu_singlethread_map __read_mostly; * use cpu_possible_map, the cpumask below is more a documentation * than optimization. */ -static cpumask_t cpu_populated_map __read_mostly; +static cpumask_var_t cpu_populated_map __read_mostly; /* If it's single threaded, it isn't in the list of workqueues. */ static inline int is_wq_single_threaded(struct workqueue_struct *wq) @@ -89,10 +89,10 @@ static inline int is_wq_single_threaded(struct workqueue_struct *wq) return wq->singlethread; } -static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq) +static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq) { return is_wq_single_threaded(wq) - ? &cpu_singlethread_map : &cpu_populated_map; + ? cpu_singlethread_map : cpu_populated_map; } static @@ -410,7 +410,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) */ void flush_workqueue(struct workqueue_struct *wq) { - const cpumask_t *cpu_map = wq_cpu_map(wq); + const struct cpumask *cpu_map = wq_cpu_map(wq); int cpu; might_sleep(); @@ -532,7 +532,7 @@ static void wait_on_work(struct work_struct *work) { struct cpu_workqueue_struct *cwq; struct workqueue_struct *wq; - const cpumask_t *cpu_map; + const struct cpumask *cpu_map; int cpu; might_sleep(); @@ -903,7 +903,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) */ void destroy_workqueue(struct workqueue_struct *wq) { - const cpumask_t *cpu_map = wq_cpu_map(wq); + const struct cpumask *cpu_map = wq_cpu_map(wq); int cpu; cpu_maps_update_begin(); @@ -933,7 +933,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: - cpu_set(cpu, cpu_populated_map); + cpumask_set_cpu(cpu, cpu_populated_map); } undo: list_for_each_entry(wq, &workqueues, list) { @@ -964,7 +964,7 @@ undo: switch (action) { case CPU_UP_CANCELED: case CPU_POST_DEAD: - cpu_clear(cpu, cpu_populated_map); + cpumask_clear_cpu(cpu, cpu_populated_map); } return ret; @@ -1017,9 +1017,11 @@ EXPORT_SYMBOL_GPL(work_on_cpu); void __init init_workqueues(void) { - cpu_populated_map = cpu_online_map; - singlethread_cpu = first_cpu(cpu_possible_map); - cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu); + alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL); + + cpumask_copy(cpu_populated_map, cpu_online_mask); + singlethread_cpu = cpumask_first(cpu_possible_mask); + cpu_singlethread_map = cpumask_of(singlethread_cpu); hotcpu_notifier(workqueue_cpu_callback, 0); keventd_wq = create_workqueue("events"); BUG_ON(!keventd_wq); -- cgit v1.2.3-70-g09d2 From 6b954823c24f04ed026a8517f6bab5abda279db8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:25 +1030 Subject: cpumask: convert kernel time functions Impact: Use new APIs Convert kernel/time functions to use struct cpumask *. Note the ugly bitmap declarations in tick-broadcast.c. These should be cpumask_var_t, but there was no obvious initialization function to put the alloc_cpumask_var() calls in. This was safe. (Eventually 'struct cpumask' will be undefined for CONFIG_CPUMASK_OFFSTACK, so we use a bitmap here to show we really mean it). Signed-off-by: Rusty Russell Signed-off-by: Mike Travis --- include/linux/tick.h | 4 +- kernel/time/clocksource.c | 6 +-- kernel/time/tick-broadcast.c | 113 ++++++++++++++++++++++--------------------- kernel/time/tick-common.c | 6 +-- 4 files changed, 66 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/include/linux/tick.h b/include/linux/tick.h index b6ec8189ac0..469b82d88b3 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -84,10 +84,10 @@ static inline void tick_cancel_sched_timer(int cpu) { } # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST extern struct tick_device *tick_get_broadcast_device(void); -extern cpumask_t *tick_get_broadcast_mask(void); +extern struct cpumask *tick_get_broadcast_mask(void); # ifdef CONFIG_TICK_ONESHOT -extern cpumask_t *tick_get_broadcast_oneshot_mask(void); +extern struct cpumask *tick_get_broadcast_oneshot_mask(void); # endif # endif /* BROADCAST */ diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 9ed2eec9752..32141b15d63 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -148,7 +148,7 @@ static void clocksource_watchdog(unsigned long data) int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map); if (next_cpu >= nr_cpu_ids) - next_cpu = first_cpu(cpu_online_map); + next_cpu = cpumask_first(cpu_online_mask); watchdog_timer.expires += WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, next_cpu); } @@ -173,7 +173,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) watchdog_last = watchdog->read(); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, - first_cpu(cpu_online_map)); + cpumask_first(cpu_online_mask)); } } else { if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) @@ -195,7 +195,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, - first_cpu(cpu_online_map)); + cpumask_first(cpu_online_mask)); } } } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 9590af2327b..356fac57a18 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -28,7 +28,9 @@ */ struct tick_device tick_broadcast_device; -static cpumask_t tick_broadcast_mask; +/* FIXME: Use cpumask_var_t. */ +static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); +static DECLARE_BITMAP(tmpmask, NR_CPUS); static DEFINE_SPINLOCK(tick_broadcast_lock); static int tick_broadcast_force; @@ -46,9 +48,9 @@ struct tick_device *tick_get_broadcast_device(void) return &tick_broadcast_device; } -cpumask_t *tick_get_broadcast_mask(void) +struct cpumask *tick_get_broadcast_mask(void) { - return &tick_broadcast_mask; + return to_cpumask(tick_broadcast_mask); } /* @@ -72,7 +74,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev) clockevents_exchange_device(NULL, dev); tick_broadcast_device.evtdev = dev; - if (!cpus_empty(tick_broadcast_mask)) + if (!cpumask_empty(tick_get_broadcast_mask())) tick_broadcast_start_periodic(dev); return 1; } @@ -104,7 +106,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) */ if (!tick_device_is_functional(dev)) { dev->event_handler = tick_handle_periodic; - cpu_set(cpu, tick_broadcast_mask); + cpumask_set_cpu(cpu, tick_get_broadcast_mask()); tick_broadcast_start_periodic(tick_broadcast_device.evtdev); ret = 1; } else { @@ -116,7 +118,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { int cpu = smp_processor_id(); - cpu_clear(cpu, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); tick_broadcast_clear_oneshot(cpu); } } @@ -125,9 +127,9 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) } /* - * Broadcast the event to the cpus, which are set in the mask + * Broadcast the event to the cpus, which are set in the mask (mangled). */ -static void tick_do_broadcast(cpumask_t mask) +static void tick_do_broadcast(struct cpumask *mask) { int cpu = smp_processor_id(); struct tick_device *td; @@ -135,22 +137,21 @@ static void tick_do_broadcast(cpumask_t mask) /* * Check, if the current cpu is in the mask */ - if (cpu_isset(cpu, mask)) { - cpu_clear(cpu, mask); + if (cpumask_test_cpu(cpu, mask)) { + cpumask_clear_cpu(cpu, mask); td = &per_cpu(tick_cpu_device, cpu); td->evtdev->event_handler(td->evtdev); } - if (!cpus_empty(mask)) { + if (!cpumask_empty(mask)) { /* * It might be necessary to actually check whether the devices * have different broadcast functions. For now, just use the * one of the first device. This works as long as we have this * misfeature only on x86 (lapic) */ - cpu = first_cpu(mask); - td = &per_cpu(tick_cpu_device, cpu); - td->evtdev->broadcast(&mask); + td = &per_cpu(tick_cpu_device, cpumask_first(mask)); + td->evtdev->broadcast(mask); } } @@ -160,12 +161,11 @@ static void tick_do_broadcast(cpumask_t mask) */ static void tick_do_periodic_broadcast(void) { - cpumask_t mask; - spin_lock(&tick_broadcast_lock); - cpus_and(mask, cpu_online_map, tick_broadcast_mask); - tick_do_broadcast(mask); + cpumask_and(to_cpumask(tmpmask), + cpu_online_mask, tick_get_broadcast_mask()); + tick_do_broadcast(to_cpumask(tmpmask)); spin_unlock(&tick_broadcast_lock); } @@ -228,13 +228,13 @@ static void tick_do_broadcast_on_off(void *why) if (!tick_device_is_functional(dev)) goto out; - bc_stopped = cpus_empty(tick_broadcast_mask); + bc_stopped = cpumask_empty(tick_get_broadcast_mask()); switch (*reason) { case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - if (!cpu_isset(cpu, tick_broadcast_mask)) { - cpu_set(cpu, tick_broadcast_mask); + if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { + cpumask_set_cpu(cpu, tick_get_broadcast_mask()); if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); @@ -244,8 +244,8 @@ static void tick_do_broadcast_on_off(void *why) break; case CLOCK_EVT_NOTIFY_BROADCAST_OFF: if (!tick_broadcast_force && - cpu_isset(cpu, tick_broadcast_mask)) { - cpu_clear(cpu, tick_broadcast_mask); + cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { + cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); @@ -253,7 +253,7 @@ static void tick_do_broadcast_on_off(void *why) break; } - if (cpus_empty(tick_broadcast_mask)) { + if (cpumask_empty(tick_get_broadcast_mask())) { if (!bc_stopped) clockevents_shutdown(bc); } else if (bc_stopped) { @@ -272,7 +272,7 @@ out: */ void tick_broadcast_on_off(unsigned long reason, int *oncpu) { - if (!cpu_isset(*oncpu, cpu_online_map)) + if (!cpumask_test_cpu(*oncpu, cpu_online_mask)) printk(KERN_ERR "tick-broadcast: ignoring broadcast for " "offline CPU #%d\n", *oncpu); else @@ -303,10 +303,10 @@ void tick_shutdown_broadcast(unsigned int *cpup) spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; - cpu_clear(cpu, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { - if (bc && cpus_empty(tick_broadcast_mask)) + if (bc && cpumask_empty(tick_get_broadcast_mask())) clockevents_shutdown(bc); } @@ -342,10 +342,10 @@ int tick_resume_broadcast(void) switch (tick_broadcast_device.mode) { case TICKDEV_MODE_PERIODIC: - if(!cpus_empty(tick_broadcast_mask)) + if (!cpumask_empty(tick_get_broadcast_mask())) tick_broadcast_start_periodic(bc); - broadcast = cpu_isset(smp_processor_id(), - tick_broadcast_mask); + broadcast = cpumask_test_cpu(smp_processor_id(), + tick_get_broadcast_mask()); break; case TICKDEV_MODE_ONESHOT: broadcast = tick_resume_broadcast_oneshot(bc); @@ -360,14 +360,15 @@ int tick_resume_broadcast(void) #ifdef CONFIG_TICK_ONESHOT -static cpumask_t tick_broadcast_oneshot_mask; +/* FIXME: use cpumask_var_t. */ +static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS); /* - * Debugging: see timer_list.c + * Exposed for debugging: see timer_list.c */ -cpumask_t *tick_get_broadcast_oneshot_mask(void) +struct cpumask *tick_get_broadcast_oneshot_mask(void) { - return &tick_broadcast_oneshot_mask; + return to_cpumask(tick_broadcast_oneshot_mask); } static int tick_broadcast_set_event(ktime_t expires, int force) @@ -389,7 +390,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) */ void tick_check_oneshot_broadcast(int cpu) { - if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { + if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) { struct tick_device *td = &per_cpu(tick_cpu_device, cpu); clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); @@ -402,7 +403,6 @@ void tick_check_oneshot_broadcast(int cpu) static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) { struct tick_device *td; - cpumask_t mask; ktime_t now, next_event; int cpu; @@ -410,13 +410,13 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) again: dev->next_event.tv64 = KTIME_MAX; next_event.tv64 = KTIME_MAX; - mask = CPU_MASK_NONE; + cpumask_clear(to_cpumask(tmpmask)); now = ktime_get(); /* Find all expired events */ - for_each_cpu_mask_nr(cpu, tick_broadcast_oneshot_mask) { + for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) { td = &per_cpu(tick_cpu_device, cpu); if (td->evtdev->next_event.tv64 <= now.tv64) - cpu_set(cpu, mask); + cpumask_set_cpu(cpu, to_cpumask(tmpmask)); else if (td->evtdev->next_event.tv64 < next_event.tv64) next_event.tv64 = td->evtdev->next_event.tv64; } @@ -424,7 +424,7 @@ again: /* * Wakeup the cpus which have an expired event. */ - tick_do_broadcast(mask); + tick_do_broadcast(to_cpumask(tmpmask)); /* * Two reasons for reprogram: @@ -476,15 +476,16 @@ void tick_broadcast_oneshot_control(unsigned long reason) goto out; if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { - if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) { - cpu_set(cpu, tick_broadcast_oneshot_mask); + if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { + cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); if (dev->next_event.tv64 < bc->next_event.tv64) tick_broadcast_set_event(dev->next_event, 1); } } else { - if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { - cpu_clear(cpu, tick_broadcast_oneshot_mask); + if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { + cpumask_clear_cpu(cpu, + tick_get_broadcast_oneshot_mask()); clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); if (dev->next_event.tv64 != KTIME_MAX) tick_program_event(dev->next_event, 1); @@ -502,10 +503,11 @@ out: */ static void tick_broadcast_clear_oneshot(int cpu) { - cpu_clear(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); } -static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires) +static void tick_broadcast_init_next_event(struct cpumask *mask, + ktime_t expires) { struct tick_device *td; int cpu; @@ -526,7 +528,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) if (bc->event_handler != tick_handle_oneshot_broadcast) { int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; int cpu = smp_processor_id(); - cpumask_t mask; bc->event_handler = tick_handle_oneshot_broadcast; clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); @@ -540,13 +541,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) * oneshot_mask bits for those and program the * broadcast device to fire. */ - mask = tick_broadcast_mask; - cpu_clear(cpu, mask); - cpus_or(tick_broadcast_oneshot_mask, - tick_broadcast_oneshot_mask, mask); - - if (was_periodic && !cpus_empty(mask)) { - tick_broadcast_init_next_event(&mask, tick_next_period); + cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask()); + cpumask_clear_cpu(cpu, to_cpumask(tmpmask)); + cpumask_or(tick_get_broadcast_oneshot_mask(), + tick_get_broadcast_oneshot_mask(), + to_cpumask(tmpmask)); + + if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { + tick_broadcast_init_next_event(to_cpumask(tmpmask), + tick_next_period); tick_broadcast_set_event(tick_next_period, 1); } else bc->next_event.tv64 = KTIME_MAX; @@ -585,7 +588,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) * Clear the broadcast mask flag for the dead cpu, but do not * stop the broadcast device! */ - cpu_clear(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); spin_unlock_irqrestore(&tick_broadcast_lock, flags); } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index f8372be7412..63e05d423a0 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -254,7 +254,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) curdev = NULL; } clockevents_exchange_device(curdev, newdev); - tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu)); + tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) tick_oneshot_notify(); @@ -299,9 +299,9 @@ static void tick_shutdown(unsigned int *cpup) } /* Transfer the do_timer job away from this cpu */ if (*cpup == tick_do_timer_cpu) { - int cpu = first_cpu(cpu_online_map); + int cpu = cpumask_first(cpu_online_mask); - tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : + tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : TICK_DO_TIMER_NONE; } spin_unlock_irqrestore(&tick_device_lock, flags); -- cgit v1.2.3-70-g09d2 From d036e67b40f52bdd95392390108defbac7e53837 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:26 +1030 Subject: cpumask: convert kernel/irq Impact: Reduce stack usage, use new cpumask API. ALPHA mod! Main change is that irq_default_affinity becomes a cpumask_var_t, so treat it as a pointer (this effects alpha). Signed-off-by: Rusty Russell --- arch/alpha/kernel/irq.c | 3 ++- include/linux/interrupt.h | 2 +- kernel/irq/manage.c | 11 +++++++++-- kernel/irq/proc.c | 32 +++++++++++++++++++++----------- 4 files changed, 33 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c index d0f1620007f..703731accda 100644 --- a/arch/alpha/kernel/irq.c +++ b/arch/alpha/kernel/irq.c @@ -50,7 +50,8 @@ int irq_select_affinity(unsigned int irq) if (!irq_desc[irq].chip->set_affinity || irq_user_affinity[irq]) return 1; - while (!cpu_possible(cpu) || !cpu_isset(cpu, irq_default_affinity)) + while (!cpu_possible(cpu) || + !cpumask_test_cpu(cpu, irq_default_affinity)) cpu = (cpu < (NR_CPUS-1) ? cpu + 1 : 0); last_cpu = cpu; diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index dfaee6bd265..91f1ef8e581 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -109,7 +109,7 @@ extern void enable_irq(unsigned int irq); #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) -extern cpumask_t irq_default_affinity; +extern cpumask_var_t irq_default_affinity; extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask); extern int irq_can_set_affinity(unsigned int irq); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 61c4a9b6216..cd0cd8dcb34 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -16,8 +16,15 @@ #include "internals.h" #ifdef CONFIG_SMP +cpumask_var_t irq_default_affinity; -cpumask_t irq_default_affinity = CPU_MASK_ALL; +static int init_irq_default_affinity(void) +{ + alloc_cpumask_var(&irq_default_affinity, GFP_KERNEL); + cpumask_setall(irq_default_affinity); + return 0; +} +core_initcall(init_irq_default_affinity); /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) @@ -127,7 +134,7 @@ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) desc->status &= ~IRQ_AFFINITY_SET; } - cpumask_and(&desc->affinity, cpu_online_mask, &irq_default_affinity); + cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity); set_affinity: desc->chip->set_affinity(irq, &desc->affinity); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index d2c0e5ee53c..2abd3a7716e 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -20,7 +20,7 @@ static struct proc_dir_entry *root_irq_dir; static int irq_affinity_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); - cpumask_t *mask = &desc->affinity; + const struct cpumask *mask = &desc->affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PENDING) @@ -93,7 +93,7 @@ static const struct file_operations irq_affinity_proc_fops = { static int default_affinity_show(struct seq_file *m, void *v) { - seq_cpumask(m, &irq_default_affinity); + seq_cpumask(m, irq_default_affinity); seq_putc(m, '\n'); return 0; } @@ -101,27 +101,37 @@ static int default_affinity_show(struct seq_file *m, void *v) static ssize_t default_affinity_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - cpumask_t new_value; + cpumask_var_t new_value; int err; - err = cpumask_parse_user(buffer, count, &new_value); + if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) + return -ENOMEM; + + err = cpumask_parse_user(buffer, count, new_value); if (err) - return err; + goto out; - if (!is_affinity_mask_valid(new_value)) - return -EINVAL; + if (!is_affinity_mask_valid(new_value)) { + err = -EINVAL; + goto out; + } /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least * one online CPU still has to be targeted. */ - if (!cpus_intersects(new_value, cpu_online_map)) - return -EINVAL; + if (!cpumask_intersects(new_value, cpu_online_mask)) { + err = -EINVAL; + goto out; + } - irq_default_affinity = new_value; + cpumask_copy(irq_default_affinity, new_value); + err = count; - return count; +out: + free_cpumask_var(new_value); + return err; } static int default_affinity_open(struct inode *inode, struct file *file) -- cgit v1.2.3-70-g09d2 From bd232f97b30f6bb630efa136a777647545db3039 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:26 +1030 Subject: cpumask: convert RCU implementations Impact: use new cpumask API. rcu_ctrlblk contains a cpumask, and it's highly optimized so I don't want a cpumask_var_t (ie. a pointer) for the CONFIG_CPUMASK_OFFSTACK case. It could use a dangling bitmap, and be allocated in __rcu_init to save memory, but for the moment we use a bitmap. (Eventually 'struct cpumask' will be undefined for CONFIG_CPUMASK_OFFSTACK, so we use a bitmap here to show we really mean it). We remove on-stack cpumasks, using cpumask_var_t for rcu_torture_shuffle_tasks() and for_each_cpu_and in force_quiescent_state(). Signed-off-by: Rusty Russell --- include/linux/rcuclassic.h | 4 ++-- kernel/rcuclassic.c | 32 +++++++++++++++++--------------- kernel/rcupreempt.c | 19 ++++++++++--------- kernel/rcutorture.c | 27 +++++++++++++++------------ 4 files changed, 44 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h index 301dda829e3..f3f697df1d7 100644 --- a/include/linux/rcuclassic.h +++ b/include/linux/rcuclassic.h @@ -59,8 +59,8 @@ struct rcu_ctrlblk { int signaled; spinlock_t lock ____cacheline_internodealigned_in_smp; - cpumask_t cpumask; /* CPUs that need to switch in order */ - /* for current batch to proceed. */ + DECLARE_BITMAP(cpumask, NR_CPUS); /* CPUs that need to switch for */ + /* current batch to proceed. */ } ____cacheline_internodealigned_in_smp; /* Is batch a before batch b ? */ diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index e503a002f33..0ff9b05706a 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -63,14 +63,14 @@ static struct rcu_ctrlblk rcu_ctrlblk = { .completed = -300, .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), - .cpumask = CPU_MASK_NONE, + .cpumask = CPU_BITS_NONE, }; static struct rcu_ctrlblk rcu_bh_ctrlblk = { .cur = -300, .completed = -300, .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), - .cpumask = CPU_MASK_NONE, + .cpumask = CPU_BITS_NONE, }; DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; @@ -85,7 +85,6 @@ static void force_quiescent_state(struct rcu_data *rdp, struct rcu_ctrlblk *rcp) { int cpu; - cpumask_t cpumask; unsigned long flags; set_need_resched(); @@ -96,10 +95,10 @@ static void force_quiescent_state(struct rcu_data *rdp, * Don't send IPI to itself. With irqs disabled, * rdp->cpu is the current cpu. * - * cpu_online_map is updated by the _cpu_down() + * cpu_online_mask is updated by the _cpu_down() * using __stop_machine(). Since we're in irqs disabled * section, __stop_machine() is not exectuting, hence - * the cpu_online_map is stable. + * the cpu_online_mask is stable. * * However, a cpu might have been offlined _just_ before * we disabled irqs while entering here. @@ -107,13 +106,14 @@ static void force_quiescent_state(struct rcu_data *rdp, * notification, leading to the offlined cpu's bit * being set in the rcp->cpumask. * - * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent + * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent * sending smp_reschedule() to an offlined CPU. */ - cpus_and(cpumask, rcp->cpumask, cpu_online_map); - cpu_clear(rdp->cpu, cpumask); - for_each_cpu_mask_nr(cpu, cpumask) - smp_send_reschedule(cpu); + for_each_cpu_and(cpu, + to_cpumask(rcp->cpumask), cpu_online_mask) { + if (cpu != rdp->cpu) + smp_send_reschedule(cpu); + } } spin_unlock_irqrestore(&rcp->lock, flags); } @@ -193,7 +193,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) printk(KERN_ERR "INFO: RCU detected CPU stalls:"); for_each_possible_cpu(cpu) { - if (cpu_isset(cpu, rcp->cpumask)) + if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask))) printk(" %d", cpu); } printk(" (detected by %d, t=%ld jiffies)\n", @@ -221,7 +221,8 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) long delta; delta = jiffies - rcp->jiffies_stall; - if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { + if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) && + delta >= 0) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rcp); @@ -393,7 +394,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) * unnecessarily. */ smp_mb(); - cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); + cpumask_andnot(to_cpumask(rcp->cpumask), + cpu_online_mask, &nohz_cpu_mask); rcp->signaled = 0; } @@ -406,8 +408,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) */ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) { - cpu_clear(cpu, rcp->cpumask); - if (cpus_empty(rcp->cpumask)) { + cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask)); + if (cpumask_empty(to_cpumask(rcp->cpumask))) { /* batch completed ! */ rcp->completed = rcp->cur; rcu_start_batch(rcp); diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 04982659875..f9dc8f3720f 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -164,7 +164,8 @@ static char *rcu_try_flip_state_names[] = { "idle", "waitack", "waitzero", "waitmb" }; #endif /* #ifdef CONFIG_RCU_TRACE */ -static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE; +static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly + = CPU_BITS_NONE; /* * Enum and per-CPU flag to determine when each CPU has seen @@ -758,7 +759,7 @@ rcu_try_flip_idle(void) /* Now ask each CPU for acknowledgement of the flip. */ - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) { + for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) { per_cpu(rcu_flip_flag, cpu) = rcu_flipped; dyntick_save_progress_counter(cpu); } @@ -776,7 +777,7 @@ rcu_try_flip_waitack(void) int cpu; RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) + for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) if (rcu_try_flip_waitack_needed(cpu) && per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); @@ -808,7 +809,7 @@ rcu_try_flip_waitzero(void) /* Check to see if the sum of the "last" counters is zero. */ RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) + for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; if (sum != 0) { RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); @@ -823,7 +824,7 @@ rcu_try_flip_waitzero(void) smp_mb(); /* ^^^^^^^^^^^^ */ /* Call for a memory barrier from each CPU. */ - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) { + for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) { per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; dyntick_save_progress_counter(cpu); } @@ -843,7 +844,7 @@ rcu_try_flip_waitmb(void) int cpu; RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) + for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) if (rcu_try_flip_waitmb_needed(cpu) && per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); @@ -1032,7 +1033,7 @@ void rcu_offline_cpu(int cpu) RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0; RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0; - cpu_clear(cpu, rcu_cpu_online_map); + cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map)); spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); @@ -1072,7 +1073,7 @@ void __cpuinit rcu_online_cpu(int cpu) struct rcu_data *rdp; spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); - cpu_set(cpu, rcu_cpu_online_map); + cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map)); spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); /* @@ -1430,7 +1431,7 @@ void __init __rcu_init(void) * We don't need protection against CPU-Hotplug here * since * a) If a CPU comes online while we are iterating over the - * cpu_online_map below, we would only end up making a + * cpu_online_mask below, we would only end up making a * duplicate call to rcu_online_cpu() which sets the corresponding * CPU's mask in the rcu_cpu_online_map. * diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index b3106552210..3245b40952c 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -868,49 +868,52 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ */ static void rcu_torture_shuffle_tasks(void) { - cpumask_t tmp_mask; + cpumask_var_t tmp_mask; int i; - cpus_setall(tmp_mask); + if (!alloc_cpumask_var(&tmp_mask, GFP_KERNEL)) + BUG(); + + cpumask_setall(tmp_mask); get_online_cpus(); /* No point in shuffling if there is only one online CPU (ex: UP) */ - if (num_online_cpus() == 1) { - put_online_cpus(); - return; - } + if (num_online_cpus() == 1) + goto out; if (rcu_idle_cpu != -1) - cpu_clear(rcu_idle_cpu, tmp_mask); + cpumask_clear_cpu(rcu_idle_cpu, tmp_mask); - set_cpus_allowed_ptr(current, &tmp_mask); + set_cpus_allowed_ptr(current, tmp_mask); if (reader_tasks) { for (i = 0; i < nrealreaders; i++) if (reader_tasks[i]) set_cpus_allowed_ptr(reader_tasks[i], - &tmp_mask); + tmp_mask); } if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) if (fakewriter_tasks[i]) set_cpus_allowed_ptr(fakewriter_tasks[i], - &tmp_mask); + tmp_mask); } if (writer_task) - set_cpus_allowed_ptr(writer_task, &tmp_mask); + set_cpus_allowed_ptr(writer_task, tmp_mask); if (stats_task) - set_cpus_allowed_ptr(stats_task, &tmp_mask); + set_cpus_allowed_ptr(stats_task, tmp_mask); if (rcu_idle_cpu == -1) rcu_idle_cpu = num_online_cpus() - 1; else rcu_idle_cpu--; +out: put_online_cpus(); + free_cpumask_var(tmp_mask); } /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the -- cgit v1.2.3-70-g09d2 From c309b917cab55799ea489d7b5f1b77025d9f8462 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:27 +1030 Subject: cpumask: convert kernel/profile.c Impact: Reduce kernel memory usage, use new cpumask API. Avoid a static cpumask_t for prof_cpu_mask, and an on-stack cpumask_t in prof_cpu_mask_write_proc. Both become cpumask_var_t. prof_cpu_mask is only allocated when profiling is on, but the NULL checks are optimized out by gcc for the !CPUMASK_OFFSTACK case. Also removed some strange and unnecessary casts. Signed-off-by: Rusty Russell --- kernel/profile.c | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 4cb7d68fed8..d18e2d2654f 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -45,7 +45,7 @@ static unsigned long prof_len, prof_shift; int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); -static cpumask_t prof_cpu_mask = CPU_MASK_ALL; +static cpumask_var_t prof_cpu_mask; #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); @@ -113,9 +113,13 @@ int __ref profile_init(void) buffer_bytes = prof_len*sizeof(atomic_t); if (!slab_is_available()) { prof_buffer = alloc_bootmem(buffer_bytes); + alloc_bootmem_cpumask_var(&prof_cpu_mask); return 0; } + if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) + return -ENOMEM; + prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); if (prof_buffer) return 0; @@ -128,6 +132,7 @@ int __ref profile_init(void) if (prof_buffer) return 0; + free_cpumask_var(prof_cpu_mask); return -ENOMEM; } @@ -386,13 +391,15 @@ out_free: return NOTIFY_BAD; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - cpu_set(cpu, prof_cpu_mask); + if (prof_cpu_mask != NULL) + cpumask_set_cpu(cpu, prof_cpu_mask); break; case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - cpu_clear(cpu, prof_cpu_mask); + if (prof_cpu_mask != NULL) + cpumask_clear_cpu(cpu, prof_cpu_mask); if (per_cpu(cpu_profile_hits, cpu)[0]) { page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); per_cpu(cpu_profile_hits, cpu)[0] = NULL; @@ -430,7 +437,8 @@ void profile_tick(int type) if (type == CPU_PROFILING && timer_hook) timer_hook(regs); - if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask)) + if (!user_mode(regs) && prof_cpu_mask != NULL && + cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) profile_hit(type, (void *)profile_pc(regs)); } @@ -442,7 +450,7 @@ void profile_tick(int type) static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { - int len = cpumask_scnprintf(page, count, (cpumask_t *)data); + int len = cpumask_scnprintf(page, count, data); if (count - len < 2) return -EINVAL; len += sprintf(page + len, "\n"); @@ -452,16 +460,20 @@ static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, static int prof_cpu_mask_write_proc(struct file *file, const char __user *buffer, unsigned long count, void *data) { - cpumask_t *mask = (cpumask_t *)data; + struct cpumask *mask = data; unsigned long full_count = count, err; - cpumask_t new_value; + cpumask_var_t new_value; - err = cpumask_parse_user(buffer, count, &new_value); - if (err) - return err; + if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) + return -ENOMEM; - *mask = new_value; - return full_count; + err = cpumask_parse_user(buffer, count, new_value); + if (!err) { + cpumask_copy(mask, new_value); + err = full_count; + } + free_cpumask_var(new_value); + return err; } void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) @@ -472,7 +484,7 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); if (!entry) return; - entry->data = (void *)&prof_cpu_mask; + entry->data = prof_cpu_mask; entry->read_proc = prof_cpu_mask_read_proc; entry->write_proc = prof_cpu_mask_write_proc; } -- cgit v1.2.3-70-g09d2 From e0b582ec56f1a1d8b30ebf340a7b91fb09f26c8c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:28 +1030 Subject: cpumask: convert kernel/cpu.c Impact: Reduce kernel stack and memory usage, use new cpumask API. Use cpumask_var_t for take_cpu_down() stack var, and frozen_cpus. Note that notify_cpu_starting() can be called before core_initcall allocates frozen_cpus, but the NULL check is optimized out by gcc for the CONFIG_CPUMASK_OFFSTACK=n case. Signed-off-by: Rusty Russell --- kernel/cpu.c | 48 +++++++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 2c9f78f3a2f..47fff3b63cb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -194,7 +194,7 @@ static int __ref take_cpu_down(void *_param) static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) { int err, nr_calls = 0; - cpumask_t old_allowed, tmp; + cpumask_var_t old_allowed; void *hcpu = (void *)(long)cpu; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; struct take_cpu_down_param tcd_param = { @@ -208,6 +208,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) if (!cpu_online(cpu)) return -EINVAL; + if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL)) + return -ENOMEM; + cpu_hotplug_begin(); err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); @@ -222,13 +225,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) } /* Ensure that we are not runnable on dying cpu */ - old_allowed = current->cpus_allowed; - cpus_setall(tmp); - cpu_clear(cpu, tmp); - set_cpus_allowed_ptr(current, &tmp); - tmp = cpumask_of_cpu(cpu); + cpumask_copy(old_allowed, ¤t->cpus_allowed); + set_cpus_allowed_ptr(current, + cpumask_of(cpumask_any_but(cpu_online_mask, cpu))); - err = __stop_machine(take_cpu_down, &tcd_param, &tmp); + err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, @@ -254,7 +255,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) check_for_tasks(cpu); out_allowed: - set_cpus_allowed_ptr(current, &old_allowed); + set_cpus_allowed_ptr(current, old_allowed); out_release: cpu_hotplug_done(); if (!err) { @@ -262,6 +263,7 @@ out_release: hcpu) == NOTIFY_BAD) BUG(); } + free_cpumask_var(old_allowed); return err; } @@ -280,7 +282,7 @@ int __ref cpu_down(unsigned int cpu) /* * Make sure the all cpus did the reschedule and are not - * using stale version of the cpu_active_map. + * using stale version of the cpu_active_mask. * This is not strictly necessary becuase stop_machine() * that we run down the line already provides the required * synchronization. But it's really a side effect and we do not @@ -344,7 +346,7 @@ out_notify: int __cpuinit cpu_up(unsigned int cpu) { int err = 0; - if (!cpu_isset(cpu, cpu_possible_map)) { + if (!cpu_possible(cpu)) { printk(KERN_ERR "can't online cpu %d because it is not " "configured as may-hotadd at boot time\n", cpu); #if defined(CONFIG_IA64) || defined(CONFIG_X86_64) @@ -369,25 +371,25 @@ out: } #ifdef CONFIG_PM_SLEEP_SMP -static cpumask_t frozen_cpus; +static cpumask_var_t frozen_cpus; int disable_nonboot_cpus(void) { int cpu, first_cpu, error = 0; cpu_maps_update_begin(); - first_cpu = first_cpu(cpu_online_map); + first_cpu = cpumask_first(cpu_online_mask); /* We take down all of the non-boot CPUs in one shot to avoid races * with the userspace trying to use the CPU hotplug at the same time */ - cpus_clear(frozen_cpus); + cpumask_clear(frozen_cpus); printk("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; error = _cpu_down(cpu, 1); if (!error) { - cpu_set(cpu, frozen_cpus); + cpumask_set_cpu(cpu, frozen_cpus); printk("CPU%d is down\n", cpu); } else { printk(KERN_ERR "Error taking CPU%d down: %d\n", @@ -413,11 +415,11 @@ void __ref enable_nonboot_cpus(void) /* Allow everyone to use the CPU hotplug again */ cpu_maps_update_begin(); cpu_hotplug_disabled = 0; - if (cpus_empty(frozen_cpus)) + if (cpumask_empty(frozen_cpus)) goto out; printk("Enabling non-boot CPUs ...\n"); - for_each_cpu_mask_nr(cpu, frozen_cpus) { + for_each_cpu(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { printk("CPU%d is up\n", cpu); @@ -425,10 +427,18 @@ void __ref enable_nonboot_cpus(void) } printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); } - cpus_clear(frozen_cpus); + cpumask_clear(frozen_cpus); out: cpu_maps_update_done(); } + +static int alloc_frozen_cpus(void) +{ + if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO)) + return -ENOMEM; + return 0; +} +core_initcall(alloc_frozen_cpus); #endif /* CONFIG_PM_SLEEP_SMP */ /** @@ -444,7 +454,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu) unsigned long val = CPU_STARTING; #ifdef CONFIG_PM_SLEEP_SMP - if (cpu_isset(cpu, frozen_cpus)) + if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus)) val = CPU_STARTING_FROZEN; #endif /* CONFIG_PM_SLEEP_SMP */ raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); @@ -456,7 +466,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu) * cpu_bit_bitmap[] is a special, "compressed" data structure that * represents all NR_CPUS bits binary values of 1< Date: Thu, 1 Jan 2009 10:12:28 +1030 Subject: cpumask: convert rest of files in kernel/ Impact: Reduce stack usage, use new cpumask API. Mainly changing cpumask_t to 'struct cpumask' and similar simple API conversion. Two conversions worth mentioning: 1) we use cpumask_any_but to avoid a temporary in kernel/softlockup.c, 2) Use cpumask_var_t in taskstats_user_cmd(). Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Cc: Balbir Singh Cc: Ingo Molnar --- include/linux/stop_machine.h | 6 +++--- kernel/power/poweroff.c | 2 +- kernel/softlockup.c | 6 ++---- kernel/stop_machine.c | 8 ++++---- kernel/taskstats.c | 39 ++++++++++++++++++++++++--------------- 5 files changed, 34 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index faf1519b5ad..74d59a64136 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -23,7 +23,7 @@ * * This can be thought of as a very heavy write lock, equivalent to * grabbing every spinlock in the kernel. */ -int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus); +int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus); /** * __stop_machine: freeze the machine on all CPUs and run this function @@ -34,11 +34,11 @@ int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus); * Description: This is a special version of the above, which assumes cpus * won't come or go while it's being called. Used by hotplug cpu. */ -int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus); +int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus); #else static inline int stop_machine(int (*fn)(void *), void *data, - const cpumask_t *cpus) + const struct cpumask *cpus) { int ret; local_irq_disable(); diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 72016f05147..97890831e1b 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -27,7 +27,7 @@ static DECLARE_WORK(poweroff_work, do_poweroff); static void handle_poweroff(int key, struct tty_struct *tty) { /* run sysrq poweroff on boot cpu */ - schedule_work_on(first_cpu(cpu_online_map), &poweroff_work); + schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); } static struct sysrq_key_op sysrq_poweroff_op = { diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 492f0c72fec..d9188c66278 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -310,10 +310,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: if (hotcpu == check_cpu) { - cpumask_t temp_cpu_online_map = cpu_online_map; - - cpu_clear(hotcpu, temp_cpu_online_map); - check_cpu = cpumask_any(&temp_cpu_online_map); + /* Pick any other online cpu. */ + check_cpu = cpumask_any_but(cpu_online_mask, hotcpu); } break; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 24e8ceacc38..286c41722e8 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -69,10 +69,10 @@ static void stop_cpu(struct work_struct *unused) int err; if (!active_cpus) { - if (cpu == first_cpu(cpu_online_map)) + if (cpu == cpumask_first(cpu_online_mask)) smdata = &active; } else { - if (cpu_isset(cpu, *active_cpus)) + if (cpumask_test_cpu(cpu, active_cpus)) smdata = &active; } /* Simple state machine */ @@ -109,7 +109,7 @@ static int chill(void *unused) return 0; } -int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) +int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) { struct work_struct *sm_work; int i, ret; @@ -142,7 +142,7 @@ int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) return ret; } -int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) +int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) { int ret; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 6d7dc4ec4aa..888adbcca30 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -290,18 +290,17 @@ ret: return; } -static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) +static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) { struct listener_list *listeners; struct listener *s, *tmp; unsigned int cpu; - cpumask_t mask = *maskp; - if (!cpus_subset(mask, cpu_possible_map)) + if (!cpumask_subset(mask, cpu_possible_mask)) return -EINVAL; if (isadd == REGISTER) { - for_each_cpu_mask_nr(cpu, mask) { + for_each_cpu(cpu, mask) { s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, cpu_to_node(cpu)); if (!s) @@ -320,7 +319,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) /* Deregister or cleanup */ cleanup: - for_each_cpu_mask_nr(cpu, mask) { + for_each_cpu(cpu, mask) { listeners = &per_cpu(listener_array, cpu); down_write(&listeners->sem); list_for_each_entry_safe(s, tmp, &listeners->list, list) { @@ -335,7 +334,7 @@ cleanup: return 0; } -static int parse(struct nlattr *na, cpumask_t *mask) +static int parse(struct nlattr *na, struct cpumask *mask) { char *data; int len; @@ -428,23 +427,33 @@ err: static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) { - int rc = 0; + int rc; struct sk_buff *rep_skb; struct taskstats *stats; size_t size; - cpumask_t mask; + cpumask_var_t mask; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; - rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); if (rc < 0) - return rc; - if (rc == 0) - return add_del_listener(info->snd_pid, &mask, REGISTER); + goto free_return_rc; + if (rc == 0) { + rc = add_del_listener(info->snd_pid, mask, REGISTER); + goto free_return_rc; + } - rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); if (rc < 0) + goto free_return_rc; + if (rc == 0) { + rc = add_del_listener(info->snd_pid, mask, DEREGISTER); +free_return_rc: + free_cpumask_var(mask); return rc; - if (rc == 0) - return add_del_listener(info->snd_pid, &mask, DEREGISTER); + } + free_cpumask_var(mask); /* * Size includes space for nested attributes -- cgit v1.2.3-70-g09d2 From 5db0e1e9e0f30f160b832a0b5cd1131954bf4f6e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:29 +1030 Subject: cpumask: replace for_each_cpu_mask_nr with for_each_cpu in kernel/time/ Impact: cleanup Simple replacement, now the _nr is redundant. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Cc: Ingo Molnar --- kernel/time/clocksource.c | 3 ++- kernel/time/tick-broadcast.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 32141b15d63..ca89e1593f0 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -145,7 +145,8 @@ static void clocksource_watchdog(unsigned long data) * Cycle through CPUs to check if the CPUs stay * synchronized to each other. */ - int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map); + int next_cpu = cpumask_next(raw_smp_processor_id(), + cpu_online_mask); if (next_cpu >= nr_cpu_ids) next_cpu = cpumask_first(cpu_online_mask); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 356fac57a18..118a3b3b3f9 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -512,7 +512,7 @@ static void tick_broadcast_init_next_event(struct cpumask *mask, struct tick_device *td; int cpu; - for_each_cpu_mask_nr(cpu, *mask) { + for_each_cpu(cpu, mask) { td = &per_cpu(tick_cpu_device, cpu); if (td->evtdev) td->evtdev->next_event = expires; -- cgit v1.2.3-70-g09d2