From ae2975bc3476243b45a1e2344236d7920c268f38 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 14 Nov 2011 15:56:38 -0800 Subject: userns: Convert group_info values from gid_t to kgid_t. As a first step to converting struct cred to be all kuid_t and kgid_t values convert the group values stored in group_info to always be kgid_t values. Unless user namespaces are used this change should have no effect. Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- fs/proc/array.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/proc/array.c') diff --git a/fs/proc/array.c b/fs/proc/array.c index f9bd395b347..36a0a9192ec 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -81,6 +81,7 @@ #include #include #include +#include #include #include @@ -161,6 +162,7 @@ static inline const char *get_task_state(struct task_struct *tsk) static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *p) { + struct user_namespace *user_ns = current_user_ns(); struct group_info *group_info; int g; struct fdtable *fdt = NULL; @@ -205,7 +207,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, task_unlock(p); for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++) - seq_printf(m, "%d ", GROUP_AT(group_info, g)); + seq_printf(m, "%d ", + from_kgid_munged(user_ns, GROUP_AT(group_info, g))); put_cred(cred); seq_putc(m, '\n'); -- cgit v1.2.3-70-g09d2 From dcb0f22282e680ee5202ab7574ce78beb3803a9f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 9 Feb 2012 08:48:21 -0800 Subject: userns: Convert proc to use kuid/kgid where appropriate Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- fs/proc/array.c | 10 ++++++++-- fs/proc/base.c | 16 ++++++++-------- fs/proc/inode.c | 4 ++-- fs/proc/root.c | 2 +- include/linux/pid_namespace.h | 2 +- include/linux/proc_fs.h | 4 ++-- init/Kconfig | 1 - 7 files changed, 22 insertions(+), 17 deletions(-) (limited to 'fs/proc/array.c') diff --git a/fs/proc/array.c b/fs/proc/array.c index 36a0a9192ec..dc4c5a7b9ec 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -191,8 +191,14 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, task_tgid_nr_ns(p, ns), pid_nr_ns(pid, ns), ppid, tpid, - cred->uid, cred->euid, cred->suid, cred->fsuid, - cred->gid, cred->egid, cred->sgid, cred->fsgid); + from_kuid_munged(user_ns, cred->uid), + from_kuid_munged(user_ns, cred->euid), + from_kuid_munged(user_ns, cred->suid), + from_kuid_munged(user_ns, cred->fsuid), + from_kgid_munged(user_ns, cred->gid), + from_kgid_munged(user_ns, cred->egid), + from_kgid_munged(user_ns, cred->sgid), + from_kgid_munged(user_ns, cred->fsgid)); task_lock(p); if (p->files) diff --git a/fs/proc/base.c b/fs/proc/base.c index 2ee514c7e64..c47904994b7 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1562,8 +1562,8 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) generic_fillattr(inode, stat); rcu_read_lock(); - stat->uid = 0; - stat->gid = 0; + stat->uid = GLOBAL_ROOT_UID; + stat->gid = GLOBAL_ROOT_GID; task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { if (!has_pid_permissions(pid, task, 2)) { @@ -1623,8 +1623,8 @@ int pid_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_gid = cred->egid; rcu_read_unlock(); } else { - inode->i_uid = 0; - inode->i_gid = 0; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; } inode->i_mode &= ~(S_ISUID | S_ISGID); security_task_to_inode(task, inode); @@ -1811,8 +1811,8 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_gid = cred->egid; rcu_read_unlock(); } else { - inode->i_uid = 0; - inode->i_gid = 0; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; } inode->i_mode &= ~(S_ISUID | S_ISGID); security_task_to_inode(task, inode); @@ -2061,8 +2061,8 @@ static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_gid = cred->egid; rcu_read_unlock(); } else { - inode->i_uid = 0; - inode->i_gid = 0; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; } security_task_to_inode(task, inode); status = 1; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 205c9228083..554ecc54799 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -108,8 +108,8 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root) struct super_block *sb = root->d_sb; struct pid_namespace *pid = sb->s_fs_info; - if (pid->pid_gid) - seq_printf(seq, ",gid=%lu", (unsigned long)pid->pid_gid); + if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID)) + seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid)); if (pid->hide_pid != 0) seq_printf(seq, ",hidepid=%u", pid->hide_pid); diff --git a/fs/proc/root.c b/fs/proc/root.c index 46a15d8a29c..df4e4561dbb 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -67,7 +67,7 @@ static int proc_parse_options(char *options, struct pid_namespace *pid) case Opt_gid: if (match_int(&args[0], &option)) return 0; - pid->pid_gid = option; + pid->pid_gid = make_kgid(current_user_ns(), option); break; case Opt_hidepid: if (match_int(&args[0], &option)) diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index b067bd8c49d..00474b04714 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -31,7 +31,7 @@ struct pid_namespace { #ifdef CONFIG_BSD_PROCESS_ACCT struct bsd_acct_struct *bacct; #endif - gid_t pid_gid; + kgid_t pid_gid; int hide_pid; int reboot; /* group exit code if this pidns was rebooted */ }; diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 85c50730623..3fd2e871ff1 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -52,8 +52,8 @@ struct proc_dir_entry { unsigned int low_ino; umode_t mode; nlink_t nlink; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; loff_t size; const struct inode_operations *proc_iops; /* diff --git a/init/Kconfig b/init/Kconfig index 0e7d30ba8eb..7bebe91367c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -948,7 +948,6 @@ config UIDGID_CONVERTED depends on NTFS_FS = n depends on OCFS2_FS = n depends on OMFS_FS = n - depends on PROC_FS = n depends on PROC_SYSCTL = n depends on QNX4FS_FS = n depends on QNX6FS_FS = n -- cgit v1.2.3-70-g09d2 From 715be1fce0d964aca15618b24f6f415f3cbd03c8 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Thu, 31 May 2012 16:26:19 -0700 Subject: procfs: use more apprioriate types when dumping /proc/N/stat - use int fpr priority and nice, since task_nice()/task_prio() return that - field 24: get_mm_rss() returns unsigned long Signed-off-by: Jan Engelhardt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/proc/array.c') diff --git a/fs/proc/array.c b/fs/proc/array.c index dc4c5a7b9ec..3a26d23420c 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -370,7 +370,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task, int whole) { unsigned long vsize, eip, esp, wchan = ~0UL; - long priority, nice; + int priority, nice; int tty_pgrp = -1, tty_nr = 0; sigset_t sigign, sigcatch; char state; @@ -492,7 +492,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_put_decimal_ull(m, ' ', 0); seq_put_decimal_ull(m, ' ', start_time); seq_put_decimal_ull(m, ' ', vsize); - seq_put_decimal_ll(m, ' ', mm ? get_mm_rss(mm) : 0); + seq_put_decimal_ull(m, ' ', mm ? get_mm_rss(mm) : 0); seq_put_decimal_ull(m, ' ', rsslim); seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->start_code : 1) : 0); seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->end_code : 1) : 0); -- cgit v1.2.3-70-g09d2 From 818411616baf46ceba0cff6f05af3a9b294734f7 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 31 May 2012 16:26:43 -0700 Subject: fs, proc: introduce /proc//task//children entry When we do checkpoint of a task we need to know the list of children the task, has but there is no easy and fast way to generate reverse parent->children chain from arbitrary (while a parent pid is provided in "PPid" field of /proc//status). So instead of walking over all pids in the system (creating one big process tree in memory, just to figure out which children a task has) -- we add explicit /proc//task//children entry, because the kernel already has this kind of information but it is not yet exported. This is a first level children, not the whole process tree. Signed-off-by: Cyrill Gorcunov Reviewed-by: Oleg Nesterov Reviewed-by: Kees Cook Cc: Pavel Emelyanov Cc: Serge Hallyn Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 18 ++++++ fs/proc/array.c | 123 +++++++++++++++++++++++++++++++++++++ fs/proc/base.c | 3 + fs/proc/internal.h | 1 + 4 files changed, 145 insertions(+) (limited to 'fs/proc/array.c') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 912af6ce562..d8d3f9a8e5a 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -40,6 +40,7 @@ Table of Contents 3.4 /proc//coredump_filter - Core dump filtering settings 3.5 /proc//mountinfo - Information about mounts 3.6 /proc//comm & /proc//task//comm + 3.7 /proc//task//children - Information about task children 4 Configuring procfs 4.1 Mount options @@ -1578,6 +1579,23 @@ then the kernel's TASK_COMM_LEN (currently 16 chars) will result in a truncated comm value. +3.7 /proc//task//children - Information about task children +------------------------------------------------------------------------- +This file provides a fast way to retrieve first level children pids +of a task pointed by / pair. The format is a space separated +stream of pids. + +Note the "first level" here -- if a child has own children they will +not be listed here, one needs to read /proc//task//children +to obtain the descendants. + +Since this interface is intended to be fast and cheap it doesn't +guarantee to provide precise results and some children might be +skipped, especially if they've exited right after we printed their +pids, so one need to either stop or freeze processes being inspected +if precise results are needed. + + ------------------------------------------------------------------------------ Configuring procfs ------------------------------------------------------------------------------ diff --git a/fs/proc/array.c b/fs/proc/array.c index 3a26d23420c..62887e39a2d 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -565,3 +565,126 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, return 0; } + +#ifdef CONFIG_CHECKPOINT_RESTORE +static struct pid * +get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos) +{ + struct task_struct *start, *task; + struct pid *pid = NULL; + + read_lock(&tasklist_lock); + + start = pid_task(proc_pid(inode), PIDTYPE_PID); + if (!start) + goto out; + + /* + * Lets try to continue searching first, this gives + * us significant speedup on children-rich processes. + */ + if (pid_prev) { + task = pid_task(pid_prev, PIDTYPE_PID); + if (task && task->real_parent == start && + !(list_empty(&task->sibling))) { + if (list_is_last(&task->sibling, &start->children)) + goto out; + task = list_first_entry(&task->sibling, + struct task_struct, sibling); + pid = get_pid(task_pid(task)); + goto out; + } + } + + /* + * Slow search case. + * + * We might miss some children here if children + * are exited while we were not holding the lock, + * but it was never promised to be accurate that + * much. + * + * "Just suppose that the parent sleeps, but N children + * exit after we printed their tids. Now the slow paths + * skips N extra children, we miss N tasks." (c) + * + * So one need to stop or freeze the leader and all + * its children to get a precise result. + */ + list_for_each_entry(task, &start->children, sibling) { + if (pos-- == 0) { + pid = get_pid(task_pid(task)); + break; + } + } + +out: + read_unlock(&tasklist_lock); + return pid; +} + +static int children_seq_show(struct seq_file *seq, void *v) +{ + struct inode *inode = seq->private; + pid_t pid; + + pid = pid_nr_ns(v, inode->i_sb->s_fs_info); + return seq_printf(seq, "%d ", pid); +} + +static void *children_seq_start(struct seq_file *seq, loff_t *pos) +{ + return get_children_pid(seq->private, NULL, *pos); +} + +static void *children_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct pid *pid; + + pid = get_children_pid(seq->private, v, *pos + 1); + put_pid(v); + + ++*pos; + return pid; +} + +static void children_seq_stop(struct seq_file *seq, void *v) +{ + put_pid(v); +} + +static const struct seq_operations children_seq_ops = { + .start = children_seq_start, + .next = children_seq_next, + .stop = children_seq_stop, + .show = children_seq_show, +}; + +static int children_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, &children_seq_ops); + if (ret) + return ret; + + m = file->private_data; + m->private = inode; + + return ret; +} + +int children_seq_release(struct inode *inode, struct file *file) +{ + seq_release(inode, file); + return 0; +} + +const struct file_operations proc_tid_children_operations = { + .open = children_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = children_seq_release, +}; +#endif /* CONFIG_CHECKPOINT_RESTORE */ diff --git a/fs/proc/base.c b/fs/proc/base.c index bd8b4ca6a61..616f41a7cde 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3400,6 +3400,9 @@ static const struct pid_entry tid_base_stuff[] = { ONE("stat", S_IRUGO, proc_tid_stat), ONE("statm", S_IRUGO, proc_pid_statm), REG("maps", S_IRUGO, proc_tid_maps_operations), +#ifdef CONFIG_CHECKPOINT_RESTORE + REG("children", S_IRUGO, proc_tid_children_operations), +#endif #ifdef CONFIG_NUMA REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations), #endif diff --git a/fs/proc/internal.h b/fs/proc/internal.h index a30643784db..eca4aca5b6e 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -54,6 +54,7 @@ extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); +extern const struct file_operations proc_tid_children_operations; extern const struct file_operations proc_pid_maps_operations; extern const struct file_operations proc_tid_maps_operations; extern const struct file_operations proc_pid_numa_maps_operations; -- cgit v1.2.3-70-g09d2 From 5b172087f99189416d5f47fd7ab5e6fb762a9ba3 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 31 May 2012 16:26:44 -0700 Subject: c/r: procfs: add arg_start/end, env_start/end and exit_code members to /proc/$pid/stat We would like to have an ability to restore command line arguments and program environment pointers but first we need to obtain them somehow. Thus we put these values into /proc/$pid/stat. The exit_code is needed to restore zombie tasks. Signed-off-by: Cyrill Gorcunov Acked-by: Kees Cook Cc: Pavel Emelyanov Cc: Serge Hallyn Cc: KAMEZAWA Hiroyuki Cc: Alexey Dobriyan Cc: Tejun Heo Cc: Andrew Vagin Cc: Vasiliy Kulikov Cc: Alexey Dobriyan Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 5 +++++ fs/proc/array.c | 20 +++++++++++++++++--- 2 files changed, 22 insertions(+), 3 deletions(-) (limited to 'fs/proc/array.c') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index d8d3f9a8e5a..fb0a6aeb936 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -311,6 +311,11 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7) start_data address above which program data+bss is placed end_data address below which program data+bss is placed start_brk address above which program heap can be expanded with brk() + arg_start address above which program command line is placed + arg_end address below which program command line is placed + env_start address above which program environment is placed + env_end address below which program environment is placed + exit_code the thread's exit_code in the form reported by the waitpid system call .............................................................................. The /proc/PID/maps file containing the currently mapped memory regions and diff --git a/fs/proc/array.c b/fs/proc/array.c index 62887e39a2d..c1c207c36ca 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -517,9 +517,23 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task)); seq_put_decimal_ull(m, ' ', cputime_to_clock_t(gtime)); seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cgtime)); - seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->start_data : 0); - seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->end_data : 0); - seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->start_brk : 0); + + if (mm && permitted) { + seq_put_decimal_ull(m, ' ', mm->start_data); + seq_put_decimal_ull(m, ' ', mm->end_data); + seq_put_decimal_ull(m, ' ', mm->start_brk); + seq_put_decimal_ull(m, ' ', mm->arg_start); + seq_put_decimal_ull(m, ' ', mm->arg_end); + seq_put_decimal_ull(m, ' ', mm->env_start); + seq_put_decimal_ull(m, ' ', mm->env_end); + } else + seq_printf(m, " 0 0 0 0 0 0 0"); + + if (permitted) + seq_put_decimal_ll(m, ' ', task->exit_code); + else + seq_put_decimal_ll(m, ' ', 0); + seq_putc(m, '\n'); if (mm) mmput(mm); -- cgit v1.2.3-70-g09d2