From 1dce27c5aa6770e9d195f2bb7db1db3d4dde5591 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Feb 2012 17:49:42 +0000 Subject: Wrap accesses to the fd_sets in struct fdtable Wrap accesses to the fd_sets in struct fdtable (for recording open files and close-on-exec flags) so that we can move away from using fd_sets since we abuse the fd_set structs by not allocating the full-sized structure under normal circumstances and by non-core code looking at the internals of the fd_sets. The first abuse means that use of FD_ZERO() on these fd_sets is not permitted, since that cannot be told about their abnormal lengths. This introduces six wrapper functions for setting, clearing and testing close-on-exec flags and fd-is-open flags: void __set_close_on_exec(int fd, struct fdtable *fdt); void __clear_close_on_exec(int fd, struct fdtable *fdt); bool close_on_exec(int fd, const struct fdtable *fdt); void __set_open_fd(int fd, struct fdtable *fdt); void __clear_open_fd(int fd, struct fdtable *fdt); bool fd_is_open(int fd, const struct fdtable *fdt); Note that I've prepended '__' to the names of the set/clear functions because they require the caller to hold a lock to use them. Note also that I haven't added wrappers for looking behind the scenes at the the array. Possibly that should exist too. Signed-off-by: David Howells Link: http://lkml.kernel.org/r/20120216174942.23314.1364.stgit@warthog.procyon.org.uk Signed-off-by: H. Peter Anvin Cc: Al Viro --- fs/exec.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 92ce83a11e9..22cc38d9e79 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -2078,8 +2078,8 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) fd_install(0, rp); spin_lock(&cf->file_lock); fdt = files_fdtable(cf); - FD_SET(0, fdt->open_fds); - FD_CLR(0, fdt->close_on_exec); + __set_open_fd(0, fdt); + __clear_close_on_exec(0, fdt); spin_unlock(&cf->file_lock); /* and disallow core files too */ -- cgit v1.2.3-70-g09d2 From 1fd36adcd98c14d2fd97f545293c488775cb2823 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Feb 2012 17:49:54 +0000 Subject: Replace the fd_sets in struct fdtable with an array of unsigned longs Replace the fd_sets in struct fdtable with an array of unsigned longs and then use the standard non-atomic bit operations rather than the FD_* macros. This: (1) Removes the abuses of struct fd_set: (a) Since we don't want to allocate a full fd_set the vast majority of the time, we actually, in effect, just allocate a just-big-enough array of unsigned longs and cast it to an fd_set type - so why bother with the fd_set at all? (b) Some places outside of the core fdtable handling code (such as SELinux) want to look inside the array of unsigned longs hidden inside the fd_set struct for more efficient iteration over the entire set. (2) Eliminates the use of FD_*() macros in the kernel completely. (3) Permits the __FD_*() macros to be deleted entirely where not exposed to userspace. Signed-off-by: David Howells Link: http://lkml.kernel.org/r/20120216174954.23314.48147.stgit@warthog.procyon.org.uk Signed-off-by: H. Peter Anvin Cc: Al Viro --- fs/exec.c | 4 ++-- fs/file.c | 46 ++++++++++++++++++++++------------------------ fs/select.c | 2 +- include/linux/fdtable.h | 28 ++++++++++------------------ kernel/exit.c | 2 +- security/selinux/hooks.c | 2 +- 6 files changed, 37 insertions(+), 47 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 22cc38d9e79..cfd5e3047bd 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1026,10 +1026,10 @@ static void flush_old_files(struct files_struct * files) fdt = files_fdtable(files); if (i >= fdt->max_fds) break; - set = fdt->close_on_exec->fds_bits[j]; + set = fdt->close_on_exec[j]; if (!set) continue; - fdt->close_on_exec->fds_bits[j] = 0; + fdt->close_on_exec[j] = 0; spin_unlock(&files->file_lock); for ( ; set ; i++,set >>= 1) { if (set & 1) { diff --git a/fs/file.c b/fs/file.c index 114fea0a2ce..2d479dd8484 100644 --- a/fs/file.c +++ b/fs/file.c @@ -40,7 +40,7 @@ int sysctl_nr_open_max = 1024 * 1024; /* raised later */ */ static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); -static void *alloc_fdmem(unsigned int size) +static void *alloc_fdmem(size_t size) { /* * Very large allocations can stress page reclaim, so fall back to @@ -142,7 +142,7 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) static struct fdtable * alloc_fdtable(unsigned int nr) { struct fdtable *fdt; - char *data; + void *data; /* * Figure out how many fds we actually want to support in this fdtable. @@ -172,14 +172,15 @@ static struct fdtable * alloc_fdtable(unsigned int nr) data = alloc_fdmem(nr * sizeof(struct file *)); if (!data) goto out_fdt; - fdt->fd = (struct file **)data; - data = alloc_fdmem(max_t(unsigned int, + fdt->fd = data; + + data = alloc_fdmem(max_t(size_t, 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES)); if (!data) goto out_arr; - fdt->open_fds = (fd_set *)data; - data += nr / BITS_PER_BYTE; - fdt->close_on_exec = (fd_set *)data; + fdt->open_fds = data; + data += nr / BITS_PER_LONG; + fdt->close_on_exec = data; fdt->next = NULL; return fdt; @@ -275,11 +276,11 @@ static int count_open_files(struct fdtable *fdt) int i; /* Find the last open fd */ - for (i = size/(8*sizeof(long)); i > 0; ) { - if (fdt->open_fds->fds_bits[--i]) + for (i = size / BITS_PER_LONG; i > 0; ) { + if (fdt->open_fds[--i]) break; } - i = (i+1) * 8 * sizeof(long); + i = (i + 1) * BITS_PER_LONG; return i; } @@ -306,8 +307,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) newf->next_fd = 0; new_fdt = &newf->fdtab; new_fdt->max_fds = NR_OPEN_DEFAULT; - new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; - new_fdt->open_fds = (fd_set *)&newf->open_fds_init; + new_fdt->close_on_exec = newf->close_on_exec_init; + new_fdt->open_fds = newf->open_fds_init; new_fdt->fd = &newf->fd_array[0]; new_fdt->next = NULL; @@ -350,10 +351,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) old_fds = old_fdt->fd; new_fds = new_fdt->fd; - memcpy(new_fdt->open_fds->fds_bits, - old_fdt->open_fds->fds_bits, open_files/8); - memcpy(new_fdt->close_on_exec->fds_bits, - old_fdt->close_on_exec->fds_bits, open_files/8); + memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8); + memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8); for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; @@ -379,11 +378,11 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) memset(new_fds, 0, size); if (new_fdt->max_fds > open_files) { - int left = (new_fdt->max_fds-open_files)/8; - int start = open_files / (8 * sizeof(unsigned long)); + int left = (new_fdt->max_fds - open_files) / 8; + int start = open_files / BITS_PER_LONG; - memset(&new_fdt->open_fds->fds_bits[start], 0, left); - memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); + memset(&new_fdt->open_fds[start], 0, left); + memset(&new_fdt->close_on_exec[start], 0, left); } rcu_assign_pointer(newf->fdt, new_fdt); @@ -419,8 +418,8 @@ struct files_struct init_files = { .fdtab = { .max_fds = NR_OPEN_DEFAULT, .fd = &init_files.fd_array[0], - .close_on_exec = (fd_set *)&init_files.close_on_exec_init, - .open_fds = (fd_set *)&init_files.open_fds_init, + .close_on_exec = init_files.close_on_exec_init, + .open_fds = init_files.open_fds_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), }; @@ -443,8 +442,7 @@ repeat: fd = files->next_fd; if (fd < fdt->max_fds) - fd = find_next_zero_bit(fdt->open_fds->fds_bits, - fdt->max_fds, fd); + fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd); error = expand_files(files, fd); if (error < 0) diff --git a/fs/select.c b/fs/select.c index d33418fdc85..2e7fbe8a092 100644 --- a/fs/select.c +++ b/fs/select.c @@ -348,7 +348,7 @@ static int max_select_fd(unsigned long n, fd_set_bits *fds) set = ~(~0UL << (n & (__NFDBITS-1))); n /= __NFDBITS; fdt = files_fdtable(current->files); - open_fds = fdt->open_fds->fds_bits+n; + open_fds = fdt->open_fds + n; max = 0; if (set) { set &= BITS(fds, n); diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 7675da2c18f..158a41eed31 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -21,51 +21,43 @@ */ #define NR_OPEN_DEFAULT BITS_PER_LONG -/* - * The embedded_fd_set is a small fd_set, - * suitable for most tasks (which open <= BITS_PER_LONG files) - */ -struct embedded_fd_set { - unsigned long fds_bits[1]; -}; - struct fdtable { unsigned int max_fds; struct file __rcu **fd; /* current fd array */ - fd_set *close_on_exec; - fd_set *open_fds; + unsigned long *close_on_exec; + unsigned long *open_fds; struct rcu_head rcu; struct fdtable *next; }; static inline void __set_close_on_exec(int fd, struct fdtable *fdt) { - FD_SET(fd, fdt->close_on_exec); + __set_bit(fd, fdt->close_on_exec); } static inline void __clear_close_on_exec(int fd, struct fdtable *fdt) { - FD_CLR(fd, fdt->close_on_exec); + __clear_bit(fd, fdt->close_on_exec); } static inline bool close_on_exec(int fd, const struct fdtable *fdt) { - return FD_ISSET(fd, fdt->close_on_exec); + return test_bit(fd, fdt->close_on_exec); } static inline void __set_open_fd(int fd, struct fdtable *fdt) { - FD_SET(fd, fdt->open_fds); + __set_bit(fd, fdt->open_fds); } static inline void __clear_open_fd(int fd, struct fdtable *fdt) { - FD_CLR(fd, fdt->open_fds); + __clear_bit(fd, fdt->open_fds); } static inline bool fd_is_open(int fd, const struct fdtable *fdt) { - return FD_ISSET(fd, fdt->open_fds); + return test_bit(fd, fdt->open_fds); } /* @@ -83,8 +75,8 @@ struct files_struct { */ spinlock_t file_lock ____cacheline_aligned_in_smp; int next_fd; - struct embedded_fd_set close_on_exec_init; - struct embedded_fd_set open_fds_init; + unsigned long close_on_exec_init[1]; + unsigned long open_fds_init[1]; struct file __rcu * fd_array[NR_OPEN_DEFAULT]; }; diff --git a/kernel/exit.c b/kernel/exit.c index 4b4042f9bc6..4db020015f1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -473,7 +473,7 @@ static void close_files(struct files_struct * files) i = j * __NFDBITS; if (i >= fdt->max_fds) break; - set = fdt->open_fds->fds_bits[j++]; + set = fdt->open_fds[j++]; while (set) { if (set & 1) { struct file * file = xchg(&fdt->fd[i], NULL); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 6a3683e2842..421c990a20b 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2145,7 +2145,7 @@ static inline void flush_unauthorized_files(const struct cred *cred, fdt = files_fdtable(files); if (i >= fdt->max_fds) break; - set = fdt->open_fds->fds_bits[j]; + set = fdt->open_fds[j]; if (!set) continue; spin_unlock(&files->file_lock); -- cgit v1.2.3-70-g09d2 From 96f951edb1f1bdbbc99b0cd458f9808bb83d58ae Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Mar 2012 18:30:03 +0100 Subject: Add #includes needed to permit the removal of asm/system.h asm/system.h is a cause of circular dependency problems because it contains commonly used primitive stuff like barrier definitions and uncommonly used stuff like switch_to() that might require MMU definitions. asm/system.h has been disintegrated by this point on all arches into the following common segments: (1) asm/barrier.h Moved memory barrier definitions here. (2) asm/cmpxchg.h Moved xchg() and cmpxchg() here. #included in asm/atomic.h. (3) asm/bug.h Moved die() and similar here. (4) asm/exec.h Moved arch_align_stack() here. (5) asm/elf.h Moved AT_VECTOR_SIZE_ARCH here. (6) asm/switch_to.h Moved switch_to() here. Signed-off-by: David Howells --- drivers/misc/sgi-gru/gru_instructions.h | 1 + drivers/staging/crystalhd/bc_dts_defs.h | 2 ++ fs/binfmt_elf.c | 1 + fs/binfmt_elf_fdpic.c | 1 + fs/exec.c | 1 + include/asm-generic/bitops/atomic.h | 2 +- include/linux/llist.h | 3 +-- include/linux/mtd/map.h | 1 + include/linux/spinlock.h | 1 + kernel/sched/core.c | 1 + 10 files changed, 11 insertions(+), 3 deletions(-) (limited to 'fs/exec.c') diff --git a/drivers/misc/sgi-gru/gru_instructions.h b/drivers/misc/sgi-gru/gru_instructions.h index d95587cc794..04d5170ac14 100644 --- a/drivers/misc/sgi-gru/gru_instructions.h +++ b/drivers/misc/sgi-gru/gru_instructions.h @@ -40,6 +40,7 @@ extern void gru_wait_abort_proc(void *cb); *((volatile unsigned long *)(p)) = v; /* force st.rel */ \ } while (0) #elif defined(CONFIG_X86_64) +#include #define __flush_cache(p) clflush(p) #define gru_ordered_store_ulong(p, v) \ do { \ diff --git a/drivers/staging/crystalhd/bc_dts_defs.h b/drivers/staging/crystalhd/bc_dts_defs.h index 8cd51a7aad8..647e116e10d 100644 --- a/drivers/staging/crystalhd/bc_dts_defs.h +++ b/drivers/staging/crystalhd/bc_dts_defs.h @@ -26,6 +26,8 @@ #ifndef _BC_DTS_DEFS_H_ #define _BC_DTS_DEFS_H_ +#include + /* BIT Mask */ #define BC_BIT(_x) (1 << (_x)) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 81878b78c9d..18276531f7c 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -35,6 +35,7 @@ #include #include #include +#include static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs); static int load_elf_library(struct file *); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index c64bf5ee2df..9bd5612a822 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -39,6 +39,7 @@ #include #include #include +#include typedef char *elf_caddr_t; diff --git a/fs/exec.c b/fs/exec.c index 23559c227d9..c8b63d14da8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include "internal.h" diff --git a/include/asm-generic/bitops/atomic.h b/include/asm-generic/bitops/atomic.h index ecc44a8e2b4..9ae6c34dc19 100644 --- a/include/asm-generic/bitops/atomic.h +++ b/include/asm-generic/bitops/atomic.h @@ -2,7 +2,7 @@ #define _ASM_GENERIC_BITOPS_ATOMIC_H_ #include -#include +#include #ifdef CONFIG_SMP #include diff --git a/include/linux/llist.h b/include/linux/llist.h index 801b44b07aa..a5199f6d0e8 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h @@ -56,8 +56,7 @@ */ #include -#include -#include +#include struct llist_head { struct llist_node *first; diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h index 94e924e2ecd..ade5c990f1f 100644 --- a/include/linux/mtd/map.h +++ b/include/linux/mtd/map.h @@ -31,6 +31,7 @@ #include #include #include +#include #ifdef CONFIG_MTD_MAP_BANK_WIDTH_1 #define map_bankwidth(map) 1 diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 7df6c17b028..fa0f93e4d86 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -55,6 +55,7 @@ #include #include #include +#include #include diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 503d6426126..157fb9b2b18 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -73,6 +73,7 @@ #include #include +#include #include #include #include -- cgit v1.2.3-70-g09d2 From 6308191f6f55d3629c7dbe72dfb856ad9fa560fd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 30 Mar 2012 18:26:36 +0200 Subject: tracing, sched, vfs: Fix 'old_pid' usage in trace_sched_process_exec() 1. TRACE_EVENT(sched_process_exec) forgets to actually use the old pid argument, it sets ->old_pid = p->pid. 2. search_binary_handler() uses the wrong pid number. tracepoint needs the global pid_t from the root namespace, while old_pid is the virtual pid number as it seen by the tracer/parent. With this patch we have two pid_t's in search_binary_handler(), not really nice. Perhaps we should switch to "struct pid*", but in this case it would be better to cleanup the current code first and move the "depth == 0" code outside. Signed-off-by: Oleg Nesterov Cc: David Smith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Denys Vlasenko Link: http://lkml.kernel.org/r/20120330162636.GA4857@redhat.com Signed-off-by: Ingo Molnar --- fs/exec.c | 7 ++++--- include/trace/events/sched.h | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 23559c227d9..644f6c4eb60 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1370,7 +1370,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) unsigned int depth = bprm->recursion_depth; int try,retval; struct linux_binfmt *fmt; - pid_t old_pid; + pid_t old_pid, old_vpid; retval = security_bprm_check(bprm); if (retval) @@ -1381,8 +1381,9 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) return retval; /* Need to fetch pid before load_binary changes it */ + old_pid = current->pid; rcu_read_lock(); - old_pid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); + old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); rcu_read_unlock(); retval = -ENOENT; @@ -1405,7 +1406,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) if (retval >= 0) { if (depth == 0) { trace_sched_process_exec(current, old_pid, bprm); - ptrace_event(PTRACE_EVENT_EXEC, old_pid); + ptrace_event(PTRACE_EVENT_EXEC, old_vpid); } put_binfmt(fmt); allow_write_access(bprm->file); diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index fbc7b1ad929..ea7a2035456 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -295,7 +295,7 @@ TRACE_EVENT(sched_process_exec, TP_fast_assign( __assign_str(filename, bprm->filename); __entry->pid = p->pid; - __entry->old_pid = p->pid; + __entry->old_pid = old_pid; ), TP_printk("filename=%s pid=%d old_pid=%d", __get_str(filename), -- cgit v1.2.3-70-g09d2