From e78c3496790ee8a36522a838b59b388e8a709e65 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Sat, 16 Aug 2014 13:40:10 -0400 Subject: time, signal: Protect resource use statistics with seqlock Both times() and clock_gettime(CLOCK_PROCESS_CPUTIME_ID) have scalability issues on large systems, due to both functions being serialized with a lock. The lock protects against reporting a wrong value, due to a thread in the task group exiting, its statistics reporting up to the signal struct, and that exited task's statistics being counted twice (or not at all). Protecting that with a lock results in times() and clock_gettime() being completely serialized on large systems. This can be fixed by using a seqlock around the events that gather and propagate statistics. As an additional benefit, the protection code can be moved into thread_group_cputime(), slightly simplifying the calling functions. In the case of posix_cpu_clock_get_task() things can be simplified a lot, because the calling function already ensures that the task sticks around, and the rest is now taken care of in thread_group_cputime(). This way the statistics reporting code can run lockless. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Alex Thorlton Cc: Andrew Morton Cc: Daeseok Youn Cc: David Rientjes Cc: Dongsheng Yang Cc: Geert Uytterhoeven Cc: Guillaume Morin Cc: Ionut Alexa Cc: Kees Cook Cc: Linus Torvalds Cc: Li Zefan Cc: Michal Hocko Cc: Michal Schmidt Cc: Oleg Nesterov Cc: Vladimir Davydov Cc: umgwanakikbuti@gmail.com Cc: fweisbec@gmail.com Cc: srao@redhat.com Cc: lwoodman@redhat.com Cc: atheurer@redhat.com Link: http://lkml.kernel.org/r/20140816134010.26a9b572@annuminas.surriel.com Signed-off-by: Ingo Molnar --- kernel/sys.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/sys.c') diff --git a/kernel/sys.c b/kernel/sys.c index ce8129192a2..b6636643cbd 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -862,11 +862,9 @@ void do_sys_times(struct tms *tms) { cputime_t tgutime, tgstime, cutime, cstime; - spin_lock_irq(¤t->sighand->siglock); thread_group_cputime_adjusted(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; - spin_unlock_irq(¤t->sighand->siglock); tms->tms_utime = cputime_to_clock_t(tgutime); tms->tms_stime = cputime_to_clock_t(tgstime); tms->tms_cutime = cputime_to_clock_t(cutime); -- cgit v1.2.3-70-g09d2 From 8764b338b37524ab1a78aee527318ebee9762487 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 9 Oct 2014 15:27:32 -0700 Subject: mm: use may_adjust_brk helper Signed-off-by: Cyrill Gorcunov Cc: Kees Cook Cc: Tejun Heo Cc: Andrew Vagin Cc: Eric W. Biederman Cc: H. Peter Anvin Acked-by: Serge Hallyn Cc: Pavel Emelyanov Cc: Vasiliy Kulikov Cc: KAMEZAWA Hiroyuki Cc: Michael Kerrisk Cc: Julien Tinnes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 11 ++++------- mm/mmap.c | 7 +++---- 2 files changed, 7 insertions(+), 11 deletions(-) (limited to 'kernel/sys.c') diff --git a/kernel/sys.c b/kernel/sys.c index ce8129192a2..7879729bd3b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1693,7 +1693,6 @@ exit: static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { - unsigned long rlim = rlimit(RLIMIT_DATA); struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int error; @@ -1733,9 +1732,8 @@ static int prctl_set_mm(int opt, unsigned long addr, if (addr <= mm->end_data) goto out; - if (rlim < RLIM_INFINITY && - (mm->brk - addr) + - (mm->end_data - mm->start_data) > rlim) + if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr, + mm->end_data, mm->start_data)) goto out; mm->start_brk = addr; @@ -1745,9 +1743,8 @@ static int prctl_set_mm(int opt, unsigned long addr, if (addr <= mm->end_data) goto out; - if (rlim < RLIM_INFINITY && - (addr - mm->start_brk) + - (mm->end_data - mm->start_data) > rlim) + if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk, + mm->end_data, mm->start_data)) goto out; mm->brk = addr; diff --git a/mm/mmap.c b/mm/mmap.c index 2814189f501..7ff38f1a66e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -268,7 +268,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len); SYSCALL_DEFINE1(brk, unsigned long, brk) { - unsigned long rlim, retval; + unsigned long retval; unsigned long newbrk, oldbrk; struct mm_struct *mm = current->mm; unsigned long min_brk; @@ -298,9 +298,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) * segment grow beyond its set limit the in case where the limit is * not page aligned -Ram Gupta */ - rlim = rlimit(RLIMIT_DATA); - if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + - (mm->end_data - mm->start_data) > rlim) + if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk, + mm->end_data, mm->start_data)) goto out; newbrk = PAGE_ALIGN(brk); -- cgit v1.2.3-70-g09d2 From 71fe97e185040c5dac3216cd54e186dfa534efa0 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 9 Oct 2014 15:27:34 -0700 Subject: prctl: PR_SET_MM -- factor out mmap_sem when updating mm::exe_file Instead of taking mm->mmap_sem inside prctl_set_mm_exe_file() move it out and rename the helper to prctl_set_mm_exe_file_locked(). This will allow to reuse this function in a next patch. Signed-off-by: Cyrill Gorcunov Cc: Kees Cook Cc: Tejun Heo Cc: Andrew Vagin Cc: Eric W. Biederman Cc: H. Peter Anvin Acked-by: Serge Hallyn Cc: Pavel Emelyanov Cc: Vasiliy Kulikov Cc: KAMEZAWA Hiroyuki Cc: Michael Kerrisk Cc: Julien Tinnes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel/sys.c') diff --git a/kernel/sys.c b/kernel/sys.c index 7879729bd3b..14222a1699c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1628,12 +1628,14 @@ SYSCALL_DEFINE1(umask, int, mask) return mask; } -static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) +static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd) { struct fd exe; struct inode *inode; int err; + VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + exe = fdget(fd); if (!exe.file) return -EBADF; @@ -1654,8 +1656,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (err) goto exit; - down_write(&mm->mmap_sem); - /* * Forbid mm->exe_file change if old file still mapped. */ @@ -1667,7 +1667,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (vma->vm_file && path_equal(&vma->vm_file->f_path, &mm->exe_file->f_path)) - goto exit_unlock; + goto exit; } /* @@ -1678,13 +1678,10 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) */ err = -EPERM; if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) - goto exit_unlock; + goto exit; err = 0; set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ -exit_unlock: - up_write(&mm->mmap_sem); - exit: fdput(exe); return err; @@ -1703,8 +1700,12 @@ static int prctl_set_mm(int opt, unsigned long addr, if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - if (opt == PR_SET_MM_EXE_FILE) - return prctl_set_mm_exe_file(mm, (unsigned int)addr); + if (opt == PR_SET_MM_EXE_FILE) { + down_write(&mm->mmap_sem); + error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr); + up_write(&mm->mmap_sem); + return error; + } if (addr >= TASK_SIZE || addr < mmap_min_addr) return -EINVAL; -- cgit v1.2.3-70-g09d2 From f606b77f1a9e362451aca8f81d8f36a3a112139e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 9 Oct 2014 15:27:37 -0700 Subject: prctl: PR_SET_MM -- introduce PR_SET_MM_MAP operation During development of c/r we've noticed that in case if we need to support user namespaces we face a problem with capabilities in prctl(PR_SET_MM, ...) call, in particular once new user namespace is created capable(CAP_SYS_RESOURCE) no longer passes. A approach is to eliminate CAP_SYS_RESOURCE check but pass all new values in one bundle, which would allow the kernel to make more intensive test for sanity of values and same time allow us to support checkpoint/restore of user namespaces. Thus a new command PR_SET_MM_MAP introduced. It takes a pointer of prctl_mm_map structure which carries all the members to be updated. prctl(PR_SET_MM, PR_SET_MM_MAP, struct prctl_mm_map *, size) struct prctl_mm_map { __u64 start_code; __u64 end_code; __u64 start_data; __u64 end_data; __u64 start_brk; __u64 brk; __u64 start_stack; __u64 arg_start; __u64 arg_end; __u64 env_start; __u64 env_end; __u64 *auxv; __u32 auxv_size; __u32 exe_fd; }; All members except @exe_fd correspond ones of struct mm_struct. To figure out which available values these members may take here are meanings of the members. - start_code, end_code: represent bounds of executable code area - start_data, end_data: represent bounds of data area - start_brk, brk: used to calculate bounds for brk() syscall - start_stack: used when accounting space needed for command line arguments, environment and shmat() syscall - arg_start, arg_end, env_start, env_end: represent memory area supplied for command line arguments and environment variables - auxv, auxv_size: carries auxiliary vector, Elf format specifics - exe_fd: file descriptor number for executable link (/proc/self/exe) Thus we apply the following requirements to the values 1) Any member except @auxv, @auxv_size, @exe_fd is rather an address in user space thus it must be laying inside [mmap_min_addr, mmap_max_addr) interval. 2) While @[start|end]_code and @[start|end]_data may point to an nonexisting VMAs (say a program maps own new .text and .data segments during execution) the rest of members should belong to VMA which must exist. 3) Addresses must be ordered, ie @start_ member must not be greater or equal to appropriate @end_ member. 4) As in regular Elf loading procedure we require that @start_brk and @brk be greater than @end_data. 5) If RLIMIT_DATA rlimit is set to non-infinity new values should not exceed existing limit. Same applies to RLIMIT_STACK. 6) Auxiliary vector size must not exceed existing one (which is predefined as AT_VECTOR_SIZE and depends on architecture). 7) File descriptor passed in @exe_file should be pointing to executable file (because we use existing prctl_set_mm_exe_file_locked helper it ensures that the file we are going to use as exe link has all required permission granted). Now about where these members are involved inside kernel code: - @start_code and @end_code are used in /proc/$pid/[stat|statm] output; - @start_data and @end_data are used in /proc/$pid/[stat|statm] output, also they are considered if there enough space for brk() syscall result if RLIMIT_DATA is set; - @start_brk shown in /proc/$pid/stat output and accounted in brk() syscall if RLIMIT_DATA is set; also this member is tested to find a symbolic name of mmap event for perf system (we choose if event is generated for "heap" area); one more aplication is selinux -- we test if a process has PROCESS__EXECHEAP permission if trying to make heap area being executable with mprotect() syscall; - @brk is a current value for brk() syscall which lays inside heap area, it's shown in /proc/$pid/stat. When syscall brk() succesfully provides new memory area to a user space upon brk() completion the mm::brk is updated to carry new value; Both @start_brk and @brk are actively used in /proc/$pid/maps and /proc/$pid/smaps output to find a symbolic name "heap" for VMA being scanned; - @start_stack is printed out in /proc/$pid/stat and used to find a symbolic name "stack" for task and threads in /proc/$pid/maps and /proc/$pid/smaps output, and as the same as with @start_brk -- perf system uses it for event naming. Also kernel treat this member as a start address of where to map vDSO pages and to check if there is enough space for shmat() syscall; - @arg_start, @arg_end, @env_start and @env_end are printed out in /proc/$pid/stat. Another access to the data these members represent is to read /proc/$pid/environ or /proc/$pid/cmdline. Any attempt to read these areas kernel tests with access_process_vm helper so a user must have enough rights for this action; - @auxv and @auxv_size may be read from /proc/$pid/auxv. Strictly speaking kernel doesn't care much about which exactly data is sitting there because it is solely for userspace; - @exe_fd is referred from /proc/$pid/exe and when generating coredump. We uses prctl_set_mm_exe_file_locked helper to update this member, so exe-file link modification remains one-shot action. Still note that updating exe-file link now doesn't require sys-resource capability anymore, after all there is no much profit in preventing setup own file link (there are a number of ways to execute own code -- ptrace, ld-preload, so that the only reliable way to find which exactly code is executed is to inspect running program memory). Still we require the caller to be at least user-namespace root user. I believe the old interface should be deprecated and ripped off in a couple of kernel releases if no one against. To test if new interface is implemented in the kernel one can pass PR_SET_MM_MAP_SIZE opcode and the kernel returns the size of currently supported struct prctl_mm_map. [akpm@linux-foundation.org: fix 80-col wordwrap in macro definitions] Signed-off-by: Cyrill Gorcunov Cc: Kees Cook Cc: Tejun Heo Acked-by: Andrew Vagin Tested-by: Andrew Vagin Cc: Eric W. Biederman Cc: H. Peter Anvin Acked-by: Serge Hallyn Cc: Pavel Emelyanov Cc: Vasiliy Kulikov Cc: KAMEZAWA Hiroyuki Cc: Michael Kerrisk Cc: Julien Tinnes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/prctl.h | 27 +++++++ kernel/sys.c | 190 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 216 insertions(+), 1 deletion(-) (limited to 'kernel/sys.c') diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 58afc04c107..513df75d0fc 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -1,6 +1,8 @@ #ifndef _LINUX_PRCTL_H #define _LINUX_PRCTL_H +#include + /* Values to pass as first argument to prctl() */ #define PR_SET_PDEATHSIG 1 /* Second arg is a signal */ @@ -119,6 +121,31 @@ # define PR_SET_MM_ENV_END 11 # define PR_SET_MM_AUXV 12 # define PR_SET_MM_EXE_FILE 13 +# define PR_SET_MM_MAP 14 +# define PR_SET_MM_MAP_SIZE 15 + +/* + * This structure provides new memory descriptor + * map which mostly modifies /proc/pid/stat[m] + * output for a task. This mostly done in a + * sake of checkpoint/restore functionality. + */ +struct prctl_mm_map { + __u64 start_code; /* code section bounds */ + __u64 end_code; + __u64 start_data; /* data section bounds */ + __u64 end_data; + __u64 start_brk; /* heap for brk() syscall */ + __u64 brk; + __u64 start_stack; /* stack starts at */ + __u64 arg_start; /* command line arguments bounds */ + __u64 arg_end; + __u64 env_start; /* environment variables bounds */ + __u64 env_end; + __u64 *auxv; /* auxiliary vector */ + __u32 auxv_size; /* vector size */ + __u32 exe_fd; /* /proc/$pid/exe link file */ +}; /* * Set specific pid that is allowed to ptrace the current task. diff --git a/kernel/sys.c b/kernel/sys.c index 14222a1699c..f7030b06001 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1687,6 +1687,187 @@ exit: return err; } +#ifdef CONFIG_CHECKPOINT_RESTORE +/* + * WARNING: we don't require any capability here so be very careful + * in what is allowed for modification from userspace. + */ +static int validate_prctl_map(struct prctl_mm_map *prctl_map) +{ + unsigned long mmap_max_addr = TASK_SIZE; + struct mm_struct *mm = current->mm; + int error = -EINVAL, i; + + static const unsigned char offsets[] = { + offsetof(struct prctl_mm_map, start_code), + offsetof(struct prctl_mm_map, end_code), + offsetof(struct prctl_mm_map, start_data), + offsetof(struct prctl_mm_map, end_data), + offsetof(struct prctl_mm_map, start_brk), + offsetof(struct prctl_mm_map, brk), + offsetof(struct prctl_mm_map, start_stack), + offsetof(struct prctl_mm_map, arg_start), + offsetof(struct prctl_mm_map, arg_end), + offsetof(struct prctl_mm_map, env_start), + offsetof(struct prctl_mm_map, env_end), + }; + + /* + * Make sure the members are not somewhere outside + * of allowed address space. + */ + for (i = 0; i < ARRAY_SIZE(offsets); i++) { + u64 val = *(u64 *)((char *)prctl_map + offsets[i]); + + if ((unsigned long)val >= mmap_max_addr || + (unsigned long)val < mmap_min_addr) + goto out; + } + + /* + * Make sure the pairs are ordered. + */ +#define __prctl_check_order(__m1, __op, __m2) \ + ((unsigned long)prctl_map->__m1 __op \ + (unsigned long)prctl_map->__m2) ? 0 : -EINVAL + error = __prctl_check_order(start_code, <, end_code); + error |= __prctl_check_order(start_data, <, end_data); + error |= __prctl_check_order(start_brk, <=, brk); + error |= __prctl_check_order(arg_start, <=, arg_end); + error |= __prctl_check_order(env_start, <=, env_end); + if (error) + goto out; +#undef __prctl_check_order + + error = -EINVAL; + + /* + * @brk should be after @end_data in traditional maps. + */ + if (prctl_map->start_brk <= prctl_map->end_data || + prctl_map->brk <= prctl_map->end_data) + goto out; + + /* + * Neither we should allow to override limits if they set. + */ + if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk, + prctl_map->start_brk, prctl_map->end_data, + prctl_map->start_data)) + goto out; + + /* + * Someone is trying to cheat the auxv vector. + */ + if (prctl_map->auxv_size) { + if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv)) + goto out; + } + + /* + * Finally, make sure the caller has the rights to + * change /proc/pid/exe link: only local root should + * be allowed to. + */ + if (prctl_map->exe_fd != (u32)-1) { + struct user_namespace *ns = current_user_ns(); + const struct cred *cred = current_cred(); + + if (!uid_eq(cred->uid, make_kuid(ns, 0)) || + !gid_eq(cred->gid, make_kgid(ns, 0))) + goto out; + } + + error = 0; +out: + return error; +} + +static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size) +{ + struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, }; + unsigned long user_auxv[AT_VECTOR_SIZE]; + struct mm_struct *mm = current->mm; + int error; + + BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); + BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256); + + if (opt == PR_SET_MM_MAP_SIZE) + return put_user((unsigned int)sizeof(prctl_map), + (unsigned int __user *)addr); + + if (data_size != sizeof(prctl_map)) + return -EINVAL; + + if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) + return -EFAULT; + + error = validate_prctl_map(&prctl_map); + if (error) + return error; + + if (prctl_map.auxv_size) { + memset(user_auxv, 0, sizeof(user_auxv)); + if (copy_from_user(user_auxv, + (const void __user *)prctl_map.auxv, + prctl_map.auxv_size)) + return -EFAULT; + + /* Last entry must be AT_NULL as specification requires */ + user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL; + user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; + } + + down_write(&mm->mmap_sem); + if (prctl_map.exe_fd != (u32)-1) + error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd); + downgrade_write(&mm->mmap_sem); + if (error) + goto out; + + /* + * We don't validate if these members are pointing to + * real present VMAs because application may have correspond + * VMAs already unmapped and kernel uses these members for statistics + * output in procfs mostly, except + * + * - @start_brk/@brk which are used in do_brk but kernel lookups + * for VMAs when updating these memvers so anything wrong written + * here cause kernel to swear at userspace program but won't lead + * to any problem in kernel itself + */ + + mm->start_code = prctl_map.start_code; + mm->end_code = prctl_map.end_code; + mm->start_data = prctl_map.start_data; + mm->end_data = prctl_map.end_data; + mm->start_brk = prctl_map.start_brk; + mm->brk = prctl_map.brk; + mm->start_stack = prctl_map.start_stack; + mm->arg_start = prctl_map.arg_start; + mm->arg_end = prctl_map.arg_end; + mm->env_start = prctl_map.env_start; + mm->env_end = prctl_map.env_end; + + /* + * Note this update of @saved_auxv is lockless thus + * if someone reads this member in procfs while we're + * updating -- it may get partly updated results. It's + * known and acceptable trade off: we leave it as is to + * not introduce additional locks here making the kernel + * more complex. + */ + if (prctl_map.auxv_size) + memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); + + error = 0; +out: + up_read(&mm->mmap_sem); + return error; +} +#endif /* CONFIG_CHECKPOINT_RESTORE */ + static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { @@ -1694,9 +1875,16 @@ static int prctl_set_mm(int opt, unsigned long addr, struct vm_area_struct *vma; int error; - if (arg5 || (arg4 && opt != PR_SET_MM_AUXV)) + if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV && + opt != PR_SET_MM_MAP && + opt != PR_SET_MM_MAP_SIZE))) return -EINVAL; +#ifdef CONFIG_CHECKPOINT_RESTORE + if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE) + return prctl_set_mm_map(opt, (const void __user *)addr, arg4); +#endif + if (!capable(CAP_SYS_RESOURCE)) return -EPERM; -- cgit v1.2.3-70-g09d2 From 96dad67ff244e797c4bc3e4f7f0fdaa0cfdf0a7d Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 9 Oct 2014 15:28:39 -0700 Subject: mm: use VM_BUG_ON_MM where possible Dump the contents of the relevant struct_mm when we hit the bug condition. Signed-off-by: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 +-- kernel/sys.c | 2 +- mm/huge_memory.c | 2 +- mm/mlock.c | 2 +- mm/mmap.c | 7 ++++--- mm/pagewalk.c | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel/sys.c') diff --git a/kernel/fork.c b/kernel/fork.c index a91e47d86de..8c162d10274 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -601,9 +601,8 @@ static void check_mm(struct mm_struct *mm) printk(KERN_ALERT "BUG: Bad rss-counter state " "mm:%p idx:%d val:%ld\n", mm, i, x); } - #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - VM_BUG_ON(mm->pmd_huge_pte); + VM_BUG_ON_MM(mm->pmd_huge_pte, mm); #endif } diff --git a/kernel/sys.c b/kernel/sys.c index f7030b06001..df692fbf1e7 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1634,7 +1634,7 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd) struct inode *inode; int err; - VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); exe = fdget(fd); if (!exe.file) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c13148cc745..74c78aa8bc2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2048,7 +2048,7 @@ int __khugepaged_enter(struct mm_struct *mm) return -ENOMEM; /* __khugepaged_exit() must not run from under us */ - VM_BUG_ON(khugepaged_test_exit(mm)); + VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { free_mm_slot(mm_slot); return 0; diff --git a/mm/mlock.c b/mm/mlock.c index d5d09d0786e..03aa8512723 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -235,7 +235,7 @@ long __mlock_vma_pages_range(struct vm_area_struct *vma, VM_BUG_ON(end & ~PAGE_MASK); VM_BUG_ON_VMA(start < vma->vm_start, vma); VM_BUG_ON_VMA(end > vma->vm_end, vma); - VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); gup_flags = FOLL_TOUCH | FOLL_MLOCK; /* diff --git a/mm/mmap.c b/mm/mmap.c index c9bc285df25..16d19b48e2a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -410,8 +410,9 @@ static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) for (nd = rb_first(root); nd; nd = rb_next(nd)) { struct vm_area_struct *vma; vma = rb_entry(nd, struct vm_area_struct, vm_rb); - BUG_ON(vma != ignore && - vma->rb_subtree_gap != vma_compute_subtree_gap(vma)); + VM_BUG_ON_VMA(vma != ignore && + vma->rb_subtree_gap != vma_compute_subtree_gap(vma), + vma); } } @@ -448,7 +449,7 @@ static void validate_mm(struct mm_struct *mm) pr_emerg("map_count %d rb %d\n", mm->map_count, i); bug = 1; } - BUG_ON(bug); + VM_BUG_ON_MM(bug, mm); } #else #define validate_mm_rb(root, ignore) do { } while (0) diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 2beeabf502c..ad83195521f 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -177,7 +177,7 @@ int walk_page_range(unsigned long addr, unsigned long end, if (!walk->mm) return -EINVAL; - VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); + VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); pgd = pgd_offset(walk->mm, addr); do { -- cgit v1.2.3-70-g09d2 From ec94fc3d59b54561da03a0e433d93217b08c1481 Mon Sep 17 00:00:00 2001 From: "vishnu.ps" Date: Thu, 9 Oct 2014 15:30:23 -0700 Subject: kernel/sys.c: whitespace fixes Fix minor errors and warning messages in kernel/sys.c. These errors were reported by checkpatch while working with some modifications in sys.c file. Fixing this first will help me to improve my further patches. ERROR: trailing whitespace - 9 ERROR: do not use assignment in if condition - 4 ERROR: spaces required around that '?' (ctx:VxO) - 10 ERROR: switch and case should be at the same indent - 3 total 26 errors & 3 warnings fixed. Signed-off-by: vishnu.ps Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 265 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 137 insertions(+), 128 deletions(-) (limited to 'kernel/sys.c') diff --git a/kernel/sys.c b/kernel/sys.c index df692fbf1e7..037fd76bdc7 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -62,28 +62,28 @@ #include #ifndef SET_UNALIGN_CTL -# define SET_UNALIGN_CTL(a,b) (-EINVAL) +# define SET_UNALIGN_CTL(a, b) (-EINVAL) #endif #ifndef GET_UNALIGN_CTL -# define GET_UNALIGN_CTL(a,b) (-EINVAL) +# define GET_UNALIGN_CTL(a, b) (-EINVAL) #endif #ifndef SET_FPEMU_CTL -# define SET_FPEMU_CTL(a,b) (-EINVAL) +# define SET_FPEMU_CTL(a, b) (-EINVAL) #endif #ifndef GET_FPEMU_CTL -# define GET_FPEMU_CTL(a,b) (-EINVAL) +# define GET_FPEMU_CTL(a, b) (-EINVAL) #endif #ifndef SET_FPEXC_CTL -# define SET_FPEXC_CTL(a,b) (-EINVAL) +# define SET_FPEXC_CTL(a, b) (-EINVAL) #endif #ifndef GET_FPEXC_CTL -# define GET_FPEXC_CTL(a,b) (-EINVAL) +# define GET_FPEXC_CTL(a, b) (-EINVAL) #endif #ifndef GET_ENDIAN -# define GET_ENDIAN(a,b) (-EINVAL) +# define GET_ENDIAN(a, b) (-EINVAL) #endif #ifndef SET_ENDIAN -# define SET_ENDIAN(a,b) (-EINVAL) +# define SET_ENDIAN(a, b) (-EINVAL) #endif #ifndef GET_TSC_CTL # define GET_TSC_CTL(a) (-EINVAL) @@ -182,39 +182,40 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) rcu_read_lock(); read_lock(&tasklist_lock); switch (which) { - case PRIO_PROCESS: - if (who) - p = find_task_by_vpid(who); - else - p = current; - if (p) - error = set_one_prio(p, niceval, error); - break; - case PRIO_PGRP: - if (who) - pgrp = find_vpid(who); - else - pgrp = task_pgrp(current); - do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - error = set_one_prio(p, niceval, error); - } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); - break; - case PRIO_USER: - uid = make_kuid(cred->user_ns, who); - user = cred->user; - if (!who) - uid = cred->uid; - else if (!uid_eq(uid, cred->uid) && - !(user = find_user(uid))) + case PRIO_PROCESS: + if (who) + p = find_task_by_vpid(who); + else + p = current; + if (p) + error = set_one_prio(p, niceval, error); + break; + case PRIO_PGRP: + if (who) + pgrp = find_vpid(who); + else + pgrp = task_pgrp(current); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + error = set_one_prio(p, niceval, error); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; + case PRIO_USER: + uid = make_kuid(cred->user_ns, who); + user = cred->user; + if (!who) + uid = cred->uid; + else if (!uid_eq(uid, cred->uid)) { + user = find_user(uid); + if (!user) goto out_unlock; /* No processes for this user */ - - do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) - error = set_one_prio(p, niceval, error); - } while_each_thread(g, p); - if (!uid_eq(uid, cred->uid)) - free_uid(user); /* For find_user() */ - break; + } + do_each_thread(g, p) { + if (uid_eq(task_uid(p), uid)) + error = set_one_prio(p, niceval, error); + } while_each_thread(g, p); + if (!uid_eq(uid, cred->uid)) + free_uid(user); /* For find_user() */ + break; } out_unlock: read_unlock(&tasklist_lock); @@ -244,47 +245,48 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) rcu_read_lock(); read_lock(&tasklist_lock); switch (which) { - case PRIO_PROCESS: - if (who) - p = find_task_by_vpid(who); - else - p = current; - if (p) { + case PRIO_PROCESS: + if (who) + p = find_task_by_vpid(who); + else + p = current; + if (p) { + niceval = nice_to_rlimit(task_nice(p)); + if (niceval > retval) + retval = niceval; + } + break; + case PRIO_PGRP: + if (who) + pgrp = find_vpid(who); + else + pgrp = task_pgrp(current); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + niceval = nice_to_rlimit(task_nice(p)); + if (niceval > retval) + retval = niceval; + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; + case PRIO_USER: + uid = make_kuid(cred->user_ns, who); + user = cred->user; + if (!who) + uid = cred->uid; + else if (!uid_eq(uid, cred->uid)) { + user = find_user(uid); + if (!user) + goto out_unlock; /* No processes for this user */ + } + do_each_thread(g, p) { + if (uid_eq(task_uid(p), uid)) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } - break; - case PRIO_PGRP: - if (who) - pgrp = find_vpid(who); - else - pgrp = task_pgrp(current); - do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - niceval = nice_to_rlimit(task_nice(p)); - if (niceval > retval) - retval = niceval; - } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); - break; - case PRIO_USER: - uid = make_kuid(cred->user_ns, who); - user = cred->user; - if (!who) - uid = cred->uid; - else if (!uid_eq(uid, cred->uid) && - !(user = find_user(uid))) - goto out_unlock; /* No processes for this user */ - - do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) { - niceval = nice_to_rlimit(task_nice(p)); - if (niceval > retval) - retval = niceval; - } - } while_each_thread(g, p); - if (!uid_eq(uid, cred->uid)) - free_uid(user); /* for find_user() */ - break; + } while_each_thread(g, p); + if (!uid_eq(uid, cred->uid)) + free_uid(user); /* for find_user() */ + break; } out_unlock: read_unlock(&tasklist_lock); @@ -306,7 +308,7 @@ out_unlock: * * The general idea is that a program which uses just setregid() will be * 100% compatible with BSD. A program which uses just setgid() will be - * 100% compatible with POSIX with saved IDs. + * 100% compatible with POSIX with saved IDs. * * SMP: There are not races, the GIDs are checked only by filesystem * operations (as far as semantic preservation is concerned). @@ -364,7 +366,7 @@ error: } /* - * setgid() is implemented like SysV w/ SAVED_IDS + * setgid() is implemented like SysV w/ SAVED_IDS * * SMP: Same implicit races as above. */ @@ -442,7 +444,7 @@ static int set_user(struct cred *new) * * The general idea is that a program which uses just setreuid() will be * 100% compatible with BSD. A program which uses just setuid() will be - * 100% compatible with POSIX with saved IDs. + * 100% compatible with POSIX with saved IDs. */ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) { @@ -503,17 +505,17 @@ error: abort_creds(new); return retval; } - + /* - * setuid() is implemented like SysV with SAVED_IDS - * + * setuid() is implemented like SysV with SAVED_IDS + * * Note that SAVED_ID's is deficient in that a setuid root program - * like sendmail, for example, cannot set its uid to be a normal + * like sendmail, for example, cannot set its uid to be a normal * user and then switch back, because if you're root, setuid() sets * the saved uid too. If you don't like this, blame the bright people * in the POSIX committee and/or USG. Note that the BSD-style setreuid() * will allow a root program to temporarily drop privileges and be able to - * regain them by swapping the real and effective uid. + * regain them by swapping the real and effective uid. */ SYSCALL_DEFINE1(setuid, uid_t, uid) { @@ -637,10 +639,12 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _ euid = from_kuid_munged(cred->user_ns, cred->euid); suid = from_kuid_munged(cred->user_ns, cred->suid); - if (!(retval = put_user(ruid, ruidp)) && - !(retval = put_user(euid, euidp))) - retval = put_user(suid, suidp); - + retval = put_user(ruid, ruidp); + if (!retval) { + retval = put_user(euid, euidp); + if (!retval) + return put_user(suid, suidp); + } return retval; } @@ -709,9 +713,12 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _ egid = from_kgid_munged(cred->user_ns, cred->egid); sgid = from_kgid_munged(cred->user_ns, cred->sgid); - if (!(retval = put_user(rgid, rgidp)) && - !(retval = put_user(egid, egidp))) - retval = put_user(sgid, sgidp); + retval = put_user(rgid, rgidp); + if (!retval) { + retval = put_user(egid, egidp); + if (!retval) + retval = put_user(sgid, sgidp); + } return retval; } @@ -1284,7 +1291,6 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) /* * Back compatibility for getrlimit. Needed for some apps. */ - SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, struct rlimit __user *, rlim) { @@ -1299,7 +1305,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, x.rlim_cur = 0x7FFFFFFF; if (x.rlim_max > 0x7FFFFFFF) x.rlim_max = 0x7FFFFFFF; - return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; + return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0; } #endif @@ -1527,7 +1533,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) cputime_t tgutime, tgstime, utime, stime; unsigned long maxrss = 0; - memset((char *) r, 0, sizeof *r); + memset((char *)r, 0, sizeof (*r)); utime = stime = 0; if (who == RUSAGE_THREAD) { @@ -1541,41 +1547,41 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) return; switch (who) { - case RUSAGE_BOTH: - case RUSAGE_CHILDREN: - utime = p->signal->cutime; - stime = p->signal->cstime; - r->ru_nvcsw = p->signal->cnvcsw; - r->ru_nivcsw = p->signal->cnivcsw; - r->ru_minflt = p->signal->cmin_flt; - r->ru_majflt = p->signal->cmaj_flt; - r->ru_inblock = p->signal->cinblock; - r->ru_oublock = p->signal->coublock; - maxrss = p->signal->cmaxrss; - - if (who == RUSAGE_CHILDREN) - break; - - case RUSAGE_SELF: - thread_group_cputime_adjusted(p, &tgutime, &tgstime); - utime += tgutime; - stime += tgstime; - r->ru_nvcsw += p->signal->nvcsw; - r->ru_nivcsw += p->signal->nivcsw; - r->ru_minflt += p->signal->min_flt; - r->ru_majflt += p->signal->maj_flt; - r->ru_inblock += p->signal->inblock; - r->ru_oublock += p->signal->oublock; - if (maxrss < p->signal->maxrss) - maxrss = p->signal->maxrss; - t = p; - do { - accumulate_thread_rusage(t, r); - } while_each_thread(p, t); + case RUSAGE_BOTH: + case RUSAGE_CHILDREN: + utime = p->signal->cutime; + stime = p->signal->cstime; + r->ru_nvcsw = p->signal->cnvcsw; + r->ru_nivcsw = p->signal->cnivcsw; + r->ru_minflt = p->signal->cmin_flt; + r->ru_majflt = p->signal->cmaj_flt; + r->ru_inblock = p->signal->cinblock; + r->ru_oublock = p->signal->coublock; + maxrss = p->signal->cmaxrss; + + if (who == RUSAGE_CHILDREN) break; - default: - BUG(); + case RUSAGE_SELF: + thread_group_cputime_adjusted(p, &tgutime, &tgstime); + utime += tgutime; + stime += tgstime; + r->ru_nvcsw += p->signal->nvcsw; + r->ru_nivcsw += p->signal->nivcsw; + r->ru_minflt += p->signal->min_flt; + r->ru_majflt += p->signal->maj_flt; + r->ru_inblock += p->signal->inblock; + r->ru_oublock += p->signal->oublock; + if (maxrss < p->signal->maxrss) + maxrss = p->signal->maxrss; + t = p; + do { + accumulate_thread_rusage(t, r); + } while_each_thread(p, t); + break; + + default: + BUG(); } unlock_task_sighand(p, &flags); @@ -1585,6 +1591,7 @@ out: if (who != RUSAGE_CHILDREN) { struct mm_struct *mm = get_task_mm(p); + if (mm) { setmax_mm_hiwater_rss(&maxrss, mm); mmput(mm); @@ -1596,6 +1603,7 @@ out: int getrusage(struct task_struct *p, int who, struct rusage __user *ru) { struct rusage r; + k_getrusage(p, who, &r); return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; } @@ -2209,6 +2217,7 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, { int err = 0; int cpu = raw_smp_processor_id(); + if (cpup) err |= put_user(cpu, cpup); if (nodep) -- cgit v1.2.3-70-g09d2 From 0baae41ea8365a7b5a34c6474a77d7eb1126f6b2 Mon Sep 17 00:00:00 2001 From: Scotty Bauer Date: Thu, 9 Oct 2014 15:30:26 -0700 Subject: kernel/sys.c: compat sysinfo syscall: fix undefined behavior Fix undefined behavior and compiler warning by replacing right shift 32 with upper_32_bits macro Signed-off-by: Scotty Bauer Cc: Clemens Ladisch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sys.c') diff --git a/kernel/sys.c b/kernel/sys.c index 037fd76bdc7..dfce4debd13 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2330,7 +2330,7 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) /* Check to see if any memory value is too large for 32-bit and scale * down if needed */ - if ((s.totalram >> 32) || (s.totalswap >> 32)) { + if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) { int bitcount = 0; while (s.mem_unit < PAGE_SIZE) { -- cgit v1.2.3-70-g09d2