From 54a43d54988a3731d644fdeb7a1d6f46b4ac64c7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 23 Jan 2014 15:53:13 -0800 Subject: numa: add a sysctl for numa_balancing Add a working sysctl to enable/disable automatic numa memory balancing at runtime. This allows us to track down performance problems with this feature and is generally a good idea. This was possible earlier through debugfs, but only with special debugging options set. Also fix the boot message. [akpm@linux-foundation.org: s/sched_numa_balancing/sysctl_numa_balancing/] Signed-off-by: Andi Kleen Acked-by: Mel Gorman Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 24 +++++++++++++++++++++++- kernel/sysctl.c | 9 +++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4d6964e4971..7fea865a810 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1770,7 +1770,29 @@ void set_numabalancing_state(bool enabled) numabalancing_enabled = enabled; } #endif /* CONFIG_SCHED_DEBUG */ -#endif /* CONFIG_NUMA_BALANCING */ + +#ifdef CONFIG_PROC_SYSCTL +int sysctl_numa_balancing(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int err; + int state = numabalancing_enabled; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &state; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + if (write) + set_numabalancing_state(state); + return err; +} +#endif +#endif /* * fork()/clone()-time setup: diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 332cefcdb04..693eac39c20 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -389,6 +389,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "numa_balancing", + .data = NULL, /* filled in by handler */ + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_numa_balancing, + .extra1 = &zero, + .extra2 = &one, + }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ { -- cgit v1.2.3-70-g09d2 From 1d3fa370346d9d96ab0efb84e3312aed3aeb35ea Mon Sep 17 00:00:00 2001 From: Arun KS Date: Thu, 23 Jan 2014 15:54:19 -0800 Subject: printk: flush conflicting continuation line An earlier newline was missing and current print is from different task. In this scenario flush the continuation line and store this line seperatly. This patch fix the below scenario of timestamp interleaving, [ 28.154370 ] read_word_reg : reg[0x 3], reg[0x 4] data [0x 642] [ 28.155428 ] uart disconnect [ 31.947341 ] dvfs[cpufreq.c<275>]:plug-in cpu<1> done [ 28.155445 ] UART detached : send switch state 201 [ 32.014112 ] read_reg : reg[0x 3] data[0x21] [akpm@linux-foundation.org: simplify and condense the code] Signed-off-by: Arun KS Signed-off-by: Arun KS Cc: Joe Perches Cc: Tejun Heo Cc: Frederic Weisbecker Cc: Paul Gortmaker Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index f8b41bddc6d..b1d255f0413 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1595,10 +1595,13 @@ asmlinkage int vprintk_emit(int facility, int level, * either merge it with the current buffer and flush, or if * there was a race with interrupts (prefix == true) then just * flush it out and store this line separately. + * If the preceding printk was from a different task and missed + * a newline, flush and append the newline. */ - if (cont.len && cont.owner == current) { - if (!(lflags & LOG_PREFIX)) - stored = cont_add(facility, level, text, text_len); + if (cont.len) { + if (cont.owner == current && !(lflags & LOG_PREFIX)) + stored = cont_add(facility, level, text, + text_len); cont_flush(LOG_NEWLINE); } -- cgit v1.2.3-70-g09d2 From ff252c1fc537b0c9e40f62da0a9d11bf0737b7db Mon Sep 17 00:00:00 2001 From: DaeSeok Youn Date: Thu, 23 Jan 2014 15:55:46 -0800 Subject: kernel/fork.c: make dup_mm() static dup_mm() is used only in kernel/fork.c Signed-off-by: Daeseok Youn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 -- kernel/fork.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 33e4e9e1f62..66a17ad55bc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2295,8 +2295,6 @@ extern struct mm_struct *get_task_mm(struct task_struct *task); extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); /* Remove the current tasks stale references to the old mm_struct */ extern void mm_release(struct task_struct *, struct mm_struct *); -/* Allocate a new mm structure and copy contents from tsk->mm */ -extern struct mm_struct *dup_mm(struct task_struct *tsk); extern int copy_thread(unsigned long, unsigned long, unsigned long, struct task_struct *); diff --git a/kernel/fork.c b/kernel/fork.c index 2f11bbe376b..5615ead014e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -800,7 +800,7 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) * Allocate a new mm structure and copy contents from the * mm structure of the passed in task structure. */ -struct mm_struct *dup_mm(struct task_struct *tsk) +static struct mm_struct *dup_mm(struct task_struct *tsk) { struct mm_struct *mm, *oldmm = current->mm; int err; -- cgit v1.2.3-70-g09d2 From 5d59e18270d4769c9160c282b25c00b6fc004ffb Mon Sep 17 00:00:00 2001 From: Daeseok Youn Date: Thu, 23 Jan 2014 15:55:47 -0800 Subject: kernel/fork.c: fix coding style issues Fix errors reported by checkpatch.pl. One error is parentheses, the other is a whitespace issue. Signed-off-by: Daeseok Youn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 5615ead014e..01ccc610991 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1654,7 +1654,7 @@ SYSCALL_DEFINE0(fork) return do_fork(SIGCHLD, 0, 0, NULL, NULL); #else /* can not support in nommu mode */ - return(-EINVAL); + return -EINVAL; #endif } #endif @@ -1662,7 +1662,7 @@ SYSCALL_DEFINE0(fork) #ifdef __ARCH_WANT_SYS_VFORK SYSCALL_DEFINE0(vfork) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 0, NULL, NULL); } #endif -- cgit v1.2.3-70-g09d2 From 68ce670b6e8edc30551862e7f6a306e45389e189 Mon Sep 17 00:00:00 2001 From: Daeseok Youn Date: Thu, 23 Jan 2014 15:55:48 -0800 Subject: kernel/fork.c: remove redundant NULL check in dup_mm() current->mm doesn't need a NULL check in dup_mm(). Becasue dup_mm() is used only in copy_mm() and current->mm is checked whether it is NULL or not in copy_mm() before calling dup_mm(). Signed-off-by: Daeseok Youn Acked-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 01ccc610991..b6dd0bbf424 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -805,9 +805,6 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) struct mm_struct *mm, *oldmm = current->mm; int err; - if (!oldmm) - return NULL; - mm = allocate_mm(); if (!mm) goto fail_nomem; -- cgit v1.2.3-70-g09d2 From 98611e4e6a2b4a03fd2d4750cce8e4455a995c8d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 Jan 2014 15:55:52 -0800 Subject: exec: kill task_struct->did_exec We can kill either task->did_exec or PF_FORKNOEXEC, they are mutually exclusive. The patch kills ->did_exec because it has a single user. Signed-off-by: Oleg Nesterov Acked-by: KOSAKI Motohiro Cc: Al Viro Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 1 - include/linux/sched.h | 1 - kernel/fork.c | 1 - kernel/sys.c | 5 ++--- 4 files changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index f860866e04b..493b102a27c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1424,7 +1424,6 @@ static int exec_binprm(struct linux_binprm *bprm) audit_bprm(bprm); trace_sched_process_exec(current, old_pid, bprm); ptrace_event(PTRACE_EVENT_EXEC, old_vpid); - current->did_exec = 1; proc_exec_connector(current); } diff --git a/include/linux/sched.h b/include/linux/sched.h index 66a17ad55bc..68a0e84463a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1239,7 +1239,6 @@ struct task_struct { /* Used for emulating ABI behavior of previous Linux versions */ unsigned int personality; - unsigned did_exec:1; unsigned in_execve:1; /* Tell the LSMs that the process is doing an * execve */ unsigned in_iowait:1; diff --git a/kernel/fork.c b/kernel/fork.c index b6dd0bbf424..a17621c6cd4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1226,7 +1226,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (!try_module_get(task_thread_info(p)->exec_domain->module)) goto bad_fork_cleanup_count; - p->did_exec = 0; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); INIT_LIST_HEAD(&p->children); diff --git a/kernel/sys.c b/kernel/sys.c index c72311324ea..ecd3ea12f72 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -895,8 +895,7 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf) * only important on a multi-user system anyway, to make sure one user * can't send a signal to a process owned by another. -TYT, 12/12/91 * - * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. - * LBT 04.03.94 + * !PF_FORKNOEXEC check to conform completely to POSIX. */ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) { @@ -932,7 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) if (task_session(p) != task_session(group_leader)) goto out; err = -EACCES; - if (p->did_exec) + if (!(p->flags & PF_FORKNOEXEC)) goto out; } else { err = -ESRCH; -- cgit v1.2.3-70-g09d2 From 2e1f38358246b8f8e5871026b21d374e9bb1a163 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 Jan 2014 15:55:55 -0800 Subject: kernel/sys.c: k_getrusage() can use while_each_thread() Change k_getrusage() to use while_each_thread(), no changes in the compiled code. Signed-off-by: Oleg Nesterov Cc: KOSAKI Motohiro Cc: Al Viro Cc: Kees Cook Reviewed-by: Sameer Nanda Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index ecd3ea12f72..c0a58be780a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1571,8 +1571,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) t = p; do { accumulate_thread_rusage(t, r); - t = next_thread(t); - } while (t != p); + } while_each_thread(p, t); break; default: -- cgit v1.2.3-70-g09d2 From 8d38f203b46c36626285400b9466b08abecaaa80 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 Jan 2014 15:55:56 -0800 Subject: kernel/signal.c: change do_signal_stop/do_sigaction to use while_each_thread() Change do_signal_stop() and do_sigaction() to avoid next_thread() and use while_each_thread() instead. Signed-off-by: Oleg Nesterov Cc: KOSAKI Motohiro Cc: Al Viro Cc: Kees Cook Reviewed-by: Sameer Nanda Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 940b30ee9a3..52f881db1ca 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2047,8 +2047,8 @@ static bool do_signal_stop(int signr) if (task_set_jobctl_pending(current, signr | gstop)) sig->group_stop_count++; - for (t = next_thread(current); t != current; - t = next_thread(t)) { + t = current; + while_each_thread(current, t) { /* * Setting state to TASK_STOPPED for a group * stop is always done with the siglock held, @@ -3125,8 +3125,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) rm_from_queue_full(&mask, &t->signal->shared_pending); do { rm_from_queue_full(&mask, &t->pending); - t = next_thread(t); - } while (t != current); + } while_each_thread(current, t); } } -- cgit v1.2.3-70-g09d2 From 7984754b99b6c89054edc405e9d9d35810a91d36 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jan 2014 15:55:59 -0800 Subject: kexec: add sysctl to disable kexec_load For general-purpose (i.e. distro) kernel builds it makes sense to build with CONFIG_KEXEC to allow end users to choose what kind of things they want to do with kexec. However, in the face of trying to lock down a system with such a kernel, there needs to be a way to disable kexec_load (much like module loading can be disabled). Without this, it is too easy for the root user to modify kernel memory even when CONFIG_STRICT_DEVMEM and modules_disabled are set. With this change, it is still possible to load an image for use later, then disable kexec_load so the image (or lack of image) can't be altered. The intention is for using this in environments where "perfect" enforcement is hard. Without a verified boot, along with verified modules, and along with verified kexec, this is trying to give a system a better chance to defend itself (or at least grow the window of discoverability) against attack in the face of a privilege escalation. In my mind, I consider several boot scenarios: 1) Verified boot of read-only verified root fs loading fd-based verification of kexec images. 2) Secure boot of writable root fs loading signed kexec images. 3) Regular boot loading kexec (e.g. kcrash) image early and locking it. 4) Regular boot with no control of kexec image at all. 1 and 2 don't exist yet, but will soon once the verified kexec series has landed. 4 is the state of things now. The gap between 2 and 4 is too large, so this change creates scenario 3, a middle-ground above 4 when 2 and 1 are not possible for a system. Signed-off-by: Kees Cook Acked-by: Rik van Riel Cc: Vivek Goyal Cc: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 15 ++++++++++++++- include/linux/kexec.h | 1 + kernel/kexec.c | 3 ++- kernel/sysctl.c | 13 +++++++++++++ 4 files changed, 30 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 6d486404200..ee9a2f983b9 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -33,6 +33,7 @@ show up in /proc/sys/kernel: - domainname - hostname - hotplug +- kexec_load_disabled - kptr_restrict - kstack_depth_to_print [ X86 only ] - l2cr [ PPC only ] @@ -287,6 +288,18 @@ Default value is "/sbin/hotplug". ============================================================== +kexec_load_disabled: + +A toggle indicating if the kexec_load syscall has been disabled. This +value defaults to 0 (false: kexec_load enabled), but can be set to 1 +(true: kexec_load disabled). Once true, kexec can no longer be used, and +the toggle cannot be set back to false. This allows a kexec image to be +loaded before disabling the syscall, allowing a system to set up (and +later use) an image without it being altered. Generally used together +with the "modules_disabled" sysctl. + +============================================================== + kptr_restrict: This toggle indicates whether restrictions are placed on @@ -331,7 +344,7 @@ A toggle value indicating if modules are allowed to be loaded in an otherwise modular kernel. This toggle defaults to off (0), but can be set true (1). Once true, modules can be neither loaded nor unloaded, and the toggle cannot be set back -to false. +to false. Generally used with the "kexec_load_disabled" toggle. ============================================================== diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 5fd33dc1fe3..6d4066cdb5b 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -170,6 +170,7 @@ unsigned long paddr_vmcoreinfo_note(void); extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; +extern int kexec_load_disabled; #ifndef kexec_flush_icache_page #define kexec_flush_icache_page(page) diff --git a/kernel/kexec.c b/kernel/kexec.c index 9c970167e40..ac738781d35 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -932,6 +932,7 @@ static int kimage_load_segment(struct kimage *image, */ struct kimage *kexec_image; struct kimage *kexec_crash_image; +int kexec_load_disabled; static DEFINE_MUTEX(kexec_mutex); @@ -942,7 +943,7 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, int result; /* We only trust the superuser with rebooting the system. */ - if (!capable(CAP_SYS_BOOT)) + if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) return -EPERM; /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 693eac39c20..096db7452cb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -614,6 +615,18 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif +#ifdef CONFIG_KEXEC + { + .procname = "kexec_load_disabled", + .data = &kexec_load_disabled, + .maxlen = sizeof(int), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &one, + }, +#endif #ifdef CONFIG_MODULES { .procname = "modprobe", -- cgit v1.2.3-70-g09d2 From 77019967f06b5f30c8b619eac0dfdbc68465fa87 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 23 Jan 2014 15:56:00 -0800 Subject: kdump: fix exported size of vmcoreinfo note Right now we seem to be exporting the max data size contained inside vmcoreinfo note. But this does not include the size of meta data around vmcore info data. Like name of the note and starting and ending elf_note. I think user space expects total size and that size is put in PT_NOTE elf header. Things seem to be fine so far because we are not using vmcoreinfo note to the maximum capacity. But as it starts filling up, to capacity, at some point of time, problem will be visible. I don't think user space will be broken with this change. So there is no need to introduce vmcoreinfo2. This change is safe and backward compatible. More explanation on why this change is safe is below. vmcoreinfo contains information about kernel which user space needs to know to do things like filtering. For example, various kernel config options or information about size or offset of some data structures etc. All this information is commmunicated to user space with an ELF note present in ELF /proc/vmcore file. Currently vmcoreinfo data size is 4096. With some elf note meta data around it, actual size is 4132 bytes. But we are using barely 25% of that size. Rest is empty. So even if we tell user space that size of ELf note is 4096 and not 4132, nothing will be broken becase after around 1000 bytes, everything is zero anyway. But once we start filling up the note to the capacity, and not report the full size of note, bad things will start happening. Either some data will be lost or tools will be confused that they did not fine the zero note at the end. So I think this change is safe and should not break existing tools. Signed-off-by: Vivek Goyal Cc: Ken'ichi Ohmichi Cc: Dan Aloni Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ksysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 9659d38e008..d945a949760 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -126,7 +126,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, { return sprintf(buf, "%lx %x\n", paddr_vmcoreinfo_note(), - (unsigned int)vmcoreinfo_max_size); + (unsigned int)sizeof(vmcoreinfo_note)); } KERNEL_ATTR_RO(vmcoreinfo); -- cgit v1.2.3-70-g09d2