From 54a43d54988a3731d644fdeb7a1d6f46b4ac64c7 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 23 Jan 2014 15:53:13 -0800
Subject: numa: add a sysctl for numa_balancing

Add a working sysctl to enable/disable automatic numa memory balancing
at runtime.

This allows us to track down performance problems with this feature and
is generally a good idea.

This was possible earlier through debugfs, but only with special
debugging options set.  Also fix the boot message.

[akpm@linux-foundation.org: s/sched_numa_balancing/sysctl_numa_balancing/]
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/core.c | 24 +++++++++++++++++++++++-
 kernel/sysctl.c     |  9 +++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4d6964e4971..7fea865a810 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1770,7 +1770,29 @@ void set_numabalancing_state(bool enabled)
 	numabalancing_enabled = enabled;
 }
 #endif /* CONFIG_SCHED_DEBUG */
-#endif /* CONFIG_NUMA_BALANCING */
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_numa_balancing(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table t;
+	int err;
+	int state = numabalancing_enabled;
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	t = *table;
+	t.data = &state;
+	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+	if (err < 0)
+		return err;
+	if (write)
+		set_numabalancing_state(state);
+	return err;
+}
+#endif
+#endif
 
 /*
  * fork()/clone()-time setup:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 332cefcdb04..693eac39c20 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -389,6 +389,15 @@ static struct ctl_table kern_table[] = {
 		.mode           = 0644,
 		.proc_handler   = proc_dointvec,
 	},
+	{
+		.procname	= "numa_balancing",
+		.data		= NULL, /* filled in by handler */
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sysctl_numa_balancing,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 #endif /* CONFIG_NUMA_BALANCING */
 #endif /* CONFIG_SCHED_DEBUG */
 	{
-- 
cgit v1.2.3-70-g09d2


From 1d3fa370346d9d96ab0efb84e3312aed3aeb35ea Mon Sep 17 00:00:00 2001
From: Arun KS <arunks.linux@gmail.com>
Date: Thu, 23 Jan 2014 15:54:19 -0800
Subject: printk: flush conflicting continuation line

An earlier newline was missing and current print is from different task.
In this scenario flush the continuation line and store this line
seperatly.

This patch fix the below scenario of timestamp interleaving,
   [   28.154370 ] read_word_reg : reg[0x 3], reg[0x 4]  data [0x 642]
   [   28.155428 ] uart disconnect
   [   31.947341 ] dvfs[cpufreq.c<275>]:plug-in cpu<1> done
   [   28.155445 ] UART detached : send switch state 201
   [   32.014112 ] read_reg : reg[0x 3] data[0x21]

[akpm@linux-foundation.org: simplify and condense the code]
Signed-off-by: Arun KS <getarunks@gmail.com>
Signed-off-by: Arun KS <arun.ks@broadcom.com>
Cc: Joe Perches <joe@perches.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Kay Sievers <kay@vrfy.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index f8b41bddc6d..b1d255f0413 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1595,10 +1595,13 @@ asmlinkage int vprintk_emit(int facility, int level,
 		 * either merge it with the current buffer and flush, or if
 		 * there was a race with interrupts (prefix == true) then just
 		 * flush it out and store this line separately.
+		 * If the preceding printk was from a different task and missed
+		 * a newline, flush and append the newline.
 		 */
-		if (cont.len && cont.owner == current) {
-			if (!(lflags & LOG_PREFIX))
-				stored = cont_add(facility, level, text, text_len);
+		if (cont.len) {
+			if (cont.owner == current && !(lflags & LOG_PREFIX))
+				stored = cont_add(facility, level, text,
+						  text_len);
 			cont_flush(LOG_NEWLINE);
 		}
 
-- 
cgit v1.2.3-70-g09d2


From ff252c1fc537b0c9e40f62da0a9d11bf0737b7db Mon Sep 17 00:00:00 2001
From: DaeSeok Youn <daeseok.youn@gmail.com>
Date: Thu, 23 Jan 2014 15:55:46 -0800
Subject: kernel/fork.c: make dup_mm() static

dup_mm() is used only in kernel/fork.c

Signed-off-by: Daeseok Youn <daeseok.youn@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 2 --
 kernel/fork.c         | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 33e4e9e1f62..66a17ad55bc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2295,8 +2295,6 @@ extern struct mm_struct *get_task_mm(struct task_struct *task);
 extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
 /* Remove the current tasks stale references to the old mm_struct */
 extern void mm_release(struct task_struct *, struct mm_struct *);
-/* Allocate a new mm structure and copy contents from tsk->mm */
-extern struct mm_struct *dup_mm(struct task_struct *tsk);
 
 extern int copy_thread(unsigned long, unsigned long, unsigned long,
 			struct task_struct *);
diff --git a/kernel/fork.c b/kernel/fork.c
index 2f11bbe376b..5615ead014e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -800,7 +800,7 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
  * Allocate a new mm structure and copy contents from the
  * mm structure of the passed in task structure.
  */
-struct mm_struct *dup_mm(struct task_struct *tsk)
+static struct mm_struct *dup_mm(struct task_struct *tsk)
 {
 	struct mm_struct *mm, *oldmm = current->mm;
 	int err;
-- 
cgit v1.2.3-70-g09d2


From 5d59e18270d4769c9160c282b25c00b6fc004ffb Mon Sep 17 00:00:00 2001
From: Daeseok Youn <daeseok.youn@gmail.com>
Date: Thu, 23 Jan 2014 15:55:47 -0800
Subject: kernel/fork.c: fix coding style issues

Fix errors reported by checkpatch.pl.  One error is parentheses, the other
is a whitespace issue.

Signed-off-by: Daeseok Youn <daeseok.youn@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 5615ead014e..01ccc610991 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1654,7 +1654,7 @@ SYSCALL_DEFINE0(fork)
 	return do_fork(SIGCHLD, 0, 0, NULL, NULL);
 #else
 	/* can not support in nommu mode */
-	return(-EINVAL);
+	return -EINVAL;
 #endif
 }
 #endif
@@ -1662,7 +1662,7 @@ SYSCALL_DEFINE0(fork)
 #ifdef __ARCH_WANT_SYS_VFORK
 SYSCALL_DEFINE0(vfork)
 {
-	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 
+	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
 			0, NULL, NULL);
 }
 #endif
-- 
cgit v1.2.3-70-g09d2


From 68ce670b6e8edc30551862e7f6a306e45389e189 Mon Sep 17 00:00:00 2001
From: Daeseok Youn <daeseok.youn@gmail.com>
Date: Thu, 23 Jan 2014 15:55:48 -0800
Subject: kernel/fork.c: remove redundant NULL check in dup_mm()

current->mm doesn't need a NULL check in dup_mm().  Becasue dup_mm() is
used only in copy_mm() and current->mm is checked whether it is NULL or
not in copy_mm() before calling dup_mm().

Signed-off-by: Daeseok Youn <daeseok.youn@gmail.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 01ccc610991..b6dd0bbf424 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -805,9 +805,6 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
 	struct mm_struct *mm, *oldmm = current->mm;
 	int err;
 
-	if (!oldmm)
-		return NULL;
-
 	mm = allocate_mm();
 	if (!mm)
 		goto fail_nomem;
-- 
cgit v1.2.3-70-g09d2


From 98611e4e6a2b4a03fd2d4750cce8e4455a995c8d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 23 Jan 2014 15:55:52 -0800
Subject: exec: kill task_struct->did_exec

We can kill either task->did_exec or PF_FORKNOEXEC, they are mutually
exclusive.  The patch kills ->did_exec because it has a single user.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c             | 1 -
 include/linux/sched.h | 1 -
 kernel/fork.c         | 1 -
 kernel/sys.c          | 5 ++---
 4 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index f860866e04b..493b102a27c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1424,7 +1424,6 @@ static int exec_binprm(struct linux_binprm *bprm)
 		audit_bprm(bprm);
 		trace_sched_process_exec(current, old_pid, bprm);
 		ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
-		current->did_exec = 1;
 		proc_exec_connector(current);
 	}
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 66a17ad55bc..68a0e84463a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1239,7 +1239,6 @@ struct task_struct {
 	/* Used for emulating ABI behavior of previous Linux versions */
 	unsigned int personality;
 
-	unsigned did_exec:1;
 	unsigned in_execve:1;	/* Tell the LSMs that the process is doing an
 				 * execve */
 	unsigned in_iowait:1;
diff --git a/kernel/fork.c b/kernel/fork.c
index b6dd0bbf424..a17621c6cd4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1226,7 +1226,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (!try_module_get(task_thread_info(p)->exec_domain->module))
 		goto bad_fork_cleanup_count;
 
-	p->did_exec = 0;
 	delayacct_tsk_init(p);	/* Must remain after dup_task_struct() */
 	copy_flags(clone_flags, p);
 	INIT_LIST_HEAD(&p->children);
diff --git a/kernel/sys.c b/kernel/sys.c
index c72311324ea..ecd3ea12f72 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -895,8 +895,7 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
  * only important on a multi-user system anyway, to make sure one user
  * can't send a signal to a process owned by another.  -TYT, 12/12/91
  *
- * Auch. Had to add the 'did_exec' flag to conform completely to POSIX.
- * LBT 04.03.94
+ * !PF_FORKNOEXEC check to conform completely to POSIX.
  */
 SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
 {
@@ -932,7 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
 		if (task_session(p) != task_session(group_leader))
 			goto out;
 		err = -EACCES;
-		if (p->did_exec)
+		if (!(p->flags & PF_FORKNOEXEC))
 			goto out;
 	} else {
 		err = -ESRCH;
-- 
cgit v1.2.3-70-g09d2


From 2e1f38358246b8f8e5871026b21d374e9bb1a163 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 23 Jan 2014 15:55:55 -0800
Subject: kernel/sys.c: k_getrusage() can use while_each_thread()

Change k_getrusage() to use while_each_thread(), no changes in the
compiled code.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Kees Cook <keescook@chromium.org>
Reviewed-by: Sameer Nanda <snanda@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index ecd3ea12f72..c0a58be780a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1571,8 +1571,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 			t = p;
 			do {
 				accumulate_thread_rusage(t, r);
-				t = next_thread(t);
-			} while (t != p);
+			} while_each_thread(p, t);
 			break;
 
 		default:
-- 
cgit v1.2.3-70-g09d2


From 8d38f203b46c36626285400b9466b08abecaaa80 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 23 Jan 2014 15:55:56 -0800
Subject: kernel/signal.c: change do_signal_stop/do_sigaction to use
 while_each_thread()

Change do_signal_stop() and do_sigaction() to avoid next_thread() and use
while_each_thread() instead.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Kees Cook <keescook@chromium.org>
Reviewed-by: Sameer Nanda <snanda@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 940b30ee9a3..52f881db1ca 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2047,8 +2047,8 @@ static bool do_signal_stop(int signr)
 		if (task_set_jobctl_pending(current, signr | gstop))
 			sig->group_stop_count++;
 
-		for (t = next_thread(current); t != current;
-		     t = next_thread(t)) {
+		t = current;
+		while_each_thread(current, t) {
 			/*
 			 * Setting state to TASK_STOPPED for a group
 			 * stop is always done with the siglock held,
@@ -3125,8 +3125,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 			rm_from_queue_full(&mask, &t->signal->shared_pending);
 			do {
 				rm_from_queue_full(&mask, &t->pending);
-				t = next_thread(t);
-			} while (t != current);
+			} while_each_thread(current, t);
 		}
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 7984754b99b6c89054edc405e9d9d35810a91d36 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 23 Jan 2014 15:55:59 -0800
Subject: kexec: add sysctl to disable kexec_load

For general-purpose (i.e.  distro) kernel builds it makes sense to build
with CONFIG_KEXEC to allow end users to choose what kind of things they
want to do with kexec.  However, in the face of trying to lock down a
system with such a kernel, there needs to be a way to disable kexec_load
(much like module loading can be disabled).  Without this, it is too easy
for the root user to modify kernel memory even when CONFIG_STRICT_DEVMEM
and modules_disabled are set.  With this change, it is still possible to
load an image for use later, then disable kexec_load so the image (or lack
of image) can't be altered.

The intention is for using this in environments where "perfect"
enforcement is hard.  Without a verified boot, along with verified
modules, and along with verified kexec, this is trying to give a system a
better chance to defend itself (or at least grow the window of
discoverability) against attack in the face of a privilege escalation.

In my mind, I consider several boot scenarios:

1) Verified boot of read-only verified root fs loading fd-based
   verification of kexec images.
2) Secure boot of writable root fs loading signed kexec images.
3) Regular boot loading kexec (e.g. kcrash) image early and locking it.
4) Regular boot with no control of kexec image at all.

1 and 2 don't exist yet, but will soon once the verified kexec series has
landed.  4 is the state of things now.  The gap between 2 and 4 is too
large, so this change creates scenario 3, a middle-ground above 4 when 2
and 1 are not possible for a system.

Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/kernel.txt | 15 ++++++++++++++-
 include/linux/kexec.h           |  1 +
 kernel/kexec.c                  |  3 ++-
 kernel/sysctl.c                 | 13 +++++++++++++
 4 files changed, 30 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 6d486404200..ee9a2f983b9 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -33,6 +33,7 @@ show up in /proc/sys/kernel:
 - domainname
 - hostname
 - hotplug
+- kexec_load_disabled
 - kptr_restrict
 - kstack_depth_to_print       [ X86 only ]
 - l2cr                        [ PPC only ]
@@ -287,6 +288,18 @@ Default value is "/sbin/hotplug".
 
 ==============================================================
 
+kexec_load_disabled:
+
+A toggle indicating if the kexec_load syscall has been disabled. This
+value defaults to 0 (false: kexec_load enabled), but can be set to 1
+(true: kexec_load disabled). Once true, kexec can no longer be used, and
+the toggle cannot be set back to false. This allows a kexec image to be
+loaded before disabling the syscall, allowing a system to set up (and
+later use) an image without it being altered. Generally used together
+with the "modules_disabled" sysctl.
+
+==============================================================
+
 kptr_restrict:
 
 This toggle indicates whether restrictions are placed on
@@ -331,7 +344,7 @@ A toggle value indicating if modules are allowed to be loaded
 in an otherwise modular kernel.  This toggle defaults to off
 (0), but can be set true (1).  Once true, modules can be
 neither loaded nor unloaded, and the toggle cannot be set back
-to false.
+to false.  Generally used with the "kexec_load_disabled" toggle.
 
 ==============================================================
 
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 5fd33dc1fe3..6d4066cdb5b 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -170,6 +170,7 @@ unsigned long paddr_vmcoreinfo_note(void);
 
 extern struct kimage *kexec_image;
 extern struct kimage *kexec_crash_image;
+extern int kexec_load_disabled;
 
 #ifndef kexec_flush_icache_page
 #define kexec_flush_icache_page(page)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 9c970167e40..ac738781d35 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -932,6 +932,7 @@ static int kimage_load_segment(struct kimage *image,
  */
 struct kimage *kexec_image;
 struct kimage *kexec_crash_image;
+int kexec_load_disabled;
 
 static DEFINE_MUTEX(kexec_mutex);
 
@@ -942,7 +943,7 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 	int result;
 
 	/* We only trust the superuser with rebooting the system. */
-	if (!capable(CAP_SYS_BOOT))
+	if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
 		return -EPERM;
 
 	/*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 693eac39c20..096db7452cb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -62,6 +62,7 @@
 #include <linux/capability.h>
 #include <linux/binfmts.h>
 #include <linux/sched/sysctl.h>
+#include <linux/kexec.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -614,6 +615,18 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
+#ifdef CONFIG_KEXEC
+	{
+		.procname	= "kexec_load_disabled",
+		.data		= &kexec_load_disabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		/* only handle a transition from default "0" to "1" */
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
+		.extra2		= &one,
+	},
+#endif
 #ifdef CONFIG_MODULES
 	{
 		.procname	= "modprobe",
-- 
cgit v1.2.3-70-g09d2


From 77019967f06b5f30c8b619eac0dfdbc68465fa87 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 23 Jan 2014 15:56:00 -0800
Subject: kdump: fix exported size of vmcoreinfo note

Right now we seem to be exporting the max data size contained inside
vmcoreinfo note.  But this does not include the size of meta data around
vmcore info data.  Like name of the note and starting and ending elf_note.

I think user space expects total size and that size is put in PT_NOTE elf
header.  Things seem to be fine so far because we are not using vmcoreinfo
note to the maximum capacity.  But as it starts filling up, to capacity,
at some point of time, problem will be visible.

I don't think user space will be broken with this change.  So there is no
need to introduce vmcoreinfo2.  This change is safe and backward
compatible.  More explanation on why this change is safe is below.

vmcoreinfo contains information about kernel which user space needs to
know to do things like filtering.  For example, various kernel config
options or information about size or offset of some data structures etc.
All this information is commmunicated to user space with an ELF note
present in ELF /proc/vmcore file.

Currently vmcoreinfo data size is 4096.  With some elf note meta data
around it, actual size is 4132 bytes.  But we are using barely 25% of that
size.  Rest is empty.  So even if we tell user space that size of ELf note
is 4096 and not 4132, nothing will be broken becase after around 1000
bytes, everything is zero anyway.

But once we start filling up the note to the capacity, and not report the
full size of note, bad things will start happening.  Either some data will
be lost or tools will be confused that they did not fine the zero note at
the end.

So I think this change is safe and should not break existing tools.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Ken'ichi Ohmichi <oomichi@mxs.nes.nec.co.jp>
Cc: Dan Aloni <da-x@monatomic.org>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ksysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 9659d38e008..d945a949760 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -126,7 +126,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
 {
 	return sprintf(buf, "%lx %x\n",
 		       paddr_vmcoreinfo_note(),
-		       (unsigned int)vmcoreinfo_max_size);
+		       (unsigned int)sizeof(vmcoreinfo_note));
 }
 KERNEL_ATTR_RO(vmcoreinfo);
 
-- 
cgit v1.2.3-70-g09d2