From 4ecdafc8084fe1d95bb59ed7753b345abcd586fb Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 29 Apr 2013 15:05:01 -0700
Subject: kthread: introduce to_live_kthread()

"k->vfork_done != NULL" with a barrier() after to_kthread(k) in
task_get_live_kthread(k) looks unclear, and sub-optimal because we load
->vfork_done twice.

All we need is to ensure that we do not return to_kthread(NULL).  Add a
new trivial helper which loads/checks ->vfork_done once, this also looks
more understandable.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kthread.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9eb7fed0bba..a84dee265f9 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -52,8 +52,21 @@ enum KTHREAD_BITS {
 	KTHREAD_IS_PARKED,
 };
 
-#define to_kthread(tsk)	\
-	container_of((tsk)->vfork_done, struct kthread, exited)
+#define __to_kthread(vfork)	\
+	container_of(vfork, struct kthread, exited)
+
+static inline struct kthread *to_kthread(struct task_struct *k)
+{
+	return __to_kthread(k->vfork_done);
+}
+
+static struct kthread *to_live_kthread(struct task_struct *k)
+{
+	struct completion *vfork = ACCESS_ONCE(k->vfork_done);
+	if (likely(vfork))
+		return __to_kthread(vfork);
+	return NULL;
+}
 
 /**
  * kthread_should_stop - should this kthread return now?
@@ -313,15 +326,8 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
 
 static struct kthread *task_get_live_kthread(struct task_struct *k)
 {
-	struct kthread *kthread;
-
 	get_task_struct(k);
-	kthread = to_kthread(k);
-	/* It might have exited */
-	barrier();
-	if (k->vfork_done != NULL)
-		return kthread;
-	return NULL;
+	return to_live_kthread(k);
 }
 
 static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
-- 
cgit v1.2.3-70-g09d2


From b5c5442bb6bce0c67701d55124be561043a51faf Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 29 Apr 2013 15:05:12 -0700
Subject: kthread: kill task_get_live_kthread()

task_get_live_kthread() looks confusing and unneeded.  It does
get_task_struct() but only kthread_stop() needs this, it can be called
even if the calller doesn't have a reference when we know that this
kthread can't exit until we do kthread_stop().

kthread_park() and kthread_unpark() do not need get_task_struct(), the
callers already have the reference.  And it can not help if we can race
with the exiting kthread anyway, kthread_park() can hang forever in this
case.

Change kthread_park() and kthread_unpark() to use to_live_kthread(),
change kthread_stop() to do get_task_struct() by hand and remove
task_get_live_kthread().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kthread.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index a84dee265f9..9b12d65186f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -324,12 +324,6 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
 	return p;
 }
 
-static struct kthread *task_get_live_kthread(struct task_struct *k)
-{
-	get_task_struct(k);
-	return to_live_kthread(k);
-}
-
 static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
 {
 	clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
@@ -356,11 +350,10 @@ static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
  */
 void kthread_unpark(struct task_struct *k)
 {
-	struct kthread *kthread = task_get_live_kthread(k);
+	struct kthread *kthread = to_live_kthread(k);
 
 	if (kthread)
 		__kthread_unpark(k, kthread);
-	put_task_struct(k);
 }
 
 /**
@@ -377,7 +370,7 @@ void kthread_unpark(struct task_struct *k)
  */
 int kthread_park(struct task_struct *k)
 {
-	struct kthread *kthread = task_get_live_kthread(k);
+	struct kthread *kthread = to_live_kthread(k);
 	int ret = -ENOSYS;
 
 	if (kthread) {
@@ -390,7 +383,6 @@ int kthread_park(struct task_struct *k)
 		}
 		ret = 0;
 	}
-	put_task_struct(k);
 	return ret;
 }
 
@@ -411,10 +403,13 @@ int kthread_park(struct task_struct *k)
  */
 int kthread_stop(struct task_struct *k)
 {
-	struct kthread *kthread = task_get_live_kthread(k);
+	struct kthread *kthread;
 	int ret;
 
 	trace_sched_kthread_stop(k);
+
+	get_task_struct(k);
+	kthread = to_live_kthread(k);
 	if (kthread) {
 		set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
 		__kthread_unpark(k, kthread);
@@ -422,10 +417,9 @@ int kthread_stop(struct task_struct *k)
 		wait_for_completion(&kthread->exited);
 	}
 	ret = k->exit_code;
-
 	put_task_struct(k);
-	trace_sched_kthread_stop_ret(ret);
 
+	trace_sched_kthread_stop_ret(ret);
 	return ret;
 }
 EXPORT_SYMBOL(kthread_stop);
-- 
cgit v1.2.3-70-g09d2


From 3f68613f39cdc242fa2e872ac04a802e7cc7b7cb Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Mon, 29 Apr 2013 15:05:13 -0700
Subject: kernel/auditsc.c: use kzalloc instead of kmalloc+memset

In audit_alloc_context() use kzalloc instead of kmalloc+memset.  Also
rename audit_zero_context() to audit_set_context(), to represent it's
inner workings properly.

[akpm@linux-foundation.org: remove audit_set_context() altogether - fold it into its caller]
Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric Paris <eparis@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/auditsc.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index a371f857a0a..c68229411a7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1034,21 +1034,15 @@ static inline void audit_free_aux(struct audit_context *context)
 	}
 }
 
-static inline void audit_zero_context(struct audit_context *context,
-				      enum audit_state state)
-{
-	memset(context, 0, sizeof(*context));
-	context->state      = state;
-	context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
-}
-
 static inline struct audit_context *audit_alloc_context(enum audit_state state)
 {
 	struct audit_context *context;
 
-	if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
 		return NULL;
-	audit_zero_context(context, state);
+	context->state = state;
+	context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
 	INIT_LIST_HEAD(&context->killed_trees);
 	INIT_LIST_HEAD(&context->names_list);
 	return context;
-- 
cgit v1.2.3-70-g09d2


From 13f51e1c3fbebeab801f768f433067ff075dea5a Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Mon, 29 Apr 2013 15:05:14 -0700
Subject: audit: don't check if kauditd is valid every time

We only need to check if kauditd is valid after we start it, if kauditd
is invalid, we will set kauditd_task to NULL.  So next time, we will
start kauditd again.

It means if kauditd_task is not NULL,it must be valid.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/audit.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index d596e5355f1..9816a1b96cf 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -660,14 +660,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 	/* As soon as there's any sign of userspace auditd,
 	 * start kauditd to talk to it */
-	if (!kauditd_task)
+	if (!kauditd_task) {
 		kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
-	if (IS_ERR(kauditd_task)) {
-		err = PTR_ERR(kauditd_task);
-		kauditd_task = NULL;
-		return err;
+		if (IS_ERR(kauditd_task)) {
+			err = PTR_ERR(kauditd_task);
+			kauditd_task = NULL;
+			return err;
+		}
 	}
-
 	loginuid = audit_get_loginuid(current);
 	sessionid = audit_get_sessionid(current);
 	security_task_getsecid(current, &sid);
-- 
cgit v1.2.3-70-g09d2


From 374c586d95f288296e1e718533401d8ce0eecee3 Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Mon, 29 Apr 2013 15:05:15 -0700
Subject: audit: remove duplicate export of audit_enabled

audit_enabled has already been exported in include/linux/audit.h.  and
kernel/audit.h includes include/linux/audit.h, no need to export
aduit_enabled again in kernel/audit.h

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric Paris <eparis@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/audit.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.h b/kernel/audit.h
index d51cba868e1..d06ffc144f8 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -60,7 +60,6 @@ struct audit_entry {
 };
 
 #ifdef CONFIG_AUDIT
-extern int audit_enabled;
 extern int audit_ever_enabled;
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From dde5b7d6e7be308ce371baa96058c2d40df26c05 Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Mon, 29 Apr 2013 15:05:16 -0700
Subject: audit: remove unnecessary #if CONFIG_AUDIT

The files which include kernel/audit.h are complied only when
CONFIG_AUDIT is set.

Just like audit_pid, there is no need to surround audit_ever_enabled
with CONFIG_AUDIT.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric Paris <eparis@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/audit.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.h b/kernel/audit.h
index d06ffc144f8..11468d99dad 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -59,9 +59,7 @@ struct audit_entry {
 	struct audit_krule	rule;
 };
 
-#ifdef CONFIG_AUDIT
 extern int audit_ever_enabled;
-#endif
 
 extern int audit_pid;
 
-- 
cgit v1.2.3-70-g09d2


From 373e0f3408fe671550d69d9a7965d8a49e988525 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Mon, 29 Apr 2013 15:05:18 -0700
Subject: kernel/auditfilter.c: tree and watch will memory leak when failure
 occurs

In audit_data_to_entry() when a failure occurs we must check and free
the tree and watch to avoid a memory leak.

  test:
    plan:
      test command:
        "auditctl -a exit,always -w /etc -F auid=-1"
        (on fedora17, need modify auditctl to let "-w /etc" has effect)
      running:
        under fedora17 x86_64, 2 CPUs 3.20GHz, 2.5GB RAM.
        let 15 auditctl processes continue running at the same time.
      monitor command:
        watch -d -n 1 "cat /proc/meminfo | awk '{print \$2}' \
          | head -n 4 | xargs \
          | awk '{print \"used \",\$1 - \$2 - \$3 - \$4}'"

    result:
      for original version:
        will use up all memory, within 3 hours.
        kill all auditctl, the memory still does not free.
      for new version (apply this patch):
        after 14 hours later, not find issues.

Signed-off-by: Chen Gang <gang.chen@asianux.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/auditfilter.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index f9fc54bbe06..267436826c3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -594,6 +594,10 @@ exit_nofree:
 	return entry;
 
 exit_free:
+	if (entry->rule.watch)
+		audit_put_watch(entry->rule.watch); /* matches initial get */
+	if (entry->rule.tree)
+		audit_put_tree(entry->rule.tree); /* that's the temporary one */
 	audit_free_rule(entry);
 	return ERR_PTR(err);
 }
-- 
cgit v1.2.3-70-g09d2


From 12b2f117f3bf738c1a00a6f64393f1953a740bd4 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Mon, 29 Apr 2013 15:05:19 -0700
Subject: kernel/audit_tree.c: tree will leak memory when failure occurs in
 audit_trim_trees()

audit_trim_trees() calls get_tree().  If a failure occurs we must call
put_tree().

[akpm@linux-foundation.org: run put_tree() before mutex_lock() for small scalability improvement]
Signed-off-by: Chen Gang <gang.chen@asianux.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric Paris <eparis@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/audit_tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 642a89c4f3d..a291aa23fb3 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -617,9 +617,9 @@ void audit_trim_trees(void)
 		}
 		spin_unlock(&hash_lock);
 		trim_marked(tree);
-		put_tree(tree);
 		drop_collected_mounts(root_mnt);
 skip_it:
+		put_tree(tree);
 		mutex_lock(&audit_filter_mutex);
 	}
 	list_del(&cursor);
-- 
cgit v1.2.3-70-g09d2


From e07cee23e64137f7da2fb35d7b7c0ad26cc0edec Mon Sep 17 00:00:00 2001
From: Jiang Liu <liuj97@gmail.com>
Date: Mon, 29 Apr 2013 15:06:58 -0700
Subject: mm,kexec: use common help functions to free reserved pages

Use common help functions to free reserved pages.

Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Reviewed-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index ffd4e111fd6..b19181d4420 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1118,12 +1118,8 @@ void __weak crash_free_reserved_phys_range(unsigned long begin,
 {
 	unsigned long addr;
 
-	for (addr = begin; addr < end; addr += PAGE_SIZE) {
-		ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
-		init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
-		free_page((unsigned long)__va(addr));
-		totalram_pages++;
-	}
+	for (addr = begin; addr < end; addr += PAGE_SIZE)
+		free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
 }
 
 int crash_shrink_memory(unsigned long new_size)
-- 
cgit v1.2.3-70-g09d2


From 6d2488f64a240191f0733c1f32d73607916b01b7 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Mon, 29 Apr 2013 15:07:21 -0700
Subject: cgroup: remove css_get_next

Now that we have generic and well ordered cgroup tree walkers there is
no need to keep css_get_next in the place.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Ying Han <yinghan@google.com>
Cc: Tejun Heo <htejun@gmail.com>
Cc: Glauber Costa <glommer@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  7 -------
 kernel/cgroup.c        | 49 -------------------------------------------------
 2 files changed, 56 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 900af5964f5..470073bf93d 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -687,13 +687,6 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css);
 
 struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id);
 
-/*
- * Get a cgroup whose id is greater than or equal to id under tree of root.
- * Returning a cgroup_subsys_state or NULL.
- */
-struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id,
-		struct cgroup_subsys_state *root, int *foundid);
-
 /* Returns true if root is ancestor of cg */
 bool css_is_ancestor(struct cgroup_subsys_state *cg,
 		     const struct cgroup_subsys_state *root);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a32f9432666..dfaf50d4705 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5416,55 +5416,6 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
 }
 EXPORT_SYMBOL_GPL(css_lookup);
 
-/**
- * css_get_next - lookup next cgroup under specified hierarchy.
- * @ss: pointer to subsystem
- * @id: current position of iteration.
- * @root: pointer to css. search tree under this.
- * @foundid: position of found object.
- *
- * Search next css under the specified hierarchy of rootid. Calling under
- * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
- */
-struct cgroup_subsys_state *
-css_get_next(struct cgroup_subsys *ss, int id,
-	     struct cgroup_subsys_state *root, int *foundid)
-{
-	struct cgroup_subsys_state *ret = NULL;
-	struct css_id *tmp;
-	int tmpid;
-	int rootid = css_id(root);
-	int depth = css_depth(root);
-
-	if (!rootid)
-		return NULL;
-
-	BUG_ON(!ss->use_id);
-	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	/* fill start point for scan */
-	tmpid = id;
-	while (1) {
-		/*
-		 * scan next entry from bitmap(tree), tmpid is updated after
-		 * idr_get_next().
-		 */
-		tmp = idr_get_next(&ss->idr, &tmpid);
-		if (!tmp)
-			break;
-		if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
-			ret = rcu_dereference(tmp->css);
-			if (ret) {
-				*foundid = tmpid;
-				break;
-			}
-		}
-		/* continue to scan from next id */
-		tmpid = tmpid + 1;
-	}
-	return ret;
-}
-
 /*
  * get corresponding css from file open on cgroupfs directory
  */
-- 
cgit v1.2.3-70-g09d2


From 146732ce104ddfed3d4d82722c0b336074016b92 Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@joshtriplett.org>
Date: Mon, 29 Apr 2013 15:07:22 -0700
Subject: fs: don't compile in drop_caches.c when CONFIG_SYSCTL=n

drop_caches.c provides code only invokable via sysctl, so don't compile it
in when CONFIG_SYSCTL=n.

Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Makefile        | 3 ++-
 include/linux/mm.h | 4 ++++
 kernel/sysctl.c    | 1 -
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/fs/Makefile b/fs/Makefile
index 9d53192236f..3b2c76759ec 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -10,7 +10,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		ioctl.o readdir.o select.o fifo.o dcache.o inode.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
-		pnode.o drop_caches.o splice.o sync.o utimes.o \
+		pnode.o splice.o sync.o utimes.o \
 		stack.o fs_struct.o statfs.o
 
 ifeq ($(CONFIG_BLOCK),y)
@@ -49,6 +49,7 @@ obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o xattr_acl.o
 obj-$(CONFIG_NFS_COMMON)	+= nfs_common/
 obj-$(CONFIG_GENERIC_ACL)	+= generic_acl.o
 obj-$(CONFIG_COREDUMP)		+= coredump.o
+obj-$(CONFIG_SYSCTL)		+= drop_caches.o
 
 obj-$(CONFIG_FHANDLE)		+= fhandle.o
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 43b70d5f820..98a44376e4f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1731,8 +1731,12 @@ int in_gate_area_no_mm(unsigned long addr);
 #define in_gate_area(mm, addr) ({(void)mm; in_gate_area_no_mm(addr);})
 #endif	/* __HAVE_ARCH_GATE_AREA */
 
+#ifdef CONFIG_SYSCTL
+extern int sysctl_drop_caches;
 int drop_caches_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
+#endif
+
 unsigned long shrink_slab(struct shrink_control *shrink,
 			  unsigned long nr_pages_scanned,
 			  unsigned long lru_pages);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index afc1dc60f3f..3dadde52253 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -106,7 +106,6 @@ extern unsigned int core_pipe_limit;
 #endif
 extern int pid_max;
 extern int pid_max_min, pid_max_max;
-extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int latencytop_enabled;
-- 
cgit v1.2.3-70-g09d2


From f1c4069e1dc128dc8a851174cba2e273652e9216 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <js1304@gmail.com>
Date: Mon, 29 Apr 2013 15:07:37 -0700
Subject: mm, vmalloc: export vmap_area_list, instead of vmlist

Although our intention is to unexport internal structure entirely, but
there is one exception for kexec.  kexec dumps address of vmlist and
makedumpfile uses this information.

We are about to remove vmlist, then another way to retrieve information
of vmalloc layer is needed for makedumpfile.  For this purpose, we
export vmap_area_list, instead of vmlist.

Signed-off-by: Joonsoo Kim <js1304@gmail.com>
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Dave Anderson <anderson@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h |  3 +--
 kernel/kexec.c          |  2 +-
 mm/nommu.c              |  3 +--
 mm/vmalloc.c            | 11 ++++++-----
 4 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 698b1e50d3a..8a25f9081ed 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -130,8 +130,7 @@ extern long vwrite(char *buf, char *addr, unsigned long count);
 /*
  *	Internals.  Dont't use..
  */
-extern rwlock_t vmlist_lock;
-extern struct vm_struct *vmlist;
+extern struct list_head vmap_area_list;
 extern __init void vm_area_add_early(struct vm_struct *vm);
 extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
 
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b19181d4420..0b1f7e780d4 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1577,7 +1577,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_SYMBOL(swapper_pg_dir);
 #endif
 	VMCOREINFO_SYMBOL(_stext);
-	VMCOREINFO_SYMBOL(vmlist);
+	VMCOREINFO_SYMBOL(vmap_area_list);
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 	VMCOREINFO_SYMBOL(mem_map);
diff --git a/mm/nommu.c b/mm/nommu.c
index e001768b14e..2f1c75ed468 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -228,8 +228,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
 }
 EXPORT_SYMBOL(follow_pfn);
 
-DEFINE_RWLOCK(vmlist_lock);
-struct vm_struct *vmlist;
+LIST_HEAD(vmap_area_list);
 
 void vfree(const void *addr)
 {
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index bda6cef5b97..7e63984eb58 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -261,7 +261,8 @@ struct vmap_area {
 };
 
 static DEFINE_SPINLOCK(vmap_area_lock);
-static LIST_HEAD(vmap_area_list);
+/* Export for kexec only */
+LIST_HEAD(vmap_area_list);
 static struct rb_root vmap_area_root = RB_ROOT;
 
 /* The vmap cache globals are protected by vmap_area_lock */
@@ -272,6 +273,10 @@ static unsigned long cached_align;
 
 static unsigned long vmap_area_pcpu_hole;
 
+/*** Old vmalloc interfaces ***/
+static DEFINE_RWLOCK(vmlist_lock);
+static struct vm_struct *vmlist;
+
 static struct vmap_area *__find_vmap_area(unsigned long addr)
 {
 	struct rb_node *n = vmap_area_root.rb_node;
@@ -1283,10 +1288,6 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
 }
 EXPORT_SYMBOL_GPL(map_vm_area);
 
-/*** Old vmalloc interfaces ***/
-DEFINE_RWLOCK(vmlist_lock);
-struct vm_struct *vmlist;
-
 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
 			      unsigned long flags, const void *caller)
 {
-- 
cgit v1.2.3-70-g09d2


From 13ba3fcbbe31068b1ee7c39a0b58ecbed03c4d72 Mon Sep 17 00:00:00 2001
From: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Date: Mon, 29 Apr 2013 15:07:40 -0700
Subject: kexec, vmalloc: export additional vmalloc layer information

Now, vmap_area_list is exported as VMCOREINFO for makedumpfile to get
the start address of vmalloc region (vmalloc_start).  The address which
contains vmalloc_start value is represented as below:

  vmap_area_list.next - OFFSET(vmap_area.list) + OFFSET(vmap_area.va_start)

However, both OFFSET(vmap_area.va_start) and OFFSET(vmap_area.list)
aren't exported as VMCOREINFO.

So this patch exports them externally with small cleanup.

[akpm@linux-foundation.org: vmalloc.h should include list.h for list_head]
Signed-off-by: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Dave Anderson <anderson@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h | 13 +++++++++++++
 kernel/kexec.c          |  3 ++-
 mm/vmalloc.c            | 11 -----------
 3 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 8a25f9081ed..7d5773a99f2 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -3,7 +3,9 @@
 
 #include <linux/spinlock.h>
 #include <linux/init.h>
+#include <linux/list.h>
 #include <asm/page.h>		/* pgprot_t */
+#include <linux/rbtree.h>
 
 struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
 
@@ -35,6 +37,17 @@ struct vm_struct {
 	const void		*caller;
 };
 
+struct vmap_area {
+	unsigned long va_start;
+	unsigned long va_end;
+	unsigned long flags;
+	struct rb_node rb_node;         /* address sorted rbtree */
+	struct list_head list;          /* address sorted list */
+	struct list_head purge_list;    /* "lazy purge" list */
+	struct vm_struct *vm;
+	struct rcu_head rcu_head;
+};
+
 /*
  *	Highlevel APIs for driver use
  */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 0b1f7e780d4..b574920cbd4 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1615,7 +1615,8 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_OFFSET(free_area, free_list);
 	VMCOREINFO_OFFSET(list_head, next);
 	VMCOREINFO_OFFSET(list_head, prev);
-	VMCOREINFO_OFFSET(vm_struct, addr);
+	VMCOREINFO_OFFSET(vmap_area, va_start);
+	VMCOREINFO_OFFSET(vmap_area, list);
 	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
 	log_buf_kexec_setup();
 	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 151da8ac53f..72043d6c88c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -249,17 +249,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
 #define VM_LAZY_FREEING	0x02
 #define VM_VM_AREA	0x04
 
-struct vmap_area {
-	unsigned long va_start;
-	unsigned long va_end;
-	unsigned long flags;
-	struct rb_node rb_node;		/* address sorted rbtree */
-	struct list_head list;		/* address sorted list */
-	struct list_head purge_list;	/* "lazy purge" list */
-	struct vm_struct *vm;
-	struct rcu_head rcu_head;
-};
-
 static DEFINE_SPINLOCK(vmap_area_lock);
 /* Export for kexec only */
 LIST_HEAD(vmap_area_list);
-- 
cgit v1.2.3-70-g09d2


From d8f10cb3d375c34ad668f32ca6e4661ad1fc23b2 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 29 Apr 2013 15:08:08 -0700
Subject: kernel/cpuset.c: use register_hotmemory_notifier()

Use the new interface, remove one ifdef.  No code size changes.

We could/should have been using __meminit/__meminitdata here but there's
now no point in doing that because all this code is elided at compile time.

Cc: Li Zefan <lizefan@huawei.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4f9dfe43ecb..334d983a36b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2251,7 +2251,6 @@ void cpuset_update_active_cpus(bool cpu_online)
 	schedule_work(&cpuset_hotplug_work);
 }
 
-#ifdef CONFIG_MEMORY_HOTPLUG
 /*
  * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
  * Call this routine anytime after node_states[N_MEMORY] changes.
@@ -2263,20 +2262,23 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
 	schedule_work(&cpuset_hotplug_work);
 	return NOTIFY_OK;
 }
-#endif
+
+static struct notifier_block cpuset_track_online_nodes_nb = {
+	.notifier_call = cpuset_track_online_nodes,
+	.priority = 10,		/* ??! */
+};
 
 /**
  * cpuset_init_smp - initialize cpus_allowed
  *
  * Description: Finish top cpuset after cpu, node maps are initialized
- **/
-
+ */
 void __init cpuset_init_smp(void)
 {
 	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
 	top_cpuset.mems_allowed = node_states[N_MEMORY];
 
-	hotplug_memory_notifier(cpuset_track_online_nodes, 10);
+	register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
 
 	cpuset_propagate_hotplug_wq =
 		alloc_ordered_workqueue("cpuset_hotplug", 0);
-- 
cgit v1.2.3-70-g09d2


From c9b1d0981fcce3d9976d7b7a56e4e0503bc610dd Mon Sep 17 00:00:00 2001
From: Andrew Shewmaker <agshew@gmail.com>
Date: Mon, 29 Apr 2013 15:08:10 -0700
Subject: mm: limit growth of 3% hardcoded other user reserve

Add user_reserve_kbytes knob.

Limit the growth of the memory reserved for other user processes to
min(3% current process size, user_reserve_pages).  Only about 8MB is
necessary to enable recovery in the default mode, and only a few hundred
MB are required even when overcommit is disabled.

user_reserve_pages defaults to min(3% free pages, 128MB)

I arrived at 128MB by taking the max VSZ of sshd, login, bash, and top ...
then adding the RSS of each.

This only affects OVERCOMMIT_NEVER mode.

Background

1. user reserve

__vm_enough_memory reserves a hardcoded 3% of the current process size for
other applications when overcommit is disabled.  This was done so that a
user could recover if they launched a memory hogging process.  Without the
reserve, a user would easily run into a message such as:

bash: fork: Cannot allocate memory

2. admin reserve

Additionally, a hardcoded 3% of free memory is reserved for root in both
overcommit 'guess' and 'never' modes.  This was intended to prevent a
scenario where root-cant-log-in and perform recovery operations.

Note that this reserve shrinks, and doesn't guarantee a useful reserve.

Motivation

The two hardcoded memory reserves should be updated to account for current
memory sizes.

Also, the admin reserve would be more useful if it didn't shrink too much.

When the current code was originally written, 1GB was considered
"enterprise".  Now the 3% reserve can grow to multiple GB on large memory
systems, and it only needs to be a few hundred MB at most to enable a user
or admin to recover a system with an unwanted memory hogging process.

I've found that reducing these reserves is especially beneficial for a
specific type of application load:

 * single application system
 * one or few processes (e.g. one per core)
 * allocating all available memory
 * not initializing every page immediately
 * long running

I've run scientific clusters with this sort of load.  A long running job
sometimes failed many hours (weeks of CPU time) into a calculation.  They
weren't initializing all of their memory immediately, and they weren't
using calloc, so I put systems into overcommit 'never' mode.  These
clusters run diskless and have no swap.

However, with the current reserves, a user wishing to allocate as much
memory as possible to one process may be prevented from using, for
example, almost 2GB out of 32GB.

The effect is less, but still significant when a user starts a job with
one process per core.  I have repeatedly seen a set of processes
requesting the same amount of memory fail because one of them could not
allocate the amount of memory a user would expect to be able to allocate.
For example, Message Passing Interfce (MPI) processes, one per core.  And
it is similar for other parallel programming frameworks.

Changing this reserve code will make the overcommit never mode more useful
by allowing applications to allocate nearly all of the available memory.

Also, the new admin_reserve_kbytes will be safer than the current behavior
since the hardcoded 3% of available memory reserve can shrink to something
useless in the case where applications have grabbed all available memory.

Risks

* "bash: fork: Cannot allocate memory"

  The downside of the first patch-- which creates a tunable user reserve
  that is only used in overcommit 'never' mode--is that an admin can set
  it so low that a user may not be able to kill their process, even if
  they already have a shell prompt.

  Of course, a user can get in the same predicament with the current 3%
  reserve--they just have to launch processes until 3% becomes negligible.

* root-cant-log-in problem

  The second patch, adding the tunable rootuser_reserve_pages, allows
  the admin to shoot themselves in the foot by setting it too small.  They
  can easily get the system into a state where root-can't-log-in.

  However, the new admin_reserve_kbytes will be safer than the current
  behavior since the hardcoded 3% of available memory reserve can shrink
  to something useless in the case where applications have grabbed all
  available memory.

Alternatives

 * Memory cgroups provide a more flexible way to limit application memory.

   Not everyone wants to set up cgroups or deal with their overhead.

 * We could create a fourth overcommit mode which provides smaller reserves.

   The size of useful reserves may be drastically different depending
   on the whether the system is embedded or enterprise.

 * Force users to initialize all of their memory or use calloc.

   Some users don't want/expect the system to overcommit when they malloc.
   Overcommit 'never' mode is for this scenario, and it should work well.

The new user and admin reserve tunables are simple to use, with low
overhead compared to cgroups.  The patches preserve current behavior where
3% of memory is less than 128MB, except that the admin reserve doesn't
shrink to an unusable size under pressure.  The code allows admins to tune
for embedded and enterprise usage.

FAQ

 * How is the root-cant-login problem addressed?
   What happens if admin_reserve_pages is set to 0?

   Root is free to shoot themselves in the foot by setting
   admin_reserve_kbytes too low.

   On x86_64, the minimum useful reserve is:
     8MB for overcommit 'guess'
   128MB for overcommit 'never'

   admin_reserve_pages defaults to min(3% free memory, 8MB)

   So, anyone switching to 'never' mode needs to adjust
   admin_reserve_pages.

 * How do you calculate a minimum useful reserve?

   A user or the admin needs enough memory to login and perform
   recovery operations, which includes, at a minimum:

   sshd or login + bash (or some other shell) + top (or ps, kill, etc.)

   For overcommit 'guess', we can sum resident set sizes (RSS)
   because we only need enough memory to handle what the recovery
   programs will typically use. On x86_64 this is about 8MB.

   For overcommit 'never', we can take the max of their virtual sizes (VSZ)
   and add the sum of their RSS. We use VSZ instead of RSS because mode
   forces us to ensure we can fulfill all of the requested memory allocations--
   even if the programs only use a fraction of what they ask for.
   On x86_64 this is about 128MB.

   When swap is enabled, reserves are useful even when they are as
   small as 10MB, regardless of overcommit mode.

   When both swap and overcommit are disabled, then the admin should
   tune the reserves higher to be absolutley safe. Over 230MB each
   was safest in my testing.

 * What happens if user_reserve_pages is set to 0?

   Note, this only affects overcomitt 'never' mode.

   Then a user will be able to allocate all available memory minus
   admin_reserve_kbytes.

   However, they will easily see a message such as:

   "bash: fork: Cannot allocate memory"

   And they won't be able to recover/kill their application.
   The admin should be able to recover the system if
   admin_reserve_kbytes is set appropriately.

 * What's the difference between overcommit 'guess' and 'never'?

   "Guess" allows an allocation if there are enough free + reclaimable
   pages. It has a hardcoded 3% of free pages reserved for root.

   "Never" allows an allocation if there is enough swap + a configurable
   percentage (default is 50) of physical RAM. It has a hardcoded 3% of
   free pages reserved for root, like "Guess" mode. It also has a
   hardcoded 3% of the current process size reserved for additional
   applications.

 * Why is overcommit 'guess' not suitable even when an app eventually
   writes to every page? It takes free pages, file pages, available
   swap pages, reclaimable slab pages into consideration. In other words,
   these are all pages available, then why isn't overcommit suitable?

   Because it only looks at the present state of the system. It
   does not take into account the memory that other applications have
   malloced, but haven't initialized yet. It overcommits the system.

Test Summary

There was little change in behavior in the default overcommit 'guess'
mode with swap enabled before and after the patch. This was expected.

Systems run most predictably (i.e. no oom kills) in overcommit 'never'
mode with swap enabled. This also allowed the most memory to be allocated
to a user application.

Overcommit 'guess' mode without swap is a bad idea. It is easy to
crash the system. None of the other tested combinations crashed.
This matches my experience on the Roadrunner supercomputer.

Without the tunable user reserve, a system in overcommit 'never' mode
and without swap does not allow the admin to recover, although the
admin can.

With the new tunable reserves, a system in overcommit 'never' mode
and without swap can be configured to:

1. maximize user-allocatable memory, running close to the edge of
recoverability

2. maximize recoverability, sacrificing allocatable memory to
ensure that a user cannot take down a system

Test Description

Fedora 18 VM - 4 x86_64 cores, 5725MB RAM, 4GB Swap

System is booted into multiuser console mode, with unnecessary services
turned off. Caches were dropped before each test.

Hogs are user memtester processes that attempt to allocate all free memory
as reported by /proc/meminfo

In overcommit 'never' mode, memory_ratio=100

Test Results

3.9.0-rc1-mm1

Overcommit | Swap | Hogs | MB Got/Wanted | OOMs | User Recovery | Admin Recovery
----------   ----   ----   -------------   ----   -------------   --------------
guess        yes    1      5432/5432       no     yes             yes
guess        yes    4      5444/5444       1      yes             yes
guess        no     1      5302/5449       no     yes             yes
guess        no     4      -               crash  no              no

never        yes    1      5460/5460       1      yes             yes
never        yes    4      5460/5460       1      yes             yes
never        no     1      5218/5432       no     no              yes
never        no     4      5203/5448       no     no              yes

3.9.0-rc1-mm1-tunablereserves

User and Admin Recovery show their respective reserves, if applicable.

Overcommit | Swap | Hogs | MB Got/Wanted | OOMs | User Recovery | Admin Recovery
----------   ----   ----   -------------   ----   -------------   --------------
guess        yes    1      5419/5419       no     - yes           8MB yes
guess        yes    4      5436/5436       1      - yes           8MB yes
guess        no     1      5440/5440       *      - yes           8MB yes
guess        no     4      -               crash  - no            8MB no

* process would successfully mlock, then the oom killer would pick it

never        yes    1      5446/5446       no     10MB yes        20MB yes
never        yes    4      5456/5456       no     10MB yes        20MB yes
never        no     1      5387/5429       no     128MB no        8MB barely
never        no     1      5323/5428       no     226MB barely    8MB barely
never        no     1      5323/5428       no     226MB barely    8MB barely

never        no     1      5359/5448       no     10MB no         10MB barely

never        no     1      5323/5428       no     0MB no          10MB barely
never        no     1      5332/5428       no     0MB no          50MB yes
never        no     1      5293/5429       no     0MB no          90MB yes

never        no     1      5001/5427       no     230MB yes       338MB yes
never        no     4*     4998/5424       no     230MB yes       338MB yes

* more memtesters were launched, able to allocate approximately another 100MB

Future Work

 - Test larger memory systems.

 - Test an embedded image.

 - Test other architectures.

 - Time malloc microbenchmarks.

 - Would it be useful to be able to set overcommit policy for
   each memory cgroup?

 - Some lines are slightly above 80 chars.
   Perhaps define a macro to convert between pages and kb?
   Other places in the kernel do this.

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: make init_user_reserve() static]
Signed-off-by: Andrew Shewmaker <agshew@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt            | 20 +++++++++++++++++++
 Documentation/vm/overcommit-accounting |  8 +++++++-
 include/linux/mm.h                     |  2 ++
 kernel/sysctl.c                        |  7 +++++++
 mm/mmap.c                              | 35 +++++++++++++++++++++++++++++-----
 mm/nommu.c                             | 35 +++++++++++++++++++++++++++++-----
 6 files changed, 96 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 078701fdbd4..f6989573835 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -53,6 +53,7 @@ Currently, these files are in /proc/sys/vm:
 - percpu_pagelist_fraction
 - stat_interval
 - swappiness
+- user_reserve_kbytes
 - vfs_cache_pressure
 - zone_reclaim_mode
 
@@ -542,6 +543,7 @@ memory until it actually runs out.
 
 When this flag is 2, the kernel uses a "never overcommit"
 policy that attempts to prevent any overcommit of memory.
+Note that user_reserve_kbytes affects this policy.
 
 This feature can be very useful because there are a lot of
 programs that malloc() huge amounts of memory "just-in-case"
@@ -645,6 +647,24 @@ The default value is 60.
 
 ==============================================================
 
+- user_reserve_kbytes
+
+When overcommit_memory is set to 2, "never overommit" mode, reserve
+min(3% of current process size, user_reserve_kbytes) of free memory.
+This is intended to prevent a user from starting a single memory hogging
+process, such that they cannot recover (kill the hog).
+
+user_reserve_kbytes defaults to min(3% of the current process size, 128MB).
+
+If this is reduced to zero, then the user will be allowed to allocate
+all free memory with a single process, minus admin_reserve_kbytes.
+Any subsequent attempts to execute a command will result in
+"fork: Cannot allocate memory".
+
+Changing this takes effect whenever an application requests memory.
+
+==============================================================
+
 vfs_cache_pressure
 ------------------
 
diff --git a/Documentation/vm/overcommit-accounting b/Documentation/vm/overcommit-accounting
index 706d7ed9d8d..8eaa2fc4b8f 100644
--- a/Documentation/vm/overcommit-accounting
+++ b/Documentation/vm/overcommit-accounting
@@ -8,7 +8,9 @@ The Linux kernel supports the following overcommit handling modes
 		default.
 
 1	-	Always overcommit. Appropriate for some scientific
-		applications.
+		applications. Classic example is code using sparse arrays
+		and just relying on the virtual memory consisting almost
+		entirely of zero pages.
 
 2	-	Don't overcommit. The total address space commit
 		for the system is not permitted to exceed swap + a
@@ -18,6 +20,10 @@ The Linux kernel supports the following overcommit handling modes
 		pages but will receive errors on memory allocation as
 		appropriate.
 
+		Useful for applications that want to guarantee their
+		memory allocations will be available in the future
+		without having to initialize every page.
+
 The overcommit policy is set via the sysctl `vm.overcommit_memory'.
 
 The overcommit percentage is set via `vm.overcommit_ratio'.
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7aa11a6736e..43cfaabbde4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -44,6 +44,8 @@ extern int sysctl_legacy_va_layout;
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 
+extern unsigned long sysctl_user_reserve_kbytes;
+
 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
 
 /* to align the pointer to the (next) page boundary */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3dadde52253..6daabb72bdb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1429,6 +1429,13 @@ static struct ctl_table vm_table[] = {
 		.extra2		= &one,
 	},
 #endif
+	{
+		.procname	= "user_reserve_kbytes",
+		.data		= &sysctl_user_reserve_kbytes,
+		.maxlen		= sizeof(sysctl_user_reserve_kbytes),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
 	{ }
 };
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 081e6da8e1a..80a965f3525 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -84,6 +84,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;  /* heuristic overcommit */
 int sysctl_overcommit_ratio __read_mostly = 50;	/* default is 50% */
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
+unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
 /*
  * Make sure vm_committed_as in one cacheline and not cacheline shared with
  * other variables. It can be updated by several CPUs frequently.
@@ -122,7 +123,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
  */
 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 {
-	unsigned long free, allowed;
+	unsigned long free, allowed, reserve;
 
 	vm_acct_memory(pages);
 
@@ -183,10 +184,13 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 		allowed -= allowed / 32;
 	allowed += total_swap_pages;
 
-	/* Don't let a single process grow too big:
-	   leave 3% of the size of this process for other processes */
-	if (mm)
-		allowed -= mm->total_vm / 32;
+	/*
+	 * Don't let a single process grow so big a user can't recover
+	 */
+	if (mm) {
+		reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+		allowed -= min(mm->total_vm / 32, reserve);
+	}
 
 	if (percpu_counter_read_positive(&vm_committed_as) < allowed)
 		return 0;
@@ -3094,3 +3098,24 @@ void __init mmap_init(void)
 	ret = percpu_counter_init(&vm_committed_as, 0);
 	VM_BUG_ON(ret);
 }
+
+/*
+ * Initialise sysctl_user_reserve_kbytes.
+ *
+ * This is intended to prevent a user from starting a single memory hogging
+ * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
+ * mode.
+ *
+ * The default value is min(3% of free memory, 128MB)
+ * 128MB is enough to recover with sshd/login, bash, and top/kill.
+ */
+static int __meminit init_user_reserve(void)
+{
+	unsigned long free_kbytes;
+
+	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+	sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+	return 0;
+}
+module_init(init_user_reserve)
diff --git a/mm/nommu.c b/mm/nommu.c
index 2f1c75ed468..58e4a0a5125 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -63,6 +63,7 @@ int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50; /* default is 50% */
 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
 int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
+unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
 int heap_stack_gap = 0;
 
 atomic_long_t mmap_pages_allocated;
@@ -1897,7 +1898,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
  */
 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 {
-	unsigned long free, allowed;
+	unsigned long free, allowed, reserve;
 
 	vm_acct_memory(pages);
 
@@ -1957,10 +1958,13 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 		allowed -= allowed / 32;
 	allowed += total_swap_pages;
 
-	/* Don't let a single process grow too big:
-	   leave 3% of the size of this process for other processes */
-	if (mm)
-		allowed -= mm->total_vm / 32;
+	/*
+	 * Don't let a single process grow so big a user can't recover
+	 */
+	if (mm) {
+		reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+		allowed -= min(mm->total_vm / 32, reserve);
+	}
 
 	if (percpu_counter_read_positive(&vm_committed_as) < allowed)
 		return 0;
@@ -2122,3 +2126,24 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
 	up_write(&nommu_region_sem);
 	return 0;
 }
+
+/*
+ * Initialise sysctl_user_reserve_kbytes.
+ *
+ * This is intended to prevent a user from starting a single memory hogging
+ * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
+ * mode.
+ *
+ * The default value is min(3% of free memory, 128MB)
+ * 128MB is enough to recover with sshd/login, bash, and top/kill.
+ */
+static int __meminit init_user_reserve(void)
+{
+	unsigned long free_kbytes;
+
+	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+	sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+	return 0;
+}
+module_init(init_user_reserve)
-- 
cgit v1.2.3-70-g09d2


From 4eeab4f5580d11bffedc697684b91b0bca0d5009 Mon Sep 17 00:00:00 2001
From: Andrew Shewmaker <agshew@gmail.com>
Date: Mon, 29 Apr 2013 15:08:11 -0700
Subject: mm: replace hardcoded 3% with admin_reserve_pages knob

Add an admin_reserve_kbytes knob to allow admins to change the hardcoded
memory reserve to something other than 3%, which may be multiple
gigabytes on large memory systems.  Only about 8MB is necessary to
enable recovery in the default mode, and only a few hundred MB are
required even when overcommit is disabled.

This affects OVERCOMMIT_GUESS and OVERCOMMIT_NEVER.

admin_reserve_kbytes is initialized to min(3% free pages, 8MB)

I arrived at 8MB by summing the RSS of sshd or login, bash, and top.

Please see first patch in this series for full background, motivation,
testing, and full changelog.

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: make init_admin_reserve() static]
Signed-off-by: Andrew Shewmaker <agshew@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 30 ++++++++++++++++++++++++++++++
 include/linux/mm.h          |  1 +
 kernel/sysctl.c             |  7 +++++++
 mm/mmap.c                   | 30 ++++++++++++++++++++++++++----
 mm/nommu.c                  | 30 ++++++++++++++++++++++++++----
 5 files changed, 90 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index f6989573835..dcc75a9ed91 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -18,6 +18,7 @@ files can be found in mm/swap.c.
 
 Currently, these files are in /proc/sys/vm:
 
+- admin_reserve_kbytes
 - block_dump
 - compact_memory
 - dirty_background_bytes
@@ -59,6 +60,35 @@ Currently, these files are in /proc/sys/vm:
 
 ==============================================================
 
+admin_reserve_kbytes
+
+The amount of free memory in the system that should be reserved for users
+with the capability cap_sys_admin.
+
+admin_reserve_kbytes defaults to min(3% of free pages, 8MB)
+
+That should provide enough for the admin to log in and kill a process,
+if necessary, under the default overcommit 'guess' mode.
+
+Systems running under overcommit 'never' should increase this to account
+for the full Virtual Memory Size of programs used to recover. Otherwise,
+root may not be able to log in to recover the system.
+
+How do you calculate a minimum useful reserve?
+
+sshd or login + bash (or some other shell) + top (or ps, kill, etc.)
+
+For overcommit 'guess', we can sum resident set sizes (RSS).
+On x86_64 this is about 8MB.
+
+For overcommit 'never', we can take the max of their virtual sizes (VSZ)
+and add the sum of their RSS.
+On x86_64 this is about 128MB.
+
+Changing this takes effect whenever an application requests memory.
+
+==============================================================
+
 block_dump
 
 block_dump enables block I/O debugging when set to a nonzero value. More
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 43cfaabbde4..c05d7cfbb6b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -45,6 +45,7 @@ extern int sysctl_legacy_va_layout;
 #include <asm/processor.h>
 
 extern unsigned long sysctl_user_reserve_kbytes;
+extern unsigned long sysctl_admin_reserve_kbytes;
 
 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6daabb72bdb..9edcf456e0f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1436,6 +1436,13 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
 	},
+	{
+		.procname	= "admin_reserve_kbytes",
+		.data		= &sysctl_admin_reserve_kbytes,
+		.maxlen		= sizeof(sysctl_admin_reserve_kbytes),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
 	{ }
 };
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 80a965f3525..5485f18e663 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -85,6 +85,7 @@ int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;  /* heuristic ove
 int sysctl_overcommit_ratio __read_mostly = 50;	/* default is 50% */
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
+unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
 /*
  * Make sure vm_committed_as in one cacheline and not cacheline shared with
  * other variables. It can be updated by several CPUs frequently.
@@ -164,10 +165,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 			free -= totalreserve_pages;
 
 		/*
-		 * Leave the last 3% for root
+		 * Reserve some for root
 		 */
 		if (!cap_sys_admin)
-			free -= free / 32;
+			free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
 
 		if (free > pages)
 			return 0;
@@ -178,10 +179,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 	allowed = (totalram_pages - hugetlb_total_pages())
 	       	* sysctl_overcommit_ratio / 100;
 	/*
-	 * Leave the last 3% for root
+	 * Reserve some for root
 	 */
 	if (!cap_sys_admin)
-		allowed -= allowed / 32;
+		allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
 	allowed += total_swap_pages;
 
 	/*
@@ -3119,3 +3120,24 @@ static int __meminit init_user_reserve(void)
 	return 0;
 }
 module_init(init_user_reserve)
+
+/*
+ * Initialise sysctl_admin_reserve_kbytes.
+ *
+ * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
+ * to log in and kill a memory hogging process.
+ *
+ * Systems with more than 256MB will reserve 8MB, enough to recover
+ * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
+ * only reserve 3% of free pages by default.
+ */
+static int __meminit init_admin_reserve(void)
+{
+	unsigned long free_kbytes;
+
+	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+	sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
+	return 0;
+}
+module_init(init_admin_reserve)
diff --git a/mm/nommu.c b/mm/nommu.c
index 58e4a0a5125..fbe3e2f317e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -64,6 +64,7 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */
 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
 int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
+unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
 int heap_stack_gap = 0;
 
 atomic_long_t mmap_pages_allocated;
@@ -1939,10 +1940,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 			free -= totalreserve_pages;
 
 		/*
-		 * Leave the last 3% for root
+		 * Reserve some for root
 		 */
 		if (!cap_sys_admin)
-			free -= free / 32;
+			free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
 
 		if (free > pages)
 			return 0;
@@ -1952,10 +1953,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 
 	allowed = totalram_pages * sysctl_overcommit_ratio / 100;
 	/*
-	 * Leave the last 3% for root
+	 * Reserve some 3% for root
 	 */
 	if (!cap_sys_admin)
-		allowed -= allowed / 32;
+		allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
 	allowed += total_swap_pages;
 
 	/*
@@ -2147,3 +2148,24 @@ static int __meminit init_user_reserve(void)
 	return 0;
 }
 module_init(init_user_reserve)
+
+/*
+ * Initialise sysctl_admin_reserve_kbytes.
+ *
+ * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
+ * to log in and kill a memory hogging process.
+ *
+ * Systems with more than 256MB will reserve 8MB, enough to recover
+ * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
+ * only reserve 3% of free pages by default.
+ */
+static int __meminit init_admin_reserve(void)
+{
+	unsigned long free_kbytes;
+
+	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+	sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
+	return 0;
+}
+module_init(init_admin_reserve)
-- 
cgit v1.2.3-70-g09d2


From ae8e3a915aef5af5ace5936c56f05f0b1502ded1 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hp.com>
Date: Mon, 29 Apr 2013 15:08:17 -0700
Subject: resource: add __adjust_resource() for internal use

Add __adjust_resource(), which is called by adjust_resource() internally
after the resource_lock is held.  There is no interface change to
adjust_resource().  This change allows other functions to call
__adjust_resource() internally while the resource_lock is held.

Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: T Makphaibulchoke <tmac@hp.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/resource.c | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 73f35d4b30b..ae246f97c5d 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -706,24 +706,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
 	write_unlock(&resource_lock);
 }
 
-/**
- * adjust_resource - modify a resource's start and size
- * @res: resource to modify
- * @start: new start value
- * @size: new size
- *
- * Given an existing resource, change its start and size to match the
- * arguments.  Returns 0 on success, -EBUSY if it can't fit.
- * Existing children of the resource are assumed to be immutable.
- */
-int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size)
+static int __adjust_resource(struct resource *res, resource_size_t start,
+				resource_size_t size)
 {
 	struct resource *tmp, *parent = res->parent;
 	resource_size_t end = start + size - 1;
 	int result = -EBUSY;
 
-	write_lock(&resource_lock);
-
 	if (!parent)
 		goto skip;
 
@@ -751,6 +740,26 @@ skip:
 	result = 0;
 
  out:
+	return result;
+}
+
+/**
+ * adjust_resource - modify a resource's start and size
+ * @res: resource to modify
+ * @start: new start value
+ * @size: new size
+ *
+ * Given an existing resource, change its start and size to match the
+ * arguments.  Returns 0 on success, -EBUSY if it can't fit.
+ * Existing children of the resource are assumed to be immutable.
+ */
+int adjust_resource(struct resource *res, resource_size_t start,
+			resource_size_t size)
+{
+	int result;
+
+	write_lock(&resource_lock);
+	result = __adjust_resource(res, start, size);
 	write_unlock(&resource_lock);
 	return result;
 }
-- 
cgit v1.2.3-70-g09d2


From 825f787bb49676083b97c1de1f8f2f8f26b5c908 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hp.com>
Date: Mon, 29 Apr 2013 15:08:19 -0700
Subject: resource: add release_mem_region_adjustable()

Add release_mem_region_adjustable(), which releases a requested region
from a currently busy memory resource.  This interface adjusts the
matched memory resource accordingly even if the requested region does
not match exactly but still fits into.

This new interface is intended for memory hot-delete.  During bootup,
memory resources are inserted from the boot descriptor table, such as
EFI Memory Table and e820.  Each memory resource entry usually covers
the whole contigous memory range.  Memory hot-delete request, on the
other hand, may target to a particular range of memory resource, and its
size can be much smaller than the whole contiguous memory.  Since the
existing release interfaces like __release_region() require a requested
region to be exactly matched to a resource entry, they do not allow a
partial resource to be released.

This new interface is restrictive (i.e.  release under certain
conditions), which is consistent with other release interfaces,
__release_region() and __release_resource().  Additional release
conditions, such as an overlapping region to a resource entry, can be
supported after they are confirmed as valid cases.

There is no change to the existing interfaces since their restriction is
valid for I/O resources.

[akpm@linux-foundation.org: use GFP_ATOMIC under write_lock()]
[akpm@linux-foundation.org: switch back to GFP_KERNEL, less buggily]
[akpm@linux-foundation.org: remove unneeded and wrong kfree(), per Toshi]
Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Reviewed-by : Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Cc: T Makphaibulchoke <tmac@hp.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ioport.h |   4 ++
 kernel/resource.c      | 103 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 85ac9b9b72a..89b7c24a36e 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -192,6 +192,10 @@ extern struct resource * __request_region(struct resource *,
 extern int __check_region(struct resource *, resource_size_t, resource_size_t);
 extern void __release_region(struct resource *, resource_size_t,
 				resource_size_t);
+#ifdef CONFIG_MEMORY_HOTREMOVE
+extern int release_mem_region_adjustable(struct resource *, resource_size_t,
+				resource_size_t);
+#endif
 
 static inline int __deprecated check_region(resource_size_t s,
 						resource_size_t n)
diff --git a/kernel/resource.c b/kernel/resource.c
index ae246f97c5d..4aef8867fd4 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1021,6 +1021,109 @@ void __release_region(struct resource *parent, resource_size_t start,
 }
 EXPORT_SYMBOL(__release_region);
 
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/**
+ * release_mem_region_adjustable - release a previously reserved memory region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @size: resource region size
+ *
+ * This interface is intended for memory hot-delete.  The requested region
+ * is released from a currently busy memory resource.  The requested region
+ * must either match exactly or fit into a single busy resource entry.  In
+ * the latter case, the remaining resource is adjusted accordingly.
+ * Existing children of the busy memory resource must be immutable in the
+ * request.
+ *
+ * Note:
+ * - Additional release conditions, such as overlapping region, can be
+ *   supported after they are confirmed as valid cases.
+ * - When a busy memory resource gets split into two entries, the code
+ *   assumes that all children remain in the lower address entry for
+ *   simplicity.  Enhance this logic when necessary.
+ */
+int release_mem_region_adjustable(struct resource *parent,
+			resource_size_t start, resource_size_t size)
+{
+	struct resource **p;
+	struct resource *res;
+	struct resource *new_res;
+	resource_size_t end;
+	int ret = -EINVAL;
+
+	end = start + size - 1;
+	if ((start < parent->start) || (end > parent->end))
+		return ret;
+
+	/* The kzalloc() result gets checked later */
+	new_res = kzalloc(sizeof(struct resource), GFP_KERNEL);
+
+	p = &parent->child;
+	write_lock(&resource_lock);
+
+	while ((res = *p)) {
+		if (res->start >= end)
+			break;
+
+		/* look for the next resource if it does not fit into */
+		if (res->start > start || res->end < end) {
+			p = &res->sibling;
+			continue;
+		}
+
+		if (!(res->flags & IORESOURCE_MEM))
+			break;
+
+		if (!(res->flags & IORESOURCE_BUSY)) {
+			p = &res->child;
+			continue;
+		}
+
+		/* found the target resource; let's adjust accordingly */
+		if (res->start == start && res->end == end) {
+			/* free the whole entry */
+			*p = res->sibling;
+			kfree(res);
+			ret = 0;
+		} else if (res->start == start && res->end != end) {
+			/* adjust the start */
+			ret = __adjust_resource(res, end + 1,
+						res->end - end);
+		} else if (res->start != start && res->end == end) {
+			/* adjust the end */
+			ret = __adjust_resource(res, res->start,
+						start - res->start);
+		} else {
+			/* split into two entries */
+			if (!new_res) {
+				ret = -ENOMEM;
+				break;
+			}
+			new_res->name = res->name;
+			new_res->start = end + 1;
+			new_res->end = res->end;
+			new_res->flags = res->flags;
+			new_res->parent = res->parent;
+			new_res->sibling = res->sibling;
+			new_res->child = NULL;
+
+			ret = __adjust_resource(res, res->start,
+						start - res->start);
+			if (ret)
+				break;
+			res->sibling = new_res;
+			new_res = NULL;
+		}
+
+		break;
+	}
+
+	write_unlock(&resource_lock);
+	kfree(new_res);
+	return ret;
+}
+#endif	/* CONFIG_MEMORY_HOTREMOVE */
+
 /*
  * Managed region resource
  */
-- 
cgit v1.2.3-70-g09d2


From ebff7d8f270d045338d9f4796014f4db429a17f9 Mon Sep 17 00:00:00 2001
From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Date: Mon, 29 Apr 2013 15:08:56 -0700
Subject: mem hotunplug: fix kfree() of bootmem memory

When hot removing memory presented at boot time, following messages are shown:

  kernel BUG at mm/slub.c:3409!
  invalid opcode: 0000 [#1] SMP
  Modules linked in: ebtable_nat ebtables xt_CHECKSUM iptable_mangle bridge stp llc ipmi_devintf ipmi_msghandler sunrpc ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables binfmt_misc vfat fat dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun uinput iTCO_wdt iTCO_vendor_support coretemp kvm_intel kvm crc32c_intel ghash_clmulni_intel microcode pcspkr sg i2c_i801 lpc_ich mfd_core igb i2c_algo_bit i2c_core e1000e ptp pps_core tpm_infineon ioatdma dca sr_mod cdrom sd_mod crc_t10dif usb_storage megaraid_sas lpfc scsi_transport_fc scsi_tgt scsi_mod
  CPU 0
  Pid: 5091, comm: kworker/0:2 Tainted: G        W    3.9.0-rc6+ #15
  RIP: kfree+0x232/0x240
  Process kworker/0:2 (pid: 5091, threadinfo ffff88084678c000, task ffff88083928ca80)
  Call Trace:
    __release_region+0xd4/0xe0
    __remove_pages+0x52/0x110
    arch_remove_memory+0x89/0xd0
    remove_memory+0xc4/0x100
    acpi_memory_device_remove+0x6d/0xb1
    acpi_device_remove+0x89/0xab
    __device_release_driver+0x7c/0xf0
    device_release_driver+0x2f/0x50
    acpi_bus_device_detach+0x6c/0x70
    acpi_ns_walk_namespace+0x11a/0x250
    acpi_walk_namespace+0xee/0x137
    acpi_bus_trim+0x33/0x7a
    acpi_bus_hot_remove_device+0xc4/0x1a1
    acpi_os_execute_deferred+0x27/0x34
    process_one_work+0x1f7/0x590
    worker_thread+0x11a/0x370
    kthread+0xee/0x100
    ret_from_fork+0x7c/0xb0
  RIP  [<ffffffff811c41d2>] kfree+0x232/0x240
   RSP <ffff88084678d968>

The reason why the messages are shown is to release a resource
structure, allocated by bootmem, by kfree().  So when we release a
resource structure, we should check whether it is allocated by bootmem
or not.

But even if we know a resource structure is allocated by bootmem, we
cannot release it since SLxB cannot treat it.  So for reusing a resource
structure, this patch remembers it by using bootmem_resource as follows:

When releasing a resource structure by free_resource(), free_resource()
checks whether the resource structure is allocated by bootmem or not.
If it is allocated by bootmem, free_resource() adds it to
bootmem_resource.  If it is not allocated by bootmem, free_resource()
release it by kfree().

And when getting a new resource structure by get_resource(),
get_resource() checks whether bootmem_resource has released resource
structures or not.  If there is a released resource structure,
get_resource() returns it.  If there is not a releaed resource
structure, get_resource() returns new resource structure allocated by
kzalloc().

[akpm@linux-foundation.org: s/get_resource/alloc_resource/]
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Reviewed-by: Toshi Kani <toshi.kani@hp.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/resource.c | 68 ++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 55 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 4aef8867fd4..d7386986e10 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -21,6 +21,7 @@
 #include <linux/seq_file.h>
 #include <linux/device.h>
 #include <linux/pfn.h>
+#include <linux/mm.h>
 #include <asm/io.h>
 
 
@@ -50,6 +51,14 @@ struct resource_constraint {
 
 static DEFINE_RWLOCK(resource_lock);
 
+/*
+ * For memory hotplug, there is no way to free resource entries allocated
+ * by boot mem after the system is up. So for reusing the resource entry
+ * we need to remember the resource.
+ */
+static struct resource *bootmem_resource_free;
+static DEFINE_SPINLOCK(bootmem_resource_lock);
+
 static void *r_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct resource *p = v;
@@ -151,6 +160,40 @@ __initcall(ioresources_init);
 
 #endif /* CONFIG_PROC_FS */
 
+static void free_resource(struct resource *res)
+{
+	if (!res)
+		return;
+
+	if (!PageSlab(virt_to_head_page(res))) {
+		spin_lock(&bootmem_resource_lock);
+		res->sibling = bootmem_resource_free;
+		bootmem_resource_free = res;
+		spin_unlock(&bootmem_resource_lock);
+	} else {
+		kfree(res);
+	}
+}
+
+static struct resource *alloc_resource(gfp_t flags)
+{
+	struct resource *res = NULL;
+
+	spin_lock(&bootmem_resource_lock);
+	if (bootmem_resource_free) {
+		res = bootmem_resource_free;
+		bootmem_resource_free = res->sibling;
+	}
+	spin_unlock(&bootmem_resource_lock);
+
+	if (res)
+		memset(res, 0, sizeof(struct resource));
+	else
+		res = kzalloc(sizeof(struct resource), flags);
+
+	return res;
+}
+
 /* Return the conflict entry if you can't request it */
 static struct resource * __request_resource(struct resource *root, struct resource *new)
 {
@@ -771,7 +814,7 @@ static void __init __reserve_region_with_split(struct resource *root,
 {
 	struct resource *parent = root;
 	struct resource *conflict;
-	struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC);
+	struct resource *res = alloc_resource(GFP_ATOMIC);
 	struct resource *next_res = NULL;
 
 	if (!res)
@@ -796,7 +839,7 @@ static void __init __reserve_region_with_split(struct resource *root,
 		/* conflict covered whole area */
 		if (conflict->start <= res->start &&
 				conflict->end >= res->end) {
-			kfree(res);
+			free_resource(res);
 			WARN_ON(next_res);
 			break;
 		}
@@ -806,10 +849,9 @@ static void __init __reserve_region_with_split(struct resource *root,
 			end = res->end;
 			res->end = conflict->start - 1;
 			if (conflict->end < end) {
-				next_res = kzalloc(sizeof(*next_res),
-						GFP_ATOMIC);
+				next_res = alloc_resource(GFP_ATOMIC);
 				if (!next_res) {
-					kfree(res);
+					free_resource(res);
 					break;
 				}
 				next_res->name = name;
@@ -899,7 +941,7 @@ struct resource * __request_region(struct resource *parent,
 				   const char *name, int flags)
 {
 	DECLARE_WAITQUEUE(wait, current);
-	struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
+	struct resource *res = alloc_resource(GFP_KERNEL);
 
 	if (!res)
 		return NULL;
@@ -933,7 +975,7 @@ struct resource * __request_region(struct resource *parent,
 			continue;
 		}
 		/* Uhhuh, that didn't work out.. */
-		kfree(res);
+		free_resource(res);
 		res = NULL;
 		break;
 	}
@@ -967,7 +1009,7 @@ int __check_region(struct resource *parent, resource_size_t start,
 		return -EBUSY;
 
 	release_resource(res);
-	kfree(res);
+	free_resource(res);
 	return 0;
 }
 EXPORT_SYMBOL(__check_region);
@@ -1007,7 +1049,7 @@ void __release_region(struct resource *parent, resource_size_t start,
 			write_unlock(&resource_lock);
 			if (res->flags & IORESOURCE_MUXED)
 				wake_up(&muxed_resource_wait);
-			kfree(res);
+			free_resource(res);
 			return;
 		}
 		p = &res->sibling;
@@ -1055,8 +1097,8 @@ int release_mem_region_adjustable(struct resource *parent,
 	if ((start < parent->start) || (end > parent->end))
 		return ret;
 
-	/* The kzalloc() result gets checked later */
-	new_res = kzalloc(sizeof(struct resource), GFP_KERNEL);
+	/* The alloc_resource() result gets checked later */
+	new_res = alloc_resource(GFP_KERNEL);
 
 	p = &parent->child;
 	write_lock(&resource_lock);
@@ -1083,7 +1125,7 @@ int release_mem_region_adjustable(struct resource *parent,
 		if (res->start == start && res->end == end) {
 			/* free the whole entry */
 			*p = res->sibling;
-			kfree(res);
+			free_resource(res);
 			ret = 0;
 		} else if (res->start == start && res->end != end) {
 			/* adjust the start */
@@ -1119,7 +1161,7 @@ int release_mem_region_adjustable(struct resource *parent,
 	}
 
 	write_unlock(&resource_lock);
-	kfree(new_res);
+	free_resource(new_res);
 	return ret;
 }
 #endif	/* CONFIG_MEMORY_HOTREMOVE */
-- 
cgit v1.2.3-70-g09d2