From 4ecdafc8084fe1d95bb59ed7753b345abcd586fb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 29 Apr 2013 15:05:01 -0700 Subject: kthread: introduce to_live_kthread() "k->vfork_done != NULL" with a barrier() after to_kthread(k) in task_get_live_kthread(k) looks unclear, and sub-optimal because we load ->vfork_done twice. All we need is to ensure that we do not return to_kthread(NULL). Add a new trivial helper which loads/checks ->vfork_done once, this also looks more understandable. Signed-off-by: Oleg Nesterov Cc: Thomas Gleixner Cc: Namhyung Kim Cc: "Paul E. McKenney" Cc: Peter Zijlstra Cc: Rusty Russell Cc: "Srivatsa S. Bhat" Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 9eb7fed0bba..a84dee265f9 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -52,8 +52,21 @@ enum KTHREAD_BITS { KTHREAD_IS_PARKED, }; -#define to_kthread(tsk) \ - container_of((tsk)->vfork_done, struct kthread, exited) +#define __to_kthread(vfork) \ + container_of(vfork, struct kthread, exited) + +static inline struct kthread *to_kthread(struct task_struct *k) +{ + return __to_kthread(k->vfork_done); +} + +static struct kthread *to_live_kthread(struct task_struct *k) +{ + struct completion *vfork = ACCESS_ONCE(k->vfork_done); + if (likely(vfork)) + return __to_kthread(vfork); + return NULL; +} /** * kthread_should_stop - should this kthread return now? @@ -313,15 +326,8 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), static struct kthread *task_get_live_kthread(struct task_struct *k) { - struct kthread *kthread; - get_task_struct(k); - kthread = to_kthread(k); - /* It might have exited */ - barrier(); - if (k->vfork_done != NULL) - return kthread; - return NULL; + return to_live_kthread(k); } static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) -- cgit v1.2.3-70-g09d2 From b5c5442bb6bce0c67701d55124be561043a51faf Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 29 Apr 2013 15:05:12 -0700 Subject: kthread: kill task_get_live_kthread() task_get_live_kthread() looks confusing and unneeded. It does get_task_struct() but only kthread_stop() needs this, it can be called even if the calller doesn't have a reference when we know that this kthread can't exit until we do kthread_stop(). kthread_park() and kthread_unpark() do not need get_task_struct(), the callers already have the reference. And it can not help if we can race with the exiting kthread anyway, kthread_park() can hang forever in this case. Change kthread_park() and kthread_unpark() to use to_live_kthread(), change kthread_stop() to do get_task_struct() by hand and remove task_get_live_kthread(). Signed-off-by: Oleg Nesterov Cc: Thomas Gleixner Cc: Namhyung Kim Cc: "Paul E. McKenney" Cc: Peter Zijlstra Cc: Rusty Russell Cc: "Srivatsa S. Bhat" Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index a84dee265f9..9b12d65186f 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -324,12 +324,6 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), return p; } -static struct kthread *task_get_live_kthread(struct task_struct *k) -{ - get_task_struct(k); - return to_live_kthread(k); -} - static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) { clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); @@ -356,11 +350,10 @@ static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) */ void kthread_unpark(struct task_struct *k) { - struct kthread *kthread = task_get_live_kthread(k); + struct kthread *kthread = to_live_kthread(k); if (kthread) __kthread_unpark(k, kthread); - put_task_struct(k); } /** @@ -377,7 +370,7 @@ void kthread_unpark(struct task_struct *k) */ int kthread_park(struct task_struct *k) { - struct kthread *kthread = task_get_live_kthread(k); + struct kthread *kthread = to_live_kthread(k); int ret = -ENOSYS; if (kthread) { @@ -390,7 +383,6 @@ int kthread_park(struct task_struct *k) } ret = 0; } - put_task_struct(k); return ret; } @@ -411,10 +403,13 @@ int kthread_park(struct task_struct *k) */ int kthread_stop(struct task_struct *k) { - struct kthread *kthread = task_get_live_kthread(k); + struct kthread *kthread; int ret; trace_sched_kthread_stop(k); + + get_task_struct(k); + kthread = to_live_kthread(k); if (kthread) { set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); __kthread_unpark(k, kthread); @@ -422,10 +417,9 @@ int kthread_stop(struct task_struct *k) wait_for_completion(&kthread->exited); } ret = k->exit_code; - put_task_struct(k); - trace_sched_kthread_stop_ret(ret); + trace_sched_kthread_stop_ret(ret); return ret; } EXPORT_SYMBOL(kthread_stop); -- cgit v1.2.3-70-g09d2 From 3f68613f39cdc242fa2e872ac04a802e7cc7b7cb Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Mon, 29 Apr 2013 15:05:13 -0700 Subject: kernel/auditsc.c: use kzalloc instead of kmalloc+memset In audit_alloc_context() use kzalloc instead of kmalloc+memset. Also rename audit_zero_context() to audit_set_context(), to represent it's inner workings properly. [akpm@linux-foundation.org: remove audit_set_context() altogether - fold it into its caller] Signed-off-by: Rakib Mullick Cc: Al Viro Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditsc.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index a371f857a0a..c68229411a7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1034,21 +1034,15 @@ static inline void audit_free_aux(struct audit_context *context) } } -static inline void audit_zero_context(struct audit_context *context, - enum audit_state state) -{ - memset(context, 0, sizeof(*context)); - context->state = state; - context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; -} - static inline struct audit_context *audit_alloc_context(enum audit_state state) { struct audit_context *context; - if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) + context = kzalloc(sizeof(*context), GFP_KERNEL); + if (!context) return NULL; - audit_zero_context(context, state); + context->state = state; + context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; INIT_LIST_HEAD(&context->killed_trees); INIT_LIST_HEAD(&context->names_list); return context; -- cgit v1.2.3-70-g09d2 From 13f51e1c3fbebeab801f768f433067ff075dea5a Mon Sep 17 00:00:00 2001 From: Gao feng Date: Mon, 29 Apr 2013 15:05:14 -0700 Subject: audit: don't check if kauditd is valid every time We only need to check if kauditd is valid after we start it, if kauditd is invalid, we will set kauditd_task to NULL. So next time, we will start kauditd again. It means if kauditd_task is not NULL,it must be valid. Signed-off-by: Gao feng Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index d596e5355f1..9816a1b96cf 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -660,14 +660,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) /* As soon as there's any sign of userspace auditd, * start kauditd to talk to it */ - if (!kauditd_task) + if (!kauditd_task) { kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); - if (IS_ERR(kauditd_task)) { - err = PTR_ERR(kauditd_task); - kauditd_task = NULL; - return err; + if (IS_ERR(kauditd_task)) { + err = PTR_ERR(kauditd_task); + kauditd_task = NULL; + return err; + } } - loginuid = audit_get_loginuid(current); sessionid = audit_get_sessionid(current); security_task_getsecid(current, &sid); -- cgit v1.2.3-70-g09d2 From 374c586d95f288296e1e718533401d8ce0eecee3 Mon Sep 17 00:00:00 2001 From: Gao feng Date: Mon, 29 Apr 2013 15:05:15 -0700 Subject: audit: remove duplicate export of audit_enabled audit_enabled has already been exported in include/linux/audit.h. and kernel/audit.h includes include/linux/audit.h, no need to export aduit_enabled again in kernel/audit.h Signed-off-by: Gao feng Cc: Al Viro Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index d51cba868e1..d06ffc144f8 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -60,7 +60,6 @@ struct audit_entry { }; #ifdef CONFIG_AUDIT -extern int audit_enabled; extern int audit_ever_enabled; #endif -- cgit v1.2.3-70-g09d2 From dde5b7d6e7be308ce371baa96058c2d40df26c05 Mon Sep 17 00:00:00 2001 From: Gao feng Date: Mon, 29 Apr 2013 15:05:16 -0700 Subject: audit: remove unnecessary #if CONFIG_AUDIT The files which include kernel/audit.h are complied only when CONFIG_AUDIT is set. Just like audit_pid, there is no need to surround audit_ever_enabled with CONFIG_AUDIT. Signed-off-by: Gao feng Cc: Al Viro Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index d06ffc144f8..11468d99dad 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -59,9 +59,7 @@ struct audit_entry { struct audit_krule rule; }; -#ifdef CONFIG_AUDIT extern int audit_ever_enabled; -#endif extern int audit_pid; -- cgit v1.2.3-70-g09d2 From 373e0f3408fe671550d69d9a7965d8a49e988525 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Mon, 29 Apr 2013 15:05:18 -0700 Subject: kernel/auditfilter.c: tree and watch will memory leak when failure occurs In audit_data_to_entry() when a failure occurs we must check and free the tree and watch to avoid a memory leak. test: plan: test command: "auditctl -a exit,always -w /etc -F auid=-1" (on fedora17, need modify auditctl to let "-w /etc" has effect) running: under fedora17 x86_64, 2 CPUs 3.20GHz, 2.5GB RAM. let 15 auditctl processes continue running at the same time. monitor command: watch -d -n 1 "cat /proc/meminfo | awk '{print \$2}' \ | head -n 4 | xargs \ | awk '{print \"used \",\$1 - \$2 - \$3 - \$4}'" result: for original version: will use up all memory, within 3 hours. kill all auditctl, the memory still does not free. for new version (apply this patch): after 14 hours later, not find issues. Signed-off-by: Chen Gang Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditfilter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f9fc54bbe06..267436826c3 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -594,6 +594,10 @@ exit_nofree: return entry; exit_free: + if (entry->rule.watch) + audit_put_watch(entry->rule.watch); /* matches initial get */ + if (entry->rule.tree) + audit_put_tree(entry->rule.tree); /* that's the temporary one */ audit_free_rule(entry); return ERR_PTR(err); } -- cgit v1.2.3-70-g09d2 From 12b2f117f3bf738c1a00a6f64393f1953a740bd4 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Mon, 29 Apr 2013 15:05:19 -0700 Subject: kernel/audit_tree.c: tree will leak memory when failure occurs in audit_trim_trees() audit_trim_trees() calls get_tree(). If a failure occurs we must call put_tree(). [akpm@linux-foundation.org: run put_tree() before mutex_lock() for small scalability improvement] Signed-off-by: Chen Gang Cc: Al Viro Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 642a89c4f3d..a291aa23fb3 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -617,9 +617,9 @@ void audit_trim_trees(void) } spin_unlock(&hash_lock); trim_marked(tree); - put_tree(tree); drop_collected_mounts(root_mnt); skip_it: + put_tree(tree); mutex_lock(&audit_filter_mutex); } list_del(&cursor); -- cgit v1.2.3-70-g09d2 From e07cee23e64137f7da2fb35d7b7c0ad26cc0edec Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 29 Apr 2013 15:06:58 -0700 Subject: mm,kexec: use common help functions to free reserved pages Use common help functions to free reserved pages. Signed-off-by: Jiang Liu Cc: Eric Biederman Reviewed-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index ffd4e111fd6..b19181d4420 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1118,12 +1118,8 @@ void __weak crash_free_reserved_phys_range(unsigned long begin, { unsigned long addr; - for (addr = begin; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); - init_page_count(pfn_to_page(addr >> PAGE_SHIFT)); - free_page((unsigned long)__va(addr)); - totalram_pages++; - } + for (addr = begin; addr < end; addr += PAGE_SIZE) + free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); } int crash_shrink_memory(unsigned long new_size) -- cgit v1.2.3-70-g09d2 From 6d2488f64a240191f0733c1f32d73607916b01b7 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 29 Apr 2013 15:07:21 -0700 Subject: cgroup: remove css_get_next Now that we have generic and well ordered cgroup tree walkers there is no need to keep css_get_next in the place. Signed-off-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Johannes Weiner Cc: Ying Han Cc: Tejun Heo Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 7 ------- kernel/cgroup.c | 49 ------------------------------------------------- 2 files changed, 56 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 900af5964f5..470073bf93d 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -687,13 +687,6 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id); -/* - * Get a cgroup whose id is greater than or equal to id under tree of root. - * Returning a cgroup_subsys_state or NULL. - */ -struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id, - struct cgroup_subsys_state *root, int *foundid); - /* Returns true if root is ancestor of cg */ bool css_is_ancestor(struct cgroup_subsys_state *cg, const struct cgroup_subsys_state *root); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a32f9432666..dfaf50d4705 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5416,55 +5416,6 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) } EXPORT_SYMBOL_GPL(css_lookup); -/** - * css_get_next - lookup next cgroup under specified hierarchy. - * @ss: pointer to subsystem - * @id: current position of iteration. - * @root: pointer to css. search tree under this. - * @foundid: position of found object. - * - * Search next css under the specified hierarchy of rootid. Calling under - * rcu_read_lock() is necessary. Returns NULL if it reaches the end. - */ -struct cgroup_subsys_state * -css_get_next(struct cgroup_subsys *ss, int id, - struct cgroup_subsys_state *root, int *foundid) -{ - struct cgroup_subsys_state *ret = NULL; - struct css_id *tmp; - int tmpid; - int rootid = css_id(root); - int depth = css_depth(root); - - if (!rootid) - return NULL; - - BUG_ON(!ss->use_id); - WARN_ON_ONCE(!rcu_read_lock_held()); - - /* fill start point for scan */ - tmpid = id; - while (1) { - /* - * scan next entry from bitmap(tree), tmpid is updated after - * idr_get_next(). - */ - tmp = idr_get_next(&ss->idr, &tmpid); - if (!tmp) - break; - if (tmp->depth >= depth && tmp->stack[depth] == rootid) { - ret = rcu_dereference(tmp->css); - if (ret) { - *foundid = tmpid; - break; - } - } - /* continue to scan from next id */ - tmpid = tmpid + 1; - } - return ret; -} - /* * get corresponding css from file open on cgroupfs directory */ -- cgit v1.2.3-70-g09d2 From 146732ce104ddfed3d4d82722c0b336074016b92 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Mon, 29 Apr 2013 15:07:22 -0700 Subject: fs: don't compile in drop_caches.c when CONFIG_SYSCTL=n drop_caches.c provides code only invokable via sysctl, so don't compile it in when CONFIG_SYSCTL=n. Signed-off-by: Josh Triplett Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Makefile | 3 ++- include/linux/mm.h | 4 ++++ kernel/sysctl.c | 1 - 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/fs/Makefile b/fs/Makefile index 9d53192236f..3b2c76759ec 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -10,7 +10,7 @@ obj-y := open.o read_write.o file_table.o super.o \ ioctl.o readdir.o select.o fifo.o dcache.o inode.o \ attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ - pnode.o drop_caches.o splice.o sync.o utimes.o \ + pnode.o splice.o sync.o utimes.o \ stack.o fs_struct.o statfs.o ifeq ($(CONFIG_BLOCK),y) @@ -49,6 +49,7 @@ obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_GENERIC_ACL) += generic_acl.o obj-$(CONFIG_COREDUMP) += coredump.o +obj-$(CONFIG_SYSCTL) += drop_caches.o obj-$(CONFIG_FHANDLE) += fhandle.o diff --git a/include/linux/mm.h b/include/linux/mm.h index 43b70d5f820..98a44376e4f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1731,8 +1731,12 @@ int in_gate_area_no_mm(unsigned long addr); #define in_gate_area(mm, addr) ({(void)mm; in_gate_area_no_mm(addr);}) #endif /* __HAVE_ARCH_GATE_AREA */ +#ifdef CONFIG_SYSCTL +extern int sysctl_drop_caches; int drop_caches_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); +#endif + unsigned long shrink_slab(struct shrink_control *shrink, unsigned long nr_pages_scanned, unsigned long lru_pages); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index afc1dc60f3f..3dadde52253 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -106,7 +106,6 @@ extern unsigned int core_pipe_limit; #endif extern int pid_max; extern int pid_max_min, pid_max_max; -extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; extern int latencytop_enabled; -- cgit v1.2.3-70-g09d2 From f1c4069e1dc128dc8a851174cba2e273652e9216 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 29 Apr 2013 15:07:37 -0700 Subject: mm, vmalloc: export vmap_area_list, instead of vmlist Although our intention is to unexport internal structure entirely, but there is one exception for kexec. kexec dumps address of vmlist and makedumpfile uses this information. We are about to remove vmlist, then another way to retrieve information of vmalloc layer is needed for makedumpfile. For this purpose, we export vmap_area_list, instead of vmlist. Signed-off-by: Joonsoo Kim Signed-off-by: Joonsoo Kim Cc: Eric Biederman Cc: Dave Anderson Cc: Vivek Goyal Cc: Atsushi Kumagai Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Chris Metcalf Cc: Guan Xuetao Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 3 +-- kernel/kexec.c | 2 +- mm/nommu.c | 3 +-- mm/vmalloc.c | 11 ++++++----- 4 files changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 698b1e50d3a..8a25f9081ed 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -130,8 +130,7 @@ extern long vwrite(char *buf, char *addr, unsigned long count); /* * Internals. Dont't use.. */ -extern rwlock_t vmlist_lock; -extern struct vm_struct *vmlist; +extern struct list_head vmap_area_list; extern __init void vm_area_add_early(struct vm_struct *vm); extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); diff --git a/kernel/kexec.c b/kernel/kexec.c index b19181d4420..0b1f7e780d4 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1577,7 +1577,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(swapper_pg_dir); #endif VMCOREINFO_SYMBOL(_stext); - VMCOREINFO_SYMBOL(vmlist); + VMCOREINFO_SYMBOL(vmap_area_list); #ifndef CONFIG_NEED_MULTIPLE_NODES VMCOREINFO_SYMBOL(mem_map); diff --git a/mm/nommu.c b/mm/nommu.c index e001768b14e..2f1c75ed468 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -228,8 +228,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address, } EXPORT_SYMBOL(follow_pfn); -DEFINE_RWLOCK(vmlist_lock); -struct vm_struct *vmlist; +LIST_HEAD(vmap_area_list); void vfree(const void *addr) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index bda6cef5b97..7e63984eb58 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -261,7 +261,8 @@ struct vmap_area { }; static DEFINE_SPINLOCK(vmap_area_lock); -static LIST_HEAD(vmap_area_list); +/* Export for kexec only */ +LIST_HEAD(vmap_area_list); static struct rb_root vmap_area_root = RB_ROOT; /* The vmap cache globals are protected by vmap_area_lock */ @@ -272,6 +273,10 @@ static unsigned long cached_align; static unsigned long vmap_area_pcpu_hole; +/*** Old vmalloc interfaces ***/ +static DEFINE_RWLOCK(vmlist_lock); +static struct vm_struct *vmlist; + static struct vmap_area *__find_vmap_area(unsigned long addr) { struct rb_node *n = vmap_area_root.rb_node; @@ -1283,10 +1288,6 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) } EXPORT_SYMBOL_GPL(map_vm_area); -/*** Old vmalloc interfaces ***/ -DEFINE_RWLOCK(vmlist_lock); -struct vm_struct *vmlist; - static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { -- cgit v1.2.3-70-g09d2 From 13ba3fcbbe31068b1ee7c39a0b58ecbed03c4d72 Mon Sep 17 00:00:00 2001 From: Atsushi Kumagai Date: Mon, 29 Apr 2013 15:07:40 -0700 Subject: kexec, vmalloc: export additional vmalloc layer information Now, vmap_area_list is exported as VMCOREINFO for makedumpfile to get the start address of vmalloc region (vmalloc_start). The address which contains vmalloc_start value is represented as below: vmap_area_list.next - OFFSET(vmap_area.list) + OFFSET(vmap_area.va_start) However, both OFFSET(vmap_area.va_start) and OFFSET(vmap_area.list) aren't exported as VMCOREINFO. So this patch exports them externally with small cleanup. [akpm@linux-foundation.org: vmalloc.h should include list.h for list_head] Signed-off-by: Atsushi Kumagai Cc: Joonsoo Kim Cc: Joonsoo Kim Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Atsushi Kumagai Cc: Chris Metcalf Cc: Dave Anderson Cc: Eric Biederman Cc: Guan Xuetao Cc: Ingo Molnar Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 13 +++++++++++++ kernel/kexec.c | 3 ++- mm/vmalloc.c | 11 ----------- 3 files changed, 15 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 8a25f9081ed..7d5773a99f2 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -3,7 +3,9 @@ #include #include +#include #include /* pgprot_t */ +#include struct vm_area_struct; /* vma defining user mapping in mm_types.h */ @@ -35,6 +37,17 @@ struct vm_struct { const void *caller; }; +struct vmap_area { + unsigned long va_start; + unsigned long va_end; + unsigned long flags; + struct rb_node rb_node; /* address sorted rbtree */ + struct list_head list; /* address sorted list */ + struct list_head purge_list; /* "lazy purge" list */ + struct vm_struct *vm; + struct rcu_head rcu_head; +}; + /* * Highlevel APIs for driver use */ diff --git a/kernel/kexec.c b/kernel/kexec.c index 0b1f7e780d4..b574920cbd4 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1615,7 +1615,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(free_area, free_list); VMCOREINFO_OFFSET(list_head, next); VMCOREINFO_OFFSET(list_head, prev); - VMCOREINFO_OFFSET(vm_struct, addr); + VMCOREINFO_OFFSET(vmap_area, va_start); + VMCOREINFO_OFFSET(vmap_area, list); VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); log_buf_kexec_setup(); VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 151da8ac53f..72043d6c88c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -249,17 +249,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn); #define VM_LAZY_FREEING 0x02 #define VM_VM_AREA 0x04 -struct vmap_area { - unsigned long va_start; - unsigned long va_end; - unsigned long flags; - struct rb_node rb_node; /* address sorted rbtree */ - struct list_head list; /* address sorted list */ - struct list_head purge_list; /* "lazy purge" list */ - struct vm_struct *vm; - struct rcu_head rcu_head; -}; - static DEFINE_SPINLOCK(vmap_area_lock); /* Export for kexec only */ LIST_HEAD(vmap_area_list); -- cgit v1.2.3-70-g09d2 From d8f10cb3d375c34ad668f32ca6e4661ad1fc23b2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 29 Apr 2013 15:08:08 -0700 Subject: kernel/cpuset.c: use register_hotmemory_notifier() Use the new interface, remove one ifdef. No code size changes. We could/should have been using __meminit/__meminitdata here but there's now no point in doing that because all this code is elided at compile time. Cc: Li Zefan Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4f9dfe43ecb..334d983a36b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2251,7 +2251,6 @@ void cpuset_update_active_cpus(bool cpu_online) schedule_work(&cpuset_hotplug_work); } -#ifdef CONFIG_MEMORY_HOTPLUG /* * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. * Call this routine anytime after node_states[N_MEMORY] changes. @@ -2263,20 +2262,23 @@ static int cpuset_track_online_nodes(struct notifier_block *self, schedule_work(&cpuset_hotplug_work); return NOTIFY_OK; } -#endif + +static struct notifier_block cpuset_track_online_nodes_nb = { + .notifier_call = cpuset_track_online_nodes, + .priority = 10, /* ??! */ +}; /** * cpuset_init_smp - initialize cpus_allowed * * Description: Finish top cpuset after cpu, node maps are initialized - **/ - + */ void __init cpuset_init_smp(void) { cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); top_cpuset.mems_allowed = node_states[N_MEMORY]; - hotplug_memory_notifier(cpuset_track_online_nodes, 10); + register_hotmemory_notifier(&cpuset_track_online_nodes_nb); cpuset_propagate_hotplug_wq = alloc_ordered_workqueue("cpuset_hotplug", 0); -- cgit v1.2.3-70-g09d2 From c9b1d0981fcce3d9976d7b7a56e4e0503bc610dd Mon Sep 17 00:00:00 2001 From: Andrew Shewmaker Date: Mon, 29 Apr 2013 15:08:10 -0700 Subject: mm: limit growth of 3% hardcoded other user reserve Add user_reserve_kbytes knob. Limit the growth of the memory reserved for other user processes to min(3% current process size, user_reserve_pages). Only about 8MB is necessary to enable recovery in the default mode, and only a few hundred MB are required even when overcommit is disabled. user_reserve_pages defaults to min(3% free pages, 128MB) I arrived at 128MB by taking the max VSZ of sshd, login, bash, and top ... then adding the RSS of each. This only affects OVERCOMMIT_NEVER mode. Background 1. user reserve __vm_enough_memory reserves a hardcoded 3% of the current process size for other applications when overcommit is disabled. This was done so that a user could recover if they launched a memory hogging process. Without the reserve, a user would easily run into a message such as: bash: fork: Cannot allocate memory 2. admin reserve Additionally, a hardcoded 3% of free memory is reserved for root in both overcommit 'guess' and 'never' modes. This was intended to prevent a scenario where root-cant-log-in and perform recovery operations. Note that this reserve shrinks, and doesn't guarantee a useful reserve. Motivation The two hardcoded memory reserves should be updated to account for current memory sizes. Also, the admin reserve would be more useful if it didn't shrink too much. When the current code was originally written, 1GB was considered "enterprise". Now the 3% reserve can grow to multiple GB on large memory systems, and it only needs to be a few hundred MB at most to enable a user or admin to recover a system with an unwanted memory hogging process. I've found that reducing these reserves is especially beneficial for a specific type of application load: * single application system * one or few processes (e.g. one per core) * allocating all available memory * not initializing every page immediately * long running I've run scientific clusters with this sort of load. A long running job sometimes failed many hours (weeks of CPU time) into a calculation. They weren't initializing all of their memory immediately, and they weren't using calloc, so I put systems into overcommit 'never' mode. These clusters run diskless and have no swap. However, with the current reserves, a user wishing to allocate as much memory as possible to one process may be prevented from using, for example, almost 2GB out of 32GB. The effect is less, but still significant when a user starts a job with one process per core. I have repeatedly seen a set of processes requesting the same amount of memory fail because one of them could not allocate the amount of memory a user would expect to be able to allocate. For example, Message Passing Interfce (MPI) processes, one per core. And it is similar for other parallel programming frameworks. Changing this reserve code will make the overcommit never mode more useful by allowing applications to allocate nearly all of the available memory. Also, the new admin_reserve_kbytes will be safer than the current behavior since the hardcoded 3% of available memory reserve can shrink to something useless in the case where applications have grabbed all available memory. Risks * "bash: fork: Cannot allocate memory" The downside of the first patch-- which creates a tunable user reserve that is only used in overcommit 'never' mode--is that an admin can set it so low that a user may not be able to kill their process, even if they already have a shell prompt. Of course, a user can get in the same predicament with the current 3% reserve--they just have to launch processes until 3% becomes negligible. * root-cant-log-in problem The second patch, adding the tunable rootuser_reserve_pages, allows the admin to shoot themselves in the foot by setting it too small. They can easily get the system into a state where root-can't-log-in. However, the new admin_reserve_kbytes will be safer than the current behavior since the hardcoded 3% of available memory reserve can shrink to something useless in the case where applications have grabbed all available memory. Alternatives * Memory cgroups provide a more flexible way to limit application memory. Not everyone wants to set up cgroups or deal with their overhead. * We could create a fourth overcommit mode which provides smaller reserves. The size of useful reserves may be drastically different depending on the whether the system is embedded or enterprise. * Force users to initialize all of their memory or use calloc. Some users don't want/expect the system to overcommit when they malloc. Overcommit 'never' mode is for this scenario, and it should work well. The new user and admin reserve tunables are simple to use, with low overhead compared to cgroups. The patches preserve current behavior where 3% of memory is less than 128MB, except that the admin reserve doesn't shrink to an unusable size under pressure. The code allows admins to tune for embedded and enterprise usage. FAQ * How is the root-cant-login problem addressed? What happens if admin_reserve_pages is set to 0? Root is free to shoot themselves in the foot by setting admin_reserve_kbytes too low. On x86_64, the minimum useful reserve is: 8MB for overcommit 'guess' 128MB for overcommit 'never' admin_reserve_pages defaults to min(3% free memory, 8MB) So, anyone switching to 'never' mode needs to adjust admin_reserve_pages. * How do you calculate a minimum useful reserve? A user or the admin needs enough memory to login and perform recovery operations, which includes, at a minimum: sshd or login + bash (or some other shell) + top (or ps, kill, etc.) For overcommit 'guess', we can sum resident set sizes (RSS) because we only need enough memory to handle what the recovery programs will typically use. On x86_64 this is about 8MB. For overcommit 'never', we can take the max of their virtual sizes (VSZ) and add the sum of their RSS. We use VSZ instead of RSS because mode forces us to ensure we can fulfill all of the requested memory allocations-- even if the programs only use a fraction of what they ask for. On x86_64 this is about 128MB. When swap is enabled, reserves are useful even when they are as small as 10MB, regardless of overcommit mode. When both swap and overcommit are disabled, then the admin should tune the reserves higher to be absolutley safe. Over 230MB each was safest in my testing. * What happens if user_reserve_pages is set to 0? Note, this only affects overcomitt 'never' mode. Then a user will be able to allocate all available memory minus admin_reserve_kbytes. However, they will easily see a message such as: "bash: fork: Cannot allocate memory" And they won't be able to recover/kill their application. The admin should be able to recover the system if admin_reserve_kbytes is set appropriately. * What's the difference between overcommit 'guess' and 'never'? "Guess" allows an allocation if there are enough free + reclaimable pages. It has a hardcoded 3% of free pages reserved for root. "Never" allows an allocation if there is enough swap + a configurable percentage (default is 50) of physical RAM. It has a hardcoded 3% of free pages reserved for root, like "Guess" mode. It also has a hardcoded 3% of the current process size reserved for additional applications. * Why is overcommit 'guess' not suitable even when an app eventually writes to every page? It takes free pages, file pages, available swap pages, reclaimable slab pages into consideration. In other words, these are all pages available, then why isn't overcommit suitable? Because it only looks at the present state of the system. It does not take into account the memory that other applications have malloced, but haven't initialized yet. It overcommits the system. Test Summary There was little change in behavior in the default overcommit 'guess' mode with swap enabled before and after the patch. This was expected. Systems run most predictably (i.e. no oom kills) in overcommit 'never' mode with swap enabled. This also allowed the most memory to be allocated to a user application. Overcommit 'guess' mode without swap is a bad idea. It is easy to crash the system. None of the other tested combinations crashed. This matches my experience on the Roadrunner supercomputer. Without the tunable user reserve, a system in overcommit 'never' mode and without swap does not allow the admin to recover, although the admin can. With the new tunable reserves, a system in overcommit 'never' mode and without swap can be configured to: 1. maximize user-allocatable memory, running close to the edge of recoverability 2. maximize recoverability, sacrificing allocatable memory to ensure that a user cannot take down a system Test Description Fedora 18 VM - 4 x86_64 cores, 5725MB RAM, 4GB Swap System is booted into multiuser console mode, with unnecessary services turned off. Caches were dropped before each test. Hogs are user memtester processes that attempt to allocate all free memory as reported by /proc/meminfo In overcommit 'never' mode, memory_ratio=100 Test Results 3.9.0-rc1-mm1 Overcommit | Swap | Hogs | MB Got/Wanted | OOMs | User Recovery | Admin Recovery ---------- ---- ---- ------------- ---- ------------- -------------- guess yes 1 5432/5432 no yes yes guess yes 4 5444/5444 1 yes yes guess no 1 5302/5449 no yes yes guess no 4 - crash no no never yes 1 5460/5460 1 yes yes never yes 4 5460/5460 1 yes yes never no 1 5218/5432 no no yes never no 4 5203/5448 no no yes 3.9.0-rc1-mm1-tunablereserves User and Admin Recovery show their respective reserves, if applicable. Overcommit | Swap | Hogs | MB Got/Wanted | OOMs | User Recovery | Admin Recovery ---------- ---- ---- ------------- ---- ------------- -------------- guess yes 1 5419/5419 no - yes 8MB yes guess yes 4 5436/5436 1 - yes 8MB yes guess no 1 5440/5440 * - yes 8MB yes guess no 4 - crash - no 8MB no * process would successfully mlock, then the oom killer would pick it never yes 1 5446/5446 no 10MB yes 20MB yes never yes 4 5456/5456 no 10MB yes 20MB yes never no 1 5387/5429 no 128MB no 8MB barely never no 1 5323/5428 no 226MB barely 8MB barely never no 1 5323/5428 no 226MB barely 8MB barely never no 1 5359/5448 no 10MB no 10MB barely never no 1 5323/5428 no 0MB no 10MB barely never no 1 5332/5428 no 0MB no 50MB yes never no 1 5293/5429 no 0MB no 90MB yes never no 1 5001/5427 no 230MB yes 338MB yes never no 4* 4998/5424 no 230MB yes 338MB yes * more memtesters were launched, able to allocate approximately another 100MB Future Work - Test larger memory systems. - Test an embedded image. - Test other architectures. - Time malloc microbenchmarks. - Would it be useful to be able to set overcommit policy for each memory cgroup? - Some lines are slightly above 80 chars. Perhaps define a macro to convert between pages and kb? Other places in the kernel do this. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: make init_user_reserve() static] Signed-off-by: Andrew Shewmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 20 +++++++++++++++++++ Documentation/vm/overcommit-accounting | 8 +++++++- include/linux/mm.h | 2 ++ kernel/sysctl.c | 7 +++++++ mm/mmap.c | 35 +++++++++++++++++++++++++++++----- mm/nommu.c | 35 +++++++++++++++++++++++++++++----- 6 files changed, 96 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 078701fdbd4..f6989573835 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -53,6 +53,7 @@ Currently, these files are in /proc/sys/vm: - percpu_pagelist_fraction - stat_interval - swappiness +- user_reserve_kbytes - vfs_cache_pressure - zone_reclaim_mode @@ -542,6 +543,7 @@ memory until it actually runs out. When this flag is 2, the kernel uses a "never overcommit" policy that attempts to prevent any overcommit of memory. +Note that user_reserve_kbytes affects this policy. This feature can be very useful because there are a lot of programs that malloc() huge amounts of memory "just-in-case" @@ -645,6 +647,24 @@ The default value is 60. ============================================================== +- user_reserve_kbytes + +When overcommit_memory is set to 2, "never overommit" mode, reserve +min(3% of current process size, user_reserve_kbytes) of free memory. +This is intended to prevent a user from starting a single memory hogging +process, such that they cannot recover (kill the hog). + +user_reserve_kbytes defaults to min(3% of the current process size, 128MB). + +If this is reduced to zero, then the user will be allowed to allocate +all free memory with a single process, minus admin_reserve_kbytes. +Any subsequent attempts to execute a command will result in +"fork: Cannot allocate memory". + +Changing this takes effect whenever an application requests memory. + +============================================================== + vfs_cache_pressure ------------------ diff --git a/Documentation/vm/overcommit-accounting b/Documentation/vm/overcommit-accounting index 706d7ed9d8d..8eaa2fc4b8f 100644 --- a/Documentation/vm/overcommit-accounting +++ b/Documentation/vm/overcommit-accounting @@ -8,7 +8,9 @@ The Linux kernel supports the following overcommit handling modes default. 1 - Always overcommit. Appropriate for some scientific - applications. + applications. Classic example is code using sparse arrays + and just relying on the virtual memory consisting almost + entirely of zero pages. 2 - Don't overcommit. The total address space commit for the system is not permitted to exceed swap + a @@ -18,6 +20,10 @@ The Linux kernel supports the following overcommit handling modes pages but will receive errors on memory allocation as appropriate. + Useful for applications that want to guarantee their + memory allocations will be available in the future + without having to initialize every page. + The overcommit policy is set via the sysctl `vm.overcommit_memory'. The overcommit percentage is set via `vm.overcommit_ratio'. diff --git a/include/linux/mm.h b/include/linux/mm.h index 7aa11a6736e..43cfaabbde4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -44,6 +44,8 @@ extern int sysctl_legacy_va_layout; #include #include +extern unsigned long sysctl_user_reserve_kbytes; + #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) /* to align the pointer to the (next) page boundary */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3dadde52253..6daabb72bdb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1429,6 +1429,13 @@ static struct ctl_table vm_table[] = { .extra2 = &one, }, #endif + { + .procname = "user_reserve_kbytes", + .data = &sysctl_user_reserve_kbytes, + .maxlen = sizeof(sysctl_user_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, { } }; diff --git a/mm/mmap.c b/mm/mmap.c index 081e6da8e1a..80a965f3525 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -84,6 +84,7 @@ EXPORT_SYMBOL(vm_get_page_prot); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; +unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ /* * Make sure vm_committed_as in one cacheline and not cacheline shared with * other variables. It can be updated by several CPUs frequently. @@ -122,7 +123,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); */ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { - unsigned long free, allowed; + unsigned long free, allowed, reserve; vm_acct_memory(pages); @@ -183,10 +184,13 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) allowed -= allowed / 32; allowed += total_swap_pages; - /* Don't let a single process grow too big: - leave 3% of the size of this process for other processes */ - if (mm) - allowed -= mm->total_vm / 32; + /* + * Don't let a single process grow so big a user can't recover + */ + if (mm) { + reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + allowed -= min(mm->total_vm / 32, reserve); + } if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; @@ -3094,3 +3098,24 @@ void __init mmap_init(void) ret = percpu_counter_init(&vm_committed_as, 0); VM_BUG_ON(ret); } + +/* + * Initialise sysctl_user_reserve_kbytes. + * + * This is intended to prevent a user from starting a single memory hogging + * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER + * mode. + * + * The default value is min(3% of free memory, 128MB) + * 128MB is enough to recover with sshd/login, bash, and top/kill. + */ +static int __meminit init_user_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); + return 0; +} +module_init(init_user_reserve) diff --git a/mm/nommu.c b/mm/nommu.c index 2f1c75ed468..58e4a0a5125 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -63,6 +63,7 @@ int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; +unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ int heap_stack_gap = 0; atomic_long_t mmap_pages_allocated; @@ -1897,7 +1898,7 @@ EXPORT_SYMBOL(unmap_mapping_range); */ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { - unsigned long free, allowed; + unsigned long free, allowed, reserve; vm_acct_memory(pages); @@ -1957,10 +1958,13 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) allowed -= allowed / 32; allowed += total_swap_pages; - /* Don't let a single process grow too big: - leave 3% of the size of this process for other processes */ - if (mm) - allowed -= mm->total_vm / 32; + /* + * Don't let a single process grow so big a user can't recover + */ + if (mm) { + reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + allowed -= min(mm->total_vm / 32, reserve); + } if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; @@ -2122,3 +2126,24 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, up_write(&nommu_region_sem); return 0; } + +/* + * Initialise sysctl_user_reserve_kbytes. + * + * This is intended to prevent a user from starting a single memory hogging + * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER + * mode. + * + * The default value is min(3% of free memory, 128MB) + * 128MB is enough to recover with sshd/login, bash, and top/kill. + */ +static int __meminit init_user_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); + return 0; +} +module_init(init_user_reserve) -- cgit v1.2.3-70-g09d2 From 4eeab4f5580d11bffedc697684b91b0bca0d5009 Mon Sep 17 00:00:00 2001 From: Andrew Shewmaker Date: Mon, 29 Apr 2013 15:08:11 -0700 Subject: mm: replace hardcoded 3% with admin_reserve_pages knob Add an admin_reserve_kbytes knob to allow admins to change the hardcoded memory reserve to something other than 3%, which may be multiple gigabytes on large memory systems. Only about 8MB is necessary to enable recovery in the default mode, and only a few hundred MB are required even when overcommit is disabled. This affects OVERCOMMIT_GUESS and OVERCOMMIT_NEVER. admin_reserve_kbytes is initialized to min(3% free pages, 8MB) I arrived at 8MB by summing the RSS of sshd or login, bash, and top. Please see first patch in this series for full background, motivation, testing, and full changelog. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: make init_admin_reserve() static] Signed-off-by: Andrew Shewmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 30 ++++++++++++++++++++++++++++++ include/linux/mm.h | 1 + kernel/sysctl.c | 7 +++++++ mm/mmap.c | 30 ++++++++++++++++++++++++++---- mm/nommu.c | 30 ++++++++++++++++++++++++++---- 5 files changed, 90 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index f6989573835..dcc75a9ed91 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -18,6 +18,7 @@ files can be found in mm/swap.c. Currently, these files are in /proc/sys/vm: +- admin_reserve_kbytes - block_dump - compact_memory - dirty_background_bytes @@ -59,6 +60,35 @@ Currently, these files are in /proc/sys/vm: ============================================================== +admin_reserve_kbytes + +The amount of free memory in the system that should be reserved for users +with the capability cap_sys_admin. + +admin_reserve_kbytes defaults to min(3% of free pages, 8MB) + +That should provide enough for the admin to log in and kill a process, +if necessary, under the default overcommit 'guess' mode. + +Systems running under overcommit 'never' should increase this to account +for the full Virtual Memory Size of programs used to recover. Otherwise, +root may not be able to log in to recover the system. + +How do you calculate a minimum useful reserve? + +sshd or login + bash (or some other shell) + top (or ps, kill, etc.) + +For overcommit 'guess', we can sum resident set sizes (RSS). +On x86_64 this is about 8MB. + +For overcommit 'never', we can take the max of their virtual sizes (VSZ) +and add the sum of their RSS. +On x86_64 this is about 128MB. + +Changing this takes effect whenever an application requests memory. + +============================================================== + block_dump block_dump enables block I/O debugging when set to a nonzero value. More diff --git a/include/linux/mm.h b/include/linux/mm.h index 43cfaabbde4..c05d7cfbb6b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -45,6 +45,7 @@ extern int sysctl_legacy_va_layout; #include extern unsigned long sysctl_user_reserve_kbytes; +extern unsigned long sysctl_admin_reserve_kbytes; #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6daabb72bdb..9edcf456e0f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1436,6 +1436,13 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, + { + .procname = "admin_reserve_kbytes", + .data = &sysctl_admin_reserve_kbytes, + .maxlen = sizeof(sysctl_admin_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, { } }; diff --git a/mm/mmap.c b/mm/mmap.c index 80a965f3525..5485f18e663 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -85,6 +85,7 @@ int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic ove int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ +unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ /* * Make sure vm_committed_as in one cacheline and not cacheline shared with * other variables. It can be updated by several CPUs frequently. @@ -164,10 +165,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) free -= totalreserve_pages; /* - * Leave the last 3% for root + * Reserve some for root */ if (!cap_sys_admin) - free -= free / 32; + free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); if (free > pages) return 0; @@ -178,10 +179,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) allowed = (totalram_pages - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100; /* - * Leave the last 3% for root + * Reserve some for root */ if (!cap_sys_admin) - allowed -= allowed / 32; + allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); allowed += total_swap_pages; /* @@ -3119,3 +3120,24 @@ static int __meminit init_user_reserve(void) return 0; } module_init(init_user_reserve) + +/* + * Initialise sysctl_admin_reserve_kbytes. + * + * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin + * to log in and kill a memory hogging process. + * + * Systems with more than 256MB will reserve 8MB, enough to recover + * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will + * only reserve 3% of free pages by default. + */ +static int __meminit init_admin_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); + return 0; +} +module_init(init_admin_reserve) diff --git a/mm/nommu.c b/mm/nommu.c index 58e4a0a5125..fbe3e2f317e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -64,6 +64,7 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ +unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ int heap_stack_gap = 0; atomic_long_t mmap_pages_allocated; @@ -1939,10 +1940,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) free -= totalreserve_pages; /* - * Leave the last 3% for root + * Reserve some for root */ if (!cap_sys_admin) - free -= free / 32; + free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); if (free > pages) return 0; @@ -1952,10 +1953,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) allowed = totalram_pages * sysctl_overcommit_ratio / 100; /* - * Leave the last 3% for root + * Reserve some 3% for root */ if (!cap_sys_admin) - allowed -= allowed / 32; + allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); allowed += total_swap_pages; /* @@ -2147,3 +2148,24 @@ static int __meminit init_user_reserve(void) return 0; } module_init(init_user_reserve) + +/* + * Initialise sysctl_admin_reserve_kbytes. + * + * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin + * to log in and kill a memory hogging process. + * + * Systems with more than 256MB will reserve 8MB, enough to recover + * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will + * only reserve 3% of free pages by default. + */ +static int __meminit init_admin_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); + return 0; +} +module_init(init_admin_reserve) -- cgit v1.2.3-70-g09d2 From ae8e3a915aef5af5ace5936c56f05f0b1502ded1 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Mon, 29 Apr 2013 15:08:17 -0700 Subject: resource: add __adjust_resource() for internal use Add __adjust_resource(), which is called by adjust_resource() internally after the resource_lock is held. There is no interface change to adjust_resource(). This change allows other functions to call __adjust_resource() internally while the resource_lock is held. Signed-off-by: Toshi Kani Reviewed-by: Yasuaki Ishimatsu Acked-by: David Rientjes Cc: Ram Pai Cc: T Makphaibulchoke Cc: Wen Congyang Cc: Tang Chen Cc: Jiang Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 73f35d4b30b..ae246f97c5d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -706,24 +706,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new) write_unlock(&resource_lock); } -/** - * adjust_resource - modify a resource's start and size - * @res: resource to modify - * @start: new start value - * @size: new size - * - * Given an existing resource, change its start and size to match the - * arguments. Returns 0 on success, -EBUSY if it can't fit. - * Existing children of the resource are assumed to be immutable. - */ -int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) +static int __adjust_resource(struct resource *res, resource_size_t start, + resource_size_t size) { struct resource *tmp, *parent = res->parent; resource_size_t end = start + size - 1; int result = -EBUSY; - write_lock(&resource_lock); - if (!parent) goto skip; @@ -751,6 +740,26 @@ skip: result = 0; out: + return result; +} + +/** + * adjust_resource - modify a resource's start and size + * @res: resource to modify + * @start: new start value + * @size: new size + * + * Given an existing resource, change its start and size to match the + * arguments. Returns 0 on success, -EBUSY if it can't fit. + * Existing children of the resource are assumed to be immutable. + */ +int adjust_resource(struct resource *res, resource_size_t start, + resource_size_t size) +{ + int result; + + write_lock(&resource_lock); + result = __adjust_resource(res, start, size); write_unlock(&resource_lock); return result; } -- cgit v1.2.3-70-g09d2 From 825f787bb49676083b97c1de1f8f2f8f26b5c908 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Mon, 29 Apr 2013 15:08:19 -0700 Subject: resource: add release_mem_region_adjustable() Add release_mem_region_adjustable(), which releases a requested region from a currently busy memory resource. This interface adjusts the matched memory resource accordingly even if the requested region does not match exactly but still fits into. This new interface is intended for memory hot-delete. During bootup, memory resources are inserted from the boot descriptor table, such as EFI Memory Table and e820. Each memory resource entry usually covers the whole contigous memory range. Memory hot-delete request, on the other hand, may target to a particular range of memory resource, and its size can be much smaller than the whole contiguous memory. Since the existing release interfaces like __release_region() require a requested region to be exactly matched to a resource entry, they do not allow a partial resource to be released. This new interface is restrictive (i.e. release under certain conditions), which is consistent with other release interfaces, __release_region() and __release_resource(). Additional release conditions, such as an overlapping region to a resource entry, can be supported after they are confirmed as valid cases. There is no change to the existing interfaces since their restriction is valid for I/O resources. [akpm@linux-foundation.org: use GFP_ATOMIC under write_lock()] [akpm@linux-foundation.org: switch back to GFP_KERNEL, less buggily] [akpm@linux-foundation.org: remove unneeded and wrong kfree(), per Toshi] Signed-off-by: Toshi Kani Reviewed-by : Yasuaki Ishimatsu Cc: David Rientjes Reviewed-by: Ram Pai Cc: T Makphaibulchoke Cc: Wen Congyang Cc: Tang Chen Cc: Jiang Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 4 ++ kernel/resource.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) (limited to 'kernel') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 85ac9b9b72a..89b7c24a36e 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -192,6 +192,10 @@ extern struct resource * __request_region(struct resource *, extern int __check_region(struct resource *, resource_size_t, resource_size_t); extern void __release_region(struct resource *, resource_size_t, resource_size_t); +#ifdef CONFIG_MEMORY_HOTREMOVE +extern int release_mem_region_adjustable(struct resource *, resource_size_t, + resource_size_t); +#endif static inline int __deprecated check_region(resource_size_t s, resource_size_t n) diff --git a/kernel/resource.c b/kernel/resource.c index ae246f97c5d..4aef8867fd4 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1021,6 +1021,109 @@ void __release_region(struct resource *parent, resource_size_t start, } EXPORT_SYMBOL(__release_region); +#ifdef CONFIG_MEMORY_HOTREMOVE +/** + * release_mem_region_adjustable - release a previously reserved memory region + * @parent: parent resource descriptor + * @start: resource start address + * @size: resource region size + * + * This interface is intended for memory hot-delete. The requested region + * is released from a currently busy memory resource. The requested region + * must either match exactly or fit into a single busy resource entry. In + * the latter case, the remaining resource is adjusted accordingly. + * Existing children of the busy memory resource must be immutable in the + * request. + * + * Note: + * - Additional release conditions, such as overlapping region, can be + * supported after they are confirmed as valid cases. + * - When a busy memory resource gets split into two entries, the code + * assumes that all children remain in the lower address entry for + * simplicity. Enhance this logic when necessary. + */ +int release_mem_region_adjustable(struct resource *parent, + resource_size_t start, resource_size_t size) +{ + struct resource **p; + struct resource *res; + struct resource *new_res; + resource_size_t end; + int ret = -EINVAL; + + end = start + size - 1; + if ((start < parent->start) || (end > parent->end)) + return ret; + + /* The kzalloc() result gets checked later */ + new_res = kzalloc(sizeof(struct resource), GFP_KERNEL); + + p = &parent->child; + write_lock(&resource_lock); + + while ((res = *p)) { + if (res->start >= end) + break; + + /* look for the next resource if it does not fit into */ + if (res->start > start || res->end < end) { + p = &res->sibling; + continue; + } + + if (!(res->flags & IORESOURCE_MEM)) + break; + + if (!(res->flags & IORESOURCE_BUSY)) { + p = &res->child; + continue; + } + + /* found the target resource; let's adjust accordingly */ + if (res->start == start && res->end == end) { + /* free the whole entry */ + *p = res->sibling; + kfree(res); + ret = 0; + } else if (res->start == start && res->end != end) { + /* adjust the start */ + ret = __adjust_resource(res, end + 1, + res->end - end); + } else if (res->start != start && res->end == end) { + /* adjust the end */ + ret = __adjust_resource(res, res->start, + start - res->start); + } else { + /* split into two entries */ + if (!new_res) { + ret = -ENOMEM; + break; + } + new_res->name = res->name; + new_res->start = end + 1; + new_res->end = res->end; + new_res->flags = res->flags; + new_res->parent = res->parent; + new_res->sibling = res->sibling; + new_res->child = NULL; + + ret = __adjust_resource(res, res->start, + start - res->start); + if (ret) + break; + res->sibling = new_res; + new_res = NULL; + } + + break; + } + + write_unlock(&resource_lock); + kfree(new_res); + return ret; +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ + /* * Managed region resource */ -- cgit v1.2.3-70-g09d2 From ebff7d8f270d045338d9f4796014f4db429a17f9 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Mon, 29 Apr 2013 15:08:56 -0700 Subject: mem hotunplug: fix kfree() of bootmem memory When hot removing memory presented at boot time, following messages are shown: kernel BUG at mm/slub.c:3409! invalid opcode: 0000 [#1] SMP Modules linked in: ebtable_nat ebtables xt_CHECKSUM iptable_mangle bridge stp llc ipmi_devintf ipmi_msghandler sunrpc ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables binfmt_misc vfat fat dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun uinput iTCO_wdt iTCO_vendor_support coretemp kvm_intel kvm crc32c_intel ghash_clmulni_intel microcode pcspkr sg i2c_i801 lpc_ich mfd_core igb i2c_algo_bit i2c_core e1000e ptp pps_core tpm_infineon ioatdma dca sr_mod cdrom sd_mod crc_t10dif usb_storage megaraid_sas lpfc scsi_transport_fc scsi_tgt scsi_mod CPU 0 Pid: 5091, comm: kworker/0:2 Tainted: G W 3.9.0-rc6+ #15 RIP: kfree+0x232/0x240 Process kworker/0:2 (pid: 5091, threadinfo ffff88084678c000, task ffff88083928ca80) Call Trace: __release_region+0xd4/0xe0 __remove_pages+0x52/0x110 arch_remove_memory+0x89/0xd0 remove_memory+0xc4/0x100 acpi_memory_device_remove+0x6d/0xb1 acpi_device_remove+0x89/0xab __device_release_driver+0x7c/0xf0 device_release_driver+0x2f/0x50 acpi_bus_device_detach+0x6c/0x70 acpi_ns_walk_namespace+0x11a/0x250 acpi_walk_namespace+0xee/0x137 acpi_bus_trim+0x33/0x7a acpi_bus_hot_remove_device+0xc4/0x1a1 acpi_os_execute_deferred+0x27/0x34 process_one_work+0x1f7/0x590 worker_thread+0x11a/0x370 kthread+0xee/0x100 ret_from_fork+0x7c/0xb0 RIP [] kfree+0x232/0x240 RSP The reason why the messages are shown is to release a resource structure, allocated by bootmem, by kfree(). So when we release a resource structure, we should check whether it is allocated by bootmem or not. But even if we know a resource structure is allocated by bootmem, we cannot release it since SLxB cannot treat it. So for reusing a resource structure, this patch remembers it by using bootmem_resource as follows: When releasing a resource structure by free_resource(), free_resource() checks whether the resource structure is allocated by bootmem or not. If it is allocated by bootmem, free_resource() adds it to bootmem_resource. If it is not allocated by bootmem, free_resource() release it by kfree(). And when getting a new resource structure by get_resource(), get_resource() checks whether bootmem_resource has released resource structures or not. If there is a released resource structure, get_resource() returns it. If there is not a releaed resource structure, get_resource() returns new resource structure allocated by kzalloc(). [akpm@linux-foundation.org: s/get_resource/alloc_resource/] Signed-off-by: Yasuaki Ishimatsu Reviewed-by: Toshi Kani Cc: Johannes Weiner Cc: Ram Pai Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 68 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 4aef8867fd4..d7386986e10 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -50,6 +51,14 @@ struct resource_constraint { static DEFINE_RWLOCK(resource_lock); +/* + * For memory hotplug, there is no way to free resource entries allocated + * by boot mem after the system is up. So for reusing the resource entry + * we need to remember the resource. + */ +static struct resource *bootmem_resource_free; +static DEFINE_SPINLOCK(bootmem_resource_lock); + static void *r_next(struct seq_file *m, void *v, loff_t *pos) { struct resource *p = v; @@ -151,6 +160,40 @@ __initcall(ioresources_init); #endif /* CONFIG_PROC_FS */ +static void free_resource(struct resource *res) +{ + if (!res) + return; + + if (!PageSlab(virt_to_head_page(res))) { + spin_lock(&bootmem_resource_lock); + res->sibling = bootmem_resource_free; + bootmem_resource_free = res; + spin_unlock(&bootmem_resource_lock); + } else { + kfree(res); + } +} + +static struct resource *alloc_resource(gfp_t flags) +{ + struct resource *res = NULL; + + spin_lock(&bootmem_resource_lock); + if (bootmem_resource_free) { + res = bootmem_resource_free; + bootmem_resource_free = res->sibling; + } + spin_unlock(&bootmem_resource_lock); + + if (res) + memset(res, 0, sizeof(struct resource)); + else + res = kzalloc(sizeof(struct resource), flags); + + return res; +} + /* Return the conflict entry if you can't request it */ static struct resource * __request_resource(struct resource *root, struct resource *new) { @@ -771,7 +814,7 @@ static void __init __reserve_region_with_split(struct resource *root, { struct resource *parent = root; struct resource *conflict; - struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); + struct resource *res = alloc_resource(GFP_ATOMIC); struct resource *next_res = NULL; if (!res) @@ -796,7 +839,7 @@ static void __init __reserve_region_with_split(struct resource *root, /* conflict covered whole area */ if (conflict->start <= res->start && conflict->end >= res->end) { - kfree(res); + free_resource(res); WARN_ON(next_res); break; } @@ -806,10 +849,9 @@ static void __init __reserve_region_with_split(struct resource *root, end = res->end; res->end = conflict->start - 1; if (conflict->end < end) { - next_res = kzalloc(sizeof(*next_res), - GFP_ATOMIC); + next_res = alloc_resource(GFP_ATOMIC); if (!next_res) { - kfree(res); + free_resource(res); break; } next_res->name = name; @@ -899,7 +941,7 @@ struct resource * __request_region(struct resource *parent, const char *name, int flags) { DECLARE_WAITQUEUE(wait, current); - struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); + struct resource *res = alloc_resource(GFP_KERNEL); if (!res) return NULL; @@ -933,7 +975,7 @@ struct resource * __request_region(struct resource *parent, continue; } /* Uhhuh, that didn't work out.. */ - kfree(res); + free_resource(res); res = NULL; break; } @@ -967,7 +1009,7 @@ int __check_region(struct resource *parent, resource_size_t start, return -EBUSY; release_resource(res); - kfree(res); + free_resource(res); return 0; } EXPORT_SYMBOL(__check_region); @@ -1007,7 +1049,7 @@ void __release_region(struct resource *parent, resource_size_t start, write_unlock(&resource_lock); if (res->flags & IORESOURCE_MUXED) wake_up(&muxed_resource_wait); - kfree(res); + free_resource(res); return; } p = &res->sibling; @@ -1055,8 +1097,8 @@ int release_mem_region_adjustable(struct resource *parent, if ((start < parent->start) || (end > parent->end)) return ret; - /* The kzalloc() result gets checked later */ - new_res = kzalloc(sizeof(struct resource), GFP_KERNEL); + /* The alloc_resource() result gets checked later */ + new_res = alloc_resource(GFP_KERNEL); p = &parent->child; write_lock(&resource_lock); @@ -1083,7 +1125,7 @@ int release_mem_region_adjustable(struct resource *parent, if (res->start == start && res->end == end) { /* free the whole entry */ *p = res->sibling; - kfree(res); + free_resource(res); ret = 0; } else if (res->start == start && res->end != end) { /* adjust the start */ @@ -1119,7 +1161,7 @@ int release_mem_region_adjustable(struct resource *parent, } write_unlock(&resource_lock); - kfree(new_res); + free_resource(new_res); return ret; } #endif /* CONFIG_MEMORY_HOTREMOVE */ -- cgit v1.2.3-70-g09d2