From af8e3354b4bbd1ee5a3a55d11a5e1fe37e77f0ba Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Dec 2009 17:58:59 -0800 Subject: mm: CONFIG_MMU for PG_mlocked Remove three degrees of obfuscation, left over from when we had CONFIG_UNEVICTABLE_LRU. MLOCK_PAGES is CONFIG_HAVE_MLOCKED_PAGE_BIT is CONFIG_HAVE_MLOCK is CONFIG_MMU. rmap.o (and memory-failure.o) are only built when CONFIG_MMU, so don't need such conditions at all. Somehow, I feel no compulsion to remove the CONFIG_HAVE_MLOCK* lines from 169 defconfigs: leave those to evolve in due course. Signed-off-by: Hugh Dickins Cc: Izik Eidus Cc: Andrea Arcangeli Cc: Nick Piggin Reviewed-by: KOSAKI Motohiro Cc: Rik van Riel Cc: Lee Schermerhorn Cc: Andi Kleen Cc: KAMEZAWA Hiroyuki Cc: Wu Fengguang Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 8 -------- 1 file changed, 8 deletions(-) (limited to 'mm/Kconfig') diff --git a/mm/Kconfig b/mm/Kconfig index 44cf6f0a3a6..77b4980d614 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -200,14 +200,6 @@ config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS -config HAVE_MLOCK - bool - default y if MMU=y - -config HAVE_MLOCKED_PAGE_BIT - bool - default y if HAVE_MLOCK=y - config MMU_NOTIFIER bool -- cgit v1.2.3-70-g09d2 From a70caa8ba48f21f46d3b4e71b6b8d14080bbd57a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Dec 2009 17:59:02 -0800 Subject: mm: stop ptlock enlarging struct page CONFIG_DEBUG_SPINLOCK adds 12 or 16 bytes to a 32- or 64-bit spinlock_t, and CONFIG_DEBUG_LOCK_ALLOC adds another 12 or 24 bytes to it: lockdep enables both of those, and CONFIG_LOCK_STAT adds 8 or 16 bytes to that. When 2.6.15 placed the split page table lock inside struct page (usually sized 32 or 56 bytes), only CONFIG_DEBUG_SPINLOCK was a possibility, and we ignored the enlargement (but fitted in CONFIG_GENERIC_LOCKBREAK's 4 by letting the spinlock_t occupy both page->private and page->mapping). Should these debugging options be allowed to double the size of a struct page, when only one minority use of the page (as a page table) needs to fit a spinlock in there? Perhaps not. Take the easy way out: switch off SPLIT_PTLOCK_CPUS when DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC is in force. I've sometimes tried to be cleverer, kmallocing a cacheline for the spinlock when it doesn't fit, but given up each time. Falling back to mm->page_table_lock (as we do when ptlock is not split) lets lockdep check out the strictest path anyway. And now that some arches allow 8192 cpus, use 999999 for infinity. (What has this got to do with KSM swapping? It doesn't care about the size of struct page, but may care about random junk in page->mapping - to be explained separately later.) Signed-off-by: Hugh Dickins Cc: Izik Eidus Cc: Andrea Arcangeli Cc: Nick Piggin Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Lee Schermerhorn Cc: Andi Kleen Cc: KAMEZAWA Hiroyuki Cc: Wu Fengguang Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm/Kconfig') diff --git a/mm/Kconfig b/mm/Kconfig index 77b4980d614..d4b5fff6ea0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -158,11 +158,13 @@ config PAGEFLAGS_EXTENDED # Default to 4 for wider testing, though 8 might be more appropriate. # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. +# DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page. # config SPLIT_PTLOCK_CPUS int - default "4096" if ARM && !CPU_CACHE_VIPT - default "4096" if PARISC && !PA20 + default "999999" if ARM && !CPU_CACHE_VIPT + default "999999" if PARISC && !PA20 + default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC default "4" # -- cgit v1.2.3-70-g09d2 From d0f209f68f80f9a152799760c230019e7f270b2a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Dec 2009 17:59:34 -0800 Subject: ksm: remove unswappable max_kernel_pages Now that ksm pages are swappable, and the known holes plugged, remove mention of unswappable kernel pages from KSM documentation and comments. Remove the totalram_pages/4 initialization of max_kernel_pages. In fact, remove max_kernel_pages altogether - we can reinstate it if removal turns out to break someone's script; but if we later want to limit KSM's memory usage, limiting the stable nodes would not be an effective approach. Signed-off-by: Hugh Dickins Cc: Izik Eidus Cc: Andrea Arcangeli Cc: Chris Wright Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/ksm.txt | 22 +++++++--------------- mm/Kconfig | 2 +- mm/ksm.c | 41 ++--------------------------------------- 3 files changed, 10 insertions(+), 55 deletions(-) (limited to 'mm/Kconfig') diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt index 262d8e6793a..b392e496f81 100644 --- a/Documentation/vm/ksm.txt +++ b/Documentation/vm/ksm.txt @@ -16,9 +16,9 @@ by sharing the data common between them. But it can be useful to any application which generates many instances of the same data. KSM only merges anonymous (private) pages, never pagecache (file) pages. -KSM's merged pages are at present locked into kernel memory for as long -as they are shared: so cannot be swapped out like the user pages they -replace (but swapping KSM pages should follow soon in a later release). +KSM's merged pages were originally locked into kernel memory, but can now +be swapped out just like other user pages (but sharing is broken when they +are swapped back in: ksmd must rediscover their identity and merge again). KSM only operates on those areas of address space which an application has advised to be likely candidates for merging, by using the madvise(2) @@ -44,20 +44,12 @@ includes unmapped gaps (though working on the intervening mapped areas), and might fail with EAGAIN if not enough memory for internal structures. Applications should be considerate in their use of MADV_MERGEABLE, -restricting its use to areas likely to benefit. KSM's scans may use -a lot of processing power, and its kernel-resident pages are a limited -resource. Some installations will disable KSM for these reasons. +restricting its use to areas likely to benefit. KSM's scans may use a lot +of processing power: some installations will disable KSM for that reason. The KSM daemon is controlled by sysfs files in /sys/kernel/mm/ksm/, readable by all but writable only by root: -max_kernel_pages - set to maximum number of kernel pages that KSM may use - e.g. "echo 100000 > /sys/kernel/mm/ksm/max_kernel_pages" - Value 0 imposes no limit on the kernel pages KSM may use; - but note that any process using MADV_MERGEABLE can cause - KSM to allocate these pages, unswappable until it exits. - Default: quarter of memory (chosen to not pin too much) - pages_to_scan - how many present pages to scan before ksmd goes to sleep e.g. "echo 100 > /sys/kernel/mm/ksm/pages_to_scan" Default: 100 (chosen for demonstration purposes) @@ -75,7 +67,7 @@ run - set 0 to stop ksmd from running but keep merged pages, The effectiveness of KSM and MADV_MERGEABLE is shown in /sys/kernel/mm/ksm/: -pages_shared - how many shared unswappable kernel pages KSM is using +pages_shared - how many shared pages are being used pages_sharing - how many more sites are sharing them i.e. how much saved pages_unshared - how many pages unique but repeatedly checked for merging pages_volatile - how many pages changing too fast to be placed in a tree @@ -87,4 +79,4 @@ pages_volatile embraces several different kinds of activity, but a high proportion there would also indicate poor use of madvise MADV_MERGEABLE. Izik Eidus, -Hugh Dickins, 24 Sept 2009 +Hugh Dickins, 17 Nov 2009 diff --git a/mm/Kconfig b/mm/Kconfig index d4b5fff6ea0..2310984591e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -212,7 +212,7 @@ config KSM Enable Kernel Samepage Merging: KSM periodically scans those areas of an application's address space that an app has advised may be mergeable. When it finds pages of identical content, it replaces - the many instances by a single resident page with that content, so + the many instances by a single page with that content, so saving memory until one or another app needs to modify the content. Recommended for use with KVM, or with other duplicative applications. See Documentation/vm/ksm.txt for more information: KSM is inactive diff --git a/mm/ksm.c b/mm/ksm.c index d4c228a9d27..56a0da1f997 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -179,9 +179,6 @@ static unsigned long ksm_pages_unshared; /* The number of rmap_items in use: to calculate pages_volatile */ static unsigned long ksm_rmap_items; -/* Limit on the number of unswappable pages used */ -static unsigned long ksm_max_kernel_pages; - /* Number of pages ksmd should scan in one batch */ static unsigned int ksm_thread_pages_to_scan = 100; @@ -943,14 +940,6 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, { int err; - /* - * The number of nodes in the stable tree - * is the number of kernel pages that we hold. - */ - if (ksm_max_kernel_pages && - ksm_max_kernel_pages <= ksm_pages_shared) - return NULL; - err = try_to_merge_with_ksm_page(rmap_item, page, NULL); if (!err) { err = try_to_merge_with_ksm_page(tree_rmap_item, @@ -1850,8 +1839,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, /* * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, - * breaking COW to free the unswappable pages_shared (but leaves - * mm_slots on the list for when ksmd may be set running again). + * breaking COW to free the pages_shared (but leaves mm_slots + * on the list for when ksmd may be set running again). */ mutex_lock(&ksm_thread_mutex); @@ -1876,29 +1865,6 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, } KSM_ATTR(run); -static ssize_t max_kernel_pages_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - int err; - unsigned long nr_pages; - - err = strict_strtoul(buf, 10, &nr_pages); - if (err) - return -EINVAL; - - ksm_max_kernel_pages = nr_pages; - - return count; -} - -static ssize_t max_kernel_pages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%lu\n", ksm_max_kernel_pages); -} -KSM_ATTR(max_kernel_pages); - static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -1948,7 +1914,6 @@ static struct attribute *ksm_attrs[] = { &sleep_millisecs_attr.attr, &pages_to_scan_attr.attr, &run_attr.attr, - &max_kernel_pages_attr.attr, &pages_shared_attr.attr, &pages_sharing_attr.attr, &pages_unshared_attr.attr, @@ -1968,8 +1933,6 @@ static int __init ksm_init(void) struct task_struct *ksm_thread; int err; - ksm_max_kernel_pages = totalram_pages / 4; - err = ksm_slab_init(); if (err) goto out; -- cgit v1.2.3-70-g09d2 From 478c5ffc0b50527bd2390f2daa46cc16276b8413 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:59 +0100 Subject: HWPOISON: add page flags filter When specified, only poison pages if ((page_flags & mask) == value). - corrupt-filter-flags-mask - corrupt-filter-flags-value This allows stress testing of many kinds of pages. Strictly speaking, the buddy pages requires taking zone lock, to avoid setting PG_hwpoison on a "was buddy but now allocated to someone" page. However we can just do nothing because we set PG_locked in the beginning, this prevents the page allocator from allocating it to someone. (It will BUG() on the unexpected PG_locked, which is fine for hwpoison testing.) [AK: Add select PROC_PAGE_MONITOR to satisfy dependency] CC: Nick Piggin Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- Documentation/vm/hwpoison.txt | 10 ++++++++++ mm/Kconfig | 1 + mm/hwpoison-inject.c | 10 ++++++++++ mm/internal.h | 2 ++ mm/memory-failure.c | 20 ++++++++++++++++++++ 5 files changed, 43 insertions(+) (limited to 'mm/Kconfig') diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt index 4ef7bb30d15..f454d3cd4d6 100644 --- a/Documentation/vm/hwpoison.txt +++ b/Documentation/vm/hwpoison.txt @@ -123,6 +123,16 @@ Only handle memory failures to pages associated with the file system defined by block device major/minor. -1U is the wildcard value. This should be only used for testing with artificial injection. + +corrupt-filter-flags-mask +corrupt-filter-flags-value + +When specified, only poison pages if ((page_flags & mask) == value). +This allows stress testing of many kinds of pages. The page_flags +are the same as in /proc/kpageflags. The flag bits are defined in +include/linux/kernel-page-flags.h and documented in +Documentation/vm/pagemap.txt + Architecture specific MCE injector x86 has mce-inject, mce-test diff --git a/mm/Kconfig b/mm/Kconfig index 2310984591e..8cea7fde06e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -253,6 +253,7 @@ config MEMORY_FAILURE config HWPOISON_INJECT tristate "Poison pages injector" depends on MEMORY_FAILURE && DEBUG_KERNEL + select PROC_PAGE_MONITOR config NOMMU_INITIAL_TRIM_EXCESS int "Turn on mmap() excess space trimming before booting" diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 2b6b3200fa6..c4dfd89f654 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -102,6 +102,16 @@ static int pfn_inject_init(void) if (!dentry) goto fail; + dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600, + hwpoison_dir, &hwpoison_filter_flags_mask); + if (!dentry) + goto fail; + + dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600, + hwpoison_dir, &hwpoison_filter_flags_value); + if (!dentry) + goto fail; + return 0; fail: pfn_inject_exit(); diff --git a/mm/internal.h b/mm/internal.h index 04bbce8b8ba..b2027c73119 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -255,3 +255,5 @@ extern int hwpoison_filter(struct page *p); extern u32 hwpoison_filter_dev_major; extern u32 hwpoison_filter_dev_minor; +extern u64 hwpoison_filter_flags_mask; +extern u64 hwpoison_filter_flags_value; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 82ac73436d0..22d2b2028e5 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -50,8 +51,12 @@ atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); u32 hwpoison_filter_dev_major = ~0U; u32 hwpoison_filter_dev_minor = ~0U; +u64 hwpoison_filter_flags_mask; +u64 hwpoison_filter_flags_value; EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); +EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); +EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); static int hwpoison_filter_dev(struct page *p) { @@ -83,11 +88,26 @@ static int hwpoison_filter_dev(struct page *p) return 0; } +static int hwpoison_filter_flags(struct page *p) +{ + if (!hwpoison_filter_flags_mask) + return 0; + + if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == + hwpoison_filter_flags_value) + return 0; + else + return -EINVAL; +} + int hwpoison_filter(struct page *p) { if (hwpoison_filter_dev(p)) return -EINVAL; + if (hwpoison_filter_flags(p)) + return -EINVAL; + return 0; } EXPORT_SYMBOL_GPL(hwpoison_filter); -- cgit v1.2.3-70-g09d2 From 413f9efbc513d330f00352bb7cba060a729999d3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Dec 2009 12:20:00 +0100 Subject: HWPOISON: mention HWPoison in Kconfig entry Signed-off-by: Andi Kleen --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/Kconfig') diff --git a/mm/Kconfig b/mm/Kconfig index 8cea7fde06e..43ea8c3a2bb 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -251,7 +251,7 @@ config MEMORY_FAILURE special hardware support and typically ECC memory. config HWPOISON_INJECT - tristate "Poison pages injector" + tristate "HWPoison pages injector" depends on MEMORY_FAILURE && DEBUG_KERNEL select PROC_PAGE_MONITOR -- cgit v1.2.3-70-g09d2 From 6e1415467614e854fee660ff6648bd10fa976e95 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 15 Dec 2009 19:27:45 +0000 Subject: NOMMU: Optimise away the {dac_,}mmap_min_addr tests In NOMMU mode clamp dac_mmap_min_addr to zero to cause the tests on it to be skipped by the compiler. We do this as the minimum mmap address doesn't make any sense in NOMMU mode. mmap_min_addr and round_hint_to_min() can be discarded entirely in NOMMU mode. Signed-off-by: David Howells Acked-by: Eric Paris Signed-off-by: James Morris --- include/linux/security.h | 7 +++++++ kernel/sysctl.c | 2 ++ mm/Kconfig | 1 + security/Makefile | 3 ++- 4 files changed, 12 insertions(+), 1 deletion(-) (limited to 'mm/Kconfig') diff --git a/include/linux/security.h b/include/linux/security.h index 466cbadbd1e..2c627d361c0 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -95,8 +95,13 @@ struct seq_file; extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb); extern int cap_netlink_recv(struct sk_buff *skb, int cap); +#ifdef CONFIG_MMU extern unsigned long mmap_min_addr; extern unsigned long dac_mmap_min_addr; +#else +#define dac_mmap_min_addr 0UL +#endif + /* * Values used in the task_security_ops calls */ @@ -121,6 +126,7 @@ struct request_sock; #define LSM_UNSAFE_PTRACE 2 #define LSM_UNSAFE_PTRACE_CAP 4 +#ifdef CONFIG_MMU /* * If a hint addr is less than mmap_min_addr change hint to be as * low as possible but still greater than mmap_min_addr @@ -135,6 +141,7 @@ static inline unsigned long round_hint_to_min(unsigned long hint) } extern int mmap_min_addr_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#endif #ifdef CONFIG_SECURITY diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 45e4bef0012..856a24eadf7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1214,6 +1214,7 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_dointvec_jiffies, }, #endif +#ifdef CONFIG_MMU { .procname = "mmap_min_addr", .data = &dac_mmap_min_addr, @@ -1221,6 +1222,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = mmap_min_addr_handler, }, +#endif #ifdef CONFIG_NUMA { .procname = "numa_zonelist_order", diff --git a/mm/Kconfig b/mm/Kconfig index 43ea8c3a2bb..ee9f3e0f2b6 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -221,6 +221,7 @@ config KSM config DEFAULT_MMAP_MIN_ADDR int "Low address space to protect from user allocation" + depends on MMU default 4096 help This is the portion of low virtual memory which should be protected diff --git a/security/Makefile b/security/Makefile index bb44e350c61..da20a193c8d 100644 --- a/security/Makefile +++ b/security/Makefile @@ -8,7 +8,8 @@ subdir-$(CONFIG_SECURITY_SMACK) += smack subdir-$(CONFIG_SECURITY_TOMOYO) += tomoyo # always enable default capabilities -obj-y += commoncap.o min_addr.o +obj-y += commoncap.o +obj-$(CONFIG_MMU) += min_addr.o # Object file lists obj-$(CONFIG_SECURITY) += security.o capability.o -- cgit v1.2.3-70-g09d2 From 27df5068e24f2f88de98e95eb6e8dbc9800bf80e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 21 Dec 2009 19:56:42 +0100 Subject: HWPOISON: Add PROC_FS dependency to hwpoison injector v2 The injector filter requires stable_page_flags() which is supplied by procfs. So make it dependent on that. Also add ifdefs around the filter code in memory-failure.c so that when the filter is disabled due to missing dependencies the whole code still builds. Reported-by: Ingo Molnar Signed-off-by: Andi Kleen --- mm/Kconfig | 2 +- mm/memory-failure.c | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'mm/Kconfig') diff --git a/mm/Kconfig b/mm/Kconfig index ee9f3e0f2b6..17b8947aa7d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -253,7 +253,7 @@ config MEMORY_FAILURE config HWPOISON_INJECT tristate "HWPoison pages injector" - depends on MEMORY_FAILURE && DEBUG_KERNEL + depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS select PROC_PAGE_MONITOR config NOMMU_INITIAL_TRIM_EXCESS diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6a0466ed5bf..17299fd4577 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -52,6 +52,8 @@ int sysctl_memory_failure_recovery __read_mostly = 1; atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); +#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) + u32 hwpoison_filter_enable = 0; u32 hwpoison_filter_dev_major = ~0U; u32 hwpoison_filter_dev_minor = ~0U; @@ -164,6 +166,13 @@ int hwpoison_filter(struct page *p) return 0; } +#else +int hwpoison_filter(struct page *p) +{ + return 0; +} +#endif + EXPORT_SYMBOL_GPL(hwpoison_filter); /* -- cgit v1.2.3-70-g09d2