From 7e85ee0c1d15ca5f8bff0f514f158eba1742dd87 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Fri, 12 Jun 2009 14:03:06 +0300 Subject: slab,slub: don't enable interrupts during early boot As explained by Benjamin Herrenschmidt: Oh and btw, your patch alone doesn't fix powerpc, because it's missing a whole bunch of GFP_KERNEL's in the arch code... You would have to grep the entire kernel for things that check slab_is_available() and even then you'll be missing some. For example, slab_is_available() didn't always exist, and so in the early days on powerpc, we used a mem_init_done global that is set form mem_init() (not perfect but works in practice). And we still have code using that to do the test. Therefore, mask out __GFP_WAIT, __GFP_IO, and __GFP_FS in the slab allocators in early boot code to avoid enabling interrupts. Signed-off-by: Pekka Enberg --- include/linux/gfp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/gfp.h') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 0bbc15f5453..3760e7c5de0 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -85,6 +85,9 @@ struct vm_area_struct; __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_NOMEMALLOC) +/* Control slab gfp mask during early boot */ +#define SLAB_GFP_BOOT_MASK __GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS) + /* Control allocation constraints */ #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) -- cgit v1.2.3-70-g09d2 From 2dff440525f8faba8836e9f05297b76f23b4af30 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sat, 31 May 2008 15:56:17 +0200 Subject: kmemcheck: add mm functions With kmemcheck enabled, the slab allocator needs to do this: 1. Tell kmemcheck to allocate the shadow memory which stores the status of each byte in the allocation proper, e.g. whether it is initialized or uninitialized. 2. Tell kmemcheck which parts of memory that should be marked uninitialized. There are actually a few more states, such as "not yet allocated" and "recently freed". If a slab cache is set up using the SLAB_NOTRACK flag, it will never return memory that can take page faults because of kmemcheck. If a slab cache is NOT set up using the SLAB_NOTRACK flag, callers can still request memory with the __GFP_NOTRACK flag. This does not prevent the page faults from occuring, however, but marks the object in question as being initialized so that no warnings will ever be produced for this object. In addition to (and in contrast to) __GFP_NOTRACK, the __GFP_NOTRACK_FALSE_POSITIVE flag indicates that the allocation should not be tracked _because_ it would produce a false positive. Their values are identical, but need not be so in the future (for example, we could now enable/disable false positives with a config option). Parts of this patch were contributed by Pekka Enberg but merged for atomicity. Signed-off-by: Vegard Nossum Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar [rebased for mainline inclusion] Signed-off-by: Vegard Nossum --- arch/x86/kernel/process.c | 2 +- include/linux/gfp.h | 9 +++- include/linux/kmemcheck.h | 47 +++++++++++++++++++++ include/linux/slab.h | 7 ++++ kernel/fork.c | 14 +++---- mm/Makefile | 1 + mm/kmemcheck.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 174 insertions(+), 9 deletions(-) create mode 100644 mm/kmemcheck.c (limited to 'include/linux/gfp.h') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 3bb2be1649b..994dd6a4a2a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -63,7 +63,7 @@ void arch_task_cache_init(void) task_xstate_cachep = kmem_cache_create("task_xstate", xstate_size, __alignof__(union thread_xstate), - SLAB_PANIC, NULL); + SLAB_PANIC | SLAB_NOTRACK, NULL); } /* diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 0bbc15f5453..daeaa8fe1bb 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -51,8 +51,15 @@ struct vm_area_struct; #define __GFP_THISNODE ((__force gfp_t)0x40000u)/* No fallback, no policies */ #define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */ #define __GFP_MOVABLE ((__force gfp_t)0x100000u) /* Page is movable */ +#define __GFP_NOTRACK ((__force gfp_t)0x200000u) /* Don't track with kmemcheck */ -#define __GFP_BITS_SHIFT 21 /* Room for 21 __GFP_FOO bits */ +/* + * This may seem redundant, but it's a way of annotating false positives vs. + * allocations that simply cannot be supported (e.g. page tables). + */ +#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) + +#define __GFP_BITS_SHIFT 22 /* Room for 22 __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* This equals 0, but use constants in case they ever change */ diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h index 39480c91b2f..5b65f4ebead 100644 --- a/include/linux/kmemcheck.h +++ b/include/linux/kmemcheck.h @@ -7,11 +7,58 @@ #ifdef CONFIG_KMEMCHECK extern int kmemcheck_enabled; +/* The slab-related functions. */ +void kmemcheck_alloc_shadow(struct kmem_cache *s, gfp_t flags, int node, + struct page *page, int order); +void kmemcheck_free_shadow(struct kmem_cache *s, struct page *page, int order); +void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, + size_t size); +void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size); + +void kmemcheck_show_pages(struct page *p, unsigned int n); +void kmemcheck_hide_pages(struct page *p, unsigned int n); + +bool kmemcheck_page_is_tracked(struct page *p); + +void kmemcheck_mark_unallocated(void *address, unsigned int n); +void kmemcheck_mark_uninitialized(void *address, unsigned int n); +void kmemcheck_mark_initialized(void *address, unsigned int n); +void kmemcheck_mark_freed(void *address, unsigned int n); + +void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n); +void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n); + int kmemcheck_show_addr(unsigned long address); int kmemcheck_hide_addr(unsigned long address); #else #define kmemcheck_enabled 0 +static inline void +kmemcheck_alloc_shadow(struct kmem_cache *s, gfp_t flags, int node, + struct page *page, int order) +{ +} + +static inline void +kmemcheck_free_shadow(struct kmem_cache *s, struct page *page, int order) +{ +} + +static inline void +kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, + size_t size) +{ +} + +static inline void kmemcheck_slab_free(struct kmem_cache *s, void *object, + size_t size) +{ +} + +static inline bool kmemcheck_page_is_tracked(struct page *p) +{ + return false; +} #endif /* CONFIG_KMEMCHECK */ #endif /* LINUX_KMEMCHECK_H */ diff --git a/include/linux/slab.h b/include/linux/slab.h index 48803064ced..e339fcf17cd 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -64,6 +64,13 @@ #define SLAB_NOLEAKTRACE 0x00800000UL /* Avoid kmemleak tracing */ +/* Don't track use of uninitialized memory */ +#ifdef CONFIG_KMEMCHECK +# define SLAB_NOTRACK 0x01000000UL +#else +# define SLAB_NOTRACK 0x00000000UL +#endif + /* The following flags affect the page allocator grouping pages by mobility */ #define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ diff --git a/kernel/fork.c b/kernel/fork.c index 4430eb1376f..be022c200da 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -178,7 +178,7 @@ void __init fork_init(unsigned long mempages) /* create a slab on which task_structs can be allocated */ task_struct_cachep = kmem_cache_create("task_struct", sizeof(struct task_struct), - ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); + ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); #endif /* do the arch specific task caches init */ @@ -1470,20 +1470,20 @@ void __init proc_caches_init(void) { sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, - sighand_ctor); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| + SLAB_NOTRACK, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); mmap_init(); } diff --git a/mm/Makefile b/mm/Makefile index e89acb090b4..c379ce08354 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o +obj-$(CONFIG_KMEMCHECK) += kmemcheck.o obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c new file mode 100644 index 00000000000..eaa41b80261 --- /dev/null +++ b/mm/kmemcheck.c @@ -0,0 +1,103 @@ +#include +#include +#include +#include + +void kmemcheck_alloc_shadow(struct kmem_cache *s, gfp_t flags, int node, + struct page *page, int order) +{ + struct page *shadow; + int pages; + int i; + + pages = 1 << order; + + /* + * With kmemcheck enabled, we need to allocate a memory area for the + * shadow bits as well. + */ + shadow = alloc_pages_node(node, flags, order); + if (!shadow) { + if (printk_ratelimit()) + printk(KERN_ERR "kmemcheck: failed to allocate " + "shadow bitmap\n"); + return; + } + + for(i = 0; i < pages; ++i) + page[i].shadow = page_address(&shadow[i]); + + /* + * Mark it as non-present for the MMU so that our accesses to + * this memory will trigger a page fault and let us analyze + * the memory accesses. + */ + kmemcheck_hide_pages(page, pages); + + /* + * Objects from caches that have a constructor don't get + * cleared when they're allocated, so we need to do it here. + */ + if (s->ctor) + kmemcheck_mark_uninitialized_pages(page, pages); + else + kmemcheck_mark_unallocated_pages(page, pages); +} + +void kmemcheck_free_shadow(struct kmem_cache *s, struct page *page, int order) +{ + struct page *shadow; + int pages; + int i; + + pages = 1 << order; + + kmemcheck_show_pages(page, pages); + + shadow = virt_to_page(page[0].shadow); + + for(i = 0; i < pages; ++i) + page[i].shadow = NULL; + + __free_pages(shadow, order); +} + +void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, + size_t size) +{ + /* + * Has already been memset(), which initializes the shadow for us + * as well. + */ + if (gfpflags & __GFP_ZERO) + return; + + /* No need to initialize the shadow of a non-tracked slab. */ + if (s->flags & SLAB_NOTRACK) + return; + + if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) { + /* + * Allow notracked objects to be allocated from + * tracked caches. Note however that these objects + * will still get page faults on access, they just + * won't ever be flagged as uninitialized. If page + * faults are not acceptable, the slab cache itself + * should be marked NOTRACK. + */ + kmemcheck_mark_initialized(object, size); + } else if (!s->ctor) { + /* + * New objects should be marked uninitialized before + * they're returned to the called. + */ + kmemcheck_mark_uninitialized(object, size); + } +} + +void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) +{ + /* TODO: RCU freeing is unsupported for now; hide false positives. */ + if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU)) + kmemcheck_mark_freed(object, size); +} -- cgit v1.2.3-70-g09d2 From b1eeab67682a5e397aecf172046b3a8bd4808ae4 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Tue, 25 Nov 2008 16:55:53 +0100 Subject: kmemcheck: add hooks for the page allocator This adds support for tracking the initializedness of memory that was allocated with the page allocator. Highmem requests are not tracked. Cc: Dave Hansen Acked-by: Pekka Enberg [build fix for !CONFIG_KMEMCHECK] Signed-off-by: Ingo Molnar [rebased for mainline inclusion] Signed-off-by: Vegard Nossum --- arch/x86/include/asm/thread_info.h | 4 ++-- arch/x86/mm/kmemcheck/shadow.c | 8 +++++++ include/linux/gfp.h | 5 +++++ include/linux/kmemcheck.h | 35 ++++++++++++++++++++++++----- mm/kmemcheck.c | 45 +++++++++++++++++++++++++++----------- mm/page_alloc.c | 18 +++++++++++++++ mm/slab.c | 15 ++++++++----- mm/slub.c | 23 ++++++++++++++----- 8 files changed, 122 insertions(+), 31 deletions(-) (limited to 'include/linux/gfp.h') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 602c769fc98..b0783520988 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -154,9 +154,9 @@ struct thread_info { /* thread information allocation */ #ifdef CONFIG_DEBUG_STACK_USAGE -#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO) +#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) #else -#define THREAD_FLAGS GFP_KERNEL +#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK) #endif #define __HAVE_ARCH_THREAD_INFO_ALLOCATOR diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c index e7346d3873b..e773b6bd007 100644 --- a/arch/x86/mm/kmemcheck/shadow.c +++ b/arch/x86/mm/kmemcheck/shadow.c @@ -116,6 +116,14 @@ void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n) kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE); } +void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n) +{ + unsigned int i; + + for (i = 0; i < n; ++i) + kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE); +} + enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size) { uint8_t *x; diff --git a/include/linux/gfp.h b/include/linux/gfp.h index daeaa8fe1bb..3885e7f7556 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -51,7 +51,12 @@ struct vm_area_struct; #define __GFP_THISNODE ((__force gfp_t)0x40000u)/* No fallback, no policies */ #define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */ #define __GFP_MOVABLE ((__force gfp_t)0x100000u) /* Page is movable */ + +#ifdef CONFIG_KMEMCHECK #define __GFP_NOTRACK ((__force gfp_t)0x200000u) /* Don't track with kmemcheck */ +#else +#define __GFP_NOTRACK ((__force gfp_t)0) +#endif /* * This may seem redundant, but it's a way of annotating false positives vs. diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h index 71f21ae33d1..093d23969b1 100644 --- a/include/linux/kmemcheck.h +++ b/include/linux/kmemcheck.h @@ -8,13 +8,15 @@ extern int kmemcheck_enabled; /* The slab-related functions. */ -void kmemcheck_alloc_shadow(struct kmem_cache *s, gfp_t flags, int node, - struct page *page, int order); -void kmemcheck_free_shadow(struct kmem_cache *s, struct page *page, int order); +void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node); +void kmemcheck_free_shadow(struct page *page, int order); void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, size_t size); void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size); +void kmemcheck_pagealloc_alloc(struct page *p, unsigned int order, + gfp_t gfpflags); + void kmemcheck_show_pages(struct page *p, unsigned int n); void kmemcheck_hide_pages(struct page *p, unsigned int n); @@ -27,6 +29,7 @@ void kmemcheck_mark_freed(void *address, unsigned int n); void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n); void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n); +void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n); int kmemcheck_show_addr(unsigned long address); int kmemcheck_hide_addr(unsigned long address); @@ -34,13 +37,12 @@ int kmemcheck_hide_addr(unsigned long address); #define kmemcheck_enabled 0 static inline void -kmemcheck_alloc_shadow(struct kmem_cache *s, gfp_t flags, int node, - struct page *page, int order) +kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) { } static inline void -kmemcheck_free_shadow(struct kmem_cache *s, struct page *page, int order) +kmemcheck_free_shadow(struct page *page, int order) { } @@ -55,6 +57,11 @@ static inline void kmemcheck_slab_free(struct kmem_cache *s, void *object, { } +static inline void kmemcheck_pagealloc_alloc(struct page *p, + unsigned int order, gfp_t gfpflags) +{ +} + static inline bool kmemcheck_page_is_tracked(struct page *p) { return false; @@ -75,6 +82,22 @@ static inline void kmemcheck_mark_initialized(void *address, unsigned int n) static inline void kmemcheck_mark_freed(void *address, unsigned int n) { } + +static inline void kmemcheck_mark_unallocated_pages(struct page *p, + unsigned int n) +{ +} + +static inline void kmemcheck_mark_uninitialized_pages(struct page *p, + unsigned int n) +{ +} + +static inline void kmemcheck_mark_initialized_pages(struct page *p, + unsigned int n) +{ +} + #endif /* CONFIG_KMEMCHECK */ #endif /* LINUX_KMEMCHECK_H */ diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index eaa41b80261..fd814fd6131 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c @@ -1,10 +1,10 @@ +#include #include #include #include #include -void kmemcheck_alloc_shadow(struct kmem_cache *s, gfp_t flags, int node, - struct page *page, int order) +void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) { struct page *shadow; int pages; @@ -16,7 +16,7 @@ void kmemcheck_alloc_shadow(struct kmem_cache *s, gfp_t flags, int node, * With kmemcheck enabled, we need to allocate a memory area for the * shadow bits as well. */ - shadow = alloc_pages_node(node, flags, order); + shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order); if (!shadow) { if (printk_ratelimit()) printk(KERN_ERR "kmemcheck: failed to allocate " @@ -33,23 +33,17 @@ void kmemcheck_alloc_shadow(struct kmem_cache *s, gfp_t flags, int node, * the memory accesses. */ kmemcheck_hide_pages(page, pages); - - /* - * Objects from caches that have a constructor don't get - * cleared when they're allocated, so we need to do it here. - */ - if (s->ctor) - kmemcheck_mark_uninitialized_pages(page, pages); - else - kmemcheck_mark_unallocated_pages(page, pages); } -void kmemcheck_free_shadow(struct kmem_cache *s, struct page *page, int order) +void kmemcheck_free_shadow(struct page *page, int order) { struct page *shadow; int pages; int i; + if (!kmemcheck_page_is_tracked(page)) + return; + pages = 1 << order; kmemcheck_show_pages(page, pages); @@ -101,3 +95,28 @@ void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU)) kmemcheck_mark_freed(object, size); } + +void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order, + gfp_t gfpflags) +{ + int pages; + + if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK)) + return; + + pages = 1 << order; + + /* + * NOTE: We choose to track GFP_ZERO pages too; in fact, they + * can become uninitialized by copying uninitialized memory + * into them. + */ + + /* XXX: Can use zone->node for node? */ + kmemcheck_alloc_shadow(page, order, gfpflags, -1); + + if (gfpflags & __GFP_ZERO) + kmemcheck_mark_initialized_pages(page, pages); + else + kmemcheck_mark_uninitialized_pages(page, pages); +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 17d5f539a9a..0727896a88a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -546,6 +547,8 @@ static void __free_pages_ok(struct page *page, unsigned int order) int i; int bad = 0; + kmemcheck_free_shadow(page, order); + for (i = 0 ; i < (1 << order) ; ++i) bad += free_pages_check(page + i); if (bad) @@ -994,6 +997,8 @@ static void free_hot_cold_page(struct page *page, int cold) struct per_cpu_pages *pcp; unsigned long flags; + kmemcheck_free_shadow(page, 0); + if (PageAnon(page)) page->mapping = NULL; if (free_pages_check(page)) @@ -1047,6 +1052,16 @@ void split_page(struct page *page, unsigned int order) VM_BUG_ON(PageCompound(page)); VM_BUG_ON(!page_count(page)); + +#ifdef CONFIG_KMEMCHECK + /* + * Split shadow pages too, because free(page[0]) would + * otherwise free the whole shadow. + */ + if (kmemcheck_page_is_tracked(page)) + split_page(virt_to_page(page[0].shadow), order); +#endif + for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); } @@ -1667,7 +1682,10 @@ nopage: dump_stack(); show_mem(); } + return page; got_pg: + if (kmemcheck_enabled) + kmemcheck_pagealloc_alloc(page, order, gfp_mask); return page; } EXPORT_SYMBOL(__alloc_pages_internal); diff --git a/mm/slab.c b/mm/slab.c index 95b6c5eb40b..6a1ad0b9a94 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1612,7 +1612,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) if (cachep->flags & SLAB_RECLAIM_ACCOUNT) flags |= __GFP_RECLAIMABLE; - page = alloc_pages_node(nodeid, flags, cachep->gfporder); + page = alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); if (!page) return NULL; @@ -1626,8 +1626,14 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) for (i = 0; i < nr_pages; i++) __SetPageSlab(page + i); - if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) - kmemcheck_alloc_shadow(cachep, flags, nodeid, page, cachep->gfporder); + if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { + kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); + + if (cachep->ctor) + kmemcheck_mark_uninitialized_pages(page, nr_pages); + else + kmemcheck_mark_unallocated_pages(page, nr_pages); + } return page_address(page); } @@ -1641,8 +1647,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) struct page *page = virt_to_page(addr); const unsigned long nr_freed = i; - if (kmemcheck_page_is_tracked(page)) - kmemcheck_free_shadow(cachep, page, cachep->gfporder); + kmemcheck_free_shadow(page, cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) sub_zone_page_state(page_zone(page), diff --git a/mm/slub.c b/mm/slub.c index 1cebaa747ad..898fb5047dc 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1066,6 +1066,8 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node, { int order = oo_order(oo); + flags |= __GFP_NOTRACK; + if (node == -1) return alloc_pages(flags, order); else @@ -1097,7 +1099,18 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if (kmemcheck_enabled && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { - kmemcheck_alloc_shadow(s, flags, node, page, compound_order(page)); + int pages = 1 << oo_order(oo); + + kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); + + /* + * Objects from caches that have a constructor don't get + * cleared when they're allocated, so we need to do it here. + */ + if (s->ctor) + kmemcheck_mark_uninitialized_pages(page, pages); + else + kmemcheck_mark_unallocated_pages(page, pages); } page->objects = oo_objects(oo); @@ -1173,8 +1186,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlubDebug(page); } - if (kmemcheck_page_is_tracked(page)) - kmemcheck_free_shadow(s, page, compound_order(page)); + kmemcheck_free_shadow(page, compound_order(page)); mod_zone_page_state(page_zone(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? @@ -2734,9 +2746,10 @@ EXPORT_SYMBOL(__kmalloc); static void *kmalloc_large_node(size_t size, gfp_t flags, int node) { - struct page *page = alloc_pages_node(node, flags | __GFP_COMP, - get_order(size)); + struct page *page; + flags |= __GFP_COMP | __GFP_NOTRACK; + page = alloc_pages_node(node, flags, get_order(size)); if (page) return page_address(page); else -- cgit v1.2.3-70-g09d2 From d239171e4f6efd58d7e423853056b1b6a74f1446 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:31:52 -0700 Subject: page allocator: replace __alloc_pages_internal() with __alloc_pages_nodemask() The start of a large patch series to clean up and optimise the page allocator. The performance improvements are in a wide range depending on the exact machine but the results I've seen so fair are approximately; kernbench: 0 to 0.12% (elapsed time) 0.49% to 3.20% (sys time) aim9: -4% to 30% (for page_test and brk_test) tbench: -1% to 4% hackbench: -2.5% to 3.45% (mostly within the noise though) netperf-udp -1.34% to 4.06% (varies between machines a bit) netperf-tcp -0.44% to 5.22% (varies between machines a bit) I haven't sysbench figures at hand, but previously they were within the -0.5% to 2% range. On netperf, the client and server were bound to opposite number CPUs to maximise the problems with cache line bouncing of the struct pages so I expect different people to report different results for netperf depending on their exact machine and how they ran the test (different machines, same cpus client/server, shared cache but two threads client/server, different socket client/server etc). I also measured the vmlinux sizes for a single x86-based config with CONFIG_DEBUG_INFO enabled but not CONFIG_DEBUG_VM. The core of the .config is based on the Debian Lenny kernel config so I expect it to be reasonably typical. This patch: __alloc_pages_internal is the core page allocator function but essentially it is an alias of __alloc_pages_nodemask. Naming a publicly available and exported function "internal" is also a big ugly. This patch renames __alloc_pages_internal() to __alloc_pages_nodemask() and deletes the old nodemask function. Warning - This patch renames an exported symbol. No kernel driver is affected by external drivers calling __alloc_pages_internal() should change the call to __alloc_pages_nodemask() without any alteration of parameters. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Reviewed-by: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 12 ++---------- mm/page_alloc.c | 4 ++-- 2 files changed, 4 insertions(+), 12 deletions(-) (limited to 'include/linux/gfp.h') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 3760e7c5de0..549ec558310 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -172,24 +172,16 @@ static inline void arch_alloc_page(struct page *page, int order) { } #endif struct page * -__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask); static inline struct page * __alloc_pages(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist) { - return __alloc_pages_internal(gfp_mask, order, zonelist, NULL); + return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL); } -static inline struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask) -{ - return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask); -} - - static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cbed869fd83..d58df903150 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1458,7 +1458,7 @@ try_next_zone: * This is the 'heart' of the zoned buddy allocator. */ struct page * -__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask) { const gfp_t wait = gfp_mask & __GFP_WAIT; @@ -1667,7 +1667,7 @@ nopage: got_pg: return page; } -EXPORT_SYMBOL(__alloc_pages_internal); +EXPORT_SYMBOL(__alloc_pages_nodemask); /* * Common helper functions. -- cgit v1.2.3-70-g09d2 From b3c466ce512923298ae8c0121d3e9f397a3f1210 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:31:53 -0700 Subject: page allocator: do not sanity check order in the fast path No user of the allocator API should be passing in an order >= MAX_ORDER but we check for it on each and every allocation. Delete this check and make it a VM_BUG_ON check further down the call path. [akpm@linux-foundation.org: s/VM_BUG_ON/WARN_ON_ONCE/] Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Reviewed-by: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 6 ------ mm/page_alloc.c | 3 +++ 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux/gfp.h') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 549ec558310..c2d3fe03b5d 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -185,9 +185,6 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { - if (unlikely(order >= MAX_ORDER)) - return NULL; - /* Unknown node is current node */ if (nid < 0) nid = numa_node_id(); @@ -201,9 +198,6 @@ extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order); static inline struct page * alloc_pages(gfp_t gfp_mask, unsigned int order) { - if (unlikely(order >= MAX_ORDER)) - return NULL; - return alloc_pages_current(gfp_mask, order); } extern struct page *alloc_page_vma(gfp_t gfp_mask, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d58df903150..bfbd95c0610 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1401,6 +1401,9 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, classzone_idx = zone_idx(preferred_zone); + if (WARN_ON_ONCE(order >= MAX_ORDER)) + return NULL; + zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. -- cgit v1.2.3-70-g09d2 From 6484eb3e2a81807722c5f28efef94d8338b7b996 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:31:54 -0700 Subject: page allocator: do not check NUMA node ID when the caller knows the node is valid Callers of alloc_pages_node() can optionally specify -1 as a node to mean "allocate from the current node". However, a number of the callers in fast paths know for a fact their node is valid. To avoid a comparison and branch, this patch adds alloc_pages_exact_node() that only checks the nid with VM_BUG_ON(). Callers that know their node is valid are then converted. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Reviewed-by: Pekka Enberg Acked-by: Paul Mundt [for the SLOB NUMA bits] Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/hp/common/sba_iommu.c | 2 +- arch/ia64/kernel/mca.c | 3 +-- arch/ia64/kernel/uncached.c | 3 ++- arch/ia64/sn/pci/pci_dma.c | 3 ++- arch/powerpc/platforms/cell/ras.c | 4 ++-- arch/x86/kvm/vmx.c | 2 +- drivers/misc/sgi-gru/grufile.c | 2 +- drivers/misc/sgi-xp/xpc_uv.c | 2 +- include/linux/gfp.h | 9 +++++++++ include/linux/mm.h | 1 - kernel/profile.c | 8 ++++---- mm/filemap.c | 2 +- mm/hugetlb.c | 4 ++-- mm/mempolicy.c | 2 +- mm/migrate.c | 2 +- mm/slab.c | 4 ++-- mm/slob.c | 4 ++-- 17 files changed, 33 insertions(+), 24 deletions(-) (limited to 'include/linux/gfp.h') diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c index 56ceb68eb99..fe63b2dc9d0 100644 --- a/arch/ia64/hp/common/sba_iommu.c +++ b/arch/ia64/hp/common/sba_iommu.c @@ -1131,7 +1131,7 @@ sba_alloc_coherent (struct device *dev, size_t size, dma_addr_t *dma_handle, gfp #ifdef CONFIG_NUMA { struct page *page; - page = alloc_pages_node(ioc->node == MAX_NUMNODES ? + page = alloc_pages_exact_node(ioc->node == MAX_NUMNODES ? numa_node_id() : ioc->node, flags, get_order(size)); diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 8f33a884042..5b17bd40227 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1829,8 +1829,7 @@ ia64_mca_cpu_init(void *cpu_data) data = mca_bootmem(); first_time = 0; } else - data = page_address(alloc_pages_node(numa_node_id(), - GFP_KERNEL, get_order(sz))); + data = __get_free_pages(GFP_KERNEL, get_order(sz)); if (!data) panic("Could not allocate MCA memory for cpu %d\n", cpu); diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c index 8eff8c1d40a..6ba72ab42fc 100644 --- a/arch/ia64/kernel/uncached.c +++ b/arch/ia64/kernel/uncached.c @@ -98,7 +98,8 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid) /* attempt to allocate a granule's worth of cached memory pages */ - page = alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + page = alloc_pages_exact_node(nid, + GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, IA64_GRANULE_SHIFT-PAGE_SHIFT); if (!page) { mutex_unlock(&uc_pool->add_chunk_mutex); diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c index d876423e4e7..98b684928e1 100644 --- a/arch/ia64/sn/pci/pci_dma.c +++ b/arch/ia64/sn/pci/pci_dma.c @@ -90,7 +90,8 @@ static void *sn_dma_alloc_coherent(struct device *dev, size_t size, */ node = pcibus_to_node(pdev->bus); if (likely(node >=0)) { - struct page *p = alloc_pages_node(node, flags, get_order(size)); + struct page *p = alloc_pages_exact_node(node, + flags, get_order(size)); if (likely(p)) cpuaddr = page_address(p); diff --git a/arch/powerpc/platforms/cell/ras.c b/arch/powerpc/platforms/cell/ras.c index 296b5268754..5e0a191764f 100644 --- a/arch/powerpc/platforms/cell/ras.c +++ b/arch/powerpc/platforms/cell/ras.c @@ -122,8 +122,8 @@ static int __init cbe_ptcal_enable_on_node(int nid, int order) area->nid = nid; area->order = order; - area->pages = alloc_pages_node(area->nid, GFP_KERNEL | GFP_THISNODE, - area->order); + area->pages = alloc_pages_exact_node(area->nid, GFP_KERNEL|GFP_THISNODE, + area->order); if (!area->pages) { printk(KERN_WARNING "%s: no page on node %d\n", diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 32d6ae8fb60..e770bf349ec 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1277,7 +1277,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) struct page *pages; struct vmcs *vmcs; - pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); + pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order); if (!pages) return NULL; vmcs = page_address(pages); diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c index bbefe77c67a..3ce2920e2bf 100644 --- a/drivers/misc/sgi-gru/grufile.c +++ b/drivers/misc/sgi-gru/grufile.c @@ -302,7 +302,7 @@ static int gru_init_tables(unsigned long gru_base_paddr, void *gru_base_vaddr) pnode = uv_node_to_pnode(nid); if (bid < 0 || gru_base[bid]) continue; - page = alloc_pages_node(nid, GFP_KERNEL, order); + page = alloc_pages_exact_node(nid, GFP_KERNEL, order); if (!page) goto fail; gru_base[bid] = page_address(page); diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c index 9172fcdee4e..c76677afda1 100644 --- a/drivers/misc/sgi-xp/xpc_uv.c +++ b/drivers/misc/sgi-xp/xpc_uv.c @@ -232,7 +232,7 @@ xpc_create_gru_mq_uv(unsigned int mq_size, int cpu, char *irq_name, mq->mmr_blade = uv_cpu_to_blade_id(cpu); nid = cpu_to_node(cpu); - page = alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + page = alloc_pages_exact_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, pg_order); if (page == NULL) { dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to alloc %d " diff --git a/include/linux/gfp.h b/include/linux/gfp.h index c2d3fe03b5d..4efa33088a8 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -5,6 +5,7 @@ #include #include #include +#include struct vm_area_struct; @@ -192,6 +193,14 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); } +static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask, + unsigned int order) +{ + VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); + + return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); +} + #ifdef CONFIG_NUMA extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order); diff --git a/include/linux/mm.h b/include/linux/mm.h index a880161a385..7b548e7cfbd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -7,7 +7,6 @@ #include #include -#include #include #include #include diff --git a/kernel/profile.c b/kernel/profile.c index 28cf26ad2d2..69911b5745e 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -365,7 +365,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info, node = cpu_to_node(cpu); per_cpu(cpu_profile_flip, cpu) = 0; if (!per_cpu(cpu_profile_hits, cpu)[1]) { - page = alloc_pages_node(node, + page = alloc_pages_exact_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) @@ -373,7 +373,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info, per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); } if (!per_cpu(cpu_profile_hits, cpu)[0]) { - page = alloc_pages_node(node, + page = alloc_pages_exact_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) @@ -564,14 +564,14 @@ static int create_hash_tables(void) int node = cpu_to_node(cpu); struct page *page; - page = alloc_pages_node(node, + page = alloc_pages_exact_node(node, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 0); if (!page) goto out_cleanup; per_cpu(cpu_profile_hits, cpu)[1] = (struct profile_hit *)page_address(page); - page = alloc_pages_node(node, + page = alloc_pages_exact_node(node, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 0); if (!page) diff --git a/mm/filemap.c b/mm/filemap.c index 6846a902f5c..22396713feb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -521,7 +521,7 @@ struct page *__page_cache_alloc(gfp_t gfp) { if (cpuset_do_page_mem_spread()) { int n = cpuset_mem_spread_node(); - return alloc_pages_node(n, gfp, 0); + return alloc_pages_exact_node(n, gfp, 0); } return alloc_pages(gfp, 0); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e83ad2c9228..2f8241f300f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -630,7 +630,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) if (h->order >= MAX_ORDER) return NULL; - page = alloc_pages_node(nid, + page = alloc_pages_exact_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); @@ -649,7 +649,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) * Use a helper variable to find the next node and then * copy it back to hugetlb_next_nid afterwards: * otherwise there's a window in which a racer might - * pass invalid nid MAX_NUMNODES to alloc_pages_node. + * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. * But we don't need to use a spin_lock here: it really * doesn't matter if occasionally a racer chooses the * same nid as we do. Move nid forward in the mask even diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 46bdf9ddf2b..e08e2c4da63 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -803,7 +803,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, static struct page *new_node_page(struct page *page, unsigned long node, int **x) { - return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); + return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); } /* diff --git a/mm/migrate.c b/mm/migrate.c index 068655d8f88..5a24923e7fd 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -802,7 +802,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, *result = &pm->status; - return alloc_pages_node(pm->node, + return alloc_pages_exact_node(pm->node, GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); } diff --git a/mm/slab.c b/mm/slab.c index 18e3164de09..bb3254c95cd 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1707,7 +1707,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) if (cachep->flags & SLAB_RECLAIM_ACCOUNT) flags |= __GFP_RECLAIMABLE; - page = alloc_pages_node(nodeid, flags, cachep->gfporder); + page = alloc_pages_exact_node(nodeid, flags, cachep->gfporder); if (!page) return NULL; @@ -3261,7 +3261,7 @@ retry: if (local_flags & __GFP_WAIT) local_irq_enable(); kmem_flagcheck(cache, flags); - obj = kmem_getpages(cache, local_flags, -1); + obj = kmem_getpages(cache, local_flags, numa_node_id()); if (local_flags & __GFP_WAIT) local_irq_disable(); if (obj) { diff --git a/mm/slob.c b/mm/slob.c index 12f26149992..64f6db1943b 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -46,7 +46,7 @@ * NUMA support in SLOB is fairly simplistic, pushing most of the real * logic down to the page allocator, and simply doing the node accounting * on the upper levels. In the event that a node id is explicitly - * provided, alloc_pages_node() with the specified node id is used + * provided, alloc_pages_exact_node() with the specified node id is used * instead. The common case (or when the node id isn't explicitly provided) * will default to the current node, as per numa_node_id(). * @@ -244,7 +244,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) #ifdef CONFIG_NUMA if (node != -1) - page = alloc_pages_node(node, gfp, order); + page = alloc_pages_exact_node(node, gfp, order); else #endif page = alloc_pages(gfp, order); -- cgit v1.2.3-70-g09d2 From 7f33d49a2ed546e01f7b1d0607661810f2421859 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 16 Jun 2009 15:32:41 -0700 Subject: mm, PM/Freezer: Disable OOM killer when tasks are frozen Currently, the following scenario appears to be possible in theory: * Tasks are frozen for hibernation or suspend. * Free pages are almost exhausted. * Certain piece of code in the suspend code path attempts to allocate some memory using GFP_KERNEL and allocation order less than or equal to PAGE_ALLOC_COSTLY_ORDER. * __alloc_pages_internal() cannot find a free page so it invokes the OOM killer. * The OOM killer attempts to kill a task, but the task is frozen, so it doesn't die immediately. * __alloc_pages_internal() jumps to 'restart', unsuccessfully tries to find a free page and invokes the OOM killer. * No progress can be made. Although it is now hard to trigger during hibernation due to the memory shrinking carried out by the hibernation code, it is theoretically possible to trigger during suspend after the memory shrinking has been removed from that code path. Moreover, since memory allocations are going to be used for the hibernation memory shrinking, it will be even more likely to happen during hibernation. To prevent it from happening, introduce the oom_killer_disabled switch that will cause __alloc_pages_internal() to fail in the situations in which the OOM killer would have been called and make the freezer set this switch after tasks have been successfully frozen. [akpm@linux-foundation.org: be nicer to the namespace] Signed-off-by: Rafael J. Wysocki Cc: Fengguang Wu Cc: David Rientjes Acked-by: Pavel Machek Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 12 ++++++++++++ kernel/power/process.c | 5 +++++ mm/page_alloc.c | 4 ++++ 3 files changed, 21 insertions(+) (limited to 'include/linux/gfp.h') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 4efa33088a8..06b7e8cc80a 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -243,4 +243,16 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(void); void drain_local_pages(void *dummy); +extern bool oom_killer_disabled; + +static inline void oom_killer_disable(void) +{ + oom_killer_disabled = true; +} + +static inline void oom_killer_enable(void) +{ + oom_killer_disabled = false; +} + #endif /* __LINUX_GFP_H */ diff --git a/kernel/power/process.c b/kernel/power/process.c index ca634019497..da2072d7381 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -117,9 +117,12 @@ int freeze_processes(void) if (error) goto Exit; printk("done."); + + oom_killer_disable(); Exit: BUG_ON(in_atomic()); printk("\n"); + return error; } @@ -145,6 +148,8 @@ static void thaw_tasks(bool nosig_only) void thaw_processes(void) { + oom_killer_enable(); + printk("Restarting tasks ... "); thaw_tasks(true); thaw_tasks(false); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 61290ea721c..5b09488d0f5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -178,6 +178,8 @@ static void set_pageblock_migratetype(struct page *page, int migratetype) PB_migrate, PB_migrate_end); } +bool oom_killer_disabled __read_mostly; + #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { @@ -1769,6 +1771,8 @@ rebalance: */ if (!did_some_progress) { if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { + if (oom_killer_disabled) + goto nopage; page = __alloc_pages_may_oom(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, -- cgit v1.2.3-70-g09d2 From b70d94ee438b3fd9c15c7691d7a932a135c18101 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 16 Jun 2009 15:32:46 -0700 Subject: page-allocator: use integer fields lookup for gfp_zone and check for errors in flags passed to the page allocator This simplifies the code in gfp_zone() and also keeps the ability of the compiler to use constant folding to get rid of gfp_zone processing. The lookup of the zone is done using a bitfield stored in an integer. So the code in gfp_zone is a simple extraction of bits from a constant bitfield. The compiler is generating a load of a constant into a register and then performs a shift and mask operation to get the zone from a gfp_t. No cachelines are touched and no branches have to be predicted by the compiler. We are doing some macro tricks here to convince the compiler to always do the constant folding if possible. Signed-off-by: Christoph Lameter Cc: KAMEZAWA Hiroyuki Reviewed-by: Mel Gorman Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 111 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 96 insertions(+), 15 deletions(-) (limited to 'include/linux/gfp.h') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 06b7e8cc80a..412178afd42 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -21,7 +21,8 @@ struct vm_area_struct; #define __GFP_DMA ((__force gfp_t)0x01u) #define __GFP_HIGHMEM ((__force gfp_t)0x02u) #define __GFP_DMA32 ((__force gfp_t)0x04u) - +#define __GFP_MOVABLE ((__force gfp_t)0x08u) /* Page is movable */ +#define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE) /* * Action modifiers - doesn't change the zoning * @@ -51,7 +52,6 @@ struct vm_area_struct; #define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */ #define __GFP_THISNODE ((__force gfp_t)0x40000u)/* No fallback, no policies */ #define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */ -#define __GFP_MOVABLE ((__force gfp_t)0x100000u) /* Page is movable */ #define __GFP_BITS_SHIFT 21 /* Room for 21 __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) @@ -116,24 +116,105 @@ static inline int allocflags_to_migratetype(gfp_t gfp_flags) ((gfp_flags & __GFP_RECLAIMABLE) != 0); } -static inline enum zone_type gfp_zone(gfp_t flags) -{ +#ifdef CONFIG_HIGHMEM +#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM +#else +#define OPT_ZONE_HIGHMEM ZONE_NORMAL +#endif + #ifdef CONFIG_ZONE_DMA - if (flags & __GFP_DMA) - return ZONE_DMA; +#define OPT_ZONE_DMA ZONE_DMA +#else +#define OPT_ZONE_DMA ZONE_NORMAL #endif + #ifdef CONFIG_ZONE_DMA32 - if (flags & __GFP_DMA32) - return ZONE_DMA32; +#define OPT_ZONE_DMA32 ZONE_DMA32 +#else +#define OPT_ZONE_DMA32 ZONE_NORMAL #endif - if ((flags & (__GFP_HIGHMEM | __GFP_MOVABLE)) == - (__GFP_HIGHMEM | __GFP_MOVABLE)) - return ZONE_MOVABLE; -#ifdef CONFIG_HIGHMEM - if (flags & __GFP_HIGHMEM) - return ZONE_HIGHMEM; + +/* + * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the + * zone to use given the lowest 4 bits of gfp_t. Entries are ZONE_SHIFT long + * and there are 16 of them to cover all possible combinations of + * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM + * + * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA. + * But GFP_MOVABLE is not only a zone specifier but also an allocation + * policy. Therefore __GFP_MOVABLE plus another zone selector is valid. + * Only 1bit of the lowest 3 bit (DMA,DMA32,HIGHMEM) can be set to "1". + * + * bit result + * ================= + * 0x0 => NORMAL + * 0x1 => DMA or NORMAL + * 0x2 => HIGHMEM or NORMAL + * 0x3 => BAD (DMA+HIGHMEM) + * 0x4 => DMA32 or DMA or NORMAL + * 0x5 => BAD (DMA+DMA32) + * 0x6 => BAD (HIGHMEM+DMA32) + * 0x7 => BAD (HIGHMEM+DMA32+DMA) + * 0x8 => NORMAL (MOVABLE+0) + * 0x9 => DMA or NORMAL (MOVABLE+DMA) + * 0xa => MOVABLE (Movable is valid only if HIGHMEM is set too) + * 0xb => BAD (MOVABLE+HIGHMEM+DMA) + * 0xc => DMA32 (MOVABLE+HIGHMEM+DMA32) + * 0xd => BAD (MOVABLE+DMA32+DMA) + * 0xe => BAD (MOVABLE+DMA32+HIGHMEM) + * 0xf => BAD (MOVABLE+DMA32+HIGHMEM+DMA) + * + * ZONES_SHIFT must be <= 2 on 32 bit platforms. + */ + +#if 16 * ZONES_SHIFT > BITS_PER_LONG +#error ZONES_SHIFT too large to create GFP_ZONE_TABLE integer +#endif + +#define GFP_ZONE_TABLE ( \ + (ZONE_NORMAL << 0 * ZONES_SHIFT) \ + | (OPT_ZONE_DMA << __GFP_DMA * ZONES_SHIFT) \ + | (OPT_ZONE_HIGHMEM << __GFP_HIGHMEM * ZONES_SHIFT) \ + | (OPT_ZONE_DMA32 << __GFP_DMA32 * ZONES_SHIFT) \ + | (ZONE_NORMAL << __GFP_MOVABLE * ZONES_SHIFT) \ + | (OPT_ZONE_DMA << (__GFP_MOVABLE | __GFP_DMA) * ZONES_SHIFT) \ + | (ZONE_MOVABLE << (__GFP_MOVABLE | __GFP_HIGHMEM) * ZONES_SHIFT)\ + | (OPT_ZONE_DMA32 << (__GFP_MOVABLE | __GFP_DMA32) * ZONES_SHIFT)\ +) + +/* + * GFP_ZONE_BAD is a bitmap for all combination of __GFP_DMA, __GFP_DMA32 + * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per + * entry starting with bit 0. Bit is set if the combination is not + * allowed. + */ +#define GFP_ZONE_BAD ( \ + 1 << (__GFP_DMA | __GFP_HIGHMEM) \ + | 1 << (__GFP_DMA | __GFP_DMA32) \ + | 1 << (__GFP_DMA32 | __GFP_HIGHMEM) \ + | 1 << (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM) \ + | 1 << (__GFP_MOVABLE | __GFP_HIGHMEM | __GFP_DMA) \ + | 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_DMA) \ + | 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_HIGHMEM) \ + | 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_DMA | __GFP_HIGHMEM)\ +) + +static inline enum zone_type gfp_zone(gfp_t flags) +{ + enum zone_type z; + int bit = flags & GFP_ZONEMASK; + + z = (GFP_ZONE_TABLE >> (bit * ZONES_SHIFT)) & + ((1 << ZONES_SHIFT) - 1); + + if (__builtin_constant_p(bit)) + BUILD_BUG_ON((GFP_ZONE_BAD >> bit) & 1); + else { +#ifdef CONFIG_DEBUG_VM + BUG_ON((GFP_ZONE_BAD >> bit) & 1); #endif - return ZONE_NORMAL; + } + return z; } /* -- cgit v1.2.3-70-g09d2 From dcce284a259373f9e5570f2e33f79eca84fcf565 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 18 Jun 2009 13:24:12 +1000 Subject: mm: Extend gfp masking to the page allocator The page allocator also needs the masking of gfp flags during boot, so this moves it out of slab/slub and uses it with the page allocator as well. Signed-off-by: Benjamin Herrenschmidt Acked-by: Pekka Enberg Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 9 ++++++++- init/main.c | 4 ++++ mm/page_alloc.c | 3 +++ mm/slab.c | 15 ++------------- mm/slub.c | 12 +----------- 5 files changed, 18 insertions(+), 25 deletions(-) (limited to 'include/linux/gfp.h') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index cfdb35d71bc..7c777a0da17 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -99,7 +99,7 @@ struct vm_area_struct; __GFP_NORETRY|__GFP_NOMEMALLOC) /* Control slab gfp mask during early boot */ -#define SLAB_GFP_BOOT_MASK __GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS) +#define GFP_BOOT_MASK __GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS) /* Control allocation constraints */ #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) @@ -348,4 +348,11 @@ static inline void oom_killer_enable(void) oom_killer_disabled = false; } +extern gfp_t gfp_allowed_mask; + +static inline void set_gfp_allowed_mask(gfp_t mask) +{ + gfp_allowed_mask = mask; +} + #endif /* __LINUX_GFP_H */ diff --git a/init/main.c b/init/main.c index 1a65fdd0631..09131ec090c 100644 --- a/init/main.c +++ b/init/main.c @@ -642,6 +642,10 @@ asmlinkage void __init start_kernel(void) "enabled early\n"); early_boot_irqs_on(); local_irq_enable(); + + /* Interrupts are enabled now so all GFP allocations are safe. */ + set_gfp_allowed_mask(__GFP_BITS_MASK); + kmem_cache_init_late(); /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a5f3c278c57..6f0753fe694 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -73,6 +73,7 @@ unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; unsigned long highest_memmap_pfn __read_mostly; int percpu_pagelist_fraction; +gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE int pageblock_order __read_mostly; @@ -1863,6 +1864,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct page *page; int migratetype = allocflags_to_migratetype(gfp_mask); + gfp_mask &= gfp_allowed_mask; + lockdep_trace_alloc(gfp_mask); might_sleep_if(gfp_mask & __GFP_WAIT); diff --git a/mm/slab.c b/mm/slab.c index d08692303f6..e74a16e4ced 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -304,12 +304,6 @@ struct kmem_list3 { int free_touched; /* updated without locking */ }; -/* - * The slab allocator is initialized with interrupts disabled. Therefore, make - * sure early boot allocations don't accidentally enable interrupts. - */ -static gfp_t slab_gfp_mask __read_mostly = SLAB_GFP_BOOT_MASK; - /* * Need this for bootstrapping a per node allocator. */ @@ -1559,11 +1553,6 @@ void __init kmem_cache_init_late(void) { struct kmem_cache *cachep; - /* - * Interrupts are enabled now so all GFP allocations are safe. - */ - slab_gfp_mask = __GFP_BITS_MASK; - /* 6) resize the head arrays to their final sizes */ mutex_lock(&cache_chain_mutex); list_for_each_entry(cachep, &cache_chain, next) @@ -3307,7 +3296,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, unsigned long save_flags; void *ptr; - flags &= slab_gfp_mask; + flags &= gfp_allowed_mask; lockdep_trace_alloc(flags); @@ -3392,7 +3381,7 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) unsigned long save_flags; void *objp; - flags &= slab_gfp_mask; + flags &= gfp_allowed_mask; lockdep_trace_alloc(flags); diff --git a/mm/slub.c b/mm/slub.c index 4c6449310a0..ce62b770e2f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -179,12 +179,6 @@ static enum { SYSFS /* Sysfs up */ } slab_state = DOWN; -/* - * The slab allocator is initialized with interrupts disabled. Therefore, make - * sure early boot allocations don't accidentally enable interrupts. - */ -static gfp_t slab_gfp_mask __read_mostly = SLAB_GFP_BOOT_MASK; - /* A list of all slab caches on the system */ static DECLARE_RWSEM(slub_lock); static LIST_HEAD(slab_caches); @@ -1692,7 +1686,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, unsigned long flags; unsigned int objsize; - gfpflags &= slab_gfp_mask; + gfpflags &= gfp_allowed_mask; lockdep_trace_alloc(gfpflags); might_sleep_if(gfpflags & __GFP_WAIT); @@ -3220,10 +3214,6 @@ void __init kmem_cache_init(void) void __init kmem_cache_init_late(void) { - /* - * Interrupts are enabled now so all GFP allocations are safe. - */ - slab_gfp_mask = __GFP_BITS_MASK; } /* -- cgit v1.2.3-70-g09d2