From 5594c8c813d9e907ff55da7080d42653478b73e8 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 6 Jan 2009 14:39:14 -0800 Subject: mm: print out memmap number only if it is not zero Don't print the size of the zone's memmap array if it does not have one. Impact: cleanup Signed-off-by: Yinghai Lu Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d8ac0147456..2f644c3e3da 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3469,9 +3469,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; if (realsize >= memmap_pages) { realsize -= memmap_pages; - printk(KERN_DEBUG - " %s zone: %lu pages used for memmap\n", - zone_names[j], memmap_pages); + if (memmap_pages) + printk(KERN_DEBUG + " %s zone: %lu pages used for memmap\n", + zone_names[j], memmap_pages); } else printk(KERN_WARNING " %s zone: %lu pages exceeds realsize %lu\n", -- cgit v1.2.3-70-g09d2 From 58a01a45721bf7bd3a41a86248c3cb02a6b0c501 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Tue, 6 Jan 2009 14:39:28 -0800 Subject: mm/page_alloc.c: eliminate NULL test and memset after alloc_bootmem As noted by Akinobu Mita in patch b1fceac2b9e04d278316b2faddf276015fc06e3b, alloc_bootmem and related functions never return NULL and always return a zeroed region of memory. Thus a NULL test or memset after calls to these functions is unnecessary. This was fixed using the following semantic patch. (http://www.emn.fr/x-info/coccinelle/) // @@ expression E; statement S; @@ E = \(alloc_bootmem\|alloc_bootmem_low\|alloc_bootmem_pages\|alloc_bootmem_low_pages\|alloc_bootmem_node\|alloc_bootmem_low_pages_node\|alloc_bootmem_pages_node\)(...) ... when != E ( - BUG_ON (E == NULL); | - if (E == NULL) S ) @@ expression E,E1; @@ E = \(alloc_bootmem\|alloc_bootmem_low\|alloc_bootmem_pages\|alloc_bootmem_low_pages\|alloc_bootmem_node\|alloc_bootmem_low_pages_node\|alloc_bootmem_pages_node\)(...) ... when != E - memset(E,0,E1); // Signed-off-by: Julia Lawall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2f644c3e3da..ce2a026219b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3381,10 +3381,8 @@ static void __init setup_usemap(struct pglist_data *pgdat, { unsigned long usemapsize = usemap_size(zonesize); zone->pageblock_flags = NULL; - if (usemapsize) { + if (usemapsize) zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); - memset(zone->pageblock_flags, 0, usemapsize); - } } #else static void inline setup_usemap(struct pglist_data *pgdat, -- cgit v1.2.3-70-g09d2 From b962716b459505a8d83aea313fea0abe76749f42 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:41 -0800 Subject: mm: optimize get_scan_ratio for no swap Rik suggests a simplified get_scan_ratio() for !CONFIG_SWAP. Yes, the gcc optimizer gives us that, when nr_swap_pages is #defined as 0L. Move usual declaration to swapfile.c: it never belonged in page_alloc.c. Signed-off-by: Hugh Dickins Cc: Lee Schermerhorn Acked-by: Rik van Riel Cc: Nick Piggin Cc: KAMEZAWA Hiroyuki Cc: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 5 +++-- mm/page_alloc.c | 1 - mm/swapfile.c | 1 + mm/vmscan.c | 12 ++++++------ 4 files changed, 10 insertions(+), 9 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index c0d23ac710d..3a31cc25bd2 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -163,7 +163,6 @@ struct swap_list_t { /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; extern unsigned long totalreserve_pages; -extern long nr_swap_pages; extern unsigned int nr_free_buffer_pages(void); extern unsigned int nr_free_pagecache_pages(void); @@ -291,6 +290,7 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t, struct vm_area_struct *vma, unsigned long addr); /* linux/mm/swapfile.c */ +extern long nr_swap_pages; extern long total_swap_pages; extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(void); @@ -331,7 +331,8 @@ static inline void disable_swap_token(void) #else /* CONFIG_SWAP */ -#define total_swap_pages 0 +#define nr_swap_pages 0L +#define total_swap_pages 0L #define total_swapcache_pages 0UL #define si_swapinfo(val) \ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ce2a026219b..65133436fe3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -69,7 +69,6 @@ EXPORT_SYMBOL(node_states); unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; -long nr_swap_pages; int percpu_pagelist_fraction; #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE diff --git a/mm/swapfile.c b/mm/swapfile.c index 9ce7f81c8ab..725e56c362d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -35,6 +35,7 @@ static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; +long nr_swap_pages; long total_swap_pages; static int swap_overflow; static int least_priority; diff --git a/mm/vmscan.c b/mm/vmscan.c index f350523a8ee..d500b906746 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1327,12 +1327,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, unsigned long anon_prio, file_prio; unsigned long ap, fp; - anon = zone_page_state(zone, NR_ACTIVE_ANON) + - zone_page_state(zone, NR_INACTIVE_ANON); - file = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE); - free = zone_page_state(zone, NR_FREE_PAGES); - /* If we have no swap space, do not bother scanning anon pages. */ if (nr_swap_pages <= 0) { percent[0] = 0; @@ -1340,6 +1334,12 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, return; } + anon = zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + file = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + free = zone_page_state(zone, NR_FREE_PAGES); + /* If we have very few page cache pages, force-scan anon pages. */ if (unlikely(file + free <= zone->pages_high)) { percent[0] = 100; -- cgit v1.2.3-70-g09d2 From efab81864161f8c546d4403873e7ae7831ed5b26 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 6 Jan 2009 14:39:46 -0800 Subject: mm: make setup_per_zone_inactive_ratio() static Sparse output following warning. mm/page_alloc.c:4301:6: warning: symbol 'setup_per_zone_inactive_ratio' was not declared. Should it be static? cleanup here. Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 65133436fe3..31c512410a9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4314,7 +4314,7 @@ void setup_per_zone_pages_min(void) * 1TB 101 10GB * 10TB 320 32GB */ -void setup_per_zone_inactive_ratio(void) +static void setup_per_zone_inactive_ratio(void) { struct zone *zone; -- cgit v1.2.3-70-g09d2 From 79f4b7bf393e67bbffec807cc68caaefc72b82ee Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:40:05 -0800 Subject: badpage: simplify page_alloc flag check+clear Simplify the PAGE_FLAGS checking and clearing when freeing and allocating a page: check the same flags as before when freeing, clear ALL the flags (unless PageReserved) when freeing, check ALL flags off when allocating. Signed-off-by: Hugh Dickins Cc: Nick Piggin Cc: Christoph Lameter Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 25 ++++++++----------------- mm/page_alloc.c | 19 ++++++------------- 2 files changed, 14 insertions(+), 30 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 628ec080249..219a523ecdb 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -373,31 +373,22 @@ static inline void __ClearPageTail(struct page *page) #define __PG_MLOCKED 0 #endif -#define PAGE_FLAGS (1 << PG_lru | 1 << PG_private | 1 << PG_locked | \ - 1 << PG_buddy | 1 << PG_writeback | \ - 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ - __PG_UNEVICTABLE | __PG_MLOCKED) - -/* - * Flags checked in bad_page(). Pages on the free list should not have - * these flags set. It they are, there is a problem. - */ -#define PAGE_FLAGS_CLEAR_WHEN_BAD (PAGE_FLAGS | \ - 1 << PG_reclaim | 1 << PG_dirty | 1 << PG_swapbacked) - /* * Flags checked when a page is freed. Pages being freed should not have * these flags set. It they are, there is a problem. */ -#define PAGE_FLAGS_CHECK_AT_FREE (PAGE_FLAGS | 1 << PG_reserved) +#define PAGE_FLAGS_CHECK_AT_FREE \ + (1 << PG_lru | 1 << PG_private | 1 << PG_locked | \ + 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \ + 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ + __PG_UNEVICTABLE | __PG_MLOCKED) /* * Flags checked when a page is prepped for return by the page allocator. - * Pages being prepped should not have these flags set. It they are, there - * is a problem. + * Pages being prepped should not have any flags set. It they are set, + * there has been a kernel bug or struct page corruption. */ -#define PAGE_FLAGS_CHECK_AT_PREP (PAGE_FLAGS | \ - 1 << PG_reserved | 1 << PG_dirty | 1 << PG_swapbacked) +#define PAGE_FLAGS_CHECK_AT_PREP ((1 << NR_PAGEFLAGS) - 1) #endif /* !__GENERATING_BOUNDS_H */ #endif /* PAGE_FLAGS_H */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 31c512410a9..b90a74d2848 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -231,7 +231,6 @@ static void bad_page(struct page *page) printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" KERN_EMERG "Backtrace:\n"); dump_stack(); - page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD; set_page_count(page, 0); reset_page_mapcount(page); page->mapping = NULL; @@ -468,16 +467,16 @@ static inline int free_pages_check(struct page *page) (page_count(page) != 0) | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) bad_page(page); - if (PageDirty(page)) - __ClearPageDirty(page); - if (PageSwapBacked(page)) - __ClearPageSwapBacked(page); /* * For now, we report if PG_reserved was found set, but do not * clear it, and do not free the page. But we shall soon need * to do more, for when the ZERO_PAGE count wraps negative. */ - return PageReserved(page); + if (PageReserved(page)) + return 1; + if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) + page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + return 0; } /* @@ -621,13 +620,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) if (PageReserved(page)) return 1; - page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | - 1 << PG_referenced | 1 << PG_arch_1 | - 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk -#ifdef CONFIG_UNEVICTABLE_LRU - | 1 << PG_mlocked -#endif - ); + page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; set_page_private(page, 0); set_page_refcounted(page); -- cgit v1.2.3-70-g09d2 From 8cc3b39221b0ecbd83a338948a8396df097fc656 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:40:06 -0800 Subject: badpage: keep any bad page out of circulation Until now the bad_page() checkers have special-cased PageReserved, keeping those pages out of circulation thereafter. Now extend the special case to all: we want to keep ANY page with bad state out of circulation - the "free" page may well be in use by something. Leave the bad state of those pages untouched, for examination by debuggers; except for PageBuddy - leaving that set would risk bringing the page back. Signed-off-by: Hugh Dickins Cc: Nick Piggin Cc: Christoph Lameter Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 52 ++++++++++++++++++++++++---------------------------- 1 file changed, 24 insertions(+), 28 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b90a74d2848..bd330252fc7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -231,9 +231,9 @@ static void bad_page(struct page *page) printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" KERN_EMERG "Backtrace:\n"); dump_stack(); - set_page_count(page, 0); - reset_page_mapcount(page); - page->mapping = NULL; + + /* Leave bad fields for debug, except PageBuddy could make trouble */ + __ClearPageBuddy(page); add_taint(TAINT_BAD_PAGE); } @@ -290,25 +290,31 @@ void prep_compound_gigantic_page(struct page *page, unsigned long order) } #endif -static void destroy_compound_page(struct page *page, unsigned long order) +static int destroy_compound_page(struct page *page, unsigned long order) { int i; int nr_pages = 1 << order; + int bad = 0; - if (unlikely(compound_order(page) != order)) + if (unlikely(compound_order(page) != order) || + unlikely(!PageHead(page))) { bad_page(page); + bad++; + } - if (unlikely(!PageHead(page))) - bad_page(page); __ClearPageHead(page); + for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - if (unlikely(!PageTail(p) | - (p->first_page != page))) + if (unlikely(!PageTail(p) | (p->first_page != page))) { bad_page(page); + bad++; + } __ClearPageTail(p); } + + return bad; } static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) @@ -428,7 +434,8 @@ static inline void __free_one_page(struct page *page, int migratetype = get_pageblock_migratetype(page); if (unlikely(PageCompound(page))) - destroy_compound_page(page, order); + if (unlikely(destroy_compound_page(page, order))) + return; page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); @@ -465,15 +472,10 @@ static inline int free_pages_check(struct page *page) if (unlikely(page_mapcount(page) | (page->mapping != NULL) | (page_count(page) != 0) | - (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) + (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { bad_page(page); - /* - * For now, we report if PG_reserved was found set, but do not - * clear it, and do not free the page. But we shall soon need - * to do more, for when the ZERO_PAGE count wraps negative. - */ - if (PageReserved(page)) return 1; + } if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; return 0; @@ -521,11 +523,11 @@ static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; int i; - int reserved = 0; + int bad = 0; for (i = 0 ; i < (1 << order) ; ++i) - reserved += free_pages_check(page + i); - if (reserved) + bad += free_pages_check(page + i); + if (bad) return; if (!PageHighMem(page)) { @@ -610,17 +612,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) if (unlikely(page_mapcount(page) | (page->mapping != NULL) | (page_count(page) != 0) | - (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) + (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { bad_page(page); - - /* - * For now, we report if PG_reserved was found set, but do not - * clear it, and do not allocate the page: as a safety net. - */ - if (PageReserved(page)) return 1; + } - page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; set_page_private(page, 0); set_page_refcounted(page); -- cgit v1.2.3-70-g09d2 From 3dc147414ccad81dc33edb80774b1fed12a38c08 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:40:08 -0800 Subject: badpage: replace page_remove_rmap Eeek and BUG Now that bad pages are kept out of circulation, there is no need for the infamous page_remove_rmap() BUG() - once that page is freed, its negative mapcount will issue a "Bad page state" message and the page won't be freed. Removing the BUG() allows more info, on subsequent pages, to be gathered. We do have more info about the page at this point than bad_page() can know - notably, what the pmd is, which might pinpoint something like low 64kB corruption - but page_remove_rmap() isn't given the address to find that. In practice, there is only one call to page_remove_rmap() which has ever reported anything, that from zap_pte_range() (usually on exit, sometimes on munmap). It has all the info, so remove page_remove_rmap()'s "Eeek" message and leave it all to zap_pte_range(). mm/memory.c already has a hardly used print_bad_pte() function, showing some of the appropriate info: extend it to show what we want for the rmap case: pte info, page info (when there is a page) and vma info to compare. zap_pte_range() already knows the pmd, but print_bad_pte() is easier to use if it works that out for itself. Some of this info is also shown in bad_page()'s "Bad page state" message. Keep them separate, but adjust them to match each other as far as possible. Say "Bad page map" in print_bad_pte(), and add a TAINT_BAD_PAGE there too. print_bad_pte() show current->comm unconditionally (though it should get repeated in the usually irrelevant stack trace): sorry, I misled Nick Piggin to make it conditional on vm_mm == current->mm, but current->mm is already NULL in the exit case. Usually current->comm is good, though exceptionally it may not be that of the mm (when "swapoff" for example). Signed-off-by: Hugh Dickins Cc: Nick Piggin Cc: Christoph Lameter Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 52 ++++++++++++++++++++++++++++++++++++++++------------ mm/page_alloc.c | 16 ++++++++-------- mm/rmap.c | 16 ---------------- 3 files changed, 48 insertions(+), 36 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/memory.c b/mm/memory.c index 89339c61f8e..cda04b19f73 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -52,6 +52,9 @@ #include #include #include +#include +#include +#include #include #include @@ -59,9 +62,6 @@ #include #include -#include -#include - #include "internal.h" #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -375,15 +375,41 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) * * The calling function must still handle the error. */ -static void print_bad_pte(struct vm_area_struct *vma, pte_t pte, - unsigned long vaddr) -{ - printk(KERN_ERR "Bad pte = %08llx, process = %s, " - "vm_flags = %lx, vaddr = %lx\n", - (long long)pte_val(pte), - (vma->vm_mm == current->mm ? current->comm : "???"), - vma->vm_flags, vaddr); +static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, + pte_t pte, struct page *page) +{ + pgd_t *pgd = pgd_offset(vma->vm_mm, addr); + pud_t *pud = pud_offset(pgd, addr); + pmd_t *pmd = pmd_offset(pud, addr); + struct address_space *mapping; + pgoff_t index; + + mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; + index = linear_page_index(vma, addr); + + printk(KERN_EMERG "Bad page map in process %s pte:%08llx pmd:%08llx\n", + current->comm, + (long long)pte_val(pte), (long long)pmd_val(*pmd)); + if (page) { + printk(KERN_EMERG + "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", + page, (void *)page->flags, page_count(page), + page_mapcount(page), page->mapping, page->index); + } + printk(KERN_EMERG + "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", + (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); + /* + * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y + */ + if (vma->vm_ops) + print_symbol(KERN_EMERG "vma->vm_ops->fault: %s\n", + (unsigned long)vma->vm_ops->fault); + if (vma->vm_file && vma->vm_file->f_op) + print_symbol(KERN_EMERG "vma->vm_file->f_op->mmap: %s\n", + (unsigned long)vma->vm_file->f_op->mmap); dump_stack(); + add_taint(TAINT_BAD_PAGE); } static inline int is_cow_mapping(unsigned int flags) @@ -773,6 +799,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, file_rss--; } page_remove_rmap(page, vma); + if (unlikely(page_mapcount(page) < 0)) + print_bad_pte(vma, addr, ptent, page); tlb_remove_page(tlb, page); continue; } @@ -2684,7 +2712,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* * Page table corrupted: show pte and kill process. */ - print_bad_pte(vma, orig_pte, address); + print_bad_pte(vma, address, orig_pte, NULL); return VM_FAULT_OOM; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bd330252fc7..3acb216e9a7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -222,14 +222,14 @@ static inline int bad_range(struct zone *zone, struct page *page) static void bad_page(struct page *page) { - printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG - "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", - current->comm, page, (int)(2*sizeof(unsigned long)), - (unsigned long)page->flags, page->mapping, - page_mapcount(page), page_count(page)); - - printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" - KERN_EMERG "Backtrace:\n"); + printk(KERN_EMERG "Bad page state in process %s pfn:%05lx\n", + current->comm, page_to_pfn(page)); + printk(KERN_EMERG + "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", + page, (void *)page->flags, page_count(page), + page_mapcount(page), page->mapping, page->index); + printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); + dump_stack(); /* Leave bad fields for debug, except PageBuddy could make trouble */ diff --git a/mm/rmap.c b/mm/rmap.c index b1770b11a57..32098255082 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -47,7 +47,6 @@ #include #include #include -#include #include #include #include @@ -725,21 +724,6 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long void page_remove_rmap(struct page *page, struct vm_area_struct *vma) { if (atomic_add_negative(-1, &page->_mapcount)) { - if (unlikely(page_mapcount(page) < 0)) { - printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); - printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page)); - printk (KERN_EMERG " page->flags = %lx\n", page->flags); - printk (KERN_EMERG " page->count = %x\n", page_count(page)); - printk (KERN_EMERG " page->mapping = %p\n", page->mapping); - print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops); - if (vma->vm_ops) { - print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault); - } - if (vma->vm_file && vma->vm_file->f_op) - print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap); - BUG(); - } - /* * Now that the last pte has gone, s390 must transfer dirty * flag from storage key to struct page. We can usually skip -- cgit v1.2.3-70-g09d2 From 22b31eec63e5f2e219a3ee15f456897272bc73e8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:40:09 -0800 Subject: badpage: vm_normal_page use print_bad_pte print_bad_pte() is so far being called only when zap_pte_range() finds negative page_mapcount, or there's a fault on a pte_file where it does not belong. That's weak coverage when we suspect pagetable corruption. Originally, it was called when vm_normal_page() found an invalid pfn: but pfn_valid is expensive on some architectures and configurations, so 2.6.24 put that under CONFIG_DEBUG_VM (which doesn't help in the field), then 2.6.26 replaced it by a VM_BUG_ON (likewise). Reinstate the print_bad_pte() in vm_normal_page(), but use a cheaper test than pfn_valid(): memmap_init_zone() (used in bootup and hotplug) keep a __read_mostly note of the highest_memmap_pfn, vm_normal_page() then check pfn against that. We could call this pfn_plausible() or pfn_sane(), but I doubt we'll need it elsewhere: of course it's not reliable, but gives much stronger pagetable validation on many boxes. Also use print_bad_pte() when the pte_special bit is found outside a VM_PFNMAP or VM_MIXEDMAP area, instead of VM_BUG_ON. Signed-off-by: Hugh Dickins Cc: Nick Piggin Cc: Christoph Lameter Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 1 + mm/memory.c | 20 ++++++++++---------- mm/page_alloc.c | 4 ++++ 3 files changed, 15 insertions(+), 10 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/internal.h b/mm/internal.h index 13333bc2eb6..1981bc9454f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -49,6 +49,7 @@ extern void putback_lru_page(struct page *page); /* * in mm/page_alloc.c */ +extern unsigned long highest_memmap_pfn; extern void __free_pages_bootmem(struct page *page, unsigned int order); /* diff --git a/mm/memory.c b/mm/memory.c index cda04b19f73..890095f5f36 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -467,21 +467,18 @@ static inline int is_cow_mapping(unsigned int flags) struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { - unsigned long pfn; + unsigned long pfn = pte_pfn(pte); if (HAVE_PTE_SPECIAL) { - if (likely(!pte_special(pte))) { - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - return pte_page(pte); - } - VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); + if (likely(!pte_special(pte))) + goto check_pfn; + if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) + print_bad_pte(vma, addr, pte, NULL); return NULL; } /* !HAVE_PTE_SPECIAL case follows: */ - pfn = pte_pfn(pte); - if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) @@ -497,11 +494,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, } } - VM_BUG_ON(!pfn_valid(pfn)); +check_pfn: + if (unlikely(pfn > highest_memmap_pfn)) { + print_bad_pte(vma, addr, pte, NULL); + return NULL; + } /* * NOTE! We still have PageReserved() pages in the page tables. - * * eg. VDSO mappings can cause them to exist. */ out: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3acb216e9a7..755c99a0ac7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -69,6 +69,7 @@ EXPORT_SYMBOL(node_states); unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; +unsigned long highest_memmap_pfn __read_mostly; int percpu_pagelist_fraction; #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE @@ -2597,6 +2598,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long pfn; struct zone *z; + if (highest_memmap_pfn < end_pfn - 1) + highest_memmap_pfn = end_pfn - 1; + z = &NODE_DATA(nid)->node_zones[zone]; for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* -- cgit v1.2.3-70-g09d2 From d936cf9b39b06c8d2e0d7fb5e7b4f176e18dec69 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:40:12 -0800 Subject: badpage: ratelimit print_bad_pte and bad_page print_bad_pte() and bad_page() might each need ratelimiting - especially for their dump_stacks, almost never of interest, yet not quite dispensible. Correlating corruption across neighbouring entries can be very helpful, so allow a burst of 60 reports before keeping quiet for the remainder of that minute (or allow a steady drip of one report per second). Signed-off-by: Hugh Dickins Cc: Nick Piggin Cc: Christoph Lameter Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 23 +++++++++++++++++++++++ mm/page_alloc.c | 26 +++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/memory.c b/mm/memory.c index 0f9abbaf618..b12888c1b4e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -383,6 +383,29 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd = pmd_offset(pud, addr); struct address_space *mapping; pgoff_t index; + static unsigned long resume; + static unsigned long nr_shown; + static unsigned long nr_unshown; + + /* + * Allow a burst of 60 reports, then keep quiet for that minute; + * or allow a steady drip of one report per second. + */ + if (nr_shown == 60) { + if (time_before(jiffies, resume)) { + nr_unshown++; + return; + } + if (nr_unshown) { + printk(KERN_EMERG + "Bad page map: %lu messages suppressed\n", + nr_unshown); + nr_unshown = 0; + } + nr_shown = 0; + } + if (nr_shown++ == 0) + resume = jiffies + 60 * HZ; mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 755c99a0ac7..d1a80f652c6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -223,6 +223,30 @@ static inline int bad_range(struct zone *zone, struct page *page) static void bad_page(struct page *page) { + static unsigned long resume; + static unsigned long nr_shown; + static unsigned long nr_unshown; + + /* + * Allow a burst of 60 reports, then keep quiet for that minute; + * or allow a steady drip of one report per second. + */ + if (nr_shown == 60) { + if (time_before(jiffies, resume)) { + nr_unshown++; + goto out; + } + if (nr_unshown) { + printk(KERN_EMERG + "Bad page state: %lu messages suppressed\n", + nr_unshown); + nr_unshown = 0; + } + nr_shown = 0; + } + if (nr_shown++ == 0) + resume = jiffies + 60 * HZ; + printk(KERN_EMERG "Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); printk(KERN_EMERG @@ -232,7 +256,7 @@ static void bad_page(struct page *page) printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); dump_stack(); - +out: /* Leave bad fields for debug, except PageBuddy could make trouble */ __ClearPageBuddy(page); add_taint(TAINT_BAD_PAGE); -- cgit v1.2.3-70-g09d2 From 1e9e63650d6cb88e6d6d2ca6cc3ee276c26de4a3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:40:13 -0800 Subject: badpage: KERN_ALERT BUG instead of KERN_EMERG bad_page() and rmap Eeek messages have said KERN_EMERG for a few years, which I've followed in print_bad_pte(). These are serious system errors, on a par with BUGs, but they're not quite emergencies, and we do our best to carry on: say KERN_ALERT "BUG: " like the x86 oops does. And remove the "Trying to fix it up, but a reboot is needed" line: it's not untrue, but I hope the KERN_ALERT "BUG: " conveys as much. Signed-off-by: Hugh Dickins Cc: Nick Piggin Cc: Christoph Lameter Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 15 ++++++++------- mm/page_alloc.c | 9 ++++----- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/memory.c b/mm/memory.c index b12888c1b4e..db68af8e0bc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -397,8 +397,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, return; } if (nr_unshown) { - printk(KERN_EMERG - "Bad page map: %lu messages suppressed\n", + printk(KERN_ALERT + "BUG: Bad page map: %lu messages suppressed\n", nr_unshown); nr_unshown = 0; } @@ -410,26 +410,27 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); - printk(KERN_EMERG "Bad page map in process %s pte:%08llx pmd:%08llx\n", + printk(KERN_ALERT + "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", current->comm, (long long)pte_val(pte), (long long)pmd_val(*pmd)); if (page) { - printk(KERN_EMERG + printk(KERN_ALERT "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", page, (void *)page->flags, page_count(page), page_mapcount(page), page->mapping, page->index); } - printk(KERN_EMERG + printk(KERN_ALERT "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); /* * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y */ if (vma->vm_ops) - print_symbol(KERN_EMERG "vma->vm_ops->fault: %s\n", + print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n", (unsigned long)vma->vm_ops->fault); if (vma->vm_file && vma->vm_file->f_op) - print_symbol(KERN_EMERG "vma->vm_file->f_op->mmap: %s\n", + print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", (unsigned long)vma->vm_file->f_op->mmap); dump_stack(); add_taint(TAINT_BAD_PAGE); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d1a80f652c6..d531e8ef998 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -237,8 +237,8 @@ static void bad_page(struct page *page) goto out; } if (nr_unshown) { - printk(KERN_EMERG - "Bad page state: %lu messages suppressed\n", + printk(KERN_ALERT + "BUG: Bad page state: %lu messages suppressed\n", nr_unshown); nr_unshown = 0; } @@ -247,13 +247,12 @@ static void bad_page(struct page *page) if (nr_shown++ == 0) resume = jiffies + 60 * HZ; - printk(KERN_EMERG "Bad page state in process %s pfn:%05lx\n", + printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); - printk(KERN_EMERG + printk(KERN_ALERT "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", page, (void *)page->flags, page_count(page), page_mapcount(page), page->mapping, page->index); - printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); dump_stack(); out: -- cgit v1.2.3-70-g09d2 From 9f572e3f96b8a2ef70dcb881e64c7b9c10057d98 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 6 Jan 2009 14:40:29 -0800 Subject: mm: remove CONFIG_OUT_OF_LINE_PFN_TO_PAGE No architectures use CONFIG_OUT_OF_LINE_PFN_TO_PAGE - it can be removed. Signed-off-by: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/memory_model.h | 7 ------- mm/page_alloc.c | 13 ------------- 2 files changed, 20 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h index 36fa286adad..4c8d0afae71 100644 --- a/include/asm-generic/memory_model.h +++ b/include/asm-generic/memory_model.h @@ -69,15 +69,8 @@ }) #endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */ -#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE -struct page; -/* this is useful when inlined pfn_to_page is too big */ -extern struct page *pfn_to_page(unsigned long pfn); -extern unsigned long page_to_pfn(struct page *page); -#else #define page_to_pfn __page_to_pfn #define pfn_to_page __pfn_to_page -#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ #endif /* __ASSEMBLY__ */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d531e8ef998..7bf22e04531 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4587,19 +4587,6 @@ void *__init alloc_large_system_hash(const char *tablename, return table; } -#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE -struct page *pfn_to_page(unsigned long pfn) -{ - return __pfn_to_page(pfn); -} -unsigned long page_to_pfn(struct page *page) -{ - return __page_to_pfn(page); -} -EXPORT_SYMBOL(pfn_to_page); -EXPORT_SYMBOL(page_to_pfn); -#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ - /* Return a pointer to the bitmap storing bits affecting a block of pages */ static inline unsigned long *get_pageblock_bitmap(struct zone *zone, unsigned long pfn) -- cgit v1.2.3-70-g09d2