diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 4 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/allocpercpu.c | 9 | ||||
-rw-r--r-- | mm/filemap.c | 72 | ||||
-rw-r--r-- | mm/highmem.c | 7 | ||||
-rw-r--r-- | mm/hugetlb.c | 25 | ||||
-rw-r--r-- | mm/memory.c | 9 | ||||
-rw-r--r-- | mm/mempolicy.c | 10 | ||||
-rw-r--r-- | mm/mempool.c | 3 | ||||
-rw-r--r-- | mm/migrate.c | 3 | ||||
-rw-r--r-- | mm/page-writeback.c | 1 | ||||
-rw-r--r-- | mm/page_alloc.c | 292 | ||||
-rw-r--r-- | mm/pdflush.c | 1 | ||||
-rw-r--r-- | mm/shmem.c | 8 | ||||
-rw-r--r-- | mm/slab.c | 86 | ||||
-rw-r--r-- | mm/slob.c | 56 | ||||
-rw-r--r-- | mm/slub.c | 668 | ||||
-rw-r--r-- | mm/swap_state.c | 3 | ||||
-rw-r--r-- | mm/truncate.c | 2 | ||||
-rw-r--r-- | mm/util.c | 48 | ||||
-rw-r--r-- | mm/vmalloc.c | 19 | ||||
-rw-r--r-- | mm/vmscan.c | 212 | ||||
-rw-r--r-- | mm/vmstat.c | 2 |
23 files changed, 970 insertions, 574 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 086af703da4..86187221e78 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -163,6 +163,10 @@ config ZONE_DMA_FLAG default "0" if !ZONE_DMA default "1" +config BOUNCE + def_bool y + depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM) + config NR_QUICK int depends on QUICKLIST diff --git a/mm/Makefile b/mm/Makefile index a9148ea329a..245e33ab00c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -13,9 +13,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ $(mmu-y) -ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy) -obj-y += bounce.o -endif +obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index b2486cf887a..00b02623f00 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -53,12 +53,9 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) int node = cpu_to_node(cpu); BUG_ON(pdata->ptrs[cpu]); - if (node_online(node)) { - /* FIXME: kzalloc_node(size, gfp, node) */ - pdata->ptrs[cpu] = kmalloc_node(size, gfp, node); - if (pdata->ptrs[cpu]) - memset(pdata->ptrs[cpu], 0, size); - } else + if (node_online(node)) + pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node); + else pdata->ptrs[cpu] = kzalloc(size, gfp); return pdata->ptrs[cpu]; } diff --git a/mm/filemap.c b/mm/filemap.c index 100b99c2d50..5d5449f3d41 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -867,13 +867,11 @@ void do_generic_mapping_read(struct address_space *mapping, { struct inode *inode = mapping->host; unsigned long index; - unsigned long end_index; unsigned long offset; unsigned long last_index; unsigned long next_index; unsigned long prev_index; unsigned int prev_offset; - loff_t isize; struct page *cached_page; int error; struct file_ra_state ra = *_ra; @@ -886,27 +884,12 @@ void do_generic_mapping_read(struct address_space *mapping, last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; - isize = i_size_read(inode); - if (!isize) - goto out; - - end_index = (isize - 1) >> PAGE_CACHE_SHIFT; for (;;) { struct page *page; + unsigned long end_index; + loff_t isize; unsigned long nr, ret; - /* nr is the maximum number of bytes to copy from this page */ - nr = PAGE_CACHE_SIZE; - if (index >= end_index) { - if (index > end_index) - goto out; - nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; - if (nr <= offset) { - goto out; - } - } - nr = nr - offset; - cond_resched(); if (index == next_index) next_index = page_cache_readahead(mapping, &ra, filp, @@ -921,6 +904,32 @@ find_page: if (!PageUptodate(page)) goto page_not_up_to_date; page_ok: + /* + * i_size must be checked after we know the page is Uptodate. + * + * Checking i_size after the check allows us to calculate + * the correct value for "nr", which means the zero-filled + * part of the page is not copied back to userspace (unless + * another truncate extends the file - this is desired though). + */ + + isize = i_size_read(inode); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!isize || index > end_index)) { + page_cache_release(page); + goto out; + } + + /* nr is the maximum number of bytes to copy from this page */ + nr = PAGE_CACHE_SIZE; + if (index == end_index) { + nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; + if (nr <= offset) { + page_cache_release(page); + goto out; + } + } + nr = nr - offset; /* If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing @@ -1007,31 +1016,6 @@ readpage: unlock_page(page); } - /* - * i_size must be checked after we have done ->readpage. - * - * Checking i_size after the readpage allows us to calculate - * the correct value for "nr", which means the zero-filled - * part of the page is not copied back to userspace (unless - * another truncate extends the file - this is desired though). - */ - isize = i_size_read(inode); - end_index = (isize - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(!isize || index > end_index)) { - page_cache_release(page); - goto out; - } - - /* nr is the maximum number of bytes to copy from this page */ - nr = PAGE_CACHE_SIZE; - if (index == end_index) { - nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; - if (nr <= offset) { - page_cache_release(page); - goto out; - } - } - nr = nr - offset; goto page_ok; readpage_error: diff --git a/mm/highmem.c b/mm/highmem.c index be8f8d36a8b..7a967bc3515 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -46,9 +46,14 @@ unsigned int nr_free_highpages (void) pg_data_t *pgdat; unsigned int pages = 0; - for_each_online_pgdat(pgdat) + for_each_online_pgdat(pgdat) { pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], NR_FREE_PAGES); + if (zone_movable_is_highmem()) + pages += zone_page_state( + &pgdat->node_zones[ZONE_MOVABLE], + NR_FREE_PAGES); + } return pages; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index acc0fb3cf06..6912bbf33fa 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -27,6 +27,9 @@ unsigned long max_huge_pages; static struct list_head hugepage_freelists[MAX_NUMNODES]; static unsigned int nr_huge_pages_node[MAX_NUMNODES]; static unsigned int free_huge_pages_node[MAX_NUMNODES]; +static gfp_t htlb_alloc_mask = GFP_HIGHUSER; +unsigned long hugepages_treat_as_movable; + /* * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages */ @@ -68,12 +71,13 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, { int nid; struct page *page = NULL; - struct zonelist *zonelist = huge_zonelist(vma, address); + struct zonelist *zonelist = huge_zonelist(vma, address, + htlb_alloc_mask); struct zone **z; for (z = zonelist->zones; *z; z++) { nid = zone_to_nid(*z); - if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) && + if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) && !list_empty(&hugepage_freelists[nid])) break; } @@ -113,7 +117,7 @@ static int alloc_fresh_huge_page(void) prev_nid = nid; spin_unlock(&nid_lock); - page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, + page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, HUGETLB_PAGE_ORDER); if (page) { set_compound_page_dtor(page, free_huge_page); @@ -263,6 +267,19 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, max_huge_pages = set_max_huge_pages(max_huge_pages); return 0; } + +int hugetlb_treat_movable_handler(struct ctl_table *table, int write, + struct file *file, void __user *buffer, + size_t *length, loff_t *ppos) +{ + proc_dointvec(table, write, file, buffer, length, ppos); + if (hugepages_treat_as_movable) + htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; + else + htlb_alloc_mask = GFP_HIGHUSER; + return 0; +} + #endif /* CONFIG_SYSCTL */ int hugetlb_report_meminfo(char *buf) @@ -481,7 +498,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_MINOR; } -int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, +static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, int write_access) { int ret = VM_FAULT_SIGBUS; diff --git a/mm/memory.c b/mm/memory.c index b3d73bb1f68..9c6ff7fffdc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1715,11 +1715,11 @@ gotten: if (unlikely(anon_vma_prepare(vma))) goto oom; if (old_page == ZERO_PAGE(address)) { - new_page = alloc_zeroed_user_highpage(vma, address); + new_page = alloc_zeroed_user_highpage_movable(vma, address); if (!new_page) goto oom; } else { - new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (!new_page) goto oom; cow_user_page(new_page, old_page, address, vma); @@ -2237,7 +2237,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(anon_vma_prepare(vma))) goto oom; - page = alloc_zeroed_user_highpage(vma, address); + page = alloc_zeroed_user_highpage_movable(vma, address); if (!page) goto oom; @@ -2340,7 +2340,8 @@ retry: if (unlikely(anon_vma_prepare(vma))) goto oom; - page = alloc_page_vma(GFP_HIGHUSER, vma, address); + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, + vma, address); if (!page) goto oom; copy_user_highpage(page, new_page, address, vma); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 188f8d9c4ae..9f4e9b95e8f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -594,7 +594,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, static struct page *new_node_page(struct page *page, unsigned long node, int **x) { - return alloc_pages_node(node, GFP_HIGHUSER, 0); + return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); } /* @@ -710,7 +710,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int * { struct vm_area_struct *vma = (struct vm_area_struct *)private; - return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma)); + return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, + page_address_in_vma(page, vma)); } #else @@ -1202,7 +1203,8 @@ static inline unsigned interleave_nid(struct mempolicy *pol, #ifdef CONFIG_HUGETLBFS /* Return a zonelist suitable for a huge page allocation. */ -struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) +struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, + gfp_t gfp_flags) { struct mempolicy *pol = get_vma_policy(current, vma, addr); @@ -1210,7 +1212,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) unsigned nid; nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); - return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER); + return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags); } return zonelist_policy(GFP_HIGHUSER, pol); } diff --git a/mm/mempool.c b/mm/mempool.c index 3e8f1fed0e1..02d5ec3feab 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -62,10 +62,9 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data, int node_id) { mempool_t *pool; - pool = kmalloc_node(sizeof(*pool), GFP_KERNEL, node_id); + pool = kmalloc_node(sizeof(*pool), GFP_KERNEL | __GFP_ZERO, node_id); if (!pool) return NULL; - memset(pool, 0, sizeof(*pool)); pool->elements = kmalloc_node(min_nr * sizeof(void *), GFP_KERNEL, node_id); if (!pool->elements) { diff --git a/mm/migrate.c b/mm/migrate.c index a91ca00abeb..34d8ada053e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -761,7 +761,8 @@ static struct page *new_page_node(struct page *p, unsigned long private, *result = &pm->status; - return alloc_pages_node(pm->node, GFP_HIGHUSER | GFP_THISNODE, 0); + return alloc_pages_node(pm->node, + GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); } /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ea9da3bed3e..886ea0d5a13 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -824,6 +824,7 @@ int __set_page_dirty_nobuffers(struct page *page) mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); + WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); task_io_account_write(PAGE_CACHE_SIZE); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f9e4e647d7e..e2a10b957f2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -80,8 +80,9 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, #endif #ifdef CONFIG_HIGHMEM - 32 + 32, #endif + 32, }; EXPORT_SYMBOL(totalram_pages); @@ -95,8 +96,9 @@ static char * const zone_names[MAX_NR_ZONES] = { #endif "Normal", #ifdef CONFIG_HIGHMEM - "HighMem" + "HighMem", #endif + "Movable", }; int min_free_kbytes = 1024; @@ -134,6 +136,13 @@ static unsigned long __meminitdata dma_reserve; static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ + unsigned long __initdata required_kernelcore; + unsigned long __initdata required_movablecore; + unsigned long __initdata zone_movable_pfn[MAX_NUMNODES]; + + /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ + int movable_zone; + EXPORT_SYMBOL(movable_zone); #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ #if MAX_NUMNODES > 1 @@ -1324,7 +1333,7 @@ nofail_alloc: reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask); + did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask); p->reclaim_state = NULL; p->flags &= ~PF_MEMALLOC; @@ -1361,7 +1370,8 @@ nofail_alloc: */ do_retry = 0; if (!(gfp_mask & __GFP_NORETRY)) { - if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) + if ((order <= PAGE_ALLOC_COSTLY_ORDER) || + (gfp_mask & __GFP_REPEAT)) do_retry = 1; if (gfp_mask & __GFP_NOFAIL) do_retry = 1; @@ -1474,13 +1484,14 @@ unsigned int nr_free_buffer_pages(void) { return nr_free_zone_pages(gfp_zone(GFP_USER)); } +EXPORT_SYMBOL_GPL(nr_free_buffer_pages); /* * Amount of free RAM allocatable within all zones */ unsigned int nr_free_pagecache_pages(void) { - return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); + return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); } static inline void show_node(struct zone *zone) @@ -2667,6 +2678,63 @@ void __meminit get_pfn_range_for_nid(unsigned int nid, } /* + * This finds a zone that can be used for ZONE_MOVABLE pages. The + * assumption is made that zones within a node are ordered in monotonic + * increasing memory addresses so that the "highest" populated zone is used + */ +void __init find_usable_zone_for_movable(void) +{ + int zone_index; + for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { + if (zone_index == ZONE_MOVABLE) + continue; + + if (arch_zone_highest_possible_pfn[zone_index] > + arch_zone_lowest_possible_pfn[zone_index]) + break; + } + + VM_BUG_ON(zone_index == -1); + movable_zone = zone_index; +} + +/* + * The zone ranges provided by the architecture do not include ZONE_MOVABLE + * because it is sized independant of architecture. Unlike the other zones, + * the starting point for ZONE_MOVABLE is not fixed. It may be different + * in each node depending on the size of each node and how evenly kernelcore + * is distributed. This helper function adjusts the zone ranges + * provided by the architecture for a given node by using the end of the + * highest usable zone for ZONE_MOVABLE. This preserves the assumption that + * zones within a node are in order of monotonic increases memory addresses + */ +void __meminit adjust_zone_range_for_zone_movable(int nid, + unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, + unsigned long *zone_start_pfn, + unsigned long *zone_end_pfn) +{ + /* Only adjust if ZONE_MOVABLE is on this node */ + if (zone_movable_pfn[nid]) { + /* Size ZONE_MOVABLE */ + if (zone_type == ZONE_MOVABLE) { + *zone_start_pfn = zone_movable_pfn[nid]; + *zone_end_pfn = min(node_end_pfn, + arch_zone_highest_possible_pfn[movable_zone]); + + /* Adjust for ZONE_MOVABLE starting within this range */ + } else if (*zone_start_pfn < zone_movable_pfn[nid] && + *zone_end_pfn > zone_movable_pfn[nid]) { + *zone_end_pfn = zone_movable_pfn[nid]; + + /* Check if this whole range is within ZONE_MOVABLE */ + } else if (*zone_start_pfn >= zone_movable_pfn[nid]) + *zone_start_pfn = *zone_end_pfn; + } +} + +/* * Return the number of pages a zone spans in a node, including holes * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() */ @@ -2681,6 +2749,9 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; + adjust_zone_range_for_zone_movable(nid, zone_type, + node_start_pfn, node_end_pfn, + &zone_start_pfn, &zone_end_pfn); /* Check that this node has pages within the zone's required range */ if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) @@ -2771,6 +2842,9 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], node_end_pfn); + adjust_zone_range_for_zone_movable(nid, zone_type, + node_start_pfn, node_end_pfn, + &zone_start_pfn, &zone_end_pfn); return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); } @@ -3148,6 +3222,157 @@ unsigned long __init find_max_pfn_with_active_regions(void) return max_pfn; } +unsigned long __init early_calculate_totalpages(void) +{ + int i; + unsigned long totalpages = 0; + + for (i = 0; i < nr_nodemap_entries; i++) + totalpages += early_node_map[i].end_pfn - + early_node_map[i].start_pfn; + + return totalpages; +} + +/* + * Find the PFN the Movable zone begins in each node. Kernel memory + * is spread evenly between nodes as long as the nodes have enough + * memory. When they don't, some nodes will have more kernelcore than + * others + */ +void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) +{ + int i, nid; + unsigned long usable_startpfn; + unsigned long kernelcore_node, kernelcore_remaining; + int usable_nodes = num_online_nodes(); + + /* + * If movablecore was specified, calculate what size of + * kernelcore that corresponds so that memory usable for + * any allocation type is evenly spread. If both kernelcore + * and movablecore are specified, then the value of kernelcore + * will be used for required_kernelcore if it's greater than + * what movablecore would have allowed. + */ + if (required_movablecore) { + unsigned long totalpages = early_calculate_totalpages(); + unsigned long corepages; + + /* + * Round-up so that ZONE_MOVABLE is at least as large as what + * was requested by the user + */ + required_movablecore = + roundup(required_movablecore, MAX_ORDER_NR_PAGES); + corepages = totalpages - required_movablecore; + + required_kernelcore = max(required_kernelcore, corepages); + } + + /* If kernelcore was not specified, there is no ZONE_MOVABLE */ + if (!required_kernelcore) + return; + + /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ + find_usable_zone_for_movable(); + usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; + +restart: + /* Spread kernelcore memory as evenly as possible throughout nodes */ + kernelcore_node = required_kernelcore / usable_nodes; + for_each_online_node(nid) { + /* + * Recalculate kernelcore_node if the division per node + * now exceeds what is necessary to satisfy the requested + * amount of memory for the kernel + */ + if (required_kernelcore < kernelcore_node) + kernelcore_node = required_kernelcore / usable_nodes; + + /* + * As the map is walked, we track how much memory is usable + * by the kernel using kernelcore_remaining. When it is + * 0, the rest of the node is usable by ZONE_MOVABLE + */ + kernelcore_remaining = kernelcore_node; + + /* Go through each range of PFNs within this node */ + for_each_active_range_index_in_nid(i, nid) { + unsigned long start_pfn, end_pfn; + unsigned long size_pages; + + start_pfn = max(early_node_map[i].start_pfn, + zone_movable_pfn[nid]); + end_pfn = early_node_map[i].end_pfn; + if (start_pfn >= end_pfn) + continue; + + /* Account for what is only usable for kernelcore */ + if (start_pfn < usable_startpfn) { + unsigned long kernel_pages; + kernel_pages = min(end_pfn, usable_startpfn) + - start_pfn; + + kernelcore_remaining -= min(kernel_pages, + kernelcore_remaining); + required_kernelcore -= min(kernel_pages, + required_kernelcore); + + /* Continue if range is now fully accounted */ + if (end_pfn <= usable_startpfn) { + + /* + * Push zone_movable_pfn to the end so + * that if we have to rebalance + * kernelcore across nodes, we will + * not double account here + */ + zone_movable_pfn[nid] = end_pfn; + continue; + } + start_pfn = usable_startpfn; + } + + /* + * The usable PFN range for ZONE_MOVABLE is from + * start_pfn->end_pfn. Calculate size_pages as the + * number of pages used as kernelcore + */ + size_pages = end_pfn - start_pfn; + if (size_pages > kernelcore_remaining) + size_pages = kernelcore_remaining; + zone_movable_pfn[nid] = start_pfn + size_pages; + + /* + * Some kernelcore has been met, update counts and + * break if the kernelcore for this node has been + * satisified + */ + required_kernelcore -= min(required_kernelcore, + size_pages); + kernelcore_remaining -= size_pages; + if (!kernelcore_remaining) + break; + } + } + + /* + * If there is still required_kernelcore, we do another pass with one + * less node in the count. This will push zone_movable_pfn[nid] further + * along on the nodes that still have memory until kernelcore is + * satisified + */ + usable_nodes--; + if (usable_nodes && required_kernelcore > usable_nodes) + goto restart; + + /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ + for (nid = 0; nid < MAX_NUMNODES; nid++) + zone_movable_pfn[nid] = + roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); +} + /** * free_area_init_nodes - Initialise all pg_data_t and zone data * @max_zone_pfn: an array of max PFNs for each zone @@ -3177,19 +3402,37 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; for (i = 1; i < MAX_NR_ZONES; i++) { + if (i == ZONE_MOVABLE) + continue; arch_zone_lowest_possible_pfn[i] = arch_zone_highest_possible_pfn[i-1]; arch_zone_highest_possible_pfn[i] = max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); } + arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; + arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; + + /* Find the PFNs that ZONE_MOVABLE begins at in each node */ + memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); + find_zone_movable_pfns_for_nodes(zone_movable_pfn); /* Print out the zone ranges */ printk("Zone PFN ranges:\n"); - for (i = 0; i < MAX_NR_ZONES; i++) + for (i = 0; i < MAX_NR_ZONES; i++) { + if (i == ZONE_MOVABLE) + continue; printk(" %-8s %8lu -> %8lu\n", zone_names[i], arch_zone_lowest_possible_pfn[i], arch_zone_highest_possible_pfn[i]); + } + + /* Print out the PFNs ZONE_MOVABLE begins at in each node */ + printk("Movable zone start PFN for each node\n"); + for (i = 0; i < MAX_NUMNODES; i++) { + if (zone_movable_pfn[i]) + printk(" Node %d: %lu\n", i, zone_movable_pfn[i]); + } /* Print out the early_node_map[] */ printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); @@ -3206,6 +3449,43 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) find_min_pfn_for_node(nid), NULL); } } + +static int __init cmdline_parse_core(char *p, unsigned long *core) +{ + unsigned long long coremem; + if (!p) + return -EINVAL; + + coremem = memparse(p, &p); + *core = coremem >> PAGE_SHIFT; + + /* Paranoid check that UL is enough for the coremem value */ + WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); + + return 0; +} + +/* + * kernelcore=size sets the amount of memory for use for allocations that + * cannot be reclaimed or migrated. + */ +static int __init cmdline_parse_kernelcore(char *p) +{ + return cmdline_parse_core(p, &required_kernelcore); +} + +/* + * movablecore=size sets the amount of memory for use for allocations that + * can be reclaimed or migrated. + */ +static int __init cmdline_parse_movablecore(char *p) +{ + return cmdline_parse_core(p, &required_movablecore); +} + +early_param("kernelcore", cmdline_parse_kernelcore); +early_param("movablecore", cmdline_parse_movablecore); + #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ /** diff --git a/mm/pdflush.c b/mm/pdflush.c index 8ce0900dc95..8f6ee073c0e 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c @@ -92,6 +92,7 @@ struct pdflush_work { static int __pdflush(struct pdflush_work *my_work) { current->flags |= PF_FLUSHER | PF_SWAPWRITE; + set_freezable(); my_work->fn = NULL; my_work->who = current; INIT_LIST_HEAD(&my_work->list); diff --git a/mm/shmem.c b/mm/shmem.c index 0493e4d0bca..96fa79fb6ad 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -27,6 +27,7 @@ #include <linux/init.h> #include <linux/fs.h> #include <linux/xattr.h> +#include <linux/exportfs.h> #include <linux/generic_acl.h> #include <linux/mm.h> #include <linux/mman.h> @@ -93,8 +94,11 @@ static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) * The above definition of ENTRIES_PER_PAGE, and the use of * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: * might be reconsidered if it ever diverges from PAGE_SIZE. + * + * __GFP_MOVABLE is masked out as swap vectors cannot move */ - return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT); + return alloc_pages((gfp_mask & ~__GFP_MOVABLE) | __GFP_ZERO, + PAGE_CACHE_SHIFT-PAGE_SHIFT); } static inline void shmem_dir_free(struct page *page) @@ -372,7 +376,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long } spin_unlock(&info->lock); - page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO); + page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); if (page) set_page_private(page, 0); spin_lock(&info->lock); diff --git a/mm/slab.c b/mm/slab.c index a453383333f..96d30ee256e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -775,6 +775,9 @@ static inline struct kmem_cache *__find_general_cachep(size_t size, */ BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); #endif + if (!size) + return ZERO_SIZE_PTR; + while (size > csizep->cs_size) csizep++; @@ -2351,7 +2354,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, * this should not happen at all. * But leave a BUG_ON for some lucky dude. */ - BUG_ON(!cachep->slabp_cache); + BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); } cachep->ctor = ctor; cachep->name = name; @@ -2743,7 +2746,7 @@ static int cache_grow(struct kmem_cache *cachep, * Be lazy and only check for valid flags here, keeping it out of the * critical path in kmem_cache_alloc(). */ - BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK)); + BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK)); local_flags = (flags & GFP_LEVEL_MASK); /* Take the l3 list lock to change the colour_next on this node */ @@ -3389,6 +3392,9 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, local_irq_restore(save_flags); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); + if (unlikely((flags & __GFP_ZERO) && ptr)) + memset(ptr, 0, obj_size(cachep)); + return ptr; } @@ -3440,6 +3446,9 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); prefetchw(objp); + if (unlikely((flags & __GFP_ZERO) && objp)) + memset(objp, 0, obj_size(cachep)); + return objp; } @@ -3581,23 +3590,6 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) EXPORT_SYMBOL(kmem_cache_alloc); /** - * kmem_cache_zalloc - Allocate an object. The memory is set to zero. - * @cache: The cache to allocate from. - * @flags: See kmalloc(). - * - * Allocate an object from this cache and set the allocated memory to zero. - * The flags are only relevant if the cache has no available objects. - */ -void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags) -{ - void *ret = __cache_alloc(cache, flags, __builtin_return_address(0)); - if (ret) - memset(ret, 0, obj_size(cache)); - return ret; -} -EXPORT_SYMBOL(kmem_cache_zalloc); - -/** * kmem_ptr_validate - check if an untrusted pointer might * be a slab entry. * @cachep: the cache we're checking against @@ -3653,8 +3645,8 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) struct kmem_cache *cachep; cachep = kmem_find_general_cachep(size, flags); - if (unlikely(cachep == NULL)) - return NULL; + if (unlikely(ZERO_OR_NULL_PTR(cachep))) + return cachep; return kmem_cache_alloc_node(cachep, flags, node); } @@ -3726,52 +3718,6 @@ EXPORT_SYMBOL(__kmalloc); #endif /** - * krealloc - reallocate memory. The contents will remain unchanged. - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - * - * The contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. If @p is %NULL, krealloc() - * behaves exactly like kmalloc(). If @size is 0 and @p is not a - * %NULL pointer, the object pointed to is freed. - */ -void *krealloc(const void *p, size_t new_size, gfp_t flags) -{ - struct kmem_cache *cache, *new_cache; - void *ret; - - if (unlikely(!p)) - return kmalloc_track_caller(new_size, flags); - - if (unlikely(!new_size)) { - kfree(p); - return NULL; - } - - cache = virt_to_cache(p); - new_cache = __find_general_cachep(new_size, flags); - - /* - * If new size fits in the current cache, bail out. - */ - if (likely(cache == new_cache)) - return (void *)p; - - /* - * We are on the slow-path here so do not use __cache_alloc - * because it bloats kernel text. - */ - ret = kmalloc_track_caller(new_size, flags); - if (ret) { - memcpy(ret, p, min(new_size, ksize(p))); - kfree(p); - } - return ret; -} -EXPORT_SYMBOL(krealloc); - -/** * kmem_cache_free - Deallocate an object * @cachep: The cache the allocation was from. * @objp: The previously allocated object. @@ -3806,7 +3752,7 @@ void kfree(const void *objp) struct kmem_cache *c; unsigned long flags; - if (unlikely(!objp)) + if (unlikely(ZERO_OR_NULL_PTR(objp))) return; local_irq_save(flags); kfree_debugcheck(objp); @@ -4398,7 +4344,7 @@ static void show_symbol(struct seq_file *m, unsigned long address) { #ifdef CONFIG_KALLSYMS unsigned long offset, size; - char modname[MODULE_NAME_LEN + 1], name[KSYM_NAME_LEN + 1]; + char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN]; if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) { seq_printf(m, "%s+%#lx/%#lx", name, offset, size); @@ -4493,7 +4439,7 @@ const struct seq_operations slabstats_op = { */ size_t ksize(const void *objp) { - if (unlikely(objp == NULL)) + if (unlikely(ZERO_OR_NULL_PTR(objp))) return 0; return obj_size(virt_to_cache(objp)); diff --git a/mm/slob.c b/mm/slob.c index b4899079d8b..c89ef116d7a 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -334,6 +334,8 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) BUG_ON(!b); spin_unlock_irqrestore(&slob_lock, flags); } + if (unlikely((gfp & __GFP_ZERO) && b)) + memset(b, 0, size); return b; } @@ -347,7 +349,7 @@ static void slob_free(void *block, int size) slobidx_t units; unsigned long flags; - if (!block) + if (ZERO_OR_NULL_PTR(block)) return; BUG_ON(!size); @@ -424,10 +426,13 @@ out: void *__kmalloc_node(size_t size, gfp_t gfp, int node) { + unsigned int *m; int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); if (size < PAGE_SIZE - align) { - unsigned int *m; + if (!size) + return ZERO_SIZE_PTR; + m = slob_alloc(size + align, gfp, align, node); if (m) *m = size; @@ -446,44 +451,11 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) } EXPORT_SYMBOL(__kmalloc_node); -/** - * krealloc - reallocate memory. The contents will remain unchanged. - * - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - * - * The contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. If @p is %NULL, krealloc() - * behaves exactly like kmalloc(). If @size is 0 and @p is not a - * %NULL pointer, the object pointed to is freed. - */ -void *krealloc(const void *p, size_t new_size, gfp_t flags) -{ - void *ret; - - if (unlikely(!p)) - return kmalloc_track_caller(new_size, flags); - - if (unlikely(!new_size)) { - kfree(p); - return NULL; - } - - ret = kmalloc_track_caller(new_size, flags); - if (ret) { - memcpy(ret, p, min(new_size, ksize(p))); - kfree(p); - } - return ret; -} -EXPORT_SYMBOL(krealloc); - void kfree(const void *block) { struct slob_page *sp; - if (!block) + if (ZERO_OR_NULL_PTR(block)) return; sp = (struct slob_page *)virt_to_page(block); @@ -501,7 +473,7 @@ size_t ksize(const void *block) { struct slob_page *sp; - if (!block) + if (ZERO_OR_NULL_PTR(block)) return 0; sp = (struct slob_page *)virt_to_page(block); @@ -571,16 +543,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) } EXPORT_SYMBOL(kmem_cache_alloc_node); -void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags) -{ - void *ret = kmem_cache_alloc(c, flags); - if (ret) - memset(ret, 0, c->size); - - return ret; -} -EXPORT_SYMBOL(kmem_cache_zalloc); - static void __kmem_cache_free(void *b, int size) { if (size < PAGE_SIZE) diff --git a/mm/slub.c b/mm/slub.c index 6aea48942c2..52a4f44be39 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -205,6 +205,11 @@ static inline void ClearSlabDebug(struct page *page) #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) #endif +/* + * The page->inuse field is 16 bit thus we have this limitation + */ +#define MAX_OBJECTS_PER_SLAB 65535 + /* Internal SLUB flags */ #define __OBJECT_POISON 0x80000000 /* Poison object */ @@ -228,7 +233,7 @@ static enum { /* A list of all slab caches on the system */ static DECLARE_RWSEM(slub_lock); -LIST_HEAD(slab_caches); +static LIST_HEAD(slab_caches); /* * Tracking user of a slab. @@ -247,9 +252,10 @@ static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); static void sysfs_slab_remove(struct kmem_cache *); #else -static int sysfs_slab_add(struct kmem_cache *s) { return 0; } -static int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } -static void sysfs_slab_remove(struct kmem_cache *s) {} +static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } +static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) + { return 0; } +static inline void sysfs_slab_remove(struct kmem_cache *s) {} #endif /******************************************************************** @@ -344,7 +350,7 @@ static void print_section(char *text, u8 *addr, unsigned int length) for (i = 0; i < length; i++) { if (newline) { - printk(KERN_ERR "%10s 0x%p: ", text, addr + i); + printk(KERN_ERR "%8s 0x%p: ", text, addr + i); newline = 0; } printk(" %02x", addr[i]); @@ -401,10 +407,11 @@ static void set_track(struct kmem_cache *s, void *object, static void init_tracking(struct kmem_cache *s, void *object) { - if (s->flags & SLAB_STORE_USER) { - set_track(s, object, TRACK_FREE, NULL); - set_track(s, object, TRACK_ALLOC, NULL); - } + if (!(s->flags & SLAB_STORE_USER)) + return; + + set_track(s, object, TRACK_FREE, NULL); + set_track(s, object, TRACK_ALLOC, NULL); } static void print_track(const char *s, struct track *t) @@ -412,65 +419,106 @@ static void print_track(const char *s, struct track *t) if (!t->addr) return; - printk(KERN_ERR "%s: ", s); + printk(KERN_ERR "INFO: %s in ", s); __print_symbol("%s", (unsigned long)t->addr); - printk(" jiffies_ago=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); + printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); +} + +static void print_tracking(struct kmem_cache *s, void *object) +{ + if (!(s->flags & SLAB_STORE_USER)) + return; + + print_track("Allocated", get_track(s, object, TRACK_ALLOC)); + print_track("Freed", get_track(s, object, TRACK_FREE)); +} + +static void print_page_info(struct page *page) +{ + printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n", + page, page->inuse, page->freelist, page->flags); + +} + +static void slab_bug(struct kmem_cache *s, char *fmt, ...) +{ + va_list args; + char buf[100]; + + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + printk(KERN_ERR "========================================" + "=====================================\n"); + printk(KERN_ERR "BUG %s: %s\n", s->name, buf); + printk(KERN_ERR "----------------------------------------" + "-------------------------------------\n\n"); } -static void print_trailer(struct kmem_cache *s, u8 *p) +static void slab_fix(struct kmem_cache *s, char *fmt, ...) +{ + va_list args; + char buf[100]; + + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + printk(KERN_ERR "FIX %s: %s\n", s->name, buf); +} + +static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) { unsigned int off; /* Offset of last byte */ + u8 *addr = page_address(page); + + print_tracking(s, p); + + print_page_info(page); + + printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", + p, p - addr, get_freepointer(s, p)); + + if (p > addr + 16) + print_section("Bytes b4", p - 16, 16); + + print_section("Object", p, min(s->objsize, 128)); if (s->flags & SLAB_RED_ZONE) print_section("Redzone", p + s->objsize, s->inuse - s->objsize); - printk(KERN_ERR "FreePointer 0x%p -> 0x%p\n", - p + s->offset, - get_freepointer(s, p)); - if (s->offset) off = s->offset + sizeof(void *); else off = s->inuse; - if (s->flags & SLAB_STORE_USER) { - print_track("Last alloc", get_track(s, p, TRACK_ALLOC)); - print_track("Last free ", get_track(s, p, TRACK_FREE)); + if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); - } if (off != s->size) /* Beginning of the filler is the free pointer */ - print_section("Filler", p + off, s->size - off); + print_section("Padding", p + off, s->size - off); + + dump_stack(); } static void object_err(struct kmem_cache *s, struct page *page, u8 *object, char *reason) { - u8 *addr = page_address(page); - - printk(KERN_ERR "*** SLUB %s: %s@0x%p slab 0x%p\n", - s->name, reason, object, page); - printk(KERN_ERR " offset=%tu flags=0x%04lx inuse=%u freelist=0x%p\n", - object - addr, page->flags, page->inuse, page->freelist); - if (object > addr + 16) - print_section("Bytes b4", object - 16, 16); - print_section("Object", object, min(s->objsize, 128)); - print_trailer(s, object); - dump_stack(); + slab_bug(s, reason); + print_trailer(s, page, object); } -static void slab_err(struct kmem_cache *s, struct page *page, char *reason, ...) +static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) { va_list args; char buf[100]; - va_start(args, reason); - vsnprintf(buf, sizeof(buf), reason, args); + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); - printk(KERN_ERR "*** SLUB %s: %s in slab @0x%p\n", s->name, buf, - page); + slab_bug(s, fmt); + print_page_info(page); dump_stack(); } @@ -489,15 +537,46 @@ static void init_object(struct kmem_cache *s, void *object, int active) s->inuse - s->objsize); } -static int check_bytes(u8 *start, unsigned int value, unsigned int bytes) +static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) { while (bytes) { if (*start != (u8)value) - return 0; + return start; start++; bytes--; } - return 1; + return NULL; +} + +static void restore_bytes(struct kmem_cache *s, char *message, u8 data, + void *from, void *to) +{ + slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data); + memset(from, data, to - from); +} + +static int check_bytes_and_report(struct kmem_cache *s, struct page *page, + u8 *object, char *what, + u8* start, unsigned int value, unsigned int bytes) +{ + u8 *fault; + u8 *end; + + fault = check_bytes(start, value, bytes); + if (!fault) + return 1; + + end = start + bytes; + while (end > fault && end[-1] == value) + end--; + + slab_bug(s, "%s overwritten", what); + printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", + fault, end - 1, fault[0], value); + print_trailer(s, page, object); + + restore_bytes(s, what, value, fault, end); + return 0; } /* @@ -538,14 +617,6 @@ static int check_bytes(u8 *start, unsigned int value, unsigned int bytes) * may be used with merged slabcaches. */ -static void restore_bytes(struct kmem_cache *s, char *message, u8 data, - void *from, void *to) -{ - printk(KERN_ERR "@@@ SLUB %s: Restoring %s (0x%x) from 0x%p-0x%p\n", - s->name, message, data, from, to - 1); - memset(from, data, to - from); -} - static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) { unsigned long off = s->inuse; /* The end of info */ @@ -561,39 +632,39 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) if (s->size == off) return 1; - if (check_bytes(p + off, POISON_INUSE, s->size - off)) - return 1; - - object_err(s, page, p, "Object padding check fails"); - - /* - * Restore padding - */ - restore_bytes(s, "object padding", POISON_INUSE, p + off, p + s->size); - return 0; + return check_bytes_and_report(s, page, p, "Object padding", + p + off, POISON_INUSE, s->size - off); } static int slab_pad_check(struct kmem_cache *s, struct page *page) { - u8 *p; - int length, remainder; + u8 *start; + u8 *fault; + u8 *end; + int length; + int remainder; if (!(s->flags & SLAB_POISON)) return 1; - p = page_address(page); + start = page_address(page); + end = start + (PAGE_SIZE << s->order); length = s->objects * s->size; - remainder = (PAGE_SIZE << s->order) - length; + remainder = end - (start + length); if (!remainder) return 1; - if (!check_bytes(p + length, POISON_INUSE, remainder)) { - slab_err(s, page, "Padding check failed"); - restore_bytes(s, "slab padding", POISON_INUSE, p + length, - p + length + remainder); - return 0; - } - return 1; + fault = check_bytes(start + length, POISON_INUSE, remainder); + if (!fault) + return 1; + while (end > fault && end[-1] == POISON_INUSE) + end--; + + slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); + print_section("Padding", start, length); + + restore_bytes(s, "slab padding", POISON_INUSE, start, end); + return 0; } static int check_object(struct kmem_cache *s, struct page *page, @@ -606,41 +677,22 @@ static int check_object(struct kmem_cache *s, struct page *page, unsigned int red = active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE; - if (!check_bytes(endobject, red, s->inuse - s->objsize)) { - object_err(s, page, object, - active ? "Redzone Active" : "Redzone Inactive"); - restore_bytes(s, "redzone", red, - endobject, object + s->inuse); + if (!check_bytes_and_report(s, page, object, "Redzone", + endobject, red, s->inuse - s->objsize)) return 0; - } } else { - if ((s->flags & SLAB_POISON) && s->objsize < s->inuse && - !check_bytes(endobject, POISON_INUSE, - s->inuse - s->objsize)) { - object_err(s, page, p, "Alignment padding check fails"); - /* - * Fix it so that there will not be another report. - * - * Hmmm... We may be corrupting an object that now expects - * to be longer than allowed. - */ - restore_bytes(s, "alignment padding", POISON_INUSE, - endobject, object + s->inuse); - } + if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) + check_bytes_and_report(s, page, p, "Alignment padding", endobject, + POISON_INUSE, s->inuse - s->objsize); } if (s->flags & SLAB_POISON) { if (!active && (s->flags & __OBJECT_POISON) && - (!check_bytes(p, POISON_FREE, s->objsize - 1) || - p[s->objsize - 1] != POISON_END)) { - - object_err(s, page, p, "Poison check failed"); - restore_bytes(s, "Poison", POISON_FREE, - p, p + s->objsize -1); - restore_bytes(s, "Poison", POISON_END, - p + s->objsize - 1, p + s->objsize); + (!check_bytes_and_report(s, page, p, "Poison", p, + POISON_FREE, s->objsize - 1) || + !check_bytes_and_report(s, page, p, "Poison", + p + s->objsize -1, POISON_END, 1))) return 0; - } /* * check_pad_bytes cleans up on its own. */ @@ -673,25 +725,17 @@ static int check_slab(struct kmem_cache *s, struct page *page) VM_BUG_ON(!irqs_disabled()); if (!PageSlab(page)) { - slab_err(s, page, "Not a valid slab page flags=%lx " - "mapping=0x%p count=%d", page->flags, page->mapping, - page_count(page)); + slab_err(s, page, "Not a valid slab page"); return 0; } if (page->offset * sizeof(void *) != s->offset) { - slab_err(s, page, "Corrupted offset %lu flags=0x%lx " - "mapping=0x%p count=%d", - (unsigned long)(page->offset * sizeof(void *)), - page->flags, - page->mapping, - page_count(page)); + slab_err(s, page, "Corrupted offset %lu", + (unsigned long)(page->offset * sizeof(void *))); return 0; } if (page->inuse > s->objects) { - slab_err(s, page, "inuse %u > max %u @0x%p flags=%lx " - "mapping=0x%p count=%d", - s->name, page->inuse, s->objects, page->flags, - page->mapping, page_count(page)); + slab_err(s, page, "inuse %u > max %u", + s->name, page->inuse, s->objects); return 0; } /* Slab_pad_check fixes things up after itself */ @@ -719,13 +763,10 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) set_freepointer(s, object, NULL); break; } else { - slab_err(s, page, "Freepointer 0x%p corrupt", - fp); + slab_err(s, page, "Freepointer corrupt"); page->freelist = NULL; page->inuse = s->objects; - printk(KERN_ERR "@@@ SLUB %s: Freelist " - "cleared. Slab 0x%p\n", - s->name, page); + slab_fix(s, "Freelist cleared"); return 0; } break; @@ -737,11 +778,9 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) if (page->inuse != s->objects - nr) { slab_err(s, page, "Wrong object count. Counter is %d but " - "counted were %d", s, page, page->inuse, - s->objects - nr); + "counted were %d", page->inuse, s->objects - nr); page->inuse = s->objects - nr; - printk(KERN_ERR "@@@ SLUB %s: Object count adjusted. " - "Slab @0x%p\n", s->name, page); + slab_fix(s, "Object count adjusted."); } return search == NULL; } @@ -803,7 +842,7 @@ static int alloc_debug_processing(struct kmem_cache *s, struct page *page, goto bad; if (object && !on_freelist(s, page, object)) { - slab_err(s, page, "Object 0x%p already allocated", object); + object_err(s, page, object, "Object already allocated"); goto bad; } @@ -829,8 +868,7 @@ bad: * to avoid issues in the future. Marking all objects * as used avoids touching the remaining objects. */ - printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n", - s->name, page); + slab_fix(s, "Marking all objects used"); page->inuse = s->objects; page->freelist = NULL; /* Fix up fields that may be corrupted */ @@ -851,7 +889,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, } if (on_freelist(s, page, object)) { - slab_err(s, page, "Object 0x%p already free", object); + object_err(s, page, object, "Object already free"); goto fail; } @@ -870,8 +908,8 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, dump_stack(); } else - slab_err(s, page, "object at 0x%p belongs " - "to slab %s", object, page->slab->name); + object_err(s, page, object, + "page slab pointer corrupt."); goto fail; } @@ -885,8 +923,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, return 1; fail: - printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n", - s->name, page, object); + slab_fix(s, "Object at 0x%p not freed", object); return 0; } @@ -1041,7 +1078,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) void *last; void *p; - BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK)); + BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK)); if (flags & __GFP_WAIT) local_irq_enable(); @@ -1359,7 +1396,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) unfreeze_slab(s, page); } -static void flush_slab(struct kmem_cache *s, struct page *page, int cpu) +static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu) { slab_lock(page); deactivate_slab(s, page, cpu); @@ -1369,7 +1406,7 @@ static void flush_slab(struct kmem_cache *s, struct page *page, int cpu) * Flush cpu slab. * Called from IPI handler with interrupts disabled. */ -static void __flush_cpu_slab(struct kmem_cache *s, int cpu) +static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { struct page *page = s->cpu_slab[cpu]; @@ -1504,7 +1541,7 @@ debug: * Otherwise we can simply pick the next object from the lockless free list. */ static void __always_inline *slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr) + gfp_t gfpflags, int node, void *addr) { struct page *page; void **object; @@ -1522,6 +1559,10 @@ static void __always_inline *slab_alloc(struct kmem_cache *s, page->lockless_freelist = object[page->offset]; } local_irq_restore(flags); + + if (unlikely((gfpflags & __GFP_ZERO) && object)) + memset(object, 0, s->objsize); + return object; } @@ -1705,8 +1746,17 @@ static inline int slab_order(int size, int min_objects, { int order; int rem; + int min_order = slub_min_order; - for (order = max(slub_min_order, + /* + * If we would create too many object per slab then reduce + * the slab order even if it goes below slub_min_order. + */ + while (min_order > 0 && + (PAGE_SIZE << min_order) >= MAX_OBJECTS_PER_SLAB * size) + min_order--; + + for (order = max(min_order, fls(min_objects * size - 1) - PAGE_SHIFT); order <= max_order; order++) { @@ -1720,6 +1770,9 @@ static inline int slab_order(int size, int min_objects, if (rem <= slab_size / fract_leftover) break; + /* If the next size is too high then exit now */ + if (slab_size * 2 >= MAX_OBJECTS_PER_SLAB * size) + break; } return order; @@ -1800,7 +1853,9 @@ static void init_kmem_cache_node(struct kmem_cache_node *n) atomic_long_set(&n->nr_slabs, 0); spin_lock_init(&n->list_lock); INIT_LIST_HEAD(&n->partial); +#ifdef CONFIG_SLUB_DEBUG INIT_LIST_HEAD(&n->full); +#endif } #ifdef CONFIG_NUMA @@ -1828,7 +1883,10 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag page->freelist = get_freepointer(kmalloc_caches, n); page->inuse++; kmalloc_caches->node[node] = n; - setup_object_debug(kmalloc_caches, page, n); +#ifdef CONFIG_SLUB_DEBUG + init_object(kmalloc_caches, n, 1); + init_tracking(kmalloc_caches, n); +#endif init_kmem_cache_node(n); atomic_long_inc(&n->nr_slabs); add_partial(n, page); @@ -2006,7 +2064,7 @@ static int calculate_sizes(struct kmem_cache *s) * The page->inuse field is only 16 bit wide! So we cannot have * more than 64k objects per slab. */ - if (!s->objects || s->objects > 65535) + if (!s->objects || s->objects > MAX_OBJECTS_PER_SLAB) return 0; return 1; @@ -2110,7 +2168,7 @@ static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, /* * Release all resources used by a slab cache. */ -static int kmem_cache_close(struct kmem_cache *s) +static inline int kmem_cache_close(struct kmem_cache *s) { int node; @@ -2138,12 +2196,13 @@ void kmem_cache_destroy(struct kmem_cache *s) s->refcount--; if (!s->refcount) { list_del(&s->list); + up_write(&slub_lock); if (kmem_cache_close(s)) WARN_ON(1); sysfs_slab_remove(s); kfree(s); - } - up_write(&slub_lock); + } else + up_write(&slub_lock); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -2216,47 +2275,92 @@ panic: panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); } -static struct kmem_cache *get_slab(size_t size, gfp_t flags) +#ifdef CONFIG_ZONE_DMA +static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) { - int index = kmalloc_index(size); + struct kmem_cache *s; + struct kmem_cache *x; + char *text; + size_t realsize; - if (!index) - return NULL; + s = kmalloc_caches_dma[index]; + if (s) + return s; - /* Allocation too large? */ - BUG_ON(index < 0); + /* Dynamically create dma cache */ + x = kmalloc(kmem_size, flags & ~SLUB_DMA); + if (!x) + panic("Unable to allocate memory for dma cache\n"); -#ifdef CONFIG_ZONE_DMA - if ((flags & SLUB_DMA)) { - struct kmem_cache *s; - struct kmem_cache *x; - char *text; - size_t realsize; - - s = kmalloc_caches_dma[index]; - if (s) - return s; + realsize = kmalloc_caches[index].objsize; + text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", + (unsigned int)realsize); + s = create_kmalloc_cache(x, text, realsize, flags); + down_write(&slub_lock); + if (!kmalloc_caches_dma[index]) { + kmalloc_caches_dma[index] = s; + up_write(&slub_lock); + return s; + } + up_write(&slub_lock); + kmem_cache_destroy(s); + return kmalloc_caches_dma[index]; +} +#endif + +/* + * Conversion table for small slabs sizes / 8 to the index in the + * kmalloc array. This is necessary for slabs < 192 since we have non power + * of two cache sizes there. The size of larger slabs can be determined using + * fls. + */ +static s8 size_index[24] = { + 3, /* 8 */ + 4, /* 16 */ + 5, /* 24 */ + 5, /* 32 */ + 6, /* 40 */ + 6, /* 48 */ + 6, /* 56 */ + 6, /* 64 */ + 1, /* 72 */ + 1, /* 80 */ + 1, /* 88 */ + 1, /* 96 */ + 7, /* 104 */ + 7, /* 112 */ + 7, /* 120 */ + 7, /* 128 */ + 2, /* 136 */ + 2, /* 144 */ + 2, /* 152 */ + 2, /* 160 */ + 2, /* 168 */ + 2, /* 176 */ + 2, /* 184 */ + 2 /* 192 */ +}; - /* Dynamically create dma cache */ - x = kmalloc(kmem_size, flags & ~SLUB_DMA); - if (!x) - panic("Unable to allocate memory for dma cache\n"); +static struct kmem_cache *get_slab(size_t size, gfp_t flags) +{ + int index; - if (index <= KMALLOC_SHIFT_HIGH) - realsize = 1 << index; - else { - if (index == 1) - realsize = 96; - else - realsize = 192; - } + if (size <= 192) { + if (!size) + return ZERO_SIZE_PTR; - text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", - (unsigned int)realsize); - s = create_kmalloc_cache(x, text, realsize, flags); - kmalloc_caches_dma[index] = s; - return s; + index = size_index[(size - 1) / 8]; + } else { + if (size > KMALLOC_MAX_SIZE) + return NULL; + + index = fls(size - 1); } + +#ifdef CONFIG_ZONE_DMA + if (unlikely((flags & SLUB_DMA))) + return dma_kmalloc_cache(index, flags); + #endif return &kmalloc_caches[index]; } @@ -2265,9 +2369,10 @@ void *__kmalloc(size_t size, gfp_t flags) { struct kmem_cache *s = get_slab(size, flags); - if (s) - return slab_alloc(s, flags, -1, __builtin_return_address(0)); - return ZERO_SIZE_PTR; + if (ZERO_OR_NULL_PTR(s)) + return s; + + return slab_alloc(s, flags, -1, __builtin_return_address(0)); } EXPORT_SYMBOL(__kmalloc); @@ -2276,9 +2381,10 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) { struct kmem_cache *s = get_slab(size, flags); - if (s) - return slab_alloc(s, flags, node, __builtin_return_address(0)); - return ZERO_SIZE_PTR; + if (ZERO_OR_NULL_PTR(s)) + return s; + + return slab_alloc(s, flags, node, __builtin_return_address(0)); } EXPORT_SYMBOL(__kmalloc_node); #endif @@ -2329,7 +2435,7 @@ void kfree(const void *x) * this comparison would be true for all "negative" pointers * (which would cover the whole upper half of the address space). */ - if ((unsigned long)x <= (unsigned long)ZERO_SIZE_PTR) + if (ZERO_OR_NULL_PTR(x)) return; page = virt_to_head_page(x); @@ -2418,43 +2524,6 @@ int kmem_cache_shrink(struct kmem_cache *s) } EXPORT_SYMBOL(kmem_cache_shrink); -/** - * krealloc - reallocate memory. The contents will remain unchanged. - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - * - * The contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. If @p is %NULL, krealloc() - * behaves exactly like kmalloc(). If @size is 0 and @p is not a - * %NULL pointer, the object pointed to is freed. - */ -void *krealloc(const void *p, size_t new_size, gfp_t flags) -{ - void *ret; - size_t ks; - - if (unlikely(!p || p == ZERO_SIZE_PTR)) - return kmalloc(new_size, flags); - - if (unlikely(!new_size)) { - kfree(p); - return ZERO_SIZE_PTR; - } - - ks = ksize(p); - if (ks >= new_size) - return (void *)p; - - ret = kmalloc(new_size, flags); - if (ret) { - memcpy(ret, p, min(new_size, ks)); - kfree(p); - } - return ret; -} -EXPORT_SYMBOL(krealloc); - /******************************************************************** * Basic setup of slabs *******************************************************************/ @@ -2497,6 +2566,24 @@ void __init kmem_cache_init(void) caches++; } + + /* + * Patch up the size_index table if we have strange large alignment + * requirements for the kmalloc array. This is only the case for + * mips it seems. The standard arches will not generate any code here. + * + * Largest permitted alignment is 256 bytes due to the way we + * handle the index determination for the smaller caches. + * + * Make sure that nothing crazy happens if someone starts tinkering + * around with ARCH_KMALLOC_MINALIGN + */ + BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || + (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); + + for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) + size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; + slab_state = UP; /* Provide the correct kmalloc names now that the caches are up */ @@ -2542,7 +2629,7 @@ static struct kmem_cache *find_mergeable(size_t size, size_t align, unsigned long flags, void (*ctor)(void *, struct kmem_cache *, unsigned long)) { - struct list_head *h; + struct kmem_cache *s; if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) return NULL; @@ -2554,10 +2641,7 @@ static struct kmem_cache *find_mergeable(size_t size, align = calculate_alignment(flags, align, size); size = ALIGN(size, align); - list_for_each(h, &slab_caches) { - struct kmem_cache *s = - container_of(h, struct kmem_cache, list); - + list_for_each_entry(s, &slab_caches, list) { if (slab_unmergeable(s)) continue; @@ -2600,25 +2684,26 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, */ s->objsize = max(s->objsize, (int)size); s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); + up_write(&slub_lock); if (sysfs_slab_alias(s, name)) goto err; - } else { - s = kmalloc(kmem_size, GFP_KERNEL); - if (s && kmem_cache_open(s, GFP_KERNEL, name, + return s; + } + s = kmalloc(kmem_size, GFP_KERNEL); + if (s) { + if (kmem_cache_open(s, GFP_KERNEL, name, size, align, flags, ctor)) { - if (sysfs_slab_add(s)) { - kfree(s); - goto err; - } list_add(&s->list, &slab_caches); - } else - kfree(s); + up_write(&slub_lock); + if (sysfs_slab_add(s)) + goto err; + return s; + } + kfree(s); } up_write(&slub_lock); - return s; err: - up_write(&slub_lock); if (flags & SLAB_PANIC) panic("Cannot create slabcache %s\n", name); else @@ -2627,45 +2712,7 @@ err: } EXPORT_SYMBOL(kmem_cache_create); -void *kmem_cache_zalloc(struct kmem_cache *s, gfp_t flags) -{ - void *x; - - x = slab_alloc(s, flags, -1, __builtin_return_address(0)); - if (x) - memset(x, 0, s->objsize); - return x; -} -EXPORT_SYMBOL(kmem_cache_zalloc); - #ifdef CONFIG_SMP -static void for_all_slabs(void (*func)(struct kmem_cache *, int), int cpu) -{ - struct list_head *h; - - down_read(&slub_lock); - list_for_each(h, &slab_caches) { - struct kmem_cache *s = - container_of(h, struct kmem_cache, list); - - func(s, cpu); - } - up_read(&slub_lock); -} - -/* - * Version of __flush_cpu_slab for the case that interrupts - * are enabled. - */ -static void cpu_slab_flush(struct kmem_cache *s, int cpu) -{ - unsigned long flags; - - local_irq_save(flags); - __flush_cpu_slab(s, cpu); - local_irq_restore(flags); -} - /* * Use the cpu notifier to insure that the cpu slabs are flushed when * necessary. @@ -2674,13 +2721,21 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { long cpu = (long)hcpu; + struct kmem_cache *s; + unsigned long flags; switch (action) { case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - for_all_slabs(cpu_slab_flush, cpu); + down_read(&slub_lock); + list_for_each_entry(s, &slab_caches, list) { + local_irq_save(flags); + __flush_cpu_slab(s, cpu); + local_irq_restore(flags); + } + up_read(&slub_lock); break; default: break; @@ -2697,8 +2752,8 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) { struct kmem_cache *s = get_slab(size, gfpflags); - if (!s) - return ZERO_SIZE_PTR; + if (ZERO_OR_NULL_PTR(s)) + return s; return slab_alloc(s, gfpflags, -1, caller); } @@ -2708,18 +2763,18 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, { struct kmem_cache *s = get_slab(size, gfpflags); - if (!s) - return ZERO_SIZE_PTR; + if (ZERO_OR_NULL_PTR(s)) + return s; return slab_alloc(s, gfpflags, node, caller); } #if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) -static int validate_slab(struct kmem_cache *s, struct page *page) +static int validate_slab(struct kmem_cache *s, struct page *page, + unsigned long *map) { void *p; void *addr = page_address(page); - DECLARE_BITMAP(map, s->objects); if (!check_slab(s, page) || !on_freelist(s, page, NULL)) @@ -2741,10 +2796,11 @@ static int validate_slab(struct kmem_cache *s, struct page *page) return 1; } -static void validate_slab_slab(struct kmem_cache *s, struct page *page) +static void validate_slab_slab(struct kmem_cache *s, struct page *page, + unsigned long *map) { if (slab_trylock(page)) { - validate_slab(s, page); + validate_slab(s, page, map); slab_unlock(page); } else printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", @@ -2761,7 +2817,8 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page) } } -static int validate_slab_node(struct kmem_cache *s, struct kmem_cache_node *n) +static int validate_slab_node(struct kmem_cache *s, + struct kmem_cache_node *n, unsigned long *map) { unsigned long count = 0; struct page *page; @@ -2770,7 +2827,7 @@ static int validate_slab_node(struct kmem_cache *s, struct kmem_cache_node *n) spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->partial, lru) { - validate_slab_slab(s, page); + validate_slab_slab(s, page, map); count++; } if (count != n->nr_partial) @@ -2781,7 +2838,7 @@ static int validate_slab_node(struct kmem_cache *s, struct kmem_cache_node *n) goto out; list_for_each_entry(page, &n->full, lru) { - validate_slab_slab(s, page); + validate_slab_slab(s, page, map); count++; } if (count != atomic_long_read(&n->nr_slabs)) @@ -2794,17 +2851,23 @@ out: return count; } -static unsigned long validate_slab_cache(struct kmem_cache *s) +static long validate_slab_cache(struct kmem_cache *s) { int node; unsigned long count = 0; + unsigned long *map = kmalloc(BITS_TO_LONGS(s->objects) * + sizeof(unsigned long), GFP_KERNEL); + + if (!map) + return -ENOMEM; flush_all(s); for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); - count += validate_slab_node(s, n); + count += validate_slab_node(s, n, map); } + kfree(map); return count; } @@ -2893,18 +2956,14 @@ static void free_loc_track(struct loc_track *t) get_order(sizeof(struct location) * t->max)); } -static int alloc_loc_track(struct loc_track *t, unsigned long max) +static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) { struct location *l; int order; - if (!max) - max = PAGE_SIZE / sizeof(struct location); - order = get_order(sizeof(struct location) * max); - l = (void *)__get_free_pages(GFP_ATOMIC, order); - + l = (void *)__get_free_pages(flags, order); if (!l) return 0; @@ -2970,7 +3029,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, /* * Not found. Insert new tracking element. */ - if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max)) + if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC)) return 0; l = t->loc + pos; @@ -3013,11 +3072,12 @@ static int list_locations(struct kmem_cache *s, char *buf, { int n = 0; unsigned long i; - struct loc_track t; + struct loc_track t = { 0, 0, NULL }; int node; - t.count = 0; - t.max = 0; + if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), + GFP_KERNEL)) + return sprintf(buf, "Out of memory\n"); /* Push back cpu slabs */ flush_all(s); @@ -3421,11 +3481,14 @@ static ssize_t validate_show(struct kmem_cache *s, char *buf) static ssize_t validate_store(struct kmem_cache *s, const char *buf, size_t length) { - if (buf[0] == '1') - validate_slab_cache(s); - else - return -EINVAL; - return length; + int ret = -EINVAL; + + if (buf[0] == '1') { + ret = validate_slab_cache(s); + if (ret >= 0) + ret = length; + } + return ret; } SLAB_ATTR(validate); @@ -3579,7 +3642,7 @@ static struct kset_uevent_ops slab_uevent_ops = { .filter = uevent_filter, }; -decl_subsys(slab, &slab_ktype, &slab_uevent_ops); +static decl_subsys(slab, &slab_ktype, &slab_uevent_ops); #define ID_STR_LENGTH 64 @@ -3677,7 +3740,7 @@ struct saved_alias { struct saved_alias *next; }; -struct saved_alias *alias_list; +static struct saved_alias *alias_list; static int sysfs_slab_alias(struct kmem_cache *s, const char *name) { @@ -3705,7 +3768,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) static int __init slab_sysfs_init(void) { - struct list_head *h; + struct kmem_cache *s; int err; err = subsystem_register(&slab_subsys); @@ -3716,10 +3779,7 @@ static int __init slab_sysfs_init(void) slab_state = SYSFS; - list_for_each(h, &slab_caches) { - struct kmem_cache *s = - container_of(h, struct kmem_cache, list); - + list_for_each_entry(s, &slab_caches, list) { err = sysfs_slab_add(s); BUG_ON(err); } diff --git a/mm/swap_state.c b/mm/swap_state.c index 925d5c50f18..67daecb6031 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -334,7 +334,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, * Get a new page to read into from swap. */ if (!new_page) { - new_page = alloc_page_vma(GFP_HIGHUSER, vma, addr); + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, + vma, addr); if (!new_page) break; /* Out of memory */ } diff --git a/mm/truncate.c b/mm/truncate.c index 7c994f2d614..f47e46d1be3 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -100,9 +100,9 @@ truncate_complete_page(struct address_space *mapping, struct page *page) if (PagePrivate(page)) do_invalidatepage(page, 0); + remove_from_page_cache(page); ClearPageUptodate(page); ClearPageMappedToDisk(page); - remove_from_page_cache(page); page_cache_release(page); /* pagecache ref */ } diff --git a/mm/util.c b/mm/util.c index ace2aea69f1..78f3783bdcc 100644 --- a/mm/util.c +++ b/mm/util.c @@ -5,20 +5,6 @@ #include <asm/uaccess.h> /** - * __kzalloc - allocate memory. The memory is set to zero. - * @size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - */ -void *__kzalloc(size_t size, gfp_t flags) -{ - void *ret = kmalloc_track_caller(size, flags); - if (ret) - memset(ret, 0, size); - return ret; -} -EXPORT_SYMBOL(__kzalloc); - -/* * kstrdup - allocate space for and copy an existing string * * @s: the string to duplicate @@ -58,6 +44,40 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp) } EXPORT_SYMBOL(kmemdup); +/** + * krealloc - reallocate memory. The contents will remain unchanged. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * The contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. If @p is %NULL, krealloc() + * behaves exactly like kmalloc(). If @size is 0 and @p is not a + * %NULL pointer, the object pointed to is freed. + */ +void *krealloc(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + size_t ks; + + if (unlikely(!new_size)) { + kfree(p); + return ZERO_SIZE_PTR; + } + + ks = ksize(p); + if (ks >= new_size) + return (void *)p; + + ret = kmalloc_track_caller(new_size, flags); + if (ret) { + memcpy(ret, p, min(new_size, ks)); + kfree(p); + } + return ret; +} +EXPORT_SYMBOL(krealloc); + /* * strndup_user - duplicate an existing string from user space * diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d3a9c536825..8e05a11155c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -68,12 +68,12 @@ static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, } while (pud++, addr = next, addr != end); } -void unmap_vm_area(struct vm_struct *area) +void unmap_kernel_range(unsigned long addr, unsigned long size) { pgd_t *pgd; unsigned long next; - unsigned long addr = (unsigned long) area->addr; - unsigned long end = addr + area->size; + unsigned long start = addr; + unsigned long end = addr + size; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); @@ -84,7 +84,12 @@ void unmap_vm_area(struct vm_struct *area) continue; vunmap_pud_range(pgd, addr, next); } while (pgd++, addr = next, addr != end); - flush_tlb_kernel_range((unsigned long) area->addr, end); + flush_tlb_kernel_range(start, end); +} + +static void unmap_vm_area(struct vm_struct *area) +{ + unmap_kernel_range((unsigned long)area->addr, area->size); } static int vmap_pte_range(pmd_t *pmd, unsigned long addr, @@ -427,11 +432,12 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node); + pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO, + PAGE_KERNEL, node); area->flags |= VM_VPAGES; } else { pages = kmalloc_node(array_size, - (gfp_mask & GFP_LEVEL_MASK), + (gfp_mask & GFP_LEVEL_MASK) | __GFP_ZERO, node); } area->pages = pages; @@ -440,7 +446,6 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, kfree(area); return NULL; } - memset(area->pages, 0, array_size); for (i = 0; i < area->nr_pages; i++) { if (node < 0) diff --git a/mm/vmscan.c b/mm/vmscan.c index 1be5a6376ef..d419e10e3da 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -66,17 +66,8 @@ struct scan_control { int swappiness; int all_unreclaimable; -}; -/* - * The list of shrinker callbacks used by to apply pressure to - * ageable caches. - */ -struct shrinker { - shrinker_t shrinker; - struct list_head list; - int seeks; /* seeks to recreate an obj */ - long nr; /* objs pending delete */ + int order; }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -121,34 +112,25 @@ static DECLARE_RWSEM(shrinker_rwsem); /* * Add a shrinker callback to be called from the vm */ -struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker) +void register_shrinker(struct shrinker *shrinker) { - struct shrinker *shrinker; - - shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL); - if (shrinker) { - shrinker->shrinker = theshrinker; - shrinker->seeks = seeks; - shrinker->nr = 0; - down_write(&shrinker_rwsem); - list_add_tail(&shrinker->list, &shrinker_list); - up_write(&shrinker_rwsem); - } - return shrinker; + shrinker->nr = 0; + down_write(&shrinker_rwsem); + list_add_tail(&shrinker->list, &shrinker_list); + up_write(&shrinker_rwsem); } -EXPORT_SYMBOL(set_shrinker); +EXPORT_SYMBOL(register_shrinker); /* * Remove one */ -void remove_shrinker(struct shrinker *shrinker) +void unregister_shrinker(struct shrinker *shrinker) { down_write(&shrinker_rwsem); list_del(&shrinker->list); up_write(&shrinker_rwsem); - kfree(shrinker); } -EXPORT_SYMBOL(remove_shrinker); +EXPORT_SYMBOL(unregister_shrinker); #define SHRINK_BATCH 128 /* @@ -185,7 +167,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; unsigned long total_scan; - unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask); + unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask); delta = (4 * scanned) / shrinker->seeks; delta *= max_pass; @@ -213,8 +195,8 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, int shrink_ret; int nr_before; - nr_before = (*shrinker->shrinker)(0, gfp_mask); - shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask); + nr_before = (*shrinker->shrink)(0, gfp_mask); + shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask); if (shrink_ret == -1) break; if (shrink_ret < nr_before) @@ -481,7 +463,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, referenced = page_referenced(page, 1); /* In active use or really unfreeable? Activate it. */ - if (referenced && page_mapping_inuse(page)) + if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && + referenced && page_mapping_inuse(page)) goto activate_locked; #ifdef CONFIG_SWAP @@ -514,7 +497,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } if (PageDirty(page)) { - if (referenced) + if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced) goto keep_locked; if (!may_enter_fs) goto keep_locked; @@ -598,6 +581,51 @@ keep: return nr_reclaimed; } +/* LRU Isolation modes. */ +#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */ +#define ISOLATE_ACTIVE 1 /* Isolate active pages. */ +#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */ + +/* + * Attempt to remove the specified page from its LRU. Only take this page + * if it is of the appropriate PageActive status. Pages which are being + * freed elsewhere are also ignored. + * + * page: page to consider + * mode: one of the LRU isolation modes defined above + * + * returns 0 on success, -ve errno on failure. + */ +static int __isolate_lru_page(struct page *page, int mode) +{ + int ret = -EINVAL; + + /* Only take pages on the LRU. */ + if (!PageLRU(page)) + return ret; + + /* + * When checking the active state, we need to be sure we are + * dealing with comparible boolean values. Take the logical not + * of each. + */ + if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) + return ret; + + ret = -EBUSY; + if (likely(get_page_unless_zero(page))) { + /* + * Be careful not to clear PageLRU until after we're + * sure the page is not being freed elsewhere -- the + * page release code relies on it. + */ + ClearPageLRU(page); + ret = 0; + } + + return ret; +} + /* * zone->lru_lock is heavily contended. Some of the functions that * shrink the lists perform better by taking out a batch of pages @@ -612,38 +640,90 @@ keep: * @src: The LRU list to pull pages off. * @dst: The temp list to put pages on to. * @scanned: The number of pages that were scanned. + * @order: The caller's attempted allocation order + * @mode: One of the LRU isolation modes * * returns how many pages were moved onto *@dst. */ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, struct list_head *src, struct list_head *dst, - unsigned long *scanned) + unsigned long *scanned, int order, int mode) { unsigned long nr_taken = 0; - struct page *page; unsigned long scan; for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { - struct list_head *target; + struct page *page; + unsigned long pfn; + unsigned long end_pfn; + unsigned long page_pfn; + int zone_id; + page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); VM_BUG_ON(!PageLRU(page)); - list_del(&page->lru); - target = src; - if (likely(get_page_unless_zero(page))) { - /* - * Be careful not to clear PageLRU until after we're - * sure the page is not being freed elsewhere -- the - * page release code relies on it. - */ - ClearPageLRU(page); - target = dst; + switch (__isolate_lru_page(page, mode)) { + case 0: + list_move(&page->lru, dst); nr_taken++; - } /* else it is being freed elsewhere */ + break; + + case -EBUSY: + /* else it is being freed elsewhere */ + list_move(&page->lru, src); + continue; + + default: + BUG(); + } + + if (!order) + continue; - list_add(&page->lru, target); + /* + * Attempt to take all pages in the order aligned region + * surrounding the tag page. Only take those pages of + * the same active state as that tag page. We may safely + * round the target page pfn down to the requested order + * as the mem_map is guarenteed valid out to MAX_ORDER, + * where that page is in a different zone we will detect + * it from its zone id and abort this block scan. + */ + zone_id = page_zone_id(page); + page_pfn = page_to_pfn(page); + pfn = page_pfn & ~((1 << order) - 1); + end_pfn = pfn + (1 << order); + for (; pfn < end_pfn; pfn++) { + struct page *cursor_page; + + /* The target page is in the block, ignore it. */ + if (unlikely(pfn == page_pfn)) + continue; + + /* Avoid holes within the zone. */ + if (unlikely(!pfn_valid_within(pfn))) + break; + + cursor_page = pfn_to_page(pfn); + /* Check that we have not crossed a zone boundary. */ + if (unlikely(page_zone_id(cursor_page) != zone_id)) + continue; + switch (__isolate_lru_page(cursor_page, mode)) { + case 0: + list_move(&cursor_page->lru, dst); + nr_taken++; + scan++; + break; + + case -EBUSY: + /* else it is being freed elsewhere */ + list_move(&cursor_page->lru, src); + default: + break; + } + } } *scanned = scan; @@ -651,6 +731,24 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, } /* + * clear_active_flags() is a helper for shrink_active_list(), clearing + * any active bits from the pages in the list. + */ +static unsigned long clear_active_flags(struct list_head *page_list) +{ + int nr_active = 0; + struct page *page; + + list_for_each_entry(page, page_list, lru) + if (PageActive(page)) { + ClearPageActive(page); + nr_active++; + } + + return nr_active; +} + +/* * shrink_inactive_list() is a helper for shrink_zone(). It returns the number * of reclaimed pages */ @@ -671,11 +769,18 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_taken; unsigned long nr_scan; unsigned long nr_freed; + unsigned long nr_active; nr_taken = isolate_lru_pages(sc->swap_cluster_max, - &zone->inactive_list, - &page_list, &nr_scan); - __mod_zone_page_state(zone, NR_INACTIVE, -nr_taken); + &zone->inactive_list, + &page_list, &nr_scan, sc->order, + (sc->order > PAGE_ALLOC_COSTLY_ORDER)? + ISOLATE_BOTH : ISOLATE_INACTIVE); + nr_active = clear_active_flags(&page_list); + + __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); + __mod_zone_page_state(zone, NR_INACTIVE, + -(nr_taken - nr_active)); zone->pages_scanned += nr_scan; spin_unlock_irq(&zone->lru_lock); @@ -820,7 +925,7 @@ force_reclaim_mapped: lru_add_drain(); spin_lock_irq(&zone->lru_lock); pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, - &l_hold, &pgscanned); + &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE); zone->pages_scanned += pgscanned; __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); spin_unlock_irq(&zone->lru_lock); @@ -1011,7 +1116,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones, * holds filesystem locks which prevent writeout this might not work, and the * allocation attempt will fail. */ -unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) +unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) { int priority; int ret = 0; @@ -1026,6 +1131,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) .swap_cluster_max = SWAP_CLUSTER_MAX, .may_swap = 1, .swappiness = vm_swappiness, + .order = order, }; count_vm_event(ALLOCSTALL); @@ -1131,6 +1237,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) .may_swap = 1, .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = vm_swappiness, + .order = order, }; /* * temp_priority is used to remember the scanning priority at which @@ -1314,6 +1421,7 @@ static int kswapd(void *p) * trying to free the first piece of memory in the first place). */ tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; + set_freezable(); order = 0; for ( ; ; ) { diff --git a/mm/vmstat.c b/mm/vmstat.c index eceaf496210..fadf791cd7e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -472,7 +472,7 @@ const struct seq_operations fragmentation_op = { #endif #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \ - TEXT_FOR_HIGHMEM(xx) + TEXT_FOR_HIGHMEM(xx) xx "_movable", static const char * const vmstat_text[] = { /* Zoned VM counters */ |