diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/filemap.c | 22 | ||||
-rw-r--r-- | mm/hugetlb.c | 2 | ||||
-rw-r--r-- | mm/mempolicy.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 6 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 13 | ||||
-rw-r--r-- | mm/slab.c | 14 | ||||
-rw-r--r-- | mm/slub.c | 83 | ||||
-rw-r--r-- | mm/sparse.c | 14 | ||||
-rw-r--r-- | mm/vmscan.c | 69 |
10 files changed, 161 insertions, 66 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 6cf700d4184..90b657b50f8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -843,7 +843,7 @@ static void shrink_readahead_size_eio(struct file *filp, /** * do_generic_mapping_read - generic file read routine * @mapping: address_space to be read - * @ra: file's readahead state + * @_ra: file's readahead state * @filp: the file to read * @ppos: current file position * @desc: read_descriptor @@ -1218,26 +1218,6 @@ out: } EXPORT_SYMBOL(generic_file_aio_read); -int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) -{ - ssize_t written; - unsigned long count = desc->count; - struct file *file = desc->arg.data; - - if (size > count) - size = count; - - written = file->f_op->sendpage(file, page, offset, - size, &file->f_pos, size<count); - if (written < 0) { - desc->error = written; - written = 0; - } - desc->count = count - written; - desc->written += written; - return written; -} - static ssize_t do_readahead(struct address_space *mapping, struct file *filp, unsigned long index, unsigned long nr) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d7ca59d66c5..de4cf458d6e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -643,7 +643,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(&mm->page_table_lock); ret = hugetlb_fault(mm, vma, vaddr, 0); spin_lock(&mm->page_table_lock); - if (!(ret & VM_FAULT_MAJOR)) + if (!(ret & VM_FAULT_ERROR)) continue; remainder = 0; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 71b84b45154..172abffeb2e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -149,7 +149,7 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes) lower zones etc. Avoid empty zones because the memory allocator doesn't like them. If you implement node hot removal you have to fix that. */ - k = policy_zone; + k = MAX_NR_ZONES - 1; while (1) { for_each_node_mask(nd, *nodes) { struct zone *z = &NODE_DATA(nd)->node_zones[k]; diff --git a/mm/mmap.c b/mm/mmap.c index b6537211b9c..0d40e66c841 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -93,7 +93,7 @@ atomic_t vm_committed_space = ATOMIC_INIT(0); * Note this is a helper function intended to be used by LSMs which * wish to use this logic. */ -int __vm_enough_memory(long pages, int cap_sys_admin) +int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { unsigned long free, allowed; @@ -166,7 +166,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin) /* Don't let a single process grow too big: leave 3% of the size of this process for other processes */ - allowed -= current->mm->total_vm / 32; + allowed -= mm->total_vm / 32; /* * cast `allowed' as a signed long because vm_committed_space @@ -2077,7 +2077,7 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) if (__vma && __vma->vm_start < vma->vm_end) return -ENOMEM; if ((vma->vm_flags & VM_ACCOUNT) && - security_vm_enough_memory(vma_pages(vma))) + security_vm_enough_memory_mm(mm, vma_pages(vma))) return -ENOMEM; vma_link(mm, vma, prev, rb_link, rb_parent); return 0; diff --git a/mm/nommu.c b/mm/nommu.c index 9eef6a39855..8ed0cb43118 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1270,7 +1270,7 @@ EXPORT_SYMBOL(get_unmapped_area); * Note this is a helper function intended to be used by LSMs which * wish to use this logic. */ -int __vm_enough_memory(long pages, int cap_sys_admin) +int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { unsigned long free, allowed; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3da85b81dab..6427653023a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1157,6 +1157,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ int zlc_active = 0; /* set if using zonelist_cache */ int did_zlc_setup = 0; /* just call zlc_setup() one time */ + enum zone_type highest_zoneidx = -1; /* Gets set for policy zonelists */ zonelist_scan: /* @@ -1166,6 +1167,18 @@ zonelist_scan: z = zonelist->zones; do { + /* + * In NUMA, this could be a policy zonelist which contains + * zones that may not be allowed by the current gfp_mask. + * Check the zone is allowed by the current flags + */ + if (unlikely(alloc_should_filter_zonelist(zonelist))) { + if (highest_zoneidx == -1) + highest_zoneidx = gfp_zone(gfp_mask); + if (zone_idx(*z) > highest_zoneidx) + continue; + } + if (NUMA_BUILD && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; diff --git a/mm/slab.c b/mm/slab.c index a684778b2b4..6f6abef83a1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -883,6 +883,7 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, */ static int use_alien_caches __read_mostly = 1; +static int numa_platform __read_mostly = 1; static int __init noaliencache_setup(char *s) { use_alien_caches = 0; @@ -1399,8 +1400,10 @@ void __init kmem_cache_init(void) int order; int node; - if (num_possible_nodes() == 1) + if (num_possible_nodes() == 1) { use_alien_caches = 0; + numa_platform = 0; + } for (i = 0; i < NUM_INIT_LISTS; i++) { kmem_list3_init(&initkmem_list3[i]); @@ -3558,7 +3561,14 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) check_irq_off(); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); - if (cache_free_alien(cachep, objp)) + /* + * Skip calling cache_free_alien() when the platform is not numa. + * This will avoid cache misses that happen while accessing slabp (which + * is per page memory reference) to get nodeid. Instead use a global + * variable to skip the call, which is mostly likely to be present in + * the cache. + */ + if (numa_platform && cache_free_alien(cachep, objp)) return; if (likely(ac->avail < ac->limit)) { diff --git a/mm/slub.c b/mm/slub.c index 6c6d74ff069..04151da399c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -211,7 +211,8 @@ static inline void ClearSlabDebug(struct page *page) #define MAX_OBJECTS_PER_SLAB 65535 /* Internal SLUB flags */ -#define __OBJECT_POISON 0x80000000 /* Poison object */ +#define __OBJECT_POISON 0x80000000 /* Poison object */ +#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ /* Not all arches define cache_line_size */ #ifndef cache_line_size @@ -1876,9 +1877,16 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); - page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node); + page = new_slab(kmalloc_caches, gfpflags, node); BUG_ON(!page); + if (page_to_nid(page) != node) { + printk(KERN_ERR "SLUB: Unable to allocate memory from " + "node %d\n", node); + printk(KERN_ERR "SLUB: Allocating a useless per node structure " + "in order to be able to continue\n"); + } + n = page->freelist; BUG_ON(!n); page->freelist = get_freepointer(kmalloc_caches, n); @@ -2277,10 +2285,26 @@ panic: } #ifdef CONFIG_ZONE_DMA + +static void sysfs_add_func(struct work_struct *w) +{ + struct kmem_cache *s; + + down_write(&slub_lock); + list_for_each_entry(s, &slab_caches, list) { + if (s->flags & __SYSFS_ADD_DEFERRED) { + s->flags &= ~__SYSFS_ADD_DEFERRED; + sysfs_slab_add(s); + } + } + up_write(&slub_lock); +} + +static DECLARE_WORK(sysfs_add_work, sysfs_add_func); + static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) { struct kmem_cache *s; - struct kmem_cache *x; char *text; size_t realsize; @@ -2289,22 +2313,36 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) return s; /* Dynamically create dma cache */ - x = kmalloc(kmem_size, flags & ~SLUB_DMA); - if (!x) - panic("Unable to allocate memory for dma cache\n"); + if (flags & __GFP_WAIT) + down_write(&slub_lock); + else { + if (!down_write_trylock(&slub_lock)) + goto out; + } + + if (kmalloc_caches_dma[index]) + goto unlock_out; realsize = kmalloc_caches[index].objsize; - text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", - (unsigned int)realsize); - s = create_kmalloc_cache(x, text, realsize, flags); - down_write(&slub_lock); - if (!kmalloc_caches_dma[index]) { - kmalloc_caches_dma[index] = s; - up_write(&slub_lock); - return s; + text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", (unsigned int)realsize), + s = kmalloc(kmem_size, flags & ~SLUB_DMA); + + if (!s || !text || !kmem_cache_open(s, flags, text, + realsize, ARCH_KMALLOC_MINALIGN, + SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) { + kfree(s); + kfree(text); + goto unlock_out; } + + list_add(&s->list, &slab_caches); + kmalloc_caches_dma[index] = s; + + schedule_work(&sysfs_add_work); + +unlock_out: up_write(&slub_lock); - kmem_cache_destroy(s); +out: return kmalloc_caches_dma[index]; } #endif @@ -2500,15 +2538,11 @@ int kmem_cache_shrink(struct kmem_cache *s) slab_unlock(page); discard_slab(s, page); } else { - if (n->nr_partial > MAX_PARTIAL) - list_move(&page->lru, - slabs_by_inuse + page->inuse); + list_move(&page->lru, + slabs_by_inuse + page->inuse); } } - if (n->nr_partial <= MAX_PARTIAL) - goto out; - /* * Rebuild the partial list with the slabs filled up most * first and the least used slabs at the end. @@ -2516,7 +2550,6 @@ int kmem_cache_shrink(struct kmem_cache *s) for (i = s->objects - 1; i >= 0; i--) list_splice(slabs_by_inuse + i, n->partial.prev); - out: spin_unlock_irqrestore(&n->list_lock, flags); } @@ -3086,7 +3119,7 @@ static int list_locations(struct kmem_cache *s, char *buf, unsigned long flags; struct page *page; - if (!atomic_read(&n->nr_slabs)) + if (!atomic_long_read(&n->nr_slabs)) continue; spin_lock_irqsave(&n->list_lock, flags); @@ -3221,7 +3254,7 @@ static unsigned long slab_objects(struct kmem_cache *s, } if (flags & SO_FULL) { - int full_slabs = atomic_read(&n->nr_slabs) + int full_slabs = atomic_long_read(&n->nr_slabs) - per_cpu[node] - n->nr_partial; @@ -3257,7 +3290,7 @@ static int any_slab_objects(struct kmem_cache *s) for_each_node(node) { struct kmem_cache_node *n = get_node(s, node); - if (n->nr_partial || atomic_read(&n->nr_slabs)) + if (n->nr_partial || atomic_long_read(&n->nr_slabs)) return 1; } return 0; diff --git a/mm/sparse.c b/mm/sparse.c index 3047bf06c1f..239f5a720d3 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -41,6 +41,15 @@ int page_to_nid(struct page *page) return section_to_node_table[page_to_section(page)]; } EXPORT_SYMBOL(page_to_nid); + +static void set_section_nid(unsigned long section_nr, int nid) +{ + section_to_node_table[section_nr] = nid; +} +#else /* !NODE_NOT_IN_PAGE_FLAGS */ +static inline void set_section_nid(unsigned long section_nr, int nid) +{ +} #endif #ifdef CONFIG_SPARSEMEM_EXTREME @@ -68,10 +77,6 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) struct mem_section *section; int ret = 0; -#ifdef NODE_NOT_IN_PAGE_FLAGS - section_to_node_table[section_nr] = nid; -#endif - if (mem_section[root]) return -EEXIST; @@ -148,6 +153,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) struct mem_section *ms; sparse_index_init(section, nid); + set_section_nid(section, nid); ms = __nr_to_section(section); if (!ms->section_mem_map) diff --git a/mm/vmscan.c b/mm/vmscan.c index d419e10e3da..a6e65d02499 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -271,6 +271,12 @@ static void handle_write_error(struct address_space *mapping, unlock_page(page); } +/* Request for sync pageout. */ +enum pageout_io { + PAGEOUT_IO_ASYNC, + PAGEOUT_IO_SYNC, +}; + /* possible outcome of pageout() */ typedef enum { /* failed to write page out, page is locked */ @@ -287,7 +293,8 @@ typedef enum { * pageout is called by shrink_page_list() for each dirty page. * Calls ->writepage(). */ -static pageout_t pageout(struct page *page, struct address_space *mapping) +static pageout_t pageout(struct page *page, struct address_space *mapping, + enum pageout_io sync_writeback) { /* * If the page is dirty, only perform writeback if that write @@ -346,6 +353,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) ClearPageReclaim(page); return PAGE_ACTIVATE; } + + /* + * Wait on writeback if requested to. This happens when + * direct reclaiming a large contiguous area and the + * first attempt to free a range of pages fails. + */ + if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC) + wait_on_page_writeback(page); + if (!PageWriteback(page)) { /* synchronous write or broken a_ops? */ ClearPageReclaim(page); @@ -423,7 +439,8 @@ cannot_free: * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, - struct scan_control *sc) + struct scan_control *sc, + enum pageout_io sync_writeback) { LIST_HEAD(ret_pages); struct pagevec freed_pvec; @@ -458,8 +475,23 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (page_mapped(page) || PageSwapCache(page)) sc->nr_scanned++; - if (PageWriteback(page)) - goto keep_locked; + may_enter_fs = (sc->gfp_mask & __GFP_FS) || + (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + + if (PageWriteback(page)) { + /* + * Synchronous reclaim is performed in two passes, + * first an asynchronous pass over the list to + * start parallel writeback, and a second synchronous + * pass to wait for the IO to complete. Wait here + * for any page for which writeback has already + * started. + */ + if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) + wait_on_page_writeback(page); + else + goto keep_locked; + } referenced = page_referenced(page, 1); /* In active use or really unfreeable? Activate it. */ @@ -478,8 +510,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, #endif /* CONFIG_SWAP */ mapping = page_mapping(page); - may_enter_fs = (sc->gfp_mask & __GFP_FS) || - (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); /* * The page is mapped into the page tables of one or more @@ -505,7 +535,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; /* Page is dirty, try to write it out here */ - switch(pageout(page, mapping)) { + switch (pageout(page, mapping, sync_writeback)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: @@ -777,6 +807,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, (sc->order > PAGE_ALLOC_COSTLY_ORDER)? ISOLATE_BOTH : ISOLATE_INACTIVE); nr_active = clear_active_flags(&page_list); + __count_vm_events(PGDEACTIVATE, nr_active); __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); __mod_zone_page_state(zone, NR_INACTIVE, @@ -785,7 +816,29 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, spin_unlock_irq(&zone->lru_lock); nr_scanned += nr_scan; - nr_freed = shrink_page_list(&page_list, sc); + nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); + + /* + * If we are direct reclaiming for contiguous pages and we do + * not reclaim everything in the list, try again and wait + * for IO to complete. This will stall high-order allocations + * but that should be acceptable to the caller + */ + if (nr_freed < nr_taken && !current_is_kswapd() && + sc->order > PAGE_ALLOC_COSTLY_ORDER) { + congestion_wait(WRITE, HZ/10); + + /* + * The attempt at page out may have made some + * of the pages active, mark them inactive again. + */ + nr_active = clear_active_flags(&page_list); + count_vm_events(PGDEACTIVATE, nr_active); + + nr_freed += shrink_page_list(&page_list, sc, + PAGEOUT_IO_SYNC); + } + nr_reclaimed += nr_freed; local_irq_disable(); if (current_is_kswapd()) { |