diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 18 | ||||
-rw-r--r-- | mm/Makefile | 3 | ||||
-rw-r--r-- | mm/backing-dev.c | 47 | ||||
-rw-r--r-- | mm/bounce.c | 6 | ||||
-rw-r--r-- | mm/filemap.c | 781 | ||||
-rw-r--r-- | mm/filemap.h | 103 | ||||
-rw-r--r-- | mm/filemap_xip.c | 17 | ||||
-rw-r--r-- | mm/fremap.c | 26 | ||||
-rw-r--r-- | mm/hugetlb.c | 398 | ||||
-rw-r--r-- | mm/internal.h | 10 | ||||
-rw-r--r-- | mm/memory.c | 161 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 312 | ||||
-rw-r--r-- | mm/mempolicy.c | 60 | ||||
-rw-r--r-- | mm/migrate.c | 6 | ||||
-rw-r--r-- | mm/mmap.c | 3 | ||||
-rw-r--r-- | mm/mprotect.c | 1 | ||||
-rw-r--r-- | mm/nommu.c | 1 | ||||
-rw-r--r-- | mm/oom_kill.c | 116 | ||||
-rw-r--r-- | mm/page-writeback.c | 310 | ||||
-rw-r--r-- | mm/page_alloc.c | 754 | ||||
-rw-r--r-- | mm/page_isolation.c | 138 | ||||
-rw-r--r-- | mm/readahead.c | 94 | ||||
-rw-r--r-- | mm/rmap.c | 5 | ||||
-rw-r--r-- | mm/shmem.c | 82 | ||||
-rw-r--r-- | mm/slab.c | 35 | ||||
-rw-r--r-- | mm/slob.c | 13 | ||||
-rw-r--r-- | mm/slub.c | 520 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 148 | ||||
-rw-r--r-- | mm/sparse.c | 105 | ||||
-rw-r--r-- | mm/swap.c | 111 | ||||
-rw-r--r-- | mm/swap_state.c | 5 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 19 | ||||
-rw-r--r-- | mm/truncate.c | 3 | ||||
-rw-r--r-- | mm/util.c | 6 | ||||
-rw-r--r-- | mm/vmalloc.c | 5 | ||||
-rw-r--r-- | mm/vmscan.c | 99 | ||||
-rw-r--r-- | mm/vmstat.c | 305 |
37 files changed, 3561 insertions, 1265 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index e24d348083c..b1f03b0eb7f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -112,6 +112,19 @@ config SPARSEMEM_EXTREME def_bool y depends on SPARSEMEM && !SPARSEMEM_STATIC +# +# SPARSEMEM_VMEMMAP uses a virtually mapped mem_map to optimise pfn_to_page +# and page_to_pfn. The most efficient option where kernel virtual space is +# not under pressure. +# +config SPARSEMEM_VMEMMAP_ENABLE + def_bool n + +config SPARSEMEM_VMEMMAP + bool + depends on SPARSEMEM + default y if (SPARSEMEM_VMEMMAP_ENABLE) + # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG bool "Allow for memory hot-add" @@ -126,6 +139,11 @@ config MEMORY_HOTPLUG_SPARSE def_bool y depends on SPARSEMEM && MEMORY_HOTPLUG +config MEMORY_HOTREMOVE + bool "Allow for memory hot remove" + depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE + depends on MIGRATION + # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address # space can be handled with less contention: split it at this NR_CPUS. diff --git a/mm/Makefile b/mm/Makefile index 245e33ab00c..5c0b0ea7572 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -11,13 +11,14 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ page_alloc.o page-writeback.o pdflush.o \ readahead.o swap.o truncate.o vmscan.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ - $(mmu-y) + page_isolation.o $(mmu-y) obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o +obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_SHMEM) += shmem.o obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f50a2811f9d..b0ceb29da4c 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -5,6 +5,41 @@ #include <linux/sched.h> #include <linux/module.h> +int bdi_init(struct backing_dev_info *bdi) +{ + int i, j; + int err; + + for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { + err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0); + if (err) + goto err; + } + + bdi->dirty_exceeded = 0; + err = prop_local_init_percpu(&bdi->completions); + + if (err) { +err: + for (j = 0; j < i; j++) + percpu_counter_destroy(&bdi->bdi_stat[i]); + } + + return err; +} +EXPORT_SYMBOL(bdi_init); + +void bdi_destroy(struct backing_dev_info *bdi) +{ + int i; + + for (i = 0; i < NR_BDI_STAT_ITEMS; i++) + percpu_counter_destroy(&bdi->bdi_stat[i]); + + prop_local_destroy_percpu(&bdi->completions); +} +EXPORT_SYMBOL(bdi_destroy); + static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) @@ -55,15 +90,3 @@ long congestion_wait(int rw, long timeout) } EXPORT_SYMBOL(congestion_wait); -/** - * congestion_end - wake up sleepers on a congested backing_dev_info - * @rw: READ or WRITE - */ -void congestion_end(int rw) -{ - wait_queue_head_t *wqh = &congestion_wqh[rw]; - - if (waitqueue_active(wqh)) - wake_up(wqh); -} -EXPORT_SYMBOL(congestion_end); diff --git a/mm/bounce.c b/mm/bounce.c index 3b549bf31f7..b6d2d0f1019 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -265,6 +265,12 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) mempool_t *pool; /* + * Data-less bio, nothing to bounce + */ + if (bio_empty_barrier(*bio_orig)) + return; + + /* * for non-isa bounce case, just check if the bounce pfn is equal * to or bigger than the highest pfn in the system -- in that case, * don't waste time iterating over bio segments diff --git a/mm/filemap.c b/mm/filemap.c index 15c8413ee92..79f24a969cb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -30,7 +30,7 @@ #include <linux/security.h> #include <linux/syscalls.h> #include <linux/cpuset.h> -#include "filemap.h" +#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ #include "internal.h" /* @@ -63,6 +63,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock + * ->zone.lock * * ->i_mutex * ->i_mmap_lock (truncate->unmap_mapping_range) @@ -593,7 +594,7 @@ void fastcall __lock_page_nosync(struct page *page) * Is there a pagecache struct page at the given (mapping, offset) tuple? * If yes, increment its refcount and return it; if no, return NULL. */ -struct page * find_get_page(struct address_space *mapping, unsigned long offset) +struct page * find_get_page(struct address_space *mapping, pgoff_t offset) { struct page *page; @@ -617,30 +618,31 @@ EXPORT_SYMBOL(find_get_page); * Returns zero if the page was not present. find_lock_page() may sleep. */ struct page *find_lock_page(struct address_space *mapping, - unsigned long offset) + pgoff_t offset) { struct page *page; - read_lock_irq(&mapping->tree_lock); repeat: + read_lock_irq(&mapping->tree_lock); page = radix_tree_lookup(&mapping->page_tree, offset); if (page) { page_cache_get(page); if (TestSetPageLocked(page)) { read_unlock_irq(&mapping->tree_lock); __lock_page(page); - read_lock_irq(&mapping->tree_lock); /* Has the page been truncated while we slept? */ - if (unlikely(page->mapping != mapping || - page->index != offset)) { + if (unlikely(page->mapping != mapping)) { unlock_page(page); page_cache_release(page); goto repeat; } + VM_BUG_ON(page->index != offset); + goto out; } } read_unlock_irq(&mapping->tree_lock); +out: return page; } EXPORT_SYMBOL(find_lock_page); @@ -663,29 +665,24 @@ EXPORT_SYMBOL(find_lock_page); * memory exhaustion. */ struct page *find_or_create_page(struct address_space *mapping, - unsigned long index, gfp_t gfp_mask) + pgoff_t index, gfp_t gfp_mask) { - struct page *page, *cached_page = NULL; + struct page *page; int err; repeat: page = find_lock_page(mapping, index); if (!page) { - if (!cached_page) { - cached_page = - __page_cache_alloc(gfp_mask); - if (!cached_page) - return NULL; + page = __page_cache_alloc(gfp_mask); + if (!page) + return NULL; + err = add_to_page_cache_lru(page, mapping, index, gfp_mask); + if (unlikely(err)) { + page_cache_release(page); + page = NULL; + if (err == -EEXIST) + goto repeat; } - err = add_to_page_cache_lru(cached_page, mapping, - index, gfp_mask); - if (!err) { - page = cached_page; - cached_page = NULL; - } else if (err == -EEXIST) - goto repeat; } - if (cached_page) - page_cache_release(cached_page); return page; } EXPORT_SYMBOL(find_or_create_page); @@ -797,7 +794,7 @@ EXPORT_SYMBOL(find_get_pages_tag); * and deadlock against the caller's locked page. */ struct page * -grab_cache_page_nowait(struct address_space *mapping, unsigned long index) +grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) { struct page *page = find_get_page(mapping, index); @@ -859,34 +856,29 @@ static void shrink_readahead_size_eio(struct file *filp, * It may be NULL. */ void do_generic_mapping_read(struct address_space *mapping, - struct file_ra_state *_ra, + struct file_ra_state *ra, struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) { struct inode *inode = mapping->host; - unsigned long index; - unsigned long offset; - unsigned long last_index; - unsigned long next_index; - unsigned long prev_index; + pgoff_t index; + pgoff_t last_index; + pgoff_t prev_index; + unsigned long offset; /* offset into pagecache page */ unsigned int prev_offset; - struct page *cached_page; int error; - struct file_ra_state ra = *_ra; - cached_page = NULL; index = *ppos >> PAGE_CACHE_SHIFT; - next_index = index; - prev_index = ra.prev_index; - prev_offset = ra.prev_offset; + prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; + prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; for (;;) { struct page *page; - unsigned long end_index; + pgoff_t end_index; loff_t isize; unsigned long nr, ret; @@ -895,7 +887,7 @@ find_page: page = find_get_page(mapping, index); if (!page) { page_cache_sync_readahead(mapping, - &ra, filp, + ra, filp, index, last_index - index); page = find_get_page(mapping, index); if (unlikely(page == NULL)) @@ -903,7 +895,7 @@ find_page: } if (PageReadahead(page)) { page_cache_async_readahead(mapping, - &ra, filp, page, + ra, filp, page, index, last_index - index); } if (!PageUptodate(page)) @@ -966,7 +958,6 @@ page_ok: index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; prev_offset = offset; - ra.prev_offset = offset; page_cache_release(page); if (ret == nr && desc->count) @@ -1015,7 +1006,7 @@ readpage: } unlock_page(page); error = -EIO; - shrink_readahead_size_eio(filp, &ra); + shrink_readahead_size_eio(filp, ra); goto readpage_error; } unlock_page(page); @@ -1034,33 +1025,29 @@ no_cached_page: * Ok, it wasn't cached, so we need to create a new * page.. */ - if (!cached_page) { - cached_page = page_cache_alloc_cold(mapping); - if (!cached_page) { - desc->error = -ENOMEM; - goto out; - } + page = page_cache_alloc_cold(mapping); + if (!page) { + desc->error = -ENOMEM; + goto out; } - error = add_to_page_cache_lru(cached_page, mapping, + error = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); if (error) { + page_cache_release(page); if (error == -EEXIST) goto find_page; desc->error = error; goto out; } - page = cached_page; - cached_page = NULL; goto readpage; } out: - *_ra = ra; - _ra->prev_index = prev_index; + ra->prev_pos = prev_index; + ra->prev_pos <<= PAGE_CACHE_SHIFT; + ra->prev_pos |= prev_offset; - *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; - if (cached_page) - page_cache_release(cached_page); + *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; if (filp) file_accessed(filp); } @@ -1220,7 +1207,7 @@ EXPORT_SYMBOL(generic_file_aio_read); static ssize_t do_readahead(struct address_space *mapping, struct file *filp, - unsigned long index, unsigned long nr) + pgoff_t index, unsigned long nr) { if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) return -EINVAL; @@ -1240,8 +1227,8 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) if (file) { if (file->f_mode & FMODE_READ) { struct address_space *mapping = file->f_mapping; - unsigned long start = offset >> PAGE_CACHE_SHIFT; - unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT; + pgoff_t start = offset >> PAGE_CACHE_SHIFT; + pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; unsigned long len = end - start + 1; ret = do_readahead(mapping, file, start, len); } @@ -1251,7 +1238,6 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) } #ifdef CONFIG_MMU -static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); /** * page_cache_read - adds requested page to the page cache if not already there * @file: file to read @@ -1260,7 +1246,7 @@ static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); * This adds the requested page to the page cache if it isn't already there, * and schedules an I/O to read in its contents from disk. */ -static int fastcall page_cache_read(struct file * file, unsigned long offset) +static int fastcall page_cache_read(struct file * file, pgoff_t offset) { struct address_space *mapping = file->f_mapping; struct page *page; @@ -1349,7 +1335,7 @@ retry_find: * Do we miss much more than hit in this file? If so, * stop bothering with read-ahead. It will only hurt. */ - if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS) + if (ra->mmap_miss > MMAP_LOTSAMISS) goto no_cached_page; /* @@ -1375,7 +1361,7 @@ retry_find: } if (!did_readaround) - ra->mmap_hit++; + ra->mmap_miss--; /* * We have a locked page in the page cache, now we need to check @@ -1396,7 +1382,7 @@ retry_find: * Found the page and have a reference on it. */ mark_page_accessed(page); - ra->prev_index = page->index; + ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; vmf->page = page; return ret | VM_FAULT_LOCKED; @@ -1501,39 +1487,32 @@ EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap); static struct page *__read_cache_page(struct address_space *mapping, - unsigned long index, + pgoff_t index, int (*filler)(void *,struct page*), void *data) { - struct page *page, *cached_page = NULL; + struct page *page; int err; repeat: page = find_get_page(mapping, index); if (!page) { - if (!cached_page) { - cached_page = page_cache_alloc_cold(mapping); - if (!cached_page) - return ERR_PTR(-ENOMEM); - } - err = add_to_page_cache_lru(cached_page, mapping, - index, GFP_KERNEL); - if (err == -EEXIST) - goto repeat; - if (err < 0) { + page = page_cache_alloc_cold(mapping); + if (!page) + return ERR_PTR(-ENOMEM); + err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); + if (unlikely(err)) { + page_cache_release(page); + if (err == -EEXIST) + goto repeat; /* Presumably ENOMEM for radix tree node */ - page_cache_release(cached_page); return ERR_PTR(err); } - page = cached_page; - cached_page = NULL; err = filler(data, page); if (err < 0) { page_cache_release(page); page = ERR_PTR(err); } } - if (cached_page) - page_cache_release(cached_page); return page; } @@ -1542,7 +1521,7 @@ repeat: * after submitting it to the filler. */ struct page *read_cache_page_async(struct address_space *mapping, - unsigned long index, + pgoff_t index, int (*filler)(void *,struct page*), void *data) { @@ -1590,7 +1569,7 @@ EXPORT_SYMBOL(read_cache_page_async); * If the page does not get brought uptodate, return -EIO. */ struct page *read_cache_page(struct address_space *mapping, - unsigned long index, + pgoff_t index, int (*filler)(void *,struct page*), void *data) { @@ -1610,40 +1589,6 @@ struct page *read_cache_page(struct address_space *mapping, EXPORT_SYMBOL(read_cache_page); /* - * If the page was newly created, increment its refcount and add it to the - * caller's lru-buffering pagevec. This function is specifically for - * generic_file_write(). - */ -static inline struct page * -__grab_cache_page(struct address_space *mapping, unsigned long index, - struct page **cached_page, struct pagevec *lru_pvec) -{ - int err; - struct page *page; -repeat: - page = find_lock_page(mapping, index); - if (!page) { - if (!*cached_page) { - *cached_page = page_cache_alloc(mapping); - if (!*cached_page) - return NULL; - } - err = add_to_page_cache(*cached_page, mapping, - index, GFP_KERNEL); - if (err == -EEXIST) - goto repeat; - if (err == 0) { - page = *cached_page; - page_cache_get(page); - if (!pagevec_add(lru_pvec, page)) - __pagevec_lru_add(lru_pvec); - *cached_page = NULL; - } - } - return page; -} - -/* * The logic we want is * * if suid or (sgid and xgrp) @@ -1682,17 +1627,22 @@ int __remove_suid(struct dentry *dentry, int kill) int remove_suid(struct dentry *dentry) { - int kill = should_remove_suid(dentry); + int killsuid = should_remove_suid(dentry); + int killpriv = security_inode_need_killpriv(dentry); + int error = 0; - if (unlikely(kill)) - return __remove_suid(dentry, kill); + if (killpriv < 0) + return killpriv; + if (killpriv) + error = security_inode_killpriv(dentry); + if (!error && killsuid) + error = __remove_suid(dentry, killsuid); - return 0; + return error; } EXPORT_SYMBOL(remove_suid); -size_t -__filemap_copy_from_user_iovec_inatomic(char *vaddr, +static size_t __iovec_copy_from_user_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) { size_t copied = 0, left = 0; @@ -1715,6 +1665,124 @@ __filemap_copy_from_user_iovec_inatomic(char *vaddr, } /* + * Copy as much as we can into the page and return the number of bytes which + * were sucessfully copied. If a fault is encountered then return the number of + * bytes which were copied. + */ +size_t iov_iter_copy_from_user_atomic(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr; + size_t copied; + + BUG_ON(!in_atomic()); + kaddr = kmap_atomic(page, KM_USER0); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = i->iov->iov_base + i->iov_offset; + left = __copy_from_user_inatomic_nocache(kaddr + offset, + buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, + i->iov, i->iov_offset, bytes); + } + kunmap_atomic(kaddr, KM_USER0); + + return copied; +} +EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); + +/* + * This has the same sideeffects and return value as + * iov_iter_copy_from_user_atomic(). + * The difference is that it attempts to resolve faults. + * Page must not be locked. + */ +size_t iov_iter_copy_from_user(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr; + size_t copied; + + kaddr = kmap(page); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = i->iov->iov_base + i->iov_offset; + left = __copy_from_user_nocache(kaddr + offset, buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, + i->iov, i->iov_offset, bytes); + } + kunmap(page); + return copied; +} +EXPORT_SYMBOL(iov_iter_copy_from_user); + +static void __iov_iter_advance_iov(struct iov_iter *i, size_t bytes) +{ + if (likely(i->nr_segs == 1)) { + i->iov_offset += bytes; + } else { + const struct iovec *iov = i->iov; + size_t base = i->iov_offset; + + while (bytes) { + int copy = min(bytes, iov->iov_len - base); + + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + base = 0; + } + } + i->iov = iov; + i->iov_offset = base; + } +} + +void iov_iter_advance(struct iov_iter *i, size_t bytes) +{ + BUG_ON(i->count < bytes); + + __iov_iter_advance_iov(i, bytes); + i->count -= bytes; +} +EXPORT_SYMBOL(iov_iter_advance); + +/* + * Fault in the first iovec of the given iov_iter, to a maximum length + * of bytes. Returns 0 on success, or non-zero if the memory could not be + * accessed (ie. because it is an invalid address). + * + * writev-intensive code may want this to prefault several iovecs -- that + * would be possible (callers must not rely on the fact that _only_ the + * first iovec will be faulted with the current implementation). + */ +int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) +{ + char __user *buf = i->iov->iov_base + i->iov_offset; + bytes = min(bytes, i->iov->iov_len - i->iov_offset); + return fault_in_pages_readable(buf, bytes); +} +EXPORT_SYMBOL(iov_iter_fault_in_readable); + +/* + * Return the count of just the current iov_iter segment. + */ +size_t iov_iter_single_seg_count(struct iov_iter *i) +{ + const struct iovec *iov = i->iov; + if (i->nr_segs == 1) + return i->count; + else + return min(i->count, iov->iov_len - i->iov_offset); +} +EXPORT_SYMBOL(iov_iter_single_seg_count); + +/* * Performs necessary checks before doing a write * * Can adjust writing position or amount of bytes to write. @@ -1796,6 +1864,91 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i } EXPORT_SYMBOL(generic_write_checks); +int pagecache_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + const struct address_space_operations *aops = mapping->a_ops; + + if (aops->write_begin) { + return aops->write_begin(file, mapping, pos, len, flags, + pagep, fsdata); + } else { + int ret; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + struct inode *inode = mapping->host; + struct page *page; +again: + page = __grab_cache_page(mapping, index); + *pagep = page; + if (!page) + return -ENOMEM; + + if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) { + /* + * There is no way to resolve a short write situation + * for a !Uptodate page (except by double copying in + * the caller done by generic_perform_write_2copy). + * + * Instead, we have to bring it uptodate here. + */ + ret = aops->readpage(file, page); + page_cache_release(page); + if (ret) { + if (ret == AOP_TRUNCATED_PAGE) + goto again; + return ret; + } + goto again; + } + + ret = aops->prepare_write(file, page, offset, offset+len); + if (ret) { + unlock_page(page); + page_cache_release(page); + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); + } + return ret; + } +} +EXPORT_SYMBOL(pagecache_write_begin); + +int pagecache_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + const struct address_space_operations *aops = mapping->a_ops; + int ret; + + if (aops->write_end) { + mark_page_accessed(page); + ret = aops->write_end(file, mapping, pos, len, copied, + page, fsdata); + } else { + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + struct inode *inode = mapping->host; + + flush_dcache_page(page); + ret = aops->commit_write(file, page, offset, offset+len); + unlock_page(page); + mark_page_accessed(page); + page_cache_release(page); + + if (ret < 0) { + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); + } else if (ret > 0) + ret = min_t(size_t, copied, ret); + else + ret = copied; + } + + return ret; +} +EXPORT_SYMBOL(pagecache_write_end); + ssize_t generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, unsigned long *nr_segs, loff_t pos, loff_t *ppos, @@ -1835,151 +1988,314 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, } EXPORT_SYMBOL(generic_file_direct_write); -ssize_t -generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, loff_t *ppos, - size_t count, ssize_t written) +/* + * Find or create a page at the given pagecache position. Return the locked + * page. This function is specifically for buffered writes. + */ +struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index) { - struct file *file = iocb->ki_filp; - struct address_space * mapping = file->f_mapping; - const struct address_space_operations *a_ops = mapping->a_ops; - struct inode *inode = mapping->host; - long status = 0; - struct page *page; - struct page *cached_page = NULL; - size_t bytes; - struct pagevec lru_pvec; - const struct iovec *cur_iov = iov; /* current iovec */ - size_t iov_base = 0; /* offset in the current iovec */ - char __user *buf; - - pagevec_init(&lru_pvec, 0); + int status; + struct page *page; +repeat: + page = find_lock_page(mapping, index); + if (likely(page)) + return page; - /* - * handle partial DIO write. Adjust cur_iov if needed. - */ - if (likely(nr_segs == 1)) - buf = iov->iov_base + written; - else { - filemap_set_next_iovec(&cur_iov, &iov_base, written); - buf = cur_iov->iov_base + iov_base; + page = page_cache_alloc(mapping); + if (!page) + return NULL; + status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); + if (unlikely(status)) { + page_cache_release(page); + if (status == -EEXIST) + goto repeat; + return NULL; } + return page; +} +EXPORT_SYMBOL(__grab_cache_page); + +static ssize_t generic_perform_write_2copy(struct file *file, + struct iov_iter *i, loff_t pos) +{ + struct address_space *mapping = file->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + struct inode *inode = mapping->host; + long status = 0; + ssize_t written = 0; do { - unsigned long index; - unsigned long offset; - size_t copied; + struct page *src_page; + struct page *page; + pgoff_t index; /* Pagecache index for current page */ + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + size_t copied; /* Bytes copied from user */ - offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + offset = (pos & (PAGE_CACHE_SIZE - 1)); index = pos >> PAGE_CACHE_SHIFT; - bytes = PAGE_CACHE_SIZE - offset; + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_count(i)); - /* Limit the size of the copy to the caller's write size */ - bytes = min(bytes, count); - - /* We only need to worry about prefaulting when writes are from - * user-space. NFSd uses vfs_writev with several non-aligned - * segments in the vector, and limiting to one segment a time is - * a noticeable performance for re-write + /* + * a non-NULL src_page indicates that we're doing the + * copy via get_user_pages and kmap. */ - if (!segment_eq(get_fs(), KERNEL_DS)) { - /* - * Limit the size of the copy to that of the current - * segment, because fault_in_pages_readable() doesn't - * know how to walk segments. - */ - bytes = min(bytes, cur_iov->iov_len - iov_base); + src_page = NULL; - /* - * Bring in the user page that we will copy from - * _first_. Otherwise there's a nasty deadlock on - * copying from the same page as we're writing to, - * without it being marked up-to-date. - */ - fault_in_pages_readable(buf, bytes); + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. + */ + if (unlikely(iov_iter_fault_in_readable(i, bytes))) { + status = -EFAULT; + break; } - page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); + + page = __grab_cache_page(mapping, index); if (!page) { status = -ENOMEM; break; } - if (unlikely(bytes == 0)) { - status = 0; - copied = 0; - goto zero_length_segment; - } + /* + * non-uptodate pages cannot cope with short copies, and we + * cannot take a pagefault with the destination page locked. + * So pin the source page to copy it. + */ + if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) { + unlock_page(page); - status = a_ops->prepare_write(file, page, offset, offset+bytes); - if (unlikely(status)) { - loff_t isize = i_size_read(inode); + src_page = alloc_page(GFP_KERNEL); + if (!src_page) { + page_cache_release(page); + status = -ENOMEM; + break; + } - if (status != AOP_TRUNCATED_PAGE) + /* + * Cannot get_user_pages with a page locked for the + * same reason as we can't take a page fault with a + * page locked (as explained below). + */ + copied = iov_iter_copy_from_user(src_page, i, + offset, bytes); + if (unlikely(copied == 0)) { + status = -EFAULT; + page_cache_release(page); + page_cache_release(src_page); + break; + } + bytes = copied; + + lock_page(page); + /* + * Can't handle the page going uptodate here, because + * that means we would use non-atomic usercopies, which + * zero out the tail of the page, which can cause + * zeroes to become transiently visible. We could just + * use a non-zeroing copy, but the APIs aren't too + * consistent. + */ + if (unlikely(!page->mapping || PageUptodate(page))) { unlock_page(page); - page_cache_release(page); - if (status == AOP_TRUNCATED_PAGE) + page_cache_release(page); + page_cache_release(src_page); continue; + } + } + + status = a_ops->prepare_write(file, page, offset, offset+bytes); + if (unlikely(status)) + goto fs_write_aop_error; + + if (!src_page) { /* - * prepare_write() may have instantiated a few blocks - * outside i_size. Trim these off again. + * Must not enter the pagefault handler here, because + * we hold the page lock, so we might recursively + * deadlock on the same lock, or get an ABBA deadlock + * against a different lock, or against the mmap_sem + * (which nests outside the page lock). So increment + * preempt count, and use _atomic usercopies. + * + * The page is uptodate so we are OK to encounter a + * short copy: if unmodified parts of the page are + * marked dirty and written out to disk, it doesn't + * really matter. */ - if (pos + bytes > isize) - vmtruncate(inode, isize); - break; + pagefault_disable(); + copied = iov_iter_copy_from_user_atomic(page, i, + offset, bytes); + pagefault_enable(); + } else { + void *src, *dst; + src = kmap_atomic(src_page, KM_USER0); + dst = kmap_atomic(page, KM_USER1); + memcpy(dst + offset, src + offset, bytes); + kunmap_atomic(dst, KM_USER1); + kunmap_atomic(src, KM_USER0); + copied = bytes; } - if (likely(nr_segs == 1)) - copied = filemap_copy_from_user(page, offset, - buf, bytes); - else - copied = filemap_copy_from_user_iovec(page, offset, - cur_iov, iov_base, bytes); flush_dcache_page(page); + status = a_ops->commit_write(file, page, offset, offset+bytes); - if (status == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - continue; - } -zero_length_segment: - if (likely(copied >= 0)) { - if (!status) - status = copied; - - if (status >= 0) { - written += status; - count -= status; - pos += status; - buf += status; - if (unlikely(nr_segs > 1)) { - filemap_set_next_iovec(&cur_iov, - &iov_base, status); - if (count) - buf = cur_iov->iov_base + - iov_base; - } else { - iov_base += status; - } - } - } - if (unlikely(copied != bytes)) - if (status >= 0) - status = -EFAULT; + if (unlikely(status < 0)) + goto fs_write_aop_error; + if (unlikely(status > 0)) /* filesystem did partial write */ + copied = min_t(size_t, copied, status); + unlock_page(page); mark_page_accessed(page); page_cache_release(page); - if (status < 0) - break; + if (src_page) + page_cache_release(src_page); + + iov_iter_advance(i, copied); + pos += copied; + written += copied; + balance_dirty_pages_ratelimited(mapping); cond_resched(); - } while (count); - *ppos = pos; + continue; - if (cached_page) - page_cache_release(cached_page); +fs_write_aop_error: + unlock_page(page); + page_cache_release(page); + if (src_page) + page_cache_release(src_page); + + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (pos + bytes > inode->i_size) + vmtruncate(inode, inode->i_size); + break; + } while (iov_iter_count(i)); + + return written ? written : status; +} + +static ssize_t generic_perform_write(struct file *file, + struct iov_iter *i, loff_t pos) +{ + struct address_space *mapping = file->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + long status = 0; + ssize_t written = 0; + unsigned int flags = 0; /* - * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC + * Copies from kernel address space cannot fail (NFSD is a big user). */ + if (segment_eq(get_fs(), KERNEL_DS)) + flags |= AOP_FLAG_UNINTERRUPTIBLE; + + do { + struct page *page; + pgoff_t index; /* Pagecache index for current page */ + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + size_t copied; /* Bytes copied from user */ + void *fsdata; + + offset = (pos & (PAGE_CACHE_SIZE - 1)); + index = pos >> PAGE_CACHE_SHIFT; + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_count(i)); + +again: + + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. + */ + if (unlikely(iov_iter_fault_in_readable(i, bytes))) { + status = -EFAULT; + break; + } + + status = a_ops->write_begin(file, mapping, pos, bytes, flags, + &page, &fsdata); + if (unlikely(status)) + break; + + pagefault_disable(); + copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); + pagefault_enable(); + flush_dcache_page(page); + + status = a_ops->write_end(file, mapping, pos, bytes, copied, + page, fsdata); + if (unlikely(status < 0)) + break; + copied = status; + + cond_resched(); + + if (unlikely(copied == 0)) { + /* + * If we were unable to copy any data at all, we must + * fall back to a single segment length write. + * + * If we didn't fallback here, we could livelock + * because not all segments in the iov can be copied at + * once without a pagefault. + */ + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_single_seg_count(i)); + goto again; + } + iov_iter_advance(i, copied); + pos += copied; + written += copied; + + balance_dirty_pages_ratelimited(mapping); + + } while (iov_iter_count(i)); + + return written ? written : status; +} + +ssize_t +generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos, loff_t *ppos, + size_t count, ssize_t written) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + struct inode *inode = mapping->host; + ssize_t status; + struct iov_iter i; + + iov_iter_init(&i, iov, nr_segs, count, written); + if (a_ops->write_begin) + status = generic_perform_write(file, &i, pos); + else + status = generic_perform_write_2copy(file, &i, pos); + if (likely(status >= 0)) { + written += status; + *ppos = pos + status; + + /* + * For now, when the user asks for O_SYNC, we'll actually give + * O_DSYNC + */ if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { if (!a_ops->writepage || !is_sync_kiocb(iocb)) status = generic_osync_inode(inode, mapping, @@ -1995,7 +2311,6 @@ zero_length_segment: if (unlikely(file->f_flags & O_DIRECT) && written) status = filemap_write_and_wait(mapping); - pagevec_lru_add(&lru_pvec); return written ? written : status; } EXPORT_SYMBOL(generic_file_buffered_write); diff --git a/mm/filemap.h b/mm/filemap.h deleted file mode 100644 index c2bff04c84e..00000000000 --- a/mm/filemap.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * linux/mm/filemap.h - * - * Copyright (C) 1994-1999 Linus Torvalds - */ - -#ifndef __FILEMAP_H -#define __FILEMAP_H - -#include <linux/types.h> -#include <linux/fs.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/uio.h> -#include <linux/uaccess.h> - -size_t -__filemap_copy_from_user_iovec_inatomic(char *vaddr, - const struct iovec *iov, - size_t base, - size_t bytes); - -/* - * Copy as much as we can into the page and return the number of bytes which - * were sucessfully copied. If a fault is encountered then clear the page - * out to (offset+bytes) and return the number of bytes which were copied. - * - * NOTE: For this to work reliably we really want copy_from_user_inatomic_nocache - * to *NOT* zero any tail of the buffer that it failed to copy. If it does, - * and if the following non-atomic copy succeeds, then there is a small window - * where the target page contains neither the data before the write, nor the - * data after the write (it contains zero). A read at this time will see - * data that is inconsistent with any ordering of the read and the write. - * (This has been detected in practice). - */ -static inline size_t -filemap_copy_from_user(struct page *page, unsigned long offset, - const char __user *buf, unsigned bytes) -{ - char *kaddr; - int left; - - kaddr = kmap_atomic(page, KM_USER0); - left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes); - kunmap_atomic(kaddr, KM_USER0); - - if (left != 0) { - /* Do it the slow way */ - kaddr = kmap(page); - left = __copy_from_user_nocache(kaddr + offset, buf, bytes); - kunmap(page); - } - return bytes - left; -} - -/* - * This has the same sideeffects and return value as filemap_copy_from_user(). - * The difference is that on a fault we need to memset the remainder of the - * page (out to offset+bytes), to emulate filemap_copy_from_user()'s - * single-segment behaviour. - */ -static inline size_t -filemap_copy_from_user_iovec(struct page *page, unsigned long offset, - const struct iovec *iov, size_t base, size_t bytes) -{ - char *kaddr; - size_t copied; - - kaddr = kmap_atomic(page, KM_USER0); - copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov, - base, bytes); - kunmap_atomic(kaddr, KM_USER0); - if (copied != bytes) { - kaddr = kmap(page); - copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov, - base, bytes); - if (bytes - copied) - memset(kaddr + offset + copied, 0, bytes - copied); - kunmap(page); - } - return copied; -} - -static inline void -filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) -{ - const struct iovec *iov = *iovp; - size_t base = *basep; - - do { - int copy = min(bytes, iov->iov_len - base); - - bytes -= copy; - base += copy; - if (iov->iov_len == base) { - iov++; - base = 0; - } - } while (bytes); - *iovp = iov; - *basep = base; -} -#endif diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 53ee6a29963..32132f3cd64 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -15,7 +15,6 @@ #include <linux/rmap.h> #include <linux/sched.h> #include <asm/tlbflush.h> -#include "filemap.h" /* * We do use our own empty page to avoid interference with other users @@ -288,6 +287,7 @@ __xip_file_write(struct file *filp, const char __user *buf, unsigned long index; unsigned long offset; size_t copied; + char *kaddr; offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ index = pos >> PAGE_CACHE_SHIFT; @@ -295,14 +295,6 @@ __xip_file_write(struct file *filp, const char __user *buf, if (bytes > count) bytes = count; - /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. - */ - fault_in_pages_readable(buf, bytes); - page = a_ops->get_xip_page(mapping, index*(PAGE_SIZE/512), 0); if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) { @@ -319,8 +311,13 @@ __xip_file_write(struct file *filp, const char __user *buf, break; } - copied = filemap_copy_from_user(page, offset, buf, bytes); + fault_in_pages_readable(buf, bytes); + kaddr = kmap_atomic(page, KM_USER0); + copied = bytes - + __copy_from_user_inatomic_nocache(kaddr, buf, bytes); + kunmap_atomic(kaddr, KM_USER0); flush_dcache_page(page); + if (likely(copied > 0)) { status = copied; diff --git a/mm/fremap.c b/mm/fremap.c index 95bcb5641c7..14bd3bf7826 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -5,7 +5,7 @@ * * started by Ingo Molnar, Copyright (C) 2002, 2003 */ - +#include <linux/backing-dev.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/file.h> @@ -97,26 +97,28 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, } -/*** - * sys_remap_file_pages - remap arbitrary pages of a shared backing store - * file within an existing vma. +/** + * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma * @start: start of the remapped virtual memory range * @size: size of the remapped virtual memory range - * @prot: new protection bits of the range - * @pgoff: to be mapped page of the backing store file + * @prot: new protection bits of the range (see NOTE) + * @pgoff: to-be-mapped page of the backing store file * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO. * - * this syscall works purely via pagetables, so it's the most efficient + * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma + * (shared backing store file). + * + * This syscall works purely via pagetables, so it's the most efficient * way to map the same (large) file into a given virtual window. Unlike * mmap()/mremap() it does not create any new vmas. The new mappings are * also safe across swapout. * - * NOTE: the 'prot' parameter right now is ignored, and the vma's default - * protection is used. Arbitrary protections might be implemented in the - * future. + * NOTE: the 'prot' parameter right now is ignored (but must be zero), + * and the vma's default protection is used. Arbitrary protections + * might be implemented in the future. */ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, - unsigned long __prot, unsigned long pgoff, unsigned long flags) + unsigned long prot, unsigned long pgoff, unsigned long flags) { struct mm_struct *mm = current->mm; struct address_space *mapping; @@ -125,7 +127,7 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, int err = -EINVAL; int has_write_lock = 0; - if (__prot) + if (prot) return err; /* * Sanitize the syscall parameters: diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eab8c428cc9..ae2959bb59c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -23,12 +23,16 @@ const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; +static unsigned long surplus_huge_pages; unsigned long max_huge_pages; static struct list_head hugepage_freelists[MAX_NUMNODES]; static unsigned int nr_huge_pages_node[MAX_NUMNODES]; static unsigned int free_huge_pages_node[MAX_NUMNODES]; +static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; static gfp_t htlb_alloc_mask = GFP_HIGHUSER; unsigned long hugepages_treat_as_movable; +int hugetlb_dynamic_pool; +static int hugetlb_next_nid; /* * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages @@ -85,6 +89,8 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, list_del(&page->lru); free_huge_pages--; free_huge_pages_node[nid]--; + if (vma && vma->vm_flags & VM_MAYSHARE) + resv_huge_pages--; break; } } @@ -92,58 +98,269 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, return page; } +static void update_and_free_page(struct page *page) +{ + int i; + nr_huge_pages--; + nr_huge_pages_node[page_to_nid(page)]--; + for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { + page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | + 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | + 1 << PG_private | 1<< PG_writeback); + } + set_compound_page_dtor(page, NULL); + set_page_refcounted(page); + __free_pages(page, HUGETLB_PAGE_ORDER); +} + static void free_huge_page(struct page *page) { - BUG_ON(page_count(page)); + int nid = page_to_nid(page); + BUG_ON(page_count(page)); INIT_LIST_HEAD(&page->lru); spin_lock(&hugetlb_lock); - enqueue_huge_page(page); + if (surplus_huge_pages_node[nid]) { + update_and_free_page(page); + surplus_huge_pages--; + surplus_huge_pages_node[nid]--; + } else { + enqueue_huge_page(page); + } spin_unlock(&hugetlb_lock); } -static int alloc_fresh_huge_page(void) +/* + * Increment or decrement surplus_huge_pages. Keep node-specific counters + * balanced by operating on them in a round-robin fashion. + * Returns 1 if an adjustment was made. + */ +static int adjust_pool_surplus(int delta) { static int prev_nid; - struct page *page; - int nid; + int nid = prev_nid; + int ret = 0; + + VM_BUG_ON(delta != -1 && delta != 1); + do { + nid = next_node(nid, node_online_map); + if (nid == MAX_NUMNODES) + nid = first_node(node_online_map); + + /* To shrink on this node, there must be a surplus page */ + if (delta < 0 && !surplus_huge_pages_node[nid]) + continue; + /* Surplus cannot exceed the total number of pages */ + if (delta > 0 && surplus_huge_pages_node[nid] >= + nr_huge_pages_node[nid]) + continue; + + surplus_huge_pages += delta; + surplus_huge_pages_node[nid] += delta; + ret = 1; + break; + } while (nid != prev_nid); - /* - * Copy static prev_nid to local nid, work on that, then copy it - * back to prev_nid afterwards: otherwise there's a window in which - * a racer might pass invalid nid MAX_NUMNODES to alloc_pages_node. - * But we don't need to use a spin_lock here: it really doesn't - * matter if occasionally a racer chooses the same nid as we do. - */ - nid = next_node(prev_nid, node_online_map); - if (nid == MAX_NUMNODES) - nid = first_node(node_online_map); prev_nid = nid; + return ret; +} + +static struct page *alloc_fresh_huge_page_node(int nid) +{ + struct page *page; - page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, + page = alloc_pages_node(nid, + htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN, + HUGETLB_PAGE_ORDER); + if (page) { + set_compound_page_dtor(page, free_huge_page); + spin_lock(&hugetlb_lock); + nr_huge_pages++; + nr_huge_pages_node[nid]++; + spin_unlock(&hugetlb_lock); + put_page(page); /* free it into the hugepage allocator */ + } + + return page; +} + +static int alloc_fresh_huge_page(void) +{ + struct page *page; + int start_nid; + int next_nid; + int ret = 0; + + start_nid = hugetlb_next_nid; + + do { + page = alloc_fresh_huge_page_node(hugetlb_next_nid); + if (page) + ret = 1; + /* + * Use a helper variable to find the next node and then + * copy it back to hugetlb_next_nid afterwards: + * otherwise there's a window in which a racer might + * pass invalid nid MAX_NUMNODES to alloc_pages_node. + * But we don't need to use a spin_lock here: it really + * doesn't matter if occasionally a racer chooses the + * same nid as we do. Move nid forward in the mask even + * if we just successfully allocated a hugepage so that + * the next caller gets hugepages on the next node. + */ + next_nid = next_node(hugetlb_next_nid, node_online_map); + if (next_nid == MAX_NUMNODES) + next_nid = first_node(node_online_map); + hugetlb_next_nid = next_nid; + } while (!page && hugetlb_next_nid != start_nid); + + return ret; +} + +static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, + unsigned long address) +{ + struct page *page; + + /* Check if the dynamic pool is enabled */ + if (!hugetlb_dynamic_pool) + return NULL; + + page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, HUGETLB_PAGE_ORDER); if (page) { set_compound_page_dtor(page, free_huge_page); spin_lock(&hugetlb_lock); nr_huge_pages++; nr_huge_pages_node[page_to_nid(page)]++; + surplus_huge_pages++; + surplus_huge_pages_node[page_to_nid(page)]++; spin_unlock(&hugetlb_lock); - put_page(page); /* free it into the hugepage allocator */ - return 1; } - return 0; + + return page; +} + +/* + * Increase the hugetlb pool such that it can accomodate a reservation + * of size 'delta'. + */ +static int gather_surplus_pages(int delta) +{ + struct list_head surplus_list; + struct page *page, *tmp; + int ret, i; + int needed, allocated; + + needed = (resv_huge_pages + delta) - free_huge_pages; + if (needed <= 0) + return 0; + + allocated = 0; + INIT_LIST_HEAD(&surplus_list); + + ret = -ENOMEM; +retry: + spin_unlock(&hugetlb_lock); + for (i = 0; i < needed; i++) { + page = alloc_buddy_huge_page(NULL, 0); + if (!page) { + /* + * We were not able to allocate enough pages to + * satisfy the entire reservation so we free what + * we've allocated so far. + */ + spin_lock(&hugetlb_lock); + needed = 0; + goto free; + } + + list_add(&page->lru, &surplus_list); + } + allocated += needed; + + /* + * After retaking hugetlb_lock, we need to recalculate 'needed' + * because either resv_huge_pages or free_huge_pages may have changed. + */ + spin_lock(&hugetlb_lock); + needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); + if (needed > 0) + goto retry; + + /* + * The surplus_list now contains _at_least_ the number of extra pages + * needed to accomodate the reservation. Add the appropriate number + * of pages to the hugetlb pool and free the extras back to the buddy + * allocator. + */ + needed += allocated; + ret = 0; +free: + list_for_each_entry_safe(page, tmp, &surplus_list, lru) { + list_del(&page->lru); + if ((--needed) >= 0) + enqueue_huge_page(page); + else { + /* + * Decrement the refcount and free the page using its + * destructor. This must be done with hugetlb_lock + * unlocked which is safe because free_huge_page takes + * hugetlb_lock before deciding how to free the page. + */ + spin_unlock(&hugetlb_lock); + put_page(page); + spin_lock(&hugetlb_lock); + } + } + + return ret; +} + +/* + * When releasing a hugetlb pool reservation, any surplus pages that were + * allocated to satisfy the reservation must be explicitly freed if they were + * never used. + */ +void return_unused_surplus_pages(unsigned long unused_resv_pages) +{ + static int nid = -1; + struct page *page; + unsigned long nr_pages; + + nr_pages = min(unused_resv_pages, surplus_huge_pages); + + while (nr_pages) { + nid = next_node(nid, node_online_map); + if (nid == MAX_NUMNODES) + nid = first_node(node_online_map); + + if (!surplus_huge_pages_node[nid]) + continue; + + if (!list_empty(&hugepage_freelists[nid])) { + page = list_entry(hugepage_freelists[nid].next, + struct page, lru); + list_del(&page->lru); + update_and_free_page(page); + free_huge_pages--; + free_huge_pages_node[nid]--; + surplus_huge_pages--; + surplus_huge_pages_node[nid]--; + nr_pages--; + } + } } static struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) { - struct page *page; + struct page *page = NULL; + int use_reserved_page = vma->vm_flags & VM_MAYSHARE; spin_lock(&hugetlb_lock); - if (vma->vm_flags & VM_MAYSHARE) - resv_huge_pages--; - else if (free_huge_pages <= resv_huge_pages) + if (!use_reserved_page && (free_huge_pages <= resv_huge_pages)) goto fail; page = dequeue_huge_page(vma, addr); @@ -155,10 +372,17 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return page; fail: - if (vma->vm_flags & VM_MAYSHARE) - resv_huge_pages++; spin_unlock(&hugetlb_lock); - return NULL; + + /* + * Private mappings do not use reserved huge pages so the allocation + * may have failed due to an undersized hugetlb pool. Try to grab a + * surplus huge page from the buddy allocator. + */ + if (!use_reserved_page) + page = alloc_buddy_huge_page(vma, addr); + + return page; } static int __init hugetlb_init(void) @@ -171,6 +395,8 @@ static int __init hugetlb_init(void) for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&hugepage_freelists[i]); + hugetlb_next_nid = first_node(node_online_map); + for (i = 0; i < max_huge_pages; ++i) { if (!alloc_fresh_huge_page()) break; @@ -201,21 +427,6 @@ static unsigned int cpuset_mems_nr(unsigned int *array) } #ifdef CONFIG_SYSCTL -static void update_and_free_page(struct page *page) -{ - int i; - nr_huge_pages--; - nr_huge_pages_node[page_to_nid(page)]--; - for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { - page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | - 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | - 1 << PG_private | 1<< PG_writeback); - } - set_compound_page_dtor(page, NULL); - set_page_refcounted(page); - __free_pages(page, HUGETLB_PAGE_ORDER); -} - #ifdef CONFIG_HIGHMEM static void try_to_free_low(unsigned long count) { @@ -224,14 +435,14 @@ static void try_to_free_low(unsigned long count) for (i = 0; i < MAX_NUMNODES; ++i) { struct page *page, *next; list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { + if (count >= nr_huge_pages) + return; if (PageHighMem(page)) continue; list_del(&page->lru); update_and_free_page(page); free_huge_pages--; free_huge_pages_node[page_to_nid(page)]--; - if (count >= nr_huge_pages) - return; } } } @@ -241,26 +452,61 @@ static inline void try_to_free_low(unsigned long count) } #endif +#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) static unsigned long set_max_huge_pages(unsigned long count) { - while (count > nr_huge_pages) { - if (!alloc_fresh_huge_page()) - return nr_huge_pages; - } - if (count >= nr_huge_pages) - return nr_huge_pages; + unsigned long min_count, ret; + /* + * Increase the pool size + * First take pages out of surplus state. Then make up the + * remaining difference by allocating fresh huge pages. + */ spin_lock(&hugetlb_lock); - count = max(count, resv_huge_pages); - try_to_free_low(count); - while (count < nr_huge_pages) { + while (surplus_huge_pages && count > persistent_huge_pages) { + if (!adjust_pool_surplus(-1)) + break; + } + + while (count > persistent_huge_pages) { + int ret; + /* + * If this allocation races such that we no longer need the + * page, free_huge_page will handle it by freeing the page + * and reducing the surplus. + */ + spin_unlock(&hugetlb_lock); + ret = alloc_fresh_huge_page(); + spin_lock(&hugetlb_lock); + if (!ret) + goto out; + + } + + /* + * Decrease the pool size + * First return free pages to the buddy allocator (being careful + * to keep enough around to satisfy reservations). Then place + * pages into surplus state as needed so the pool will shrink + * to the desired size as pages become free. + */ + min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; + min_count = max(count, min_count); + try_to_free_low(min_count); + while (min_count < persistent_huge_pages) { struct page *page = dequeue_huge_page(NULL, 0); if (!page) break; update_and_free_page(page); } + while (count < persistent_huge_pages) { + if (!adjust_pool_surplus(1)) + break; + } +out: + ret = persistent_huge_pages; spin_unlock(&hugetlb_lock); - return nr_huge_pages; + return ret; } int hugetlb_sysctl_handler(struct ctl_table *table, int write, @@ -292,10 +538,12 @@ int hugetlb_report_meminfo(char *buf) "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" "HugePages_Rsvd: %5lu\n" + "HugePages_Surp: %5lu\n" "Hugepagesize: %5lu kB\n", nr_huge_pages, free_huge_pages, resv_huge_pages, + surplus_huge_pages, HPAGE_SIZE/1024); } @@ -355,7 +603,6 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, entry = pte_mkwrite(pte_mkdirty(*ptep)); if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); } } @@ -708,7 +955,6 @@ void hugetlb_change_protection(struct vm_area_struct *vma, pte = huge_ptep_get_and_clear(mm, address, ptep); pte = pte_mkhuge(pte_modify(pte, newprot)); set_huge_pte_at(mm, address, ptep, pte); - lazy_mmu_prot_update(pte); } } spin_unlock(&mm->page_table_lock); @@ -843,21 +1089,6 @@ static int hugetlb_acct_memory(long delta) int ret = -ENOMEM; spin_lock(&hugetlb_lock); - if ((delta + resv_huge_pages) <= free_huge_pages) { - resv_huge_pages += delta; - ret = 0; - } - spin_unlock(&hugetlb_lock); - return ret; -} - -int hugetlb_reserve_pages(struct inode *inode, long from, long to) -{ - long ret, chg; - - chg = region_chg(&inode->i_mapping->private_list, from, to); - if (chg < 0) - return chg; /* * When cpuset is configured, it breaks the strict hugetlb page * reservation as the accounting is done on a global variable. Such @@ -875,8 +1106,31 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to) * a best attempt and hopefully to minimize the impact of changing * semantics that cpuset has. */ - if (chg > cpuset_mems_nr(free_huge_pages_node)) - return -ENOMEM; + if (delta > 0) { + if (gather_surplus_pages(delta) < 0) + goto out; + + if (delta > cpuset_mems_nr(free_huge_pages_node)) + goto out; + } + + ret = 0; + resv_huge_pages += delta; + if (delta < 0) + return_unused_surplus_pages((unsigned long) -delta); + +out: + spin_unlock(&hugetlb_lock); + return ret; +} + +int hugetlb_reserve_pages(struct inode *inode, long from, long to) +{ + long ret, chg; + + chg = region_chg(&inode->i_mapping->private_list, from, to); + if (chg < 0) + return chg; ret = hugetlb_acct_memory(chg); if (ret < 0) diff --git a/mm/internal.h b/mm/internal.h index a3110c02aea..953f941ea86 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -37,4 +37,14 @@ static inline void __put_page(struct page *page) extern void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order); +/* + * function for dealing with page's order in buddy system. + * zone->lock is already acquired when we use these. + * So, we don't need atomic page->flags operations here. + */ +static inline unsigned long page_order(struct page *page) +{ + VM_BUG_ON(!PageBuddy(page)); + return page_private(page); +} #endif diff --git a/mm/memory.c b/mm/memory.c index f82b359b274..bd16dcaeefb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -966,7 +966,7 @@ no_page_table: * has touched so far, we don't want to allocate page tables. */ if (flags & FOLL_ANON) { - page = ZERO_PAGE(address); + page = ZERO_PAGE(0); if (flags & FOLL_GET) get_page(page); BUG_ON(flags & FOLL_WRITE); @@ -1111,95 +1111,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } EXPORT_SYMBOL(get_user_pages); -static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, unsigned long end, pgprot_t prot) -{ - pte_t *pte; - spinlock_t *ptl; - int err = 0; - - pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); - if (!pte) - return -EAGAIN; - arch_enter_lazy_mmu_mode(); - do { - struct page *page = ZERO_PAGE(addr); - pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); - - if (unlikely(!pte_none(*pte))) { - err = -EEXIST; - pte++; - break; - } - page_cache_get(page); - page_add_file_rmap(page); - inc_mm_counter(mm, file_rss); - set_pte_at(mm, addr, pte, zero_pte); - } while (pte++, addr += PAGE_SIZE, addr != end); - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(pte - 1, ptl); - return err; -} - -static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, - unsigned long addr, unsigned long end, pgprot_t prot) -{ - pmd_t *pmd; - unsigned long next; - int err; - - pmd = pmd_alloc(mm, pud, addr); - if (!pmd) - return -EAGAIN; - do { - next = pmd_addr_end(addr, end); - err = zeromap_pte_range(mm, pmd, addr, next, prot); - if (err) - break; - } while (pmd++, addr = next, addr != end); - return err; -} - -static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, - unsigned long addr, unsigned long end, pgprot_t prot) -{ - pud_t *pud; - unsigned long next; - int err; - - pud = pud_alloc(mm, pgd, addr); - if (!pud) - return -EAGAIN; - do { - next = pud_addr_end(addr, end); - err = zeromap_pmd_range(mm, pud, addr, next, prot); - if (err) - break; - } while (pud++, addr = next, addr != end); - return err; -} - -int zeromap_page_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long size, pgprot_t prot) -{ - pgd_t *pgd; - unsigned long next; - unsigned long end = addr + size; - struct mm_struct *mm = vma->vm_mm; - int err; - - BUG_ON(addr >= end); - pgd = pgd_offset(mm, addr); - flush_cache_range(vma, addr, end); - do { - next = pgd_addr_end(addr, end); - err = zeromap_pud_range(mm, pgd, addr, next, prot); - if (err) - break; - } while (pgd++, addr = next, addr != end); - return err; -} - pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { pgd_t * pgd = pgd_offset(mm, addr); @@ -1700,10 +1611,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (ptep_set_access_flags(vma, address, page_table, entry,1)) { + if (ptep_set_access_flags(vma, address, page_table, entry,1)) update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); - } ret |= VM_FAULT_WRITE; goto unlock; } @@ -1717,16 +1626,11 @@ gotten: if (unlikely(anon_vma_prepare(vma))) goto oom; - if (old_page == ZERO_PAGE(address)) { - new_page = alloc_zeroed_user_highpage_movable(vma, address); - if (!new_page) - goto oom; - } else { - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (!new_page) - goto oom; - cow_user_page(new_page, old_page, address, vma); - } + VM_BUG_ON(old_page == ZERO_PAGE(0)); + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!new_page) + goto oom; + cow_user_page(new_page, old_page, address, vma); /* * Re-check the pte - we dropped the lock @@ -1744,7 +1648,6 @@ gotten: flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); - lazy_mmu_prot_update(entry); /* * Clear the pte entry and flush it first, before updating the * pte with the new entry. This will avoid a race condition @@ -2252,44 +2155,28 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; pte_t entry; - if (write_access) { - /* Allocate our own private page. */ - pte_unmap(page_table); - - if (unlikely(anon_vma_prepare(vma))) - goto oom; - page = alloc_zeroed_user_highpage_movable(vma, address); - if (!page) - goto oom; - - entry = mk_pte(page, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); + /* Allocate our own private page. */ + pte_unmap(page_table); - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - if (!pte_none(*page_table)) - goto release; - inc_mm_counter(mm, anon_rss); - lru_cache_add_active(page); - page_add_new_anon_rmap(page, vma, address); - } else { - /* Map the ZERO_PAGE - vm_page_prot is readonly */ - page = ZERO_PAGE(address); - page_cache_get(page); - entry = mk_pte(page, vma->vm_page_prot); + if (unlikely(anon_vma_prepare(vma))) + goto oom; + page = alloc_zeroed_user_highpage_movable(vma, address); + if (!page) + goto oom; - ptl = pte_lockptr(mm, pmd); - spin_lock(ptl); - if (!pte_none(*page_table)) - goto release; - inc_mm_counter(mm, file_rss); - page_add_file_rmap(page); - } + entry = mk_pte(page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!pte_none(*page_table)) + goto release; + inc_mm_counter(mm, anon_rss); + lru_cache_add_active(page); + page_add_new_anon_rmap(page, vma, address); set_pte_at(mm, address, page_table, entry); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); unlock: pte_unmap_unlock(page_table, ptl); return 0; @@ -2442,7 +2329,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); } else { if (anon) page_cache_release(page); @@ -2470,7 +2356,7 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, int write_access, pte_t orig_pte) { pgoff_t pgoff = (((address & PAGE_MASK) - - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff; + - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); pte_unmap(page_table); @@ -2614,7 +2500,6 @@ static inline int handle_pte_fault(struct mm_struct *mm, entry = pte_mkyoung(entry); if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); } else { /* * This is needed only for protection faults but the arch code diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index df9d554bea3..091b9c6c252 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -23,6 +23,9 @@ #include <linux/vmalloc.h> #include <linux/ioport.h> #include <linux/cpuset.h> +#include <linux/delay.h> +#include <linux/migrate.h> +#include <linux/page-isolation.h> #include <asm/tlbflush.h> @@ -161,14 +164,27 @@ static void grow_pgdat_span(struct pglist_data *pgdat, pgdat->node_start_pfn; } -int online_pages(unsigned long pfn, unsigned long nr_pages) +static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, + void *arg) { unsigned long i; + unsigned long onlined_pages = *(unsigned long *)arg; + struct page *page; + if (PageReserved(pfn_to_page(start_pfn))) + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(start_pfn + i); + online_page(page); + onlined_pages++; + } + *(unsigned long *)arg = onlined_pages; + return 0; +} + + +int online_pages(unsigned long pfn, unsigned long nr_pages) +{ unsigned long flags; unsigned long onlined_pages = 0; - struct resource res; - u64 section_end; - unsigned long start_pfn; struct zone *zone; int need_zonelists_rebuild = 0; @@ -191,32 +207,16 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) if (!populated_zone(zone)) need_zonelists_rebuild = 1; - res.start = (u64)pfn << PAGE_SHIFT; - res.end = res.start + ((u64)nr_pages << PAGE_SHIFT) - 1; - res.flags = IORESOURCE_MEM; /* we just need system ram */ - section_end = res.end; - - while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { - start_pfn = (unsigned long)(res.start >> PAGE_SHIFT); - nr_pages = (unsigned long) - ((res.end + 1 - res.start) >> PAGE_SHIFT); - - if (PageReserved(pfn_to_page(start_pfn))) { - /* this region's page is not onlined now */ - for (i = 0; i < nr_pages; i++) { - struct page *page = pfn_to_page(start_pfn + i); - online_page(page); - onlined_pages++; - } - } - - res.start = res.end + 1; - res.end = section_end; - } + walk_memory_resource(pfn, nr_pages, &onlined_pages, + online_pages_range); zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; setup_per_zone_pages_min(); + if (onlined_pages) { + kswapd_run(zone_to_nid(zone)); + node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); + } if (need_zonelists_rebuild) build_all_zonelists(); @@ -271,9 +271,6 @@ int add_memory(int nid, u64 start, u64 size) if (!pgdat) return -ENOMEM; new_pgdat = 1; - ret = kswapd_run(nid); - if (ret) - goto error; } /* call arch's memory hotadd */ @@ -308,3 +305,260 @@ error: return ret; } EXPORT_SYMBOL_GPL(add_memory); + +#ifdef CONFIG_MEMORY_HOTREMOVE +/* + * Confirm all pages in a range [start, end) is belongs to the same zone. + */ +static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + struct zone *zone = NULL; + struct page *page; + int i; + for (pfn = start_pfn; + pfn < end_pfn; + pfn += MAX_ORDER_NR_PAGES) { + i = 0; + /* This is just a CONFIG_HOLES_IN_ZONE check.*/ + while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) + i++; + if (i == MAX_ORDER_NR_PAGES) + continue; + page = pfn_to_page(pfn + i); + if (zone && page_zone(page) != zone) + return 0; + zone = page_zone(page); + } + return 1; +} + +/* + * Scanning pfn is much easier than scanning lru list. + * Scan pfn from start to end and Find LRU page. + */ +int scan_lru_pages(unsigned long start, unsigned long end) +{ + unsigned long pfn; + struct page *page; + for (pfn = start; pfn < end; pfn++) { + if (pfn_valid(pfn)) { + page = pfn_to_page(pfn); + if (PageLRU(page)) + return pfn; + } + } + return 0; +} + +static struct page * +hotremove_migrate_alloc(struct page *page, + unsigned long private, + int **x) +{ + /* This should be improoooooved!! */ + return alloc_page(GFP_HIGHUSER_PAGECACHE); +} + + +#define NR_OFFLINE_AT_ONCE_PAGES (256) +static int +do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + struct page *page; + int move_pages = NR_OFFLINE_AT_ONCE_PAGES; + int not_managed = 0; + int ret = 0; + LIST_HEAD(source); + + for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + if (!page_count(page)) + continue; + /* + * We can skip free pages. And we can only deal with pages on + * LRU. + */ + ret = isolate_lru_page(page, &source); + if (!ret) { /* Success */ + move_pages--; + } else { + /* Becasue we don't have big zone->lock. we should + check this again here. */ + if (page_count(page)) + not_managed++; +#ifdef CONFIG_DEBUG_VM + printk(KERN_INFO "removing from LRU failed" + " %lx/%d/%lx\n", + pfn, page_count(page), page->flags); +#endif + } + } + ret = -EBUSY; + if (not_managed) { + if (!list_empty(&source)) + putback_lru_pages(&source); + goto out; + } + ret = 0; + if (list_empty(&source)) + goto out; + /* this function returns # of failed pages */ + ret = migrate_pages(&source, hotremove_migrate_alloc, 0); + +out: + return ret; +} + +/* + * remove from free_area[] and mark all as Reserved. + */ +static int +offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, + void *data) +{ + __offline_isolated_pages(start, start + nr_pages); + return 0; +} + +static void +offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) +{ + walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL, + offline_isolated_pages_cb); +} + +/* + * Check all pages in range, recoreded as memory resource, are isolated. + */ +static int +check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, + void *data) +{ + int ret; + long offlined = *(long *)data; + ret = test_pages_isolated(start_pfn, start_pfn + nr_pages); + offlined = nr_pages; + if (!ret) + *(long *)data += offlined; + return ret; +} + +static long +check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) +{ + long offlined = 0; + int ret; + + ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined, + check_pages_isolated_cb); + if (ret < 0) + offlined = (long)ret; + return offlined; +} + +extern void drain_all_local_pages(void); + +int offline_pages(unsigned long start_pfn, + unsigned long end_pfn, unsigned long timeout) +{ + unsigned long pfn, nr_pages, expire; + long offlined_pages; + int ret, drain, retry_max; + struct zone *zone; + + BUG_ON(start_pfn >= end_pfn); + /* at least, alignment against pageblock is necessary */ + if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) + return -EINVAL; + if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) + return -EINVAL; + /* This makes hotplug much easier...and readable. + we assume this for now. .*/ + if (!test_pages_in_a_zone(start_pfn, end_pfn)) + return -EINVAL; + /* set above range as isolated */ + ret = start_isolate_page_range(start_pfn, end_pfn); + if (ret) + return ret; + nr_pages = end_pfn - start_pfn; + pfn = start_pfn; + expire = jiffies + timeout; + drain = 0; + retry_max = 5; +repeat: + /* start memory hot removal */ + ret = -EAGAIN; + if (time_after(jiffies, expire)) + goto failed_removal; + ret = -EINTR; + if (signal_pending(current)) + goto failed_removal; + ret = 0; + if (drain) { + lru_add_drain_all(); + flush_scheduled_work(); + cond_resched(); + drain_all_local_pages(); + } + + pfn = scan_lru_pages(start_pfn, end_pfn); + if (pfn) { /* We have page on LRU */ + ret = do_migrate_range(pfn, end_pfn); + if (!ret) { + drain = 1; + goto repeat; + } else { + if (ret < 0) + if (--retry_max == 0) + goto failed_removal; + yield(); + drain = 1; + goto repeat; + } + } + /* drain all zone's lru pagevec, this is asyncronous... */ + lru_add_drain_all(); + flush_scheduled_work(); + yield(); + /* drain pcp pages , this is synchrouns. */ + drain_all_local_pages(); + /* check again */ + offlined_pages = check_pages_isolated(start_pfn, end_pfn); + if (offlined_pages < 0) { + ret = -EBUSY; + goto failed_removal; + } + printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); + /* Ok, all of our target is islaoted. + We cannot do rollback at this point. */ + offline_isolated_pages(start_pfn, end_pfn); + /* reset pagetype flags */ + start_isolate_page_range(start_pfn, end_pfn); + /* removal success */ + zone = page_zone(pfn_to_page(start_pfn)); + zone->present_pages -= offlined_pages; + zone->zone_pgdat->node_present_pages -= offlined_pages; + totalram_pages -= offlined_pages; + num_physpages -= offlined_pages; + vm_total_pages = nr_free_pagecache_pages(); + writeback_set_ratelimit(); + return 0; + +failed_removal: + printk(KERN_INFO "memory offlining %lx to %lx failed\n", + start_pfn, end_pfn); + /* pushback to free area */ + undo_isolate_page_range(start_pfn, end_pfn); + return ret; +} +#else +int remove_memory(u64 start, u64 size) +{ + return -EINVAL; +} +EXPORT_SYMBOL_GPL(remove_memory); +#endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3d6ac9505d0..568152ae6ca 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -72,7 +72,6 @@ #include <linux/hugetlb.h> #include <linux/kernel.h> #include <linux/sched.h> -#include <linux/mm.h> #include <linux/nodemask.h> #include <linux/cpuset.h> #include <linux/gfp.h> @@ -82,13 +81,13 @@ #include <linux/interrupt.h> #include <linux/init.h> #include <linux/compat.h> -#include <linux/mempolicy.h> #include <linux/swap.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/migrate.h> #include <linux/rmap.h> #include <linux/security.h> +#include <linux/syscalls.h> #include <asm/tlbflush.h> #include <asm/uaccess.h> @@ -110,6 +109,9 @@ struct mempolicy default_policy = { .policy = MPOL_DEFAULT, }; +static void mpol_rebind_policy(struct mempolicy *pol, + const nodemask_t *newmask); + /* Do sanity checking on a policy */ static int mpol_check_policy(int mode, nodemask_t *nodes) { @@ -128,7 +130,7 @@ static int mpol_check_policy(int mode, nodemask_t *nodes) return -EINVAL; break; } - return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; + return nodes_subset(*nodes, node_states[N_HIGH_MEMORY]) ? 0 : -EINVAL; } /* Generate a custom zonelist for the BIND policy. */ @@ -185,7 +187,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) switch (mode) { case MPOL_INTERLEAVE: policy->v.nodes = *nodes; - if (nodes_weight(*nodes) == 0) { + nodes_and(policy->v.nodes, policy->v.nodes, + node_states[N_HIGH_MEMORY]); + if (nodes_weight(policy->v.nodes) == 0) { kmem_cache_free(policy_cache, policy); return ERR_PTR(-EINVAL); } @@ -459,7 +463,7 @@ static void mpol_set_task_struct_flag(void) } /* Set the process memory policy */ -long do_set_mempolicy(int mode, nodemask_t *nodes) +static long do_set_mempolicy(int mode, nodemask_t *nodes) { struct mempolicy *new; @@ -494,9 +498,9 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) *nodes = p->v.nodes; break; case MPOL_PREFERRED: - /* or use current node instead of online map? */ + /* or use current node instead of memory_map? */ if (p->v.preferred_node < 0) - *nodes = node_online_map; + *nodes = node_states[N_HIGH_MEMORY]; else node_set(p->v.preferred_node, *nodes); break; @@ -519,8 +523,8 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr) } /* Retrieve NUMA policy */ -long do_get_mempolicy(int *policy, nodemask_t *nmask, - unsigned long addr, unsigned long flags) +static long do_get_mempolicy(int *policy, nodemask_t *nmask, + unsigned long addr, unsigned long flags) { int err; struct mm_struct *mm = current->mm; @@ -528,8 +532,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, struct mempolicy *pol = current->mempolicy; cpuset_update_task_memory_state(); - if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) + if (flags & + ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) return -EINVAL; + + if (flags & MPOL_F_MEMS_ALLOWED) { + if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) + return -EINVAL; + *policy = 0; /* just so it's initialized */ + *nmask = cpuset_current_mems_allowed; + return 0; + } + if (flags & MPOL_F_ADDR) { down_read(&mm->mmap_sem); vma = find_vma_intersection(mm, addr, addr+1); @@ -601,7 +615,8 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x * Migrate pages from one node to a target node. * Returns error or the number of pages not migrated. */ -int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) +static int migrate_to_node(struct mm_struct *mm, int source, int dest, + int flags) { nodemask_t nmask; LIST_HEAD(pagelist); @@ -732,8 +747,9 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int * } #endif -long do_mbind(unsigned long start, unsigned long len, - unsigned long mode, nodemask_t *nmask, unsigned long flags) +static long do_mbind(unsigned long start, unsigned long len, + unsigned long mode, nodemask_t *nmask, + unsigned long flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; @@ -955,7 +971,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, goto out; } - if (!nodes_subset(new, node_online_map)) { + if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) { err = -EINVAL; goto out; } @@ -978,7 +994,8 @@ asmlinkage long sys_get_mempolicy(int __user *policy, unsigned long maxnode, unsigned long addr, unsigned long flags) { - int err, pval; + int err; + int uninitialized_var(pval); nodemask_t nodes; if (nmask != NULL && maxnode < MAX_NUMNODES) @@ -1527,8 +1544,8 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n) kmem_cache_free(sn_cache, n); } -struct sp_node * -sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) +static struct sp_node *sp_alloc(unsigned long start, unsigned long end, + struct mempolicy *pol) { struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); @@ -1677,7 +1694,7 @@ void __init numa_policy_init(void) * fall back to the largest node if they're all smaller. */ nodes_clear(interleave_nodes); - for_each_online_node(nid) { + for_each_node_state(nid, N_HIGH_MEMORY) { unsigned long total_pages = node_present_pages(nid); /* Preserve the largest node */ @@ -1706,7 +1723,8 @@ void numa_default_policy(void) } /* Migrate a policy to a different set of nodes */ -void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) +static void mpol_rebind_policy(struct mempolicy *pol, + const nodemask_t *newmask) { nodemask_t *mpolmask; nodemask_t tmp; @@ -1963,7 +1981,7 @@ int show_numa_map(struct seq_file *m, void *v) seq_printf(m, " huge"); } else { check_pgd_range(vma, vma->vm_start, vma->vm_end, - &node_online_map, MPOL_MF_STATS, md); + &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md); } if (!md->pages) @@ -1990,7 +2008,7 @@ int show_numa_map(struct seq_file *m, void *v) if (md->writeback) seq_printf(m," writeback=%lu", md->writeback); - for_each_online_node(n) + for_each_node_state(n, N_HIGH_MEMORY) if (md->node[n]) seq_printf(m, " N%d=%lu", n, md->node[n]); out: diff --git a/mm/migrate.c b/mm/migrate.c index e2fdbce1874..06d0877a66e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -171,6 +171,7 @@ static void remove_migration_pte(struct vm_area_struct *vma, pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); if (is_write_migration_entry(entry)) pte = pte_mkwrite(pte); + flush_cache_page(vma, addr, pte_pfn(pte)); set_pte_at(mm, addr, ptep, pte); if (PageAnon(new)) @@ -180,7 +181,6 @@ static void remove_migration_pte(struct vm_area_struct *vma, /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, pte); - lazy_mmu_prot_update(pte); out: pte_unmap_unlock(ptep, ptl); @@ -972,7 +972,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, * array. Return various errors if the user did something wrong. */ for (i = 0; i < nr_pages; i++) { - const void *p; + const void __user *p; err = -EFAULT; if (get_user(p, pages + i)) @@ -986,7 +986,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, goto out; err = -ENODEV; - if (!node_online(node)) + if (!node_state(node, N_HIGH_MEMORY)) goto out; err = -EACCES; diff --git a/mm/mmap.c b/mm/mmap.c index 0d40e66c841..4275e81e25b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -7,6 +7,7 @@ */ #include <linux/slab.h> +#include <linux/backing-dev.h> #include <linux/mm.h> #include <linux/shm.h> #include <linux/mman.h> @@ -180,8 +181,6 @@ error: return -ENOMEM; } -EXPORT_SYMBOL(__vm_enough_memory); - /* * Requires inode->i_mapping->i_mmap_lock */ diff --git a/mm/mprotect.c b/mm/mprotect.c index e8346c30abe..1d4d69790e5 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -53,7 +53,6 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, if (dirty_accountable && pte_dirty(ptent)) ptent = pte_mkwrite(ptent); set_pte_at(mm, addr, pte, ptent); - lazy_mmu_prot_update(ptent); #ifdef CONFIG_MIGRATION } else if (!pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); diff --git a/mm/nommu.c b/mm/nommu.c index 8ed0cb43118..42fb84e9e81 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -44,7 +44,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int heap_stack_gap = 0; EXPORT_SYMBOL(mem_map); -EXPORT_SYMBOL(__vm_enough_memory); EXPORT_SYMBOL(num_physpages); /* list of shareable VMAs */ diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f9b82ad5047..a64decb5b13 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -27,6 +27,8 @@ #include <linux/notifier.h> int sysctl_panic_on_oom; +int sysctl_oom_kill_allocating_task; +static DEFINE_SPINLOCK(zone_scan_mutex); /* #define DEBUG */ /** @@ -141,7 +143,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * because p may have allocated or otherwise mapped memory on * this node before. However it will be less likely. */ - if (!cpuset_excl_nodes_overlap(p)) + if (!cpuset_mems_allowed_intersects(current, p)) points /= 8; /* @@ -164,27 +166,14 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) } /* - * Types of limitations to the nodes from which allocations may occur - */ -#define CONSTRAINT_NONE 1 -#define CONSTRAINT_MEMORY_POLICY 2 -#define CONSTRAINT_CPUSET 3 - -/* * Determine the type of allocation constraint. */ -static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask) +static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, + gfp_t gfp_mask) { #ifdef CONFIG_NUMA struct zone **z; - nodemask_t nodes; - int node; - - nodes_clear(nodes); - /* node has memory ? */ - for_each_online_node(node) - if (NODE_DATA(node)->node_present_pages) - node_set(node, nodes); + nodemask_t nodes = node_states[N_HIGH_MEMORY]; for (z = zonelist->zones; *z; z++) if (cpuset_zone_allowed_softwall(*z, gfp_mask)) @@ -344,12 +333,20 @@ static int oom_kill_task(struct task_struct *p) return 0; } -static int oom_kill_process(struct task_struct *p, unsigned long points, - const char *message) +static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, + unsigned long points, const char *message) { struct task_struct *c; struct list_head *tsk; + if (printk_ratelimit()) { + printk(KERN_WARNING "%s invoked oom-killer: " + "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", + current->comm, gfp_mask, order, current->oomkilladj); + dump_stack(); + show_mem(); + } + /* * If the task is already exiting, don't alarm the sysadmin or kill * its children or threads, just set TIF_MEMDIE so it can die quickly @@ -387,6 +384,57 @@ int unregister_oom_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_oom_notifier); +/* + * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero + * if a parallel OOM killing is already taking place that includes a zone in + * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. + */ +int try_set_zone_oom(struct zonelist *zonelist) +{ + struct zone **z; + int ret = 1; + + z = zonelist->zones; + + spin_lock(&zone_scan_mutex); + do { + if (zone_is_oom_locked(*z)) { + ret = 0; + goto out; + } + } while (*(++z) != NULL); + + /* + * Lock each zone in the zonelist under zone_scan_mutex so a parallel + * invocation of try_set_zone_oom() doesn't succeed when it shouldn't. + */ + z = zonelist->zones; + do { + zone_set_flag(*z, ZONE_OOM_LOCKED); + } while (*(++z) != NULL); +out: + spin_unlock(&zone_scan_mutex); + return ret; +} + +/* + * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed + * allocation attempts with zonelists containing them may now recall the OOM + * killer, if necessary. + */ +void clear_zonelist_oom(struct zonelist *zonelist) +{ + struct zone **z; + + z = zonelist->zones; + + spin_lock(&zone_scan_mutex); + do { + zone_clear_flag(*z, ZONE_OOM_LOCKED); + } while (*(++z) != NULL); + spin_unlock(&zone_scan_mutex); +} + /** * out_of_memory - kill the "best" process when we run out of memory * @@ -400,21 +448,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) struct task_struct *p; unsigned long points = 0; unsigned long freed = 0; - int constraint; + enum oom_constraint constraint; blocking_notifier_call_chain(&oom_notify_list, 0, &freed); if (freed > 0) /* Got some memory back in the last second. */ return; - if (printk_ratelimit()) { - printk(KERN_WARNING "%s invoked oom-killer: " - "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", - current->comm, gfp_mask, order, current->oomkilladj); - dump_stack(); - show_mem(); - } - if (sysctl_panic_on_oom == 2) panic("out of memory. Compulsory panic_on_oom is selected.\n"); @@ -423,23 +463,24 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) * NUMA) that may require different handling. */ constraint = constrained_alloc(zonelist, gfp_mask); - cpuset_lock(); read_lock(&tasklist_lock); switch (constraint) { case CONSTRAINT_MEMORY_POLICY: - oom_kill_process(current, points, + oom_kill_process(current, gfp_mask, order, points, "No available memory (MPOL_BIND)"); break; - case CONSTRAINT_CPUSET: - oom_kill_process(current, points, - "No available memory in cpuset"); - break; - case CONSTRAINT_NONE: if (sysctl_panic_on_oom) panic("out of memory. panic_on_oom is selected\n"); + /* Fall-through */ + case CONSTRAINT_CPUSET: + if (sysctl_oom_kill_allocating_task) { + oom_kill_process(current, gfp_mask, order, points, + "Out of memory (oom_kill_allocating_task)"); + break; + } retry: /* * Rambo mode: Shoot down a process and hope it solves whatever @@ -453,11 +494,11 @@ retry: /* Found nothing?!?! Either we hang forever, or we panic. */ if (!p) { read_unlock(&tasklist_lock); - cpuset_unlock(); panic("Out of memory and no killable processes...\n"); } - if (oom_kill_process(p, points, "Out of memory")) + if (oom_kill_process(p, points, gfp_mask, order, + "Out of memory")) goto retry; break; @@ -465,7 +506,6 @@ retry: out: read_unlock(&tasklist_lock); - cpuset_unlock(); /* * Give "p" a good chance of killing itself before we diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 44720363374..7845462064f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2,6 +2,7 @@ * mm/page-writeback.c * * Copyright (C) 2002, Linus Torvalds. + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> * * Contains functions related to writing back dirty pages at the * address_space level. @@ -36,7 +37,7 @@ /* * The maximum number of pages to writeout in a single bdflush/kupdate - * operation. We do this so we don't hold I_LOCK against an inode for + * operation. We do this so we don't hold I_SYNC against an inode for * enormous amounts of time, which would block a userspace task which has * been forced to throttle against that inode. Also, the code reevaluates * the dirty each time it has written this many pages. @@ -49,8 +50,6 @@ */ static long ratelimit_pages = 32; -static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */ - /* * When balance_dirty_pages decides that the caller needs to perform some * non-background writeback, this is how many pages it will attempt to write. @@ -103,6 +102,141 @@ EXPORT_SYMBOL(laptop_mode); static void background_writeout(unsigned long _min_pages); /* + * Scale the writeback cache size proportional to the relative writeout speeds. + * + * We do this by keeping a floating proportion between BDIs, based on page + * writeback completions [end_page_writeback()]. Those devices that write out + * pages fastest will get the larger share, while the slower will get a smaller + * share. + * + * We use page writeout completions because we are interested in getting rid of + * dirty pages. Having them written out is the primary goal. + * + * We introduce a concept of time, a period over which we measure these events, + * because demand can/will vary over time. The length of this period itself is + * measured in page writeback completions. + * + */ +static struct prop_descriptor vm_completions; +static struct prop_descriptor vm_dirties; + +static unsigned long determine_dirtyable_memory(void); + +/* + * couple the period to the dirty_ratio: + * + * period/2 ~ roundup_pow_of_two(dirty limit) + */ +static int calc_period_shift(void) +{ + unsigned long dirty_total; + + dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100; + return 2 + ilog2(dirty_total - 1); +} + +/* + * update the period when the dirty ratio changes. + */ +int dirty_ratio_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int old_ratio = vm_dirty_ratio; + int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write && vm_dirty_ratio != old_ratio) { + int shift = calc_period_shift(); + prop_change_shift(&vm_completions, shift); + prop_change_shift(&vm_dirties, shift); + } + return ret; +} + +/* + * Increment the BDI's writeout completion count and the global writeout + * completion count. Called from test_clear_page_writeback(). + */ +static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) +{ + __prop_inc_percpu(&vm_completions, &bdi->completions); +} + +static inline void task_dirty_inc(struct task_struct *tsk) +{ + prop_inc_single(&vm_dirties, &tsk->dirties); +} + +/* + * Obtain an accurate fraction of the BDI's portion. + */ +static void bdi_writeout_fraction(struct backing_dev_info *bdi, + long *numerator, long *denominator) +{ + if (bdi_cap_writeback_dirty(bdi)) { + prop_fraction_percpu(&vm_completions, &bdi->completions, + numerator, denominator); + } else { + *numerator = 0; + *denominator = 1; + } +} + +/* + * Clip the earned share of dirty pages to that which is actually available. + * This avoids exceeding the total dirty_limit when the floating averages + * fluctuate too quickly. + */ +static void +clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) +{ + long avail_dirty; + + avail_dirty = dirty - + (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_WRITEBACK) + + global_page_state(NR_UNSTABLE_NFS)); + + if (avail_dirty < 0) + avail_dirty = 0; + + avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + + bdi_stat(bdi, BDI_WRITEBACK); + + *pbdi_dirty = min(*pbdi_dirty, avail_dirty); +} + +static inline void task_dirties_fraction(struct task_struct *tsk, + long *numerator, long *denominator) +{ + prop_fraction_single(&vm_dirties, &tsk->dirties, + numerator, denominator); +} + +/* + * scale the dirty limit + * + * task specific dirty limit: + * + * dirty -= (dirty/8) * p_{t} + */ +void task_dirty_limit(struct task_struct *tsk, long *pdirty) +{ + long numerator, denominator; + long dirty = *pdirty; + u64 inv = dirty >> 3; + + task_dirties_fraction(tsk, &numerator, &denominator); + inv *= numerator; + do_div(inv, denominator); + + dirty -= inv; + if (dirty < *pdirty/2) + dirty = *pdirty/2; + + *pdirty = dirty; +} + +/* * Work out the current dirty-memory clamping and background writeout * thresholds. * @@ -126,7 +260,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) int node; unsigned long x = 0; - for_each_online_node(node) { + for_each_node_state(node, N_HIGH_MEMORY) { struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; @@ -158,8 +292,8 @@ static unsigned long determine_dirtyable_memory(void) } static void -get_dirty_limits(long *pbackground, long *pdirty, - struct address_space *mapping) +get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, + struct backing_dev_info *bdi) { int background_ratio; /* Percentages */ int dirty_ratio; @@ -193,6 +327,23 @@ get_dirty_limits(long *pbackground, long *pdirty, } *pbackground = background; *pdirty = dirty; + + if (bdi) { + u64 bdi_dirty = dirty; + long numerator, denominator; + + /* + * Calculate this BDI's share of the dirty ratio. + */ + bdi_writeout_fraction(bdi, &numerator, &denominator); + + bdi_dirty *= numerator; + do_div(bdi_dirty, denominator); + + *pbdi_dirty = bdi_dirty; + clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty); + task_dirty_limit(current, pbdi_dirty); + } } /* @@ -204,9 +355,11 @@ get_dirty_limits(long *pbackground, long *pdirty, */ static void balance_dirty_pages(struct address_space *mapping) { - long nr_reclaimable; + long bdi_nr_reclaimable; + long bdi_nr_writeback; long background_thresh; long dirty_thresh; + long bdi_thresh; unsigned long pages_written = 0; unsigned long write_chunk = sync_writeback_pages(); @@ -221,15 +374,15 @@ static void balance_dirty_pages(struct address_space *mapping) .range_cyclic = 1, }; - get_dirty_limits(&background_thresh, &dirty_thresh, mapping); - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - if (nr_reclaimable + global_page_state(NR_WRITEBACK) <= - dirty_thresh) - break; + get_dirty_limits(&background_thresh, &dirty_thresh, + &bdi_thresh, bdi); + bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); + if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) + break; - if (!dirty_exceeded) - dirty_exceeded = 1; + if (!bdi->dirty_exceeded) + bdi->dirty_exceeded = 1; /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. * Unstable writes are a feature of certain networked @@ -237,26 +390,42 @@ static void balance_dirty_pages(struct address_space *mapping) * written to the server's write cache, but has not yet * been flushed to permanent storage. */ - if (nr_reclaimable) { + if (bdi_nr_reclaimable) { writeback_inodes(&wbc); - get_dirty_limits(&background_thresh, - &dirty_thresh, mapping); - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - if (nr_reclaimable + - global_page_state(NR_WRITEBACK) - <= dirty_thresh) - break; pages_written += write_chunk - wbc.nr_to_write; - if (pages_written >= write_chunk) - break; /* We've done our duty */ + get_dirty_limits(&background_thresh, &dirty_thresh, + &bdi_thresh, bdi); } + + /* + * In order to avoid the stacked BDI deadlock we need + * to ensure we accurately count the 'dirty' pages when + * the threshold is low. + * + * Otherwise it would be possible to get thresh+n pages + * reported dirty, even though there are thresh-m pages + * actually dirty; with m+n sitting in the percpu + * deltas. + */ + if (bdi_thresh < 2*bdi_stat_error(bdi)) { + bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); + bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); + } else if (bdi_nr_reclaimable) { + bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); + } + + if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) + break; + if (pages_written >= write_chunk) + break; /* We've done our duty */ + congestion_wait(WRITE, HZ/10); } - if (nr_reclaimable + global_page_state(NR_WRITEBACK) - <= dirty_thresh && dirty_exceeded) - dirty_exceeded = 0; + if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && + bdi->dirty_exceeded) + bdi->dirty_exceeded = 0; if (writeback_in_progress(bdi)) return; /* pdflush is already working this queue */ @@ -270,7 +439,9 @@ static void balance_dirty_pages(struct address_space *mapping) * background_thresh, to keep the amount of dirty memory low. */ if ((laptop_mode && pages_written) || - (!laptop_mode && (nr_reclaimable > background_thresh))) + (!laptop_mode && (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) + > background_thresh))) pdflush_operation(background_writeout, 0); } @@ -306,7 +477,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, unsigned long *p; ratelimit = ratelimit_pages; - if (dirty_exceeded) + if (mapping->backing_dev_info->dirty_exceeded) ratelimit = 8; /* @@ -331,18 +502,8 @@ void throttle_vm_writeout(gfp_t gfp_mask) long background_thresh; long dirty_thresh; - if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) { - /* - * The caller might hold locks which can prevent IO completion - * or progress in the filesystem. So we cannot just sit here - * waiting for IO to complete. - */ - congestion_wait(WRITE, HZ/10); - return; - } - for ( ; ; ) { - get_dirty_limits(&background_thresh, &dirty_thresh, NULL); + get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); /* * Boost the allowable dirty threshold a bit for page @@ -354,6 +515,14 @@ void throttle_vm_writeout(gfp_t gfp_mask) global_page_state(NR_WRITEBACK) <= dirty_thresh) break; congestion_wait(WRITE, HZ/10); + + /* + * The caller might hold locks which can prevent IO completion + * or progress in the filesystem. So we cannot just sit here + * waiting for IO to complete. + */ + if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) + break; } } @@ -377,11 +546,12 @@ static void background_writeout(unsigned long _min_pages) long background_thresh; long dirty_thresh; - get_dirty_limits(&background_thresh, &dirty_thresh, NULL); + get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); if (global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS) < background_thresh && min_pages <= 0) break; + wbc.more_io = 0; wbc.encountered_congestion = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.pages_skipped = 0; @@ -389,8 +559,9 @@ static void background_writeout(unsigned long _min_pages) min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { /* Wrote less than expected */ - congestion_wait(WRITE, HZ/10); - if (!wbc.encountered_congestion) + if (wbc.encountered_congestion || wbc.more_io) + congestion_wait(WRITE, HZ/10); + else break; } } @@ -455,11 +626,12 @@ static void wb_kupdate(unsigned long arg) global_page_state(NR_UNSTABLE_NFS) + (inodes_stat.nr_inodes - inodes_stat.nr_unused); while (nr_to_write > 0) { + wbc.more_io = 0; wbc.encountered_congestion = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; writeback_inodes(&wbc); if (wbc.nr_to_write > 0) { - if (wbc.encountered_congestion) + if (wbc.encountered_congestion || wbc.more_io) congestion_wait(WRITE, HZ/10); else break; /* All the old data is written */ @@ -580,9 +752,15 @@ static struct notifier_block __cpuinitdata ratelimit_nb = { */ void __init page_writeback_init(void) { + int shift; + mod_timer(&wb_timer, jiffies + dirty_writeback_interval); writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); + + shift = calc_period_shift(); + prop_descriptor_init(&vm_completions, shift); + prop_descriptor_init(&vm_dirties, shift); } /** @@ -672,8 +850,10 @@ retry: ret = (*writepage)(page, wbc, data); - if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) + if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { unlock_page(page); + ret = 0; + } if (ret || (--(wbc->nr_to_write) <= 0)) done = 1; if (wbc->nonblocking && bdi_write_congested(bdi)) { @@ -827,6 +1007,8 @@ int __set_page_dirty_nobuffers(struct page *page) WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_bdi_stat(mapping->backing_dev_info, + BDI_RECLAIMABLE); task_io_account_write(PAGE_CACHE_SIZE); } radix_tree_tag_set(&mapping->page_tree, @@ -859,7 +1041,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage); * If the mapping doesn't provide a set_page_dirty a_op, then * just fall through and assume that it wants buffer_heads. */ -int fastcall set_page_dirty(struct page *page) +static int __set_page_dirty(struct page *page) { struct address_space *mapping = page_mapping(page); @@ -877,6 +1059,14 @@ int fastcall set_page_dirty(struct page *page) } return 0; } + +int fastcall set_page_dirty(struct page *page) +{ + int ret = __set_page_dirty(page); + if (ret) + task_dirty_inc(current); + return ret; +} EXPORT_SYMBOL(set_page_dirty); /* @@ -961,6 +1151,8 @@ int clear_page_dirty_for_io(struct page *page) */ if (TestClearPageDirty(page)) { dec_zone_page_state(page, NR_FILE_DIRTY); + dec_bdi_stat(mapping->backing_dev_info, + BDI_RECLAIMABLE); return 1; } return 0; @@ -975,14 +1167,20 @@ int test_clear_page_writeback(struct page *page) int ret; if (mapping) { + struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; write_lock_irqsave(&mapping->tree_lock, flags); ret = TestClearPageWriteback(page); - if (ret) + if (ret) { radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_WRITEBACK); + if (bdi_cap_writeback_dirty(bdi)) { + __dec_bdi_stat(bdi, BDI_WRITEBACK); + __bdi_writeout_inc(bdi); + } + } write_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestClearPageWriteback(page); @@ -998,14 +1196,18 @@ int test_set_page_writeback(struct page *page) int ret; if (mapping) { + struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; write_lock_irqsave(&mapping->tree_lock, flags); ret = TestSetPageWriteback(page); - if (!ret) + if (!ret) { radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_WRITEBACK); + if (bdi_cap_writeback_dirty(bdi)) + __inc_bdi_stat(bdi, BDI_WRITEBACK); + } if (!PageDirty(page)) radix_tree_tag_clear(&mapping->page_tree, page_index(page), @@ -1022,17 +1224,15 @@ int test_set_page_writeback(struct page *page) EXPORT_SYMBOL(test_set_page_writeback); /* - * Return true if any of the pages in the mapping are marged with the + * Return true if any of the pages in the mapping are marked with the * passed tag. */ int mapping_tagged(struct address_space *mapping, int tag) { - unsigned long flags; int ret; - - read_lock_irqsave(&mapping->tree_lock, flags); + rcu_read_lock(); ret = radix_tree_tagged(&mapping->page_tree, tag); - read_unlock_irqrestore(&mapping->tree_lock, flags); + rcu_read_unlock(); return ret; } EXPORT_SYMBOL(mapping_tagged); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1a8c59571cb..43f757fcf30 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -27,6 +27,7 @@ #include <linux/pagevec.h> #include <linux/blkdev.h> #include <linux/slab.h> +#include <linux/oom.h> #include <linux/notifier.h> #include <linux/topology.h> #include <linux/sysctl.h> @@ -41,24 +42,37 @@ #include <linux/pfn.h> #include <linux/backing-dev.h> #include <linux/fault-inject.h> +#include <linux/page-isolation.h> #include <asm/tlbflush.h> #include <asm/div64.h> #include "internal.h" /* - * MCD - HACK: Find somewhere to initialize this EARLY, or make this - * initializer cleaner + * Array of node states. */ -nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; -EXPORT_SYMBOL(node_online_map); -nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; -EXPORT_SYMBOL(node_possible_map); +nodemask_t node_states[NR_NODE_STATES] __read_mostly = { + [N_POSSIBLE] = NODE_MASK_ALL, + [N_ONLINE] = { { [0] = 1UL } }, +#ifndef CONFIG_NUMA + [N_NORMAL_MEMORY] = { { [0] = 1UL } }, +#ifdef CONFIG_HIGHMEM + [N_HIGH_MEMORY] = { { [0] = 1UL } }, +#endif + [N_CPU] = { { [0] = 1UL } }, +#endif /* NUMA */ +}; +EXPORT_SYMBOL(node_states); + unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; long nr_swap_pages; int percpu_pagelist_fraction; +#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE +int pageblock_order __read_mostly; +#endif + static void __free_pages_ok(struct page *page, unsigned int order); /* @@ -137,7 +151,7 @@ static unsigned long __meminitdata dma_reserve; static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ unsigned long __initdata required_kernelcore; - unsigned long __initdata required_movablecore; + static unsigned long __initdata required_movablecore; unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ @@ -150,6 +164,14 @@ int nr_node_ids __read_mostly = MAX_NUMNODES; EXPORT_SYMBOL(nr_node_ids); #endif +int page_group_by_mobility_disabled __read_mostly; + +static void set_pageblock_migratetype(struct page *page, int migratetype) +{ + set_pageblock_flags_group(page, (unsigned long)migratetype, + PB_migrate, PB_migrate_end); +} + #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { @@ -293,16 +315,6 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) clear_highpage(page + i); } -/* - * function for dealing with page's order in buddy system. - * zone->lock is already acquired when we use these. - * So, we don't need atomic page->flags operations here. - */ -static inline unsigned long page_order(struct page *page) -{ - return page_private(page); -} - static inline void set_page_order(struct page *page, int order) { set_page_private(page, order); @@ -404,6 +416,7 @@ static inline void __free_one_page(struct page *page, { unsigned long page_idx; int order_size = 1 << order; + int migratetype = get_pageblock_migratetype(page); if (unlikely(PageCompound(page))) destroy_compound_page(page, order); @@ -416,7 +429,6 @@ static inline void __free_one_page(struct page *page, __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); while (order < MAX_ORDER-1) { unsigned long combined_idx; - struct free_area *area; struct page *buddy; buddy = __page_find_buddy(page, page_idx, order); @@ -424,8 +436,7 @@ static inline void __free_one_page(struct page *page, break; /* Move the buddy up one level. */ list_del(&buddy->lru); - area = zone->free_area + order; - area->nr_free--; + zone->free_area[order].nr_free--; rmv_page_order(buddy); combined_idx = __find_combined_index(page_idx, order); page = page + (combined_idx - page_idx); @@ -433,7 +444,8 @@ static inline void __free_one_page(struct page *page, order++; } set_page_order(page, order); - list_add(&page->lru, &zone->free_area[order].free_list); + list_add(&page->lru, + &zone->free_area[order].free_list[migratetype]); zone->free_area[order].nr_free++; } @@ -478,7 +490,7 @@ static void free_pages_bulk(struct zone *zone, int count, struct list_head *list, int order) { spin_lock(&zone->lock); - zone->all_unreclaimable = 0; + zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); zone->pages_scanned = 0; while (count--) { struct page *page; @@ -495,7 +507,7 @@ static void free_pages_bulk(struct zone *zone, int count, static void free_one_page(struct zone *zone, struct page *page, int order) { spin_lock(&zone->lock); - zone->all_unreclaimable = 0; + zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); zone->pages_scanned = 0; __free_one_page(page, zone, order); spin_unlock(&zone->lock); @@ -567,7 +579,8 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) * -- wli */ static inline void expand(struct zone *zone, struct page *page, - int low, int high, struct free_area *area) + int low, int high, struct free_area *area, + int migratetype) { unsigned long size = 1 << high; @@ -576,7 +589,7 @@ static inline void expand(struct zone *zone, struct page *page, high--; size >>= 1; VM_BUG_ON(bad_range(zone, &page[size])); - list_add(&page[size].lru, &area->free_list); + list_add(&page[size].lru, &area->free_list[migratetype]); area->nr_free++; set_page_order(&page[size], high); } @@ -628,49 +641,235 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) return 0; } -/* - * Do the hard work of removing an element from the buddy allocator. - * Call me with the zone->lock already held. +/* + * Go through the free lists for the given migratetype and remove + * the smallest available page from the freelists */ -static struct page *__rmqueue(struct zone *zone, unsigned int order) +static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, + int migratetype) { - struct free_area * area; unsigned int current_order; + struct free_area * area; struct page *page; + /* Find a page of the appropriate size in the preferred list */ for (current_order = order; current_order < MAX_ORDER; ++current_order) { - area = zone->free_area + current_order; - if (list_empty(&area->free_list)) + area = &(zone->free_area[current_order]); + if (list_empty(&area->free_list[migratetype])) continue; - page = list_entry(area->free_list.next, struct page, lru); + page = list_entry(area->free_list[migratetype].next, + struct page, lru); list_del(&page->lru); rmv_page_order(page); area->nr_free--; __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); - expand(zone, page, order, current_order, area); + expand(zone, page, order, current_order, area, migratetype); return page; } return NULL; } + +/* + * This array describes the order lists are fallen back to when + * the free lists for the desirable migrate type are depleted + */ +static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { + [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, + [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, + [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */ +}; + +/* + * Move the free pages in a range to the free lists of the requested type. + * Note that start_page and end_pages are not aligned on a pageblock + * boundary. If alignment is required, use move_freepages_block() + */ +int move_freepages(struct zone *zone, + struct page *start_page, struct page *end_page, + int migratetype) +{ + struct page *page; + unsigned long order; + int pages_moved = 0; + +#ifndef CONFIG_HOLES_IN_ZONE + /* + * page_zone is not safe to call in this context when + * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant + * anyway as we check zone boundaries in move_freepages_block(). + * Remove at a later date when no bug reports exist related to + * grouping pages by mobility + */ + BUG_ON(page_zone(start_page) != page_zone(end_page)); +#endif + + for (page = start_page; page <= end_page;) { + if (!pfn_valid_within(page_to_pfn(page))) { + page++; + continue; + } + + if (!PageBuddy(page)) { + page++; + continue; + } + + order = page_order(page); + list_del(&page->lru); + list_add(&page->lru, + &zone->free_area[order].free_list[migratetype]); + page += 1 << order; + pages_moved += 1 << order; + } + + return pages_moved; +} + +int move_freepages_block(struct zone *zone, struct page *page, int migratetype) +{ + unsigned long start_pfn, end_pfn; + struct page *start_page, *end_page; + + start_pfn = page_to_pfn(page); + start_pfn = start_pfn & ~(pageblock_nr_pages-1); + start_page = pfn_to_page(start_pfn); + end_page = start_page + pageblock_nr_pages - 1; + end_pfn = start_pfn + pageblock_nr_pages - 1; + + /* Do not cross zone boundaries */ + if (start_pfn < zone->zone_start_pfn) + start_page = page; + if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) + return 0; + + return move_freepages(zone, start_page, end_page, migratetype); +} + +/* Return the page with the lowest PFN in the list */ +static struct page *min_page(struct list_head *list) +{ + unsigned long min_pfn = -1UL; + struct page *min_page = NULL, *page;; + + list_for_each_entry(page, list, lru) { + unsigned long pfn = page_to_pfn(page); + if (pfn < min_pfn) { + min_pfn = pfn; + min_page = page; + } + } + + return min_page; +} + +/* Remove an element from the buddy allocator from the fallback list */ +static struct page *__rmqueue_fallback(struct zone *zone, int order, + int start_migratetype) +{ + struct free_area * area; + int current_order; + struct page *page; + int migratetype, i; + + /* Find the largest possible block of pages in the other list */ + for (current_order = MAX_ORDER-1; current_order >= order; + --current_order) { + for (i = 0; i < MIGRATE_TYPES - 1; i++) { + migratetype = fallbacks[start_migratetype][i]; + + /* MIGRATE_RESERVE handled later if necessary */ + if (migratetype == MIGRATE_RESERVE) + continue; + + area = &(zone->free_area[current_order]); + if (list_empty(&area->free_list[migratetype])) + continue; + + /* Bias kernel allocations towards low pfns */ + page = list_entry(area->free_list[migratetype].next, + struct page, lru); + if (unlikely(start_migratetype != MIGRATE_MOVABLE)) + page = min_page(&area->free_list[migratetype]); + area->nr_free--; + + /* + * If breaking a large block of pages, move all free + * pages to the preferred allocation list. If falling + * back for a reclaimable kernel allocation, be more + * agressive about taking ownership of free pages + */ + if (unlikely(current_order >= (pageblock_order >> 1)) || + start_migratetype == MIGRATE_RECLAIMABLE) { + unsigned long pages; + pages = move_freepages_block(zone, page, + start_migratetype); + + /* Claim the whole block if over half of it is free */ + if (pages >= (1 << (pageblock_order-1))) + set_pageblock_migratetype(page, + start_migratetype); + + migratetype = start_migratetype; + } + + /* Remove the page from the freelists */ + list_del(&page->lru); + rmv_page_order(page); + __mod_zone_page_state(zone, NR_FREE_PAGES, + -(1UL << order)); + + if (current_order == pageblock_order) + set_pageblock_migratetype(page, + start_migratetype); + + expand(zone, page, order, current_order, area, migratetype); + return page; + } + } + + /* Use MIGRATE_RESERVE rather than fail an allocation */ + return __rmqueue_smallest(zone, order, MIGRATE_RESERVE); +} + +/* + * Do the hard work of removing an element from the buddy allocator. + * Call me with the zone->lock already held. + */ +static struct page *__rmqueue(struct zone *zone, unsigned int order, + int migratetype) +{ + struct page *page; + + page = __rmqueue_smallest(zone, order, migratetype); + + if (unlikely(!page)) + page = __rmqueue_fallback(zone, order, migratetype); + + return page; +} + /* * Obtain a specified number of elements from the buddy allocator, all under * a single hold of the lock, for efficiency. Add them to the supplied list. * Returns the number of new pages which were placed at *list. */ static int rmqueue_bulk(struct zone *zone, unsigned int order, - unsigned long count, struct list_head *list) + unsigned long count, struct list_head *list, + int migratetype) { int i; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { - struct page *page = __rmqueue(zone, order); + struct page *page = __rmqueue(zone, order, migratetype); if (unlikely(page == NULL)) break; - list_add_tail(&page->lru, list); + list_add(&page->lru, list); + set_page_private(page, migratetype); } spin_unlock(&zone->lock); return i; @@ -732,7 +931,7 @@ void mark_free_pages(struct zone *zone) { unsigned long pfn, max_zone_pfn; unsigned long flags; - int order; + int order, t; struct list_head *curr; if (!zone->spanned_pages) @@ -749,17 +948,18 @@ void mark_free_pages(struct zone *zone) swsusp_unset_page_free(page); } - for (order = MAX_ORDER - 1; order >= 0; --order) - list_for_each(curr, &zone->free_area[order].free_list) { + for_each_migratetype_order(order, t) { + list_for_each(curr, &zone->free_area[order].free_list[t]) { unsigned long i; pfn = page_to_pfn(list_entry(curr, struct page, lru)); for (i = 0; i < (1UL << order); i++) swsusp_set_page_free(pfn_to_page(pfn + i)); } - + } spin_unlock_irqrestore(&zone->lock, flags); } +#endif /* CONFIG_PM */ /* * Spill all of this CPU's per-cpu pages back into the buddy allocator. @@ -772,7 +972,25 @@ void drain_local_pages(void) __drain_pages(smp_processor_id()); local_irq_restore(flags); } -#endif /* CONFIG_HIBERNATION */ + +void smp_drain_local_pages(void *arg) +{ + drain_local_pages(); +} + +/* + * Spill all the per-cpu pages from all CPUs back into the buddy allocator + */ +void drain_all_local_pages(void) +{ + unsigned long flags; + + local_irq_save(flags); + __drain_pages(smp_processor_id()); + local_irq_restore(flags); + + smp_call_function(smp_drain_local_pages, NULL, 0, 1); +} /* * Free a 0-order page @@ -797,6 +1015,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) local_irq_save(flags); __count_vm_event(PGFREE); list_add(&page->lru, &pcp->list); + set_page_private(page, get_pageblock_migratetype(page)); pcp->count++; if (pcp->count >= pcp->high) { free_pages_bulk(zone, pcp->batch, &pcp->list, 0); @@ -846,6 +1065,7 @@ static struct page *buffered_rmqueue(struct zonelist *zonelist, struct page *page; int cold = !!(gfp_flags & __GFP_COLD); int cpu; + int migratetype = allocflags_to_migratetype(gfp_flags); again: cpu = get_cpu(); @@ -856,16 +1076,28 @@ again: local_irq_save(flags); if (!pcp->count) { pcp->count = rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list); + pcp->batch, &pcp->list, migratetype); if (unlikely(!pcp->count)) goto failed; } - page = list_entry(pcp->list.next, struct page, lru); + + /* Find a page of the appropriate migrate type */ + list_for_each_entry(page, &pcp->list, lru) + if (page_private(page) == migratetype) + break; + + /* Allocate more to the pcp list if necessary */ + if (unlikely(&page->lru == &pcp->list)) { + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, &pcp->list, migratetype); + page = list_entry(pcp->list.next, struct page, lru); + } + list_del(&page->lru); pcp->count--; } else { spin_lock_irqsave(&zone->lock, flags); - page = __rmqueue(zone, order); + page = __rmqueue(zone, order, migratetype); spin_unlock(&zone->lock); if (!page) goto failed; @@ -1032,7 +1264,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, * * If the zonelist cache is present in the passed in zonelist, then * returns a pointer to the allowed node mask (either the current - * tasks mems_allowed, or node_online_map.) + * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) * * If the zonelist cache is not available for this zonelist, does * nothing and returns NULL. @@ -1061,7 +1293,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? &cpuset_current_mems_allowed : - &node_online_map; + &node_states[N_HIGH_MEMORY]; return allowednodes; } @@ -1183,9 +1415,6 @@ zonelist_scan: !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; zone = *z; - if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && - zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) - break; if ((alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) goto try_next_zone; @@ -1254,7 +1483,10 @@ restart: z = zonelist->zones; /* the list of zones suitable for gfp_mask */ if (unlikely(*z == NULL)) { - /* Should this ever happen?? */ + /* + * Happens if we have an empty zonelist as a result of + * GFP_THISNODE being used on a memoryless node + */ return NULL; } @@ -1346,12 +1578,20 @@ nofail_alloc: cond_resched(); + if (order != 0) + drain_all_local_pages(); + if (likely(did_some_progress)) { page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); if (page) goto got_pg; } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { + if (!try_set_zone_oom(zonelist)) { + schedule_timeout_uninterruptible(1); + goto restart; + } + /* * Go through the zonelist yet one more time, keep * very high watermark here, this is only to catch @@ -1360,14 +1600,19 @@ nofail_alloc: */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); - if (page) + if (page) { + clear_zonelist_oom(zonelist); goto got_pg; + } /* The OOM killer will not help higher order allocs so fail */ - if (order > PAGE_ALLOC_COSTLY_ORDER) + if (order > PAGE_ALLOC_COSTLY_ORDER) { + clear_zonelist_oom(zonelist); goto nopage; + } out_of_memory(zonelist, gfp_mask, order); + clear_zonelist_oom(zonelist); goto restart; } @@ -1616,7 +1861,7 @@ void show_free_areas(void) K(zone_page_state(zone, NR_INACTIVE)), K(zone->present_pages), zone->pages_scanned, - (zone->all_unreclaimable ? "yes" : "no") + (zone_is_all_unreclaimable(zone) ? "yes" : "no") ); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) @@ -1794,7 +2039,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) return node; } - for_each_online_node(n) { + for_each_node_state(n, N_HIGH_MEMORY) { cpumask_t tmp; /* Don't want a node to appear more than once */ @@ -1850,6 +2095,22 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) } /* + * Build gfp_thisnode zonelists + */ +static void build_thisnode_zonelists(pg_data_t *pgdat) +{ + enum zone_type i; + int j; + struct zonelist *zonelist; + + for (i = 0; i < MAX_NR_ZONES; i++) { + zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i; + j = build_zonelists_node(pgdat, zonelist, 0, i); + zonelist->zones[j] = NULL; + } +} + +/* * Build zonelists ordered by zone and nodes within zones. * This results in conserving DMA zone[s] until all Normal memory is * exhausted, but results in overflowing to remote node while memory @@ -1915,7 +2176,8 @@ static int default_zonelist_order(void) * If there is a node whose DMA/DMA32 memory is very big area on * local memory, NODE_ORDER may be suitable. */ - average_size = total_size / (num_online_nodes() + 1); + average_size = total_size / + (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); for_each_online_node(nid) { low_kmem_size = 0; total_size = 0; @@ -1953,7 +2215,7 @@ static void build_zonelists(pg_data_t *pgdat) int order = current_zonelist_order; /* initialize zonelists */ - for (i = 0; i < MAX_NR_ZONES; i++) { + for (i = 0; i < MAX_ZONELISTS; i++) { zonelist = pgdat->node_zonelists + i; zonelist->zones[0] = NULL; } @@ -1998,6 +2260,8 @@ static void build_zonelists(pg_data_t *pgdat) /* calculate node order -- i.e., DMA last! */ build_zonelists_in_zone_order(pgdat, j); } + + build_thisnode_zonelists(pgdat); } /* Construct the zonelist performance cache - see further mmzone.h */ @@ -2078,8 +2342,10 @@ static int __build_all_zonelists(void *dummy) int nid; for_each_online_node(nid) { - build_zonelists(NODE_DATA(nid)); - build_zonelist_cache(NODE_DATA(nid)); + pg_data_t *pgdat = NODE_DATA(nid); + + build_zonelists(pgdat); + build_zonelist_cache(pgdat); } return 0; } @@ -2098,9 +2364,23 @@ void build_all_zonelists(void) /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); - printk("Built %i zonelists in %s order. Total pages: %ld\n", + /* + * Disable grouping by mobility if the number of pages in the + * system is too low to allow the mechanism to work. It would be + * more accurate, but expensive to check per-zone. This check is + * made on memory-hotadd so a system can start with mobility + * disabled and enable it later + */ + if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) + page_group_by_mobility_disabled = 1; + else + page_group_by_mobility_disabled = 0; + + printk("Built %i zonelists in %s order, mobility grouping %s. " + "Total pages: %ld\n", num_online_nodes(), zonelist_order_name[current_zonelist_order], + page_group_by_mobility_disabled ? "off" : "on", vm_total_pages); #ifdef CONFIG_NUMA printk("Policy zone: %s\n", zone_names[policy_zone]); @@ -2176,6 +2456,61 @@ static inline unsigned long wait_table_bits(unsigned long size) #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) /* + * Mark a number of pageblocks as MIGRATE_RESERVE. The number + * of blocks reserved is based on zone->pages_min. The memory within the + * reserve will tend to store contiguous free pages. Setting min_free_kbytes + * higher will lead to a bigger reserve which will get freed as contiguous + * blocks as reclaim kicks in + */ +static void setup_zone_migrate_reserve(struct zone *zone) +{ + unsigned long start_pfn, pfn, end_pfn; + struct page *page; + unsigned long reserve, block_migratetype; + + /* Get the start pfn, end pfn and the number of blocks to reserve */ + start_pfn = zone->zone_start_pfn; + end_pfn = start_pfn + zone->spanned_pages; + reserve = roundup(zone->pages_min, pageblock_nr_pages) >> + pageblock_order; + + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + + /* Blocks with reserved pages will never free, skip them. */ + if (PageReserved(page)) + continue; + + block_migratetype = get_pageblock_migratetype(page); + + /* If this block is reserved, account for it */ + if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { + reserve--; + continue; + } + + /* Suitable for reserving if this block is movable */ + if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { + set_pageblock_migratetype(page, MIGRATE_RESERVE); + move_freepages_block(zone, page, MIGRATE_RESERVE); + reserve--; + continue; + } + + /* + * If the reserve is met and this is a previous reserved block, + * take it back + */ + if (block_migratetype == MIGRATE_RESERVE) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + move_freepages_block(zone, page, MIGRATE_MOVABLE); + } + } +} + +/* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. @@ -2204,6 +2539,19 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, init_page_count(page); reset_page_mapcount(page); SetPageReserved(page); + + /* + * Mark the block movable so that blocks are reserved for + * movable at startup. This will force kernel allocations + * to reserve their blocks rather than leaking throughout + * the address space during boot when many long-lived + * kernel allocations are made. Later some blocks near + * the start are marked MIGRATE_RESERVE by + * setup_zone_migrate_reserve() + */ + if ((pfn & (pageblock_nr_pages-1))) + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + INIT_LIST_HEAD(&page->lru); #ifdef WANT_PAGE_VIRTUAL /* The shift won't overflow because ZONE_NORMAL is below 4G. */ @@ -2216,9 +2564,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, static void __meminit zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, unsigned long size) { - int order; - for (order = 0; order < MAX_ORDER ; order++) { - INIT_LIST_HEAD(&zone->free_area[order].free_list); + int order, t; + for_each_migratetype_order(order, t) { + INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); zone->free_area[order].nr_free = 0; } } @@ -2324,6 +2672,9 @@ static struct per_cpu_pageset boot_pageset[NR_CPUS]; static int __cpuinit process_zones(int cpu) { struct zone *zone, *dzone; + int node = cpu_to_node(cpu); + + node_set_state(node, N_CPU); /* this node has a cpu */ for_each_zone(zone) { @@ -2331,7 +2682,7 @@ static int __cpuinit process_zones(int cpu) continue; zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), - GFP_KERNEL, cpu_to_node(cpu)); + GFP_KERNEL, node); if (!zone_pcp(zone, cpu)) goto bad; @@ -2444,7 +2795,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) * To use this new node's memory, further consideration will be * necessary. */ - zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size); + zone->wait_table = vmalloc(alloc_size); } if (!zone->wait_table) return -ENOMEM; @@ -2680,10 +3031,8 @@ void __meminit get_pfn_range_for_nid(unsigned int nid, *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); } - if (*start_pfn == -1UL) { - printk(KERN_WARNING "Node %u active with no memory\n", nid); + if (*start_pfn == -1UL) *start_pfn = 0; - } /* Push the node boundaries out if requested */ account_node_boundary(nid, start_pfn, end_pfn); @@ -2901,6 +3250,62 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, realtotalpages); } +#ifndef CONFIG_SPARSEMEM +/* + * Calculate the size of the zone->blockflags rounded to an unsigned long + * Start by making sure zonesize is a multiple of pageblock_order by rounding + * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally + * round what is now in bits to nearest long in bits, then return it in + * bytes. + */ +static unsigned long __init usemap_size(unsigned long zonesize) +{ + unsigned long usemapsize; + + usemapsize = roundup(zonesize, pageblock_nr_pages); + usemapsize = usemapsize >> pageblock_order; + usemapsize *= NR_PAGEBLOCK_BITS; + usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); + + return usemapsize / 8; +} + +static void __init setup_usemap(struct pglist_data *pgdat, + struct zone *zone, unsigned long zonesize) +{ + unsigned long usemapsize = usemap_size(zonesize); + zone->pageblock_flags = NULL; + if (usemapsize) { + zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); + memset(zone->pageblock_flags, 0, usemapsize); + } +} +#else +static void inline setup_usemap(struct pglist_data *pgdat, + struct zone *zone, unsigned long zonesize) {} +#endif /* CONFIG_SPARSEMEM */ + +#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE +/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ +static inline void __init set_pageblock_order(unsigned int order) +{ + /* Check that pageblock_nr_pages has not already been setup */ + if (pageblock_order) + return; + + /* + * Assume the largest contiguous order of interest is a huge page. + * This value may be variable depending on boot parameters on IA64 + */ + pageblock_order = order; +} +#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ + +/* Defined this way to avoid accidently referencing HUGETLB_PAGE_ORDER */ +#define set_pageblock_order(x) do {} while (0) + +#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ + /* * Set up the zone data structures: * - mark all pages reserved @@ -2977,10 +3382,12 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, zone->nr_scan_active = 0; zone->nr_scan_inactive = 0; zap_zone_vm_stats(zone); - atomic_set(&zone->reclaim_in_progress, 0); + zone->flags = 0; if (!size) continue; + set_pageblock_order(HUGETLB_PAGE_ORDER); + setup_usemap(pgdat, zone, size); ret = init_currently_empty_zone(zone, zone_start_pfn, size, MEMMAP_EARLY); BUG_ON(ret); @@ -3234,16 +3641,24 @@ unsigned long __init find_max_pfn_with_active_regions(void) return max_pfn; } -unsigned long __init early_calculate_totalpages(void) +/* + * early_calculate_totalpages() + * Sum pages in active regions for movable zone. + * Populate N_HIGH_MEMORY for calculating usable_nodes. + */ +static unsigned long __init early_calculate_totalpages(void) { int i; unsigned long totalpages = 0; - for (i = 0; i < nr_nodemap_entries; i++) - totalpages += early_node_map[i].end_pfn - + for (i = 0; i < nr_nodemap_entries; i++) { + unsigned long pages = early_node_map[i].end_pfn - early_node_map[i].start_pfn; - - return totalpages; + totalpages += pages; + if (pages) + node_set_state(early_node_map[i].nid, N_HIGH_MEMORY); + } + return totalpages; } /* @@ -3257,7 +3672,8 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) int i, nid; unsigned long usable_startpfn; unsigned long kernelcore_node, kernelcore_remaining; - int usable_nodes = num_online_nodes(); + unsigned long totalpages = early_calculate_totalpages(); + int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); /* * If movablecore was specified, calculate what size of @@ -3268,7 +3684,6 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) * what movablecore would have allowed. */ if (required_movablecore) { - unsigned long totalpages = early_calculate_totalpages(); unsigned long corepages; /* @@ -3293,7 +3708,7 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) restart: /* Spread kernelcore memory as evenly as possible throughout nodes */ kernelcore_node = required_kernelcore / usable_nodes; - for_each_online_node(nid) { + for_each_node_state(nid, N_HIGH_MEMORY) { /* * Recalculate kernelcore_node if the division per node * now exceeds what is necessary to satisfy the requested @@ -3385,6 +3800,20 @@ restart: roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); } +/* Any regular memory on that node ? */ +static void check_for_regular_memory(pg_data_t *pgdat) +{ +#ifdef CONFIG_HIGHMEM + enum zone_type zone_type; + + for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { + struct zone *zone = &pgdat->node_zones[zone_type]; + if (zone->present_pages) + node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); + } +#endif +} + /** * free_area_init_nodes - Initialise all pg_data_t and zone data * @max_zone_pfn: an array of max PFNs for each zone @@ -3459,6 +3888,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) pg_data_t *pgdat = NODE_DATA(nid); free_area_init_node(nid, pgdat, NULL, find_min_pfn_for_node(nid), NULL); + + /* Any memory on that node */ + if (pgdat->node_present_pages) + node_set_state(nid, N_HIGH_MEMORY); + check_for_regular_memory(pgdat); } } @@ -3673,6 +4107,7 @@ void setup_per_zone_pages_min(void) zone->pages_low = zone->pages_min + (tmp >> 2); zone->pages_high = zone->pages_min + (tmp >> 1); + setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lru_lock, flags); } @@ -3934,4 +4369,169 @@ EXPORT_SYMBOL(pfn_to_page); EXPORT_SYMBOL(page_to_pfn); #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ +/* Return a pointer to the bitmap storing bits affecting a block of pages */ +static inline unsigned long *get_pageblock_bitmap(struct zone *zone, + unsigned long pfn) +{ +#ifdef CONFIG_SPARSEMEM + return __pfn_to_section(pfn)->pageblock_flags; +#else + return zone->pageblock_flags; +#endif /* CONFIG_SPARSEMEM */ +} + +static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) +{ +#ifdef CONFIG_SPARSEMEM + pfn &= (PAGES_PER_SECTION-1); + return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; +#else + pfn = pfn - zone->zone_start_pfn; + return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; +#endif /* CONFIG_SPARSEMEM */ +} + +/** + * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @start_bitidx: The first bit of interest to retrieve + * @end_bitidx: The last bit of interest + * returns pageblock_bits flags + */ +unsigned long get_pageblock_flags_group(struct page *page, + int start_bitidx, int end_bitidx) +{ + struct zone *zone; + unsigned long *bitmap; + unsigned long pfn, bitidx; + unsigned long flags = 0; + unsigned long value = 1; + + zone = page_zone(page); + pfn = page_to_pfn(page); + bitmap = get_pageblock_bitmap(zone, pfn); + bitidx = pfn_to_bitidx(zone, pfn); + + for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) + if (test_bit(bitidx + start_bitidx, bitmap)) + flags |= value; + + return flags; +} + +/** + * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @start_bitidx: The first bit of interest + * @end_bitidx: The last bit of interest + * @flags: The flags to set + */ +void set_pageblock_flags_group(struct page *page, unsigned long flags, + int start_bitidx, int end_bitidx) +{ + struct zone *zone; + unsigned long *bitmap; + unsigned long pfn, bitidx; + unsigned long value = 1; + + zone = page_zone(page); + pfn = page_to_pfn(page); + bitmap = get_pageblock_bitmap(zone, pfn); + bitidx = pfn_to_bitidx(zone, pfn); + + for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) + if (flags & value) + __set_bit(bitidx + start_bitidx, bitmap); + else + __clear_bit(bitidx + start_bitidx, bitmap); +} + +/* + * This is designed as sub function...plz see page_isolation.c also. + * set/clear page block's type to be ISOLATE. + * page allocater never alloc memory from ISOLATE block. + */ + +int set_migratetype_isolate(struct page *page) +{ + struct zone *zone; + unsigned long flags; + int ret = -EBUSY; + + zone = page_zone(page); + spin_lock_irqsave(&zone->lock, flags); + /* + * In future, more migrate types will be able to be isolation target. + */ + if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE) + goto out; + set_pageblock_migratetype(page, MIGRATE_ISOLATE); + move_freepages_block(zone, page, MIGRATE_ISOLATE); + ret = 0; +out: + spin_unlock_irqrestore(&zone->lock, flags); + if (!ret) + drain_all_local_pages(); + return ret; +} +void unset_migratetype_isolate(struct page *page) +{ + struct zone *zone; + unsigned long flags; + zone = page_zone(page); + spin_lock_irqsave(&zone->lock, flags); + if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + goto out; + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + move_freepages_block(zone, page, MIGRATE_MOVABLE); +out: + spin_unlock_irqrestore(&zone->lock, flags); +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +/* + * All pages in the range must be isolated before calling this. + */ +void +__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) +{ + struct page *page; + struct zone *zone; + int order, i; + unsigned long pfn; + unsigned long flags; + /* find the first valid pfn */ + for (pfn = start_pfn; pfn < end_pfn; pfn++) + if (pfn_valid(pfn)) + break; + if (pfn == end_pfn) + return; + zone = page_zone(pfn_to_page(pfn)); + spin_lock_irqsave(&zone->lock, flags); + pfn = start_pfn; + while (pfn < end_pfn) { + if (!pfn_valid(pfn)) { + pfn++; + continue; + } + page = pfn_to_page(pfn); + BUG_ON(page_count(page)); + BUG_ON(!PageBuddy(page)); + order = page_order(page); +#ifdef CONFIG_DEBUG_VM + printk(KERN_INFO "remove from free list %lx %d %lx\n", + pfn, 1 << order, end_pfn); +#endif + list_del(&page->lru); + rmv_page_order(page); + zone->free_area[order].nr_free--; + __mod_zone_page_state(zone, NR_FREE_PAGES, + - (1UL << order)); + for (i = 0; i < (1 << order); i++) + SetPageReserved((page+i)); + pfn += (1 << order); + } + spin_unlock_irqrestore(&zone->lock, flags); +} +#endif diff --git a/mm/page_isolation.c b/mm/page_isolation.c new file mode 100644 index 00000000000..8f92a29695c --- /dev/null +++ b/mm/page_isolation.c @@ -0,0 +1,138 @@ +/* + * linux/mm/page_isolation.c + */ + +#include <stddef.h> +#include <linux/mm.h> +#include <linux/page-isolation.h> +#include <linux/pageblock-flags.h> +#include "internal.h" + +static inline struct page * +__first_valid_page(unsigned long pfn, unsigned long nr_pages) +{ + int i; + for (i = 0; i < nr_pages; i++) + if (pfn_valid_within(pfn + i)) + break; + if (unlikely(i == nr_pages)) + return NULL; + return pfn_to_page(pfn + i); +} + +/* + * start_isolate_page_range() -- make page-allocation-type of range of pages + * to be MIGRATE_ISOLATE. + * @start_pfn: The lower PFN of the range to be isolated. + * @end_pfn: The upper PFN of the range to be isolated. + * + * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in + * the range will never be allocated. Any free pages and pages freed in the + * future will not be allocated again. + * + * start_pfn/end_pfn must be aligned to pageblock_order. + * Returns 0 on success and -EBUSY if any part of range cannot be isolated. + */ +int +start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + unsigned long undo_pfn; + struct page *page; + + BUG_ON((start_pfn) & (pageblock_nr_pages - 1)); + BUG_ON((end_pfn) & (pageblock_nr_pages - 1)); + + for (pfn = start_pfn; + pfn < end_pfn; + pfn += pageblock_nr_pages) { + page = __first_valid_page(pfn, pageblock_nr_pages); + if (page && set_migratetype_isolate(page)) { + undo_pfn = pfn; + goto undo; + } + } + return 0; +undo: + for (pfn = start_pfn; + pfn <= undo_pfn; + pfn += pageblock_nr_pages) + unset_migratetype_isolate(pfn_to_page(pfn)); + + return -EBUSY; +} + +/* + * Make isolated pages available again. + */ +int +undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + struct page *page; + BUG_ON((start_pfn) & (pageblock_nr_pages - 1)); + BUG_ON((end_pfn) & (pageblock_nr_pages - 1)); + for (pfn = start_pfn; + pfn < end_pfn; + pfn += pageblock_nr_pages) { + page = __first_valid_page(pfn, pageblock_nr_pages); + if (!page || get_pageblock_flags(page) != MIGRATE_ISOLATE) + continue; + unset_migratetype_isolate(page); + } + return 0; +} +/* + * Test all pages in the range is free(means isolated) or not. + * all pages in [start_pfn...end_pfn) must be in the same zone. + * zone->lock must be held before call this. + * + * Returns 0 if all pages in the range is isolated. + */ +static int +__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) +{ + struct page *page; + + while (pfn < end_pfn) { + if (!pfn_valid_within(pfn)) { + pfn++; + continue; + } + page = pfn_to_page(pfn); + if (PageBuddy(page)) + pfn += 1 << page_order(page); + else if (page_count(page) == 0 && + page_private(page) == MIGRATE_ISOLATE) + pfn += 1; + else + break; + } + if (pfn < end_pfn) + return 0; + return 1; +} + +int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + struct page *page; + + pfn = start_pfn; + /* + * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page + * is not aligned to pageblock_nr_pages. + * Then we just check pagetype fist. + */ + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { + page = __first_valid_page(pfn, pageblock_nr_pages); + if (page && get_pageblock_flags(page) != MIGRATE_ISOLATE) + break; + } + if (pfn < end_pfn) + return -EBUSY; + /* Check all pages are free or Marked as ISOLATED */ + if (__test_page_isolated_in_pageblock(start_pfn, end_pfn)) + return 0; + return -EBUSY; +} diff --git a/mm/readahead.c b/mm/readahead.c index be20c9d699d..c9c50ca1ec3 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -22,16 +22,8 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) } EXPORT_SYMBOL(default_unplug_io_fn); -/* - * Convienent macros for min/max read-ahead pages. - * Note that MAX_RA_PAGES is rounded down, while MIN_RA_PAGES is rounded up. - * The latter is necessary for systems with large page size(i.e. 64k). - */ -#define MAX_RA_PAGES (VM_MAX_READAHEAD*1024 / PAGE_CACHE_SIZE) -#define MIN_RA_PAGES DIV_ROUND_UP(VM_MIN_READAHEAD*1024, PAGE_CACHE_SIZE) - struct backing_dev_info default_backing_dev_info = { - .ra_pages = MAX_RA_PAGES, + .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, .state = 0, .capabilities = BDI_CAP_MAP_COPY, .unplug_io_fn = default_unplug_io_fn, @@ -46,7 +38,7 @@ void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) { ra->ra_pages = mapping->backing_dev_info->ra_pages; - ra->prev_index = -1; + ra->prev_pos = -1; } EXPORT_SYMBOL_GPL(file_ra_state_init); @@ -66,28 +58,25 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, int (*filler)(void *, struct page *), void *data) { struct page *page; - struct pagevec lru_pvec; int ret = 0; - pagevec_init(&lru_pvec, 0); - while (!list_empty(pages)) { page = list_to_page(pages); list_del(&page->lru); - if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { + if (add_to_page_cache_lru(page, mapping, + page->index, GFP_KERNEL)) { page_cache_release(page); continue; } + page_cache_release(page); + ret = filler(data, page); - if (!pagevec_add(&lru_pvec, page)) - __pagevec_lru_add(&lru_pvec); - if (ret) { + if (unlikely(ret)) { put_pages_list(pages); break; } task_io_account_read(PAGE_CACHE_SIZE); } - pagevec_lru_add(&lru_pvec); return ret; } @@ -97,7 +86,6 @@ static int read_pages(struct address_space *mapping, struct file *filp, struct list_head *pages, unsigned nr_pages) { unsigned page_idx; - struct pagevec lru_pvec; int ret; if (mapping->a_ops->readpages) { @@ -107,19 +95,15 @@ static int read_pages(struct address_space *mapping, struct file *filp, goto out; } - pagevec_init(&lru_pvec, 0); for (page_idx = 0; page_idx < nr_pages; page_idx++) { struct page *page = list_to_page(pages); list_del(&page->lru); - if (!add_to_page_cache(page, mapping, + if (!add_to_page_cache_lru(page, mapping, page->index, GFP_KERNEL)) { mapping->a_ops->readpage(filp, page); - if (!pagevec_add(&lru_pvec, page)) - __pagevec_lru_add(&lru_pvec); - } else - page_cache_release(page); + } + page_cache_release(page); } - pagevec_lru_add(&lru_pvec); ret = 0; out: return ret; @@ -157,20 +141,19 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, /* * Preallocate as many pages as we will need. */ - read_lock_irq(&mapping->tree_lock); for (page_idx = 0; page_idx < nr_to_read; page_idx++) { pgoff_t page_offset = offset + page_idx; if (page_offset > end_index) break; + rcu_read_lock(); page = radix_tree_lookup(&mapping->page_tree, page_offset); + rcu_read_unlock(); if (page) continue; - read_unlock_irq(&mapping->tree_lock); page = page_cache_alloc_cold(mapping); - read_lock_irq(&mapping->tree_lock); if (!page) break; page->index = page_offset; @@ -179,7 +162,6 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, SetPageReadahead(page); ret++; } - read_unlock_irq(&mapping->tree_lock); /* * Now start the IO. We ignore I/O errors - if the page is not @@ -251,6 +233,12 @@ unsigned long max_sane_readahead(unsigned long nr) + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); } +static int __init readahead_init(void) +{ + return bdi_init(&default_backing_dev_info); +} +subsys_initcall(readahead_init); + /* * Submit IO for the read-ahead request in file_ra_state. */ @@ -327,7 +315,7 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, * indicator. The flag won't be set on already cached pages, to avoid the * readahead-for-nothing fuss, saving pointless page cache lookups. * - * prev_index tracks the last visited page in the _previous_ read request. + * prev_pos tracks the last visited byte in the _previous_ read request. * It should be maintained by the caller, and will be used for detecting * small random reads. Note that the readahead algorithm checks loosely * for sequential patterns. Hence interleaved reads might be served as @@ -351,11 +339,9 @@ ondemand_readahead(struct address_space *mapping, bool hit_readahead_marker, pgoff_t offset, unsigned long req_size) { - unsigned long max; /* max readahead pages */ - int sequential; - - max = ra->ra_pages; - sequential = (offset - ra->prev_index <= 1UL) || (req_size > max); + int max = ra->ra_pages; /* max readahead pages */ + pgoff_t prev_offset; + int sequential; /* * It's the expected callback offset, assume sequential access. @@ -369,6 +355,9 @@ ondemand_readahead(struct address_space *mapping, goto readit; } + prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT; + sequential = offset - prev_offset <= 1UL || req_size > max; + /* * Standalone, small read. * Read as is, and do not pollute the readahead state. @@ -379,6 +368,29 @@ ondemand_readahead(struct address_space *mapping, } /* + * Hit a marked page without valid readahead state. + * E.g. interleaved reads. + * Query the pagecache for async_size, which normally equals to + * readahead size. Ramp it up and use it as the new readahead size. + */ + if (hit_readahead_marker) { + pgoff_t start; + + read_lock_irq(&mapping->tree_lock); + start = radix_tree_next_hole(&mapping->page_tree, offset, max+1); + read_unlock_irq(&mapping->tree_lock); + + if (!start || start - offset > max) + return 0; + + ra->start = start; + ra->size = start - offset; /* old async_size */ + ra->size = get_next_ra_size(ra, max); + ra->async_size = ra->size; + goto readit; + } + + /* * It may be one of * - first read on start of file * - sequential cache miss @@ -389,16 +401,6 @@ ondemand_readahead(struct address_space *mapping, ra->size = get_init_ra_size(req_size, max); ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; - /* - * Hit on a marked page without valid readahead state. - * E.g. interleaved reads. - * Not knowing its readahead pos/size, bet on the minimal possible one. - */ - if (hit_readahead_marker) { - ra->start++; - ra->size = get_next_ra_size(ra, max); - } - readit: return ra_submit(ra, mapping, filp); } diff --git a/mm/rmap.c b/mm/rmap.c index 41ac39749ef..8990f909492 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -36,6 +36,7 @@ * mapping->tree_lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, * within inode_lock in __sync_single_inode) + * zone->lock (within radix tree node alloc) */ #include <linux/mm.h> @@ -137,8 +138,7 @@ void anon_vma_unlink(struct vm_area_struct *vma) anon_vma_free(anon_vma); } -static void anon_vma_ctor(void *data, struct kmem_cache *cachep, - unsigned long flags) +static void anon_vma_ctor(struct kmem_cache *cachep, void *data) { struct anon_vma *anon_vma = data; @@ -436,7 +436,6 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) entry = pte_wrprotect(entry); entry = pte_mkclean(entry); set_pte_at(mm, address, pte, entry); - lazy_mmu_prot_update(entry); ret = 1; } diff --git a/mm/shmem.c b/mm/shmem.c index fcd19d323f9..289dbb0a6fd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -49,7 +49,6 @@ #include <linux/ctype.h> #include <linux/migrate.h> #include <linux/highmem.h> -#include <linux/backing-dev.h> #include <asm/uaccess.h> #include <asm/div64.h> @@ -96,9 +95,9 @@ static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: * might be reconsidered if it ever diverges from PAGE_SIZE. * - * __GFP_MOVABLE is masked out as swap vectors cannot move + * Mobility flags are masked out as swap vectors cannot move */ - return alloc_pages((gfp_mask & ~__GFP_MOVABLE) | __GFP_ZERO, + return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO, PAGE_CACHE_SHIFT-PAGE_SHIFT); } @@ -972,7 +971,7 @@ static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_ *nodelist++ = '\0'; if (nodelist_parse(nodelist, *policy_nodes)) goto out; - if (!nodes_subset(*policy_nodes, node_online_map)) + if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY])) goto out; } if (!strcmp(value, "default")) { @@ -997,9 +996,11 @@ static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_ err = 0; } else if (!strcmp(value, "interleave")) { *policy = MPOL_INTERLEAVE; - /* Default to nodes online if no nodelist */ + /* + * Default to online nodes with memory if no nodelist + */ if (!nodelist) - *policy_nodes = node_online_map; + *policy_nodes = node_states[N_HIGH_MEMORY]; err = 0; } out: @@ -1025,8 +1026,8 @@ static struct page *shmem_swapin_async(struct shared_policy *p, return page; } -struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry, - unsigned long idx) +static struct page *shmem_swapin(struct shmem_inode_info *info, + swp_entry_t entry, unsigned long idx) { struct shared_policy *p = &info->policy; int i, num; @@ -1061,7 +1062,8 @@ shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info, return page; } #else -static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) +static inline int shmem_parse_mpol(char *value, int *policy, + nodemask_t *policy_nodes) { return 1; } @@ -1109,7 +1111,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, * Normally, filepage is NULL on entry, and either found * uptodate immediately, or allocated and zeroed, or read * in under swappage, which is then assigned to filepage. - * But shmem_readpage and shmem_prepare_write pass in a locked + * But shmem_readpage and shmem_write_begin pass in a locked * filepage, which may be found not uptodate by other callers * too, and may need to be copied from the swappage read in. */ @@ -1327,14 +1329,14 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } #ifdef CONFIG_NUMA -int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) +static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) { struct inode *i = vma->vm_file->f_path.dentry->d_inode; return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); } -struct mempolicy * -shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) +static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, + unsigned long addr) { struct inode *i = vma->vm_file->f_path.dentry->d_inode; unsigned long idx; @@ -1446,7 +1448,7 @@ static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_symlink_inline_operations; /* - * Normally tmpfs avoids the use of shmem_readpage and shmem_prepare_write; + * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin; * but providing them allows a tmpfs file to be used for splice, sendfile, and * below the loop driver, in the generic fashion that many filesystems support. */ @@ -1459,10 +1461,30 @@ static int shmem_readpage(struct file *file, struct page *page) } static int -shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to) +shmem_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) { - struct inode *inode = page->mapping->host; - return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL); + struct inode *inode = mapping->host; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + *pagep = NULL; + return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); +} + +static int +shmem_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + + set_page_dirty(page); + page_cache_release(page); + + if (pos+copied > inode->i_size) + i_size_write(inode, pos+copied); + + return copied; } static ssize_t @@ -2219,7 +2241,7 @@ static int shmem_fill_super(struct super_block *sb, unsigned long blocks = 0; unsigned long inodes = 0; int policy = MPOL_DEFAULT; - nodemask_t policy_nodes = node_online_map; + nodemask_t policy_nodes = node_states[N_HIGH_MEMORY]; #ifdef CONFIG_TMPFS /* @@ -2306,8 +2328,7 @@ static void shmem_destroy_inode(struct inode *inode) kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } -static void init_once(void *foo, struct kmem_cache *cachep, - unsigned long flags) +static void init_once(struct kmem_cache *cachep, void *foo) { struct shmem_inode_info *p = (struct shmem_inode_info *) foo; @@ -2322,9 +2343,7 @@ static int init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), - 0, 0, init_once); - if (shmem_inode_cachep == NULL) - return -ENOMEM; + 0, SLAB_PANIC, init_once); return 0; } @@ -2338,8 +2357,8 @@ static const struct address_space_operations shmem_aops = { .set_page_dirty = __set_page_dirty_no_writeback, #ifdef CONFIG_TMPFS .readpage = shmem_readpage, - .prepare_write = shmem_prepare_write, - .commit_write = simple_commit_write, + .write_begin = shmem_write_begin, + .write_end = shmem_write_end, #endif .migratepage = migrate_page, }; @@ -2442,6 +2461,10 @@ static int __init init_tmpfs(void) { int error; + error = bdi_init(&shmem_backing_dev_info); + if (error) + goto out4; + error = init_inodecache(); if (error) goto out3; @@ -2466,6 +2489,8 @@ out1: out2: destroy_inodecache(); out3: + bdi_destroy(&shmem_backing_dev_info); +out4: shm_mnt = ERR_PTR(error); return error; } @@ -2518,11 +2543,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) d_instantiate(dentry, inode); inode->i_size = size; inode->i_nlink = 0; /* It is unlinked */ - file->f_path.mnt = mntget(shm_mnt); - file->f_path.dentry = dentry; - file->f_mapping = inode->i_mapping; - file->f_op = &shmem_file_operations; - file->f_mode = FMODE_WRITE | FMODE_READ; + init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, + &shmem_file_operations); return file; close_file: diff --git a/mm/slab.c b/mm/slab.c index 6f6abef83a1..3ce9bc024d6 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -267,11 +267,10 @@ struct array_cache { unsigned int batchcount; unsigned int touched; spinlock_t lock; - void *entry[0]; /* + void *entry[]; /* * Must have this definition in here for the proper * alignment of array_cache. Also simplifies accessing * the entries. - * [0] is for gcc 2.95. It should really be []. */ }; @@ -408,7 +407,7 @@ struct kmem_cache { unsigned int dflags; /* dynamic flags */ /* constructor func */ - void (*ctor) (void *, struct kmem_cache *, unsigned long); + void (*ctor)(struct kmem_cache *, void *); /* 5) cache creation/removal */ const char *name; @@ -1568,7 +1567,7 @@ void __init kmem_cache_init(void) /* Replace the static kmem_list3 structures for the boot cpu */ init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node); - for_each_online_node(nid) { + for_each_node_state(nid, N_NORMAL_MEMORY) { init_list(malloc_sizes[INDEX_AC].cs_cachep, &initkmem_list3[SIZE_AC + nid], nid); @@ -1643,6 +1642,8 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) #endif flags |= cachep->gfpflags; + if (cachep->flags & SLAB_RECLAIM_ACCOUNT) + flags |= __GFP_RECLAIMABLE; page = alloc_pages_node(nodeid, flags, cachep->gfporder); if (!page) @@ -1944,7 +1945,7 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index) { int node; - for_each_online_node(node) { + for_each_node_state(node, N_NORMAL_MEMORY) { cachep->nodelists[node] = &initkmem_list3[index + node]; cachep->nodelists[node]->next_reap = jiffies + REAPTIMEOUT_LIST3 + @@ -2075,7 +2076,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) g_cpucache_up = PARTIAL_L3; } else { int node; - for_each_online_node(node) { + for_each_node_state(node, N_NORMAL_MEMORY) { cachep->nodelists[node] = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); @@ -2127,7 +2128,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) struct kmem_cache * kmem_cache_create (const char *name, size_t size, size_t align, unsigned long flags, - void (*ctor)(void*, struct kmem_cache *, unsigned long)) + void (*ctor)(struct kmem_cache *, void *)) { size_t left_over, slab_size, ralign; struct kmem_cache *cachep = NULL, *pc; @@ -2634,8 +2635,7 @@ static void cache_init_objs(struct kmem_cache *cachep, * They must also be threaded. */ if (cachep->ctor && !(cachep->flags & SLAB_POISON)) - cachep->ctor(objp + obj_offset(cachep), cachep, - 0); + cachep->ctor(cachep, objp + obj_offset(cachep)); if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) @@ -2651,7 +2651,7 @@ static void cache_init_objs(struct kmem_cache *cachep, cachep->buffer_size / PAGE_SIZE, 0); #else if (cachep->ctor) - cachep->ctor(objp, cachep, 0); + cachep->ctor(cachep, objp); #endif slab_bufctl(slabp)[i] = i + 1; } @@ -2746,9 +2746,9 @@ static int cache_grow(struct kmem_cache *cachep, * Be lazy and only check for valid flags here, keeping it out of the * critical path in kmem_cache_alloc(). */ - BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK)); + BUG_ON(flags & GFP_SLAB_BUG_MASK); + local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); - local_flags = (flags & GFP_LEVEL_MASK); /* Take the l3 list lock to change the colour_next on this node */ check_irq_off(); l3 = cachep->nodelists[nodeid]; @@ -2785,7 +2785,7 @@ static int cache_grow(struct kmem_cache *cachep, /* Get slab management. */ slabp = alloc_slabmgmt(cachep, objp, offset, - local_flags & ~GFP_THISNODE, nodeid); + local_flags & ~GFP_CONSTRAINT_MASK, nodeid); if (!slabp) goto opps1; @@ -3076,7 +3076,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, #endif objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) - cachep->ctor(objp, cachep, 0); + cachep->ctor(cachep, objp); #if ARCH_SLAB_MINALIGN if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", @@ -3225,7 +3225,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) zonelist = &NODE_DATA(slab_node(current->mempolicy)) ->node_zonelists[gfp_zone(flags)]; - local_flags = (flags & GFP_LEVEL_MASK); + local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); retry: /* @@ -3792,7 +3792,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep) struct array_cache *new_shared; struct array_cache **new_alien = NULL; - for_each_online_node(node) { + for_each_node_state(node, N_NORMAL_MEMORY) { if (use_alien_caches) { new_alien = alloc_alien_cache(node, cachep->limit); @@ -4446,7 +4446,8 @@ const struct seq_operations slabstats_op = { */ size_t ksize(const void *objp) { - if (unlikely(ZERO_OR_NULL_PTR(objp))) + BUG_ON(!objp); + if (unlikely(objp == ZERO_SIZE_PTR)) return 0; return obj_size(virt_to_cache(objp)); diff --git a/mm/slob.c b/mm/slob.c index ec33fcdc852..5bc2ceb692e 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -360,7 +360,7 @@ static void slob_free(void *block, int size) slobidx_t units; unsigned long flags; - if (ZERO_OR_NULL_PTR(block)) + if (unlikely(ZERO_OR_NULL_PTR(block))) return; BUG_ON(!size); @@ -466,7 +466,7 @@ void kfree(const void *block) { struct slob_page *sp; - if (ZERO_OR_NULL_PTR(block)) + if (unlikely(ZERO_OR_NULL_PTR(block))) return; sp = (struct slob_page *)virt_to_page(block); @@ -484,7 +484,8 @@ size_t ksize(const void *block) { struct slob_page *sp; - if (ZERO_OR_NULL_PTR(block)) + BUG_ON(!block); + if (unlikely(block == ZERO_SIZE_PTR)) return 0; sp = (struct slob_page *)virt_to_page(block); @@ -498,12 +499,12 @@ struct kmem_cache { unsigned int size, align; unsigned long flags; const char *name; - void (*ctor)(void *, struct kmem_cache *, unsigned long); + void (*ctor)(struct kmem_cache *, void *); }; struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, - void (*ctor)(void*, struct kmem_cache *, unsigned long)) + void (*ctor)(struct kmem_cache *, void *)) { struct kmem_cache *c; @@ -547,7 +548,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) b = slob_new_page(flags, get_order(c->size), node); if (c->ctor) - c->ctor(b, c, 0); + c->ctor(c, b); return b; } diff --git a/mm/slub.c b/mm/slub.c index addb20a6d67..e29a42988c7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -90,7 +90,7 @@ * One use of this flag is to mark slabs that are * used for allocations. Then such a slab becomes a cpu * slab. The cpu slab may be equipped with an additional - * lockless_freelist that allows lockless access to + * freelist that allows lockless access to * free objects in addition to the regular freelist * that requires the slab lock. * @@ -140,11 +140,6 @@ static inline void ClearSlabDebug(struct page *page) /* * Issues still to be resolved: * - * - The per cpu array is updated for each new slab and and is a remote - * cacheline for most nodes. This could become a bouncing cacheline given - * enough frequent updates. There are 16 pointers in a cacheline, so at - * max 16 cpus could compete for the cacheline which may be okay. - * * - Support PAGE_ALLOC_DEBUG. Should be easy to do. * * - Variable sizing of the per node arrays @@ -205,11 +200,6 @@ static inline void ClearSlabDebug(struct page *page) #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) #endif -/* - * The page->inuse field is 16 bit thus we have this limitation - */ -#define MAX_OBJECTS_PER_SLAB 65535 - /* Internal SLUB flags */ #define __OBJECT_POISON 0x80000000 /* Poison object */ #define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ @@ -277,6 +267,15 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) #endif } +static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) +{ +#ifdef CONFIG_SMP + return s->cpu_slab[cpu]; +#else + return &s->cpu_slab; +#endif +} + static inline int check_valid_pointer(struct kmem_cache *s, struct page *page, const void *object) { @@ -729,11 +728,6 @@ static int check_slab(struct kmem_cache *s, struct page *page) slab_err(s, page, "Not a valid slab page"); return 0; } - if (page->offset * sizeof(void *) != s->offset) { - slab_err(s, page, "Corrupted offset %lu", - (unsigned long)(page->offset * sizeof(void *))); - return 0; - } if (page->inuse > s->objects) { slab_err(s, page, "inuse %u > max %u", s->name, page->inuse, s->objects); @@ -872,8 +866,6 @@ bad: slab_fix(s, "Marking all objects used"); page->inuse = s->objects; page->freelist = NULL; - /* Fix up fields that may be corrupted */ - page->offset = s->offset / sizeof(void *); } return 0; } @@ -988,7 +980,7 @@ __setup("slub_debug", setup_slub_debug); static unsigned long kmem_cache_flags(unsigned long objsize, unsigned long flags, const char *name, - void (*ctor)(void *, struct kmem_cache *, unsigned long)) + void (*ctor)(struct kmem_cache *, void *)) { /* * The page->offset field is only 16 bit wide. This is an offset @@ -1035,7 +1027,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page, static inline void add_full(struct kmem_cache_node *n, struct page *page) {} static inline unsigned long kmem_cache_flags(unsigned long objsize, unsigned long flags, const char *name, - void (*ctor)(void *, struct kmem_cache *, unsigned long)) + void (*ctor)(struct kmem_cache *, void *)) { return flags; } @@ -1055,6 +1047,9 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if (s->flags & SLAB_CACHE_DMA) flags |= SLUB_DMA; + if (s->flags & SLAB_RECLAIM_ACCOUNT) + flags |= __GFP_RECLAIMABLE; + if (node == -1) page = alloc_pages(flags, s->order); else @@ -1076,7 +1071,7 @@ static void setup_object(struct kmem_cache *s, struct page *page, { setup_object_debug(s, page, object); if (unlikely(s->ctor)) - s->ctor(object, s, 0); + s->ctor(s, object); } static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) @@ -1088,19 +1083,16 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) void *last; void *p; - BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK)); - - if (flags & __GFP_WAIT) - local_irq_enable(); + BUG_ON(flags & GFP_SLAB_BUG_MASK); - page = allocate_slab(s, flags & GFP_LEVEL_MASK, node); + page = allocate_slab(s, + flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); if (!page) goto out; n = get_node(s, page_to_nid(page)); if (n) atomic_long_inc(&n->nr_slabs); - page->offset = s->offset / sizeof(void *); page->slab = s; page->flags |= 1 << PG_slab; if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | @@ -1123,11 +1115,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, last, NULL); page->freelist = start; - page->lockless_freelist = NULL; page->inuse = 0; out: - if (flags & __GFP_WAIT) - local_irq_disable(); return page; } @@ -1149,7 +1138,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page) NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - pages); - page->mapping = NULL; __free_pages(page, s->order); } @@ -1383,33 +1371,34 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page) /* * Remove the cpu slab */ -static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) +static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { + struct page *page = c->page; /* * Merge cpu freelist into freelist. Typically we get here * because both freelists are empty. So this is unlikely * to occur. */ - while (unlikely(page->lockless_freelist)) { + while (unlikely(c->freelist)) { void **object; /* Retrieve object from cpu_freelist */ - object = page->lockless_freelist; - page->lockless_freelist = page->lockless_freelist[page->offset]; + object = c->freelist; + c->freelist = c->freelist[c->offset]; /* And put onto the regular freelist */ - object[page->offset] = page->freelist; + object[c->offset] = page->freelist; page->freelist = object; page->inuse--; } - s->cpu_slab[cpu] = NULL; + c->page = NULL; unfreeze_slab(s, page); } -static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu) +static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { - slab_lock(page); - deactivate_slab(s, page, cpu); + slab_lock(c->page); + deactivate_slab(s, c); } /* @@ -1418,18 +1407,17 @@ static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu) */ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { - struct page *page = s->cpu_slab[cpu]; + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - if (likely(page)) - flush_slab(s, page, cpu); + if (likely(c && c->page)) + flush_slab(s, c); } static void flush_cpu_slab(void *d) { struct kmem_cache *s = d; - int cpu = smp_processor_id(); - __flush_cpu_slab(s, cpu); + __flush_cpu_slab(s, smp_processor_id()); } static void flush_all(struct kmem_cache *s) @@ -1446,6 +1434,19 @@ static void flush_all(struct kmem_cache *s) } /* + * Check if the objects in a per cpu structure fit numa + * locality expectations. + */ +static inline int node_match(struct kmem_cache_cpu *c, int node) +{ +#ifdef CONFIG_NUMA + if (node != -1 && c->node != node) + return 0; +#endif + return 1; +} + +/* * Slow path. The lockless freelist is empty or we need to perform * debugging duties. * @@ -1463,45 +1464,53 @@ static void flush_all(struct kmem_cache *s) * we need to allocate a new slab. This is slowest path since we may sleep. */ static void *__slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr, struct page *page) + gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c) { void **object; - int cpu = smp_processor_id(); + struct page *new; - if (!page) + if (!c->page) goto new_slab; - slab_lock(page); - if (unlikely(node != -1 && page_to_nid(page) != node)) + slab_lock(c->page); + if (unlikely(!node_match(c, node))) goto another_slab; load_freelist: - object = page->freelist; + object = c->page->freelist; if (unlikely(!object)) goto another_slab; - if (unlikely(SlabDebug(page))) + if (unlikely(SlabDebug(c->page))) goto debug; - object = page->freelist; - page->lockless_freelist = object[page->offset]; - page->inuse = s->objects; - page->freelist = NULL; - slab_unlock(page); + object = c->page->freelist; + c->freelist = object[c->offset]; + c->page->inuse = s->objects; + c->page->freelist = NULL; + c->node = page_to_nid(c->page); + slab_unlock(c->page); return object; another_slab: - deactivate_slab(s, page, cpu); + deactivate_slab(s, c); new_slab: - page = get_partial(s, gfpflags, node); - if (page) { - s->cpu_slab[cpu] = page; + new = get_partial(s, gfpflags, node); + if (new) { + c->page = new; goto load_freelist; } - page = new_slab(s, gfpflags, node); - if (page) { - cpu = smp_processor_id(); - if (s->cpu_slab[cpu]) { + if (gfpflags & __GFP_WAIT) + local_irq_enable(); + + new = new_slab(s, gfpflags, node); + + if (gfpflags & __GFP_WAIT) + local_irq_disable(); + + if (new) { + c = get_cpu_slab(s, smp_processor_id()); + if (c->page) { /* * Someone else populated the cpu_slab while we * enabled interrupts, or we have gotten scheduled @@ -1509,34 +1518,33 @@ new_slab: * requested node even if __GFP_THISNODE was * specified. So we need to recheck. */ - if (node == -1 || - page_to_nid(s->cpu_slab[cpu]) == node) { + if (node_match(c, node)) { /* * Current cpuslab is acceptable and we * want the current one since its cache hot */ - discard_slab(s, page); - page = s->cpu_slab[cpu]; - slab_lock(page); + discard_slab(s, new); + slab_lock(c->page); goto load_freelist; } /* New slab does not fit our expectations */ - flush_slab(s, s->cpu_slab[cpu], cpu); + flush_slab(s, c); } - slab_lock(page); - SetSlabFrozen(page); - s->cpu_slab[cpu] = page; + slab_lock(new); + SetSlabFrozen(new); + c->page = new; goto load_freelist; } return NULL; debug: - object = page->freelist; - if (!alloc_debug_processing(s, page, object, addr)) + object = c->page->freelist; + if (!alloc_debug_processing(s, c->page, object, addr)) goto another_slab; - page->inuse++; - page->freelist = object[page->offset]; - slab_unlock(page); + c->page->inuse++; + c->page->freelist = object[c->offset]; + c->node = -1; + slab_unlock(c->page); return object; } @@ -1553,25 +1561,24 @@ debug: static void __always_inline *slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, void *addr) { - struct page *page; void **object; unsigned long flags; + struct kmem_cache_cpu *c; local_irq_save(flags); - page = s->cpu_slab[smp_processor_id()]; - if (unlikely(!page || !page->lockless_freelist || - (node != -1 && page_to_nid(page) != node))) + c = get_cpu_slab(s, smp_processor_id()); + if (unlikely(!c->freelist || !node_match(c, node))) - object = __slab_alloc(s, gfpflags, node, addr, page); + object = __slab_alloc(s, gfpflags, node, addr, c); else { - object = page->lockless_freelist; - page->lockless_freelist = object[page->offset]; + object = c->freelist; + c->freelist = object[c->offset]; } local_irq_restore(flags); if (unlikely((gfpflags & __GFP_ZERO) && object)) - memset(object, 0, s->objsize); + memset(object, 0, c->objsize); return object; } @@ -1599,7 +1606,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node); * handling required then we can return immediately. */ static void __slab_free(struct kmem_cache *s, struct page *page, - void *x, void *addr) + void *x, void *addr, unsigned int offset) { void *prior; void **object = (void *)x; @@ -1609,7 +1616,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, if (unlikely(SlabDebug(page))) goto debug; checks_ok: - prior = object[page->offset] = page->freelist; + prior = object[offset] = page->freelist; page->freelist = object; page->inuse--; @@ -1664,15 +1671,16 @@ static void __always_inline slab_free(struct kmem_cache *s, { void **object = (void *)x; unsigned long flags; + struct kmem_cache_cpu *c; local_irq_save(flags); debug_check_no_locks_freed(object, s->objsize); - if (likely(page == s->cpu_slab[smp_processor_id()] && - !SlabDebug(page))) { - object[page->offset] = page->lockless_freelist; - page->lockless_freelist = object; + c = get_cpu_slab(s, smp_processor_id()); + if (likely(page == c->page && c->node >= 0)) { + object[c->offset] = c->freelist; + c->freelist = object; } else - __slab_free(s, page, x, addr); + __slab_free(s, page, x, addr, c->offset); local_irq_restore(flags); } @@ -1759,14 +1767,6 @@ static inline int slab_order(int size, int min_objects, int rem; int min_order = slub_min_order; - /* - * If we would create too many object per slab then reduce - * the slab order even if it goes below slub_min_order. - */ - while (min_order > 0 && - (PAGE_SIZE << min_order) >= MAX_OBJECTS_PER_SLAB * size) - min_order--; - for (order = max(min_order, fls(min_objects * size - 1) - PAGE_SHIFT); order <= max_order; order++) { @@ -1781,9 +1781,6 @@ static inline int slab_order(int size, int min_objects, if (rem <= slab_size / fract_leftover) break; - /* If the next size is too high then exit now */ - if (slab_size * 2 >= MAX_OBJECTS_PER_SLAB * size) - break; } return order; @@ -1858,6 +1855,16 @@ static unsigned long calculate_alignment(unsigned long flags, return ALIGN(align, sizeof(void *)); } +static void init_kmem_cache_cpu(struct kmem_cache *s, + struct kmem_cache_cpu *c) +{ + c->page = NULL; + c->freelist = NULL; + c->node = 0; + c->offset = s->offset / sizeof(void *); + c->objsize = s->objsize; +} + static void init_kmem_cache_node(struct kmem_cache_node *n) { n->nr_partial = 0; @@ -1869,6 +1876,131 @@ static void init_kmem_cache_node(struct kmem_cache_node *n) #endif } +#ifdef CONFIG_SMP +/* + * Per cpu array for per cpu structures. + * + * The per cpu array places all kmem_cache_cpu structures from one processor + * close together meaning that it becomes possible that multiple per cpu + * structures are contained in one cacheline. This may be particularly + * beneficial for the kmalloc caches. + * + * A desktop system typically has around 60-80 slabs. With 100 here we are + * likely able to get per cpu structures for all caches from the array defined + * here. We must be able to cover all kmalloc caches during bootstrap. + * + * If the per cpu array is exhausted then fall back to kmalloc + * of individual cachelines. No sharing is possible then. + */ +#define NR_KMEM_CACHE_CPU 100 + +static DEFINE_PER_CPU(struct kmem_cache_cpu, + kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; + +static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); +static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE; + +static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, + int cpu, gfp_t flags) +{ + struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu); + + if (c) + per_cpu(kmem_cache_cpu_free, cpu) = + (void *)c->freelist; + else { + /* Table overflow: So allocate ourselves */ + c = kmalloc_node( + ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()), + flags, cpu_to_node(cpu)); + if (!c) + return NULL; + } + + init_kmem_cache_cpu(s, c); + return c; +} + +static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) +{ + if (c < per_cpu(kmem_cache_cpu, cpu) || + c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { + kfree(c); + return; + } + c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu); + per_cpu(kmem_cache_cpu_free, cpu) = c; +} + +static void free_kmem_cache_cpus(struct kmem_cache *s) +{ + int cpu; + + for_each_online_cpu(cpu) { + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + + if (c) { + s->cpu_slab[cpu] = NULL; + free_kmem_cache_cpu(c, cpu); + } + } +} + +static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) +{ + int cpu; + + for_each_online_cpu(cpu) { + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + + if (c) + continue; + + c = alloc_kmem_cache_cpu(s, cpu, flags); + if (!c) { + free_kmem_cache_cpus(s); + return 0; + } + s->cpu_slab[cpu] = c; + } + return 1; +} + +/* + * Initialize the per cpu array. + */ +static void init_alloc_cpu_cpu(int cpu) +{ + int i; + + if (cpu_isset(cpu, kmem_cach_cpu_free_init_once)) + return; + + for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) + free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); + + cpu_set(cpu, kmem_cach_cpu_free_init_once); +} + +static void __init init_alloc_cpu(void) +{ + int cpu; + + for_each_online_cpu(cpu) + init_alloc_cpu_cpu(cpu); + } + +#else +static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} +static inline void init_alloc_cpu(void) {} + +static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) +{ + init_kmem_cache_cpu(s, &s->cpu_slab); + return 1; +} +#endif + #ifdef CONFIG_NUMA /* * No kmalloc_node yet so do it by hand. We know that this is the first @@ -1876,10 +2008,11 @@ static void init_kmem_cache_node(struct kmem_cache_node *n) * possible. * * Note that this function only works on the kmalloc_node_cache - * when allocating for the kmalloc_node_cache. + * when allocating for the kmalloc_node_cache. This is used for bootstrapping + * memory on a fresh node that has no slab structures yet. */ -static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags, - int node) +static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, + int node) { struct page *page; struct kmem_cache_node *n; @@ -1908,12 +2041,6 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag init_kmem_cache_node(n); atomic_long_inc(&n->nr_slabs); add_partial(n, page); - - /* - * new_slab() disables interupts. If we do not reenable interrupts here - * then bootup would continue with interrupts disabled. - */ - local_irq_enable(); return n; } @@ -1921,7 +2048,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) { int node; - for_each_online_node(node) { + for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = s->node[node]; if (n && n != &s->local_node) kmem_cache_free(kmalloc_caches, n); @@ -1939,7 +2066,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) else local_node = 0; - for_each_online_node(node) { + for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n; if (local_node == node) @@ -2077,21 +2204,14 @@ static int calculate_sizes(struct kmem_cache *s) */ s->objects = (PAGE_SIZE << s->order) / size; - /* - * Verify that the number of objects is within permitted limits. - * The page->inuse field is only 16 bit wide! So we cannot have - * more than 64k objects per slab. - */ - if (!s->objects || s->objects > MAX_OBJECTS_PER_SLAB) - return 0; - return 1; + return !!s->objects; } static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, const char *name, size_t size, size_t align, unsigned long flags, - void (*ctor)(void *, struct kmem_cache *, unsigned long)) + void (*ctor)(struct kmem_cache *, void *)) { memset(s, 0, kmem_size); s->name = name; @@ -2107,9 +2227,12 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, #ifdef CONFIG_NUMA s->defrag_ratio = 100; #endif + if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) + goto error; - if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) + if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) return 1; + free_kmem_cache_nodes(s); error: if (flags & SLAB_PANIC) panic("Cannot create slab %s size=%lu realsize=%u " @@ -2192,7 +2315,8 @@ static inline int kmem_cache_close(struct kmem_cache *s) flush_all(s); /* Attempt to free all objects */ - for_each_online_node(node) { + free_kmem_cache_cpus(s); + for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); n->nr_partial -= free_list(s, n, &n->partial); @@ -2227,11 +2351,11 @@ EXPORT_SYMBOL(kmem_cache_destroy); * Kmalloc subsystem *******************************************************************/ -struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned; +struct kmem_cache kmalloc_caches[PAGE_SHIFT] __cacheline_aligned; EXPORT_SYMBOL(kmalloc_caches); #ifdef CONFIG_ZONE_DMA -static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1]; +static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT]; #endif static int __init setup_slub_min_order(char *str) @@ -2397,12 +2521,8 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) return ZERO_SIZE_PTR; index = size_index[(size - 1) / 8]; - } else { - if (size > KMALLOC_MAX_SIZE) - return NULL; - + } else index = fls(size - 1); - } #ifdef CONFIG_ZONE_DMA if (unlikely((flags & SLUB_DMA))) @@ -2414,9 +2534,15 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) void *__kmalloc(size_t size, gfp_t flags) { - struct kmem_cache *s = get_slab(size, flags); + struct kmem_cache *s; - if (ZERO_OR_NULL_PTR(s)) + if (unlikely(size > PAGE_SIZE / 2)) + return (void *)__get_free_pages(flags | __GFP_COMP, + get_order(size)); + + s = get_slab(size, flags); + + if (unlikely(ZERO_OR_NULL_PTR(s))) return s; return slab_alloc(s, flags, -1, __builtin_return_address(0)); @@ -2426,9 +2552,15 @@ EXPORT_SYMBOL(__kmalloc); #ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node) { - struct kmem_cache *s = get_slab(size, flags); + struct kmem_cache *s; - if (ZERO_OR_NULL_PTR(s)) + if (unlikely(size > PAGE_SIZE / 2)) + return (void *)__get_free_pages(flags | __GFP_COMP, + get_order(size)); + + s = get_slab(size, flags); + + if (unlikely(ZERO_OR_NULL_PTR(s))) return s; return slab_alloc(s, flags, node, __builtin_return_address(0)); @@ -2441,7 +2573,8 @@ size_t ksize(const void *object) struct page *page; struct kmem_cache *s; - if (ZERO_OR_NULL_PTR(object)) + BUG_ON(!object); + if (unlikely(object == ZERO_SIZE_PTR)) return 0; page = get_object_page(object); @@ -2473,22 +2606,17 @@ EXPORT_SYMBOL(ksize); void kfree(const void *x) { - struct kmem_cache *s; struct page *page; - /* - * This has to be an unsigned comparison. According to Linus - * some gcc version treat a pointer as a signed entity. Then - * this comparison would be true for all "negative" pointers - * (which would cover the whole upper half of the address space). - */ - if (ZERO_OR_NULL_PTR(x)) + if (unlikely(ZERO_OR_NULL_PTR(x))) return; page = virt_to_head_page(x); - s = page->slab; - - slab_free(s, page, (void *)x, __builtin_return_address(0)); + if (unlikely(!PageSlab(page))) { + put_page(page); + return; + } + slab_free(page->slab, page, (void *)x, __builtin_return_address(0)); } EXPORT_SYMBOL(kfree); @@ -2517,7 +2645,7 @@ int kmem_cache_shrink(struct kmem_cache *s) return -ENOMEM; flush_all(s); - for_each_online_node(node) { + for_each_node_state(node, N_NORMAL_MEMORY) { n = get_node(s, node); if (!n->nr_partial) @@ -2575,6 +2703,8 @@ void __init kmem_cache_init(void) int i; int caches = 0; + init_alloc_cpu(); + #ifdef CONFIG_NUMA /* * Must first have the slab cache available for the allocations of the @@ -2602,7 +2732,7 @@ void __init kmem_cache_init(void) caches++; } - for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { + for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) { create_kmalloc_cache(&kmalloc_caches[i], "kmalloc", 1 << i, GFP_KERNEL); caches++; @@ -2629,16 +2759,18 @@ void __init kmem_cache_init(void) slab_state = UP; /* Provide the correct kmalloc names now that the caches are up */ - for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) + for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) kmalloc_caches[i]. name = kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); + kmem_size = offsetof(struct kmem_cache, cpu_slab) + + nr_cpu_ids * sizeof(struct kmem_cache_cpu *); +#else + kmem_size = sizeof(struct kmem_cache); #endif - kmem_size = offsetof(struct kmem_cache, cpu_slab) + - nr_cpu_ids * sizeof(struct page *); printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," " CPUs=%d, Nodes=%d\n", @@ -2669,7 +2801,7 @@ static int slab_unmergeable(struct kmem_cache *s) static struct kmem_cache *find_mergeable(size_t size, size_t align, unsigned long flags, const char *name, - void (*ctor)(void *, struct kmem_cache *, unsigned long)) + void (*ctor)(struct kmem_cache *, void *)) { struct kmem_cache *s; @@ -2710,19 +2842,28 @@ static struct kmem_cache *find_mergeable(size_t size, struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, - void (*ctor)(void *, struct kmem_cache *, unsigned long)) + void (*ctor)(struct kmem_cache *, void *)) { struct kmem_cache *s; down_write(&slub_lock); s = find_mergeable(size, align, flags, name, ctor); if (s) { + int cpu; + s->refcount++; /* * Adjust the object sizes so that we clear * the complete object on kzalloc. */ s->objsize = max(s->objsize, (int)size); + + /* + * And then we need to update the object size in the + * per cpu structures + */ + for_each_online_cpu(cpu) + get_cpu_slab(s, cpu)->objsize = s->objsize; s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); up_write(&slub_lock); if (sysfs_slab_alias(s, name)) @@ -2765,15 +2906,29 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, unsigned long flags; switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + init_alloc_cpu_cpu(cpu); + down_read(&slub_lock); + list_for_each_entry(s, &slab_caches, list) + s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu, + GFP_KERNEL); + up_read(&slub_lock); + break; + case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: down_read(&slub_lock); list_for_each_entry(s, &slab_caches, list) { + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + local_irq_save(flags); __flush_cpu_slab(s, cpu); local_irq_restore(flags); + free_kmem_cache_cpu(c, cpu); + s->cpu_slab[cpu] = NULL; } up_read(&slub_lock); break; @@ -2790,9 +2945,14 @@ static struct notifier_block __cpuinitdata slab_notifier = void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) { - struct kmem_cache *s = get_slab(size, gfpflags); + struct kmem_cache *s; + + if (unlikely(size > PAGE_SIZE / 2)) + return (void *)__get_free_pages(gfpflags | __GFP_COMP, + get_order(size)); + s = get_slab(size, gfpflags); - if (ZERO_OR_NULL_PTR(s)) + if (unlikely(ZERO_OR_NULL_PTR(s))) return s; return slab_alloc(s, gfpflags, -1, caller); @@ -2801,9 +2961,14 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, int node, void *caller) { - struct kmem_cache *s = get_slab(size, gfpflags); + struct kmem_cache *s; + + if (unlikely(size > PAGE_SIZE / 2)) + return (void *)__get_free_pages(gfpflags | __GFP_COMP, + get_order(size)); + s = get_slab(size, gfpflags); - if (ZERO_OR_NULL_PTR(s)) + if (unlikely(ZERO_OR_NULL_PTR(s))) return s; return slab_alloc(s, gfpflags, node, caller); @@ -2902,7 +3067,7 @@ static long validate_slab_cache(struct kmem_cache *s) return -ENOMEM; flush_all(s); - for_each_online_node(node) { + for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); count += validate_slab_node(s, n, map); @@ -3116,13 +3281,13 @@ static int list_locations(struct kmem_cache *s, char *buf, int node; if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), - GFP_KERNEL)) + GFP_TEMPORARY)) return sprintf(buf, "Out of memory\n"); /* Push back cpu slabs */ flush_all(s); - for_each_online_node(node) { + for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); unsigned long flags; struct page *page; @@ -3230,11 +3395,18 @@ static unsigned long slab_objects(struct kmem_cache *s, per_cpu = nodes + nr_node_ids; for_each_possible_cpu(cpu) { - struct page *page = s->cpu_slab[cpu]; + struct page *page; int node; + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + if (!c) + continue; + + page = c->page; + node = c->node; + if (node < 0) + continue; if (page) { - node = page_to_nid(page); if (flags & SO_CPU) { int x = 0; @@ -3249,7 +3421,7 @@ static unsigned long slab_objects(struct kmem_cache *s, } } - for_each_online_node(node) { + for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); if (flags & SO_PARTIAL) { @@ -3277,7 +3449,7 @@ static unsigned long slab_objects(struct kmem_cache *s, x = sprintf(buf, "%lu", total); #ifdef CONFIG_NUMA - for_each_online_node(node) + for_each_node_state(node, N_NORMAL_MEMORY) if (nodes[node]) x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); @@ -3291,13 +3463,19 @@ static int any_slab_objects(struct kmem_cache *s) int node; int cpu; - for_each_possible_cpu(cpu) - if (s->cpu_slab[cpu]) + for_each_possible_cpu(cpu) { + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + + if (c && c->page) return 1; + } - for_each_node(node) { + for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); + if (!n) + continue; + if (n->nr_partial || atomic_long_read(&n->nr_slabs)) return 1; } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c new file mode 100644 index 00000000000..d3b718b0c20 --- /dev/null +++ b/mm/sparse-vmemmap.c @@ -0,0 +1,148 @@ +/* + * Virtual Memory Map support + * + * (C) 2007 sgi. Christoph Lameter <clameter@sgi.com>. + * + * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn, + * virt_to_page, page_address() to be implemented as a base offset + * calculation without memory access. + * + * However, virtual mappings need a page table and TLBs. Many Linux + * architectures already map their physical space using 1-1 mappings + * via TLBs. For those arches the virtual memmory map is essentially + * for free if we use the same page size as the 1-1 mappings. In that + * case the overhead consists of a few additional pages that are + * allocated to create a view of memory for vmemmap. + * + * The architecture is expected to provide a vmemmap_populate() function + * to instantiate the mapping. + */ +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/bootmem.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/vmalloc.h> +#include <asm/dma.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> + +/* + * Allocate a block of memory to be used to back the virtual memory map + * or to back the page tables that are used to create the mapping. + * Uses the main allocators if they are available, else bootmem. + */ +void * __meminit vmemmap_alloc_block(unsigned long size, int node) +{ + /* If the main allocator is up use that, fallback to bootmem. */ + if (slab_is_available()) { + struct page *page = alloc_pages_node(node, + GFP_KERNEL | __GFP_ZERO, get_order(size)); + if (page) + return page_address(page); + return NULL; + } else + return __alloc_bootmem_node(NODE_DATA(node), size, size, + __pa(MAX_DMA_ADDRESS)); +} + +void __meminit vmemmap_verify(pte_t *pte, int node, + unsigned long start, unsigned long end) +{ + unsigned long pfn = pte_pfn(*pte); + int actual_node = early_pfn_to_nid(pfn); + + if (actual_node != node) + printk(KERN_WARNING "[%lx-%lx] potential offnode " + "page_structs\n", start, end - 1); +} + +pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) +{ + pte_t *pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) { + pte_t entry; + void *p = vmemmap_alloc_block(PAGE_SIZE, node); + if (!p) + return 0; + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + set_pte_at(&init_mm, addr, pte, entry); + } + return pte; +} + +pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) +{ + pmd_t *pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + void *p = vmemmap_alloc_block(PAGE_SIZE, node); + if (!p) + return 0; + pmd_populate_kernel(&init_mm, pmd, p); + } + return pmd; +} + +pud_t * __meminit vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node) +{ + pud_t *pud = pud_offset(pgd, addr); + if (pud_none(*pud)) { + void *p = vmemmap_alloc_block(PAGE_SIZE, node); + if (!p) + return 0; + pud_populate(&init_mm, pud, p); + } + return pud; +} + +pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) +{ + pgd_t *pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + void *p = vmemmap_alloc_block(PAGE_SIZE, node); + if (!p) + return 0; + pgd_populate(&init_mm, pgd, p); + } + return pgd; +} + +int __meminit vmemmap_populate_basepages(struct page *start_page, + unsigned long size, int node) +{ + unsigned long addr = (unsigned long)start_page; + unsigned long end = (unsigned long)(start_page + size); + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + for (; addr < end; addr += PAGE_SIZE) { + pgd = vmemmap_pgd_populate(addr, node); + if (!pgd) + return -ENOMEM; + pud = vmemmap_pud_populate(pgd, addr, node); + if (!pud) + return -ENOMEM; + pmd = vmemmap_pmd_populate(pud, addr, node); + if (!pmd) + return -ENOMEM; + pte = vmemmap_pte_populate(pmd, addr, node); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + } + + return 0; +} + +struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid) +{ + struct page *map = pfn_to_page(pnum * PAGES_PER_SECTION); + int error = vmemmap_populate(map, PAGES_PER_SECTION, nid); + if (error) + return NULL; + + return map; +} diff --git a/mm/sparse.c b/mm/sparse.c index 239f5a720d3..08fb14f5eea 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -9,6 +9,8 @@ #include <linux/spinlock.h> #include <linux/vmalloc.h> #include <asm/dma.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> /* * Permanent SPARSEMEM data: @@ -106,7 +108,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) /* * Although written for the SPARSEMEM_EXTREME case, this happens - * to also work for the flat array case becase + * to also work for the flat array case because * NR_SECTION_ROOTS==NR_MEM_SECTIONS. */ int __section_nr(struct mem_section* ms) @@ -176,7 +178,7 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn, if (nid != early_pfn_to_nid(pfn)) continue; - if (pfn_valid(pfn)) + if (pfn_present(pfn)) nr_pages += PAGES_PER_SECTION; } @@ -204,13 +206,16 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn } static int __meminit sparse_init_one_section(struct mem_section *ms, - unsigned long pnum, struct page *mem_map) + unsigned long pnum, struct page *mem_map, + unsigned long *pageblock_bitmap) { - if (!valid_section(ms)) + if (!present_section(ms)) return -EINVAL; ms->section_mem_map &= ~SECTION_MAP_MASK; - ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum); + ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) | + SECTION_HAS_MEM_MAP; + ms->pageblock_flags = pageblock_bitmap; return 1; } @@ -221,12 +226,43 @@ void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) return NULL; } -static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) +static unsigned long usemap_size(void) { - struct page *map; + unsigned long size_bytes; + size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8; + size_bytes = roundup(size_bytes, sizeof(unsigned long)); + return size_bytes; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static unsigned long *__kmalloc_section_usemap(void) +{ + return kmalloc(usemap_size(), GFP_KERNEL); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + +static unsigned long *sparse_early_usemap_alloc(unsigned long pnum) +{ + unsigned long *usemap; struct mem_section *ms = __nr_to_section(pnum); int nid = sparse_early_nid(ms); + usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); + if (usemap) + return usemap; + + /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ + nid = 0; + + printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__); + return NULL; +} + +#ifndef CONFIG_SPARSEMEM_VMEMMAP +struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) +{ + struct page *map; + map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); if (map) return map; @@ -238,10 +274,22 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) map = alloc_bootmem_node(NODE_DATA(nid), sizeof(struct page) * PAGES_PER_SECTION); + return map; +} +#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ + +struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) +{ + struct page *map; + struct mem_section *ms = __nr_to_section(pnum); + int nid = sparse_early_nid(ms); + + map = sparse_mem_map_populate(pnum, nid); if (map) return map; - printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__); + printk(KERN_ERR "%s: sparsemem memory map backing failed " + "some memory will not be available.\n", __FUNCTION__); ms->section_mem_map = 0; return NULL; } @@ -254,19 +302,38 @@ void __init sparse_init(void) { unsigned long pnum; struct page *map; + unsigned long *usemap; for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { - if (!valid_section_nr(pnum)) + if (!present_section_nr(pnum)) continue; map = sparse_early_mem_map_alloc(pnum); if (!map) continue; - sparse_init_one_section(__nr_to_section(pnum), pnum, map); + + usemap = sparse_early_usemap_alloc(pnum); + if (!usemap) + continue; + + sparse_init_one_section(__nr_to_section(pnum), pnum, map, + usemap); } } #ifdef CONFIG_MEMORY_HOTPLUG +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, + unsigned long nr_pages) +{ + /* This will make the necessary allocations eventually. */ + return sparse_mem_map_populate(pnum, nid); +} +static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) +{ + return; /* XXX: Not implemented yet */ +} +#else static struct page *__kmalloc_section_memmap(unsigned long nr_pages) { struct page *page, *ret; @@ -289,6 +356,12 @@ got_map_ptr: return ret; } +static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, + unsigned long nr_pages) +{ + return __kmalloc_section_memmap(nr_pages); +} + static int vaddr_in_vmalloc_area(void *addr) { if (addr >= (void *)VMALLOC_START && @@ -305,6 +378,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) free_pages((unsigned long)memmap, get_order(sizeof(struct page) * nr_pages)); } +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ /* * returns the number of sections whose mem_maps were properly @@ -318,6 +392,7 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, struct pglist_data *pgdat = zone->zone_pgdat; struct mem_section *ms; struct page *memmap; + unsigned long *usemap; unsigned long flags; int ret; @@ -326,7 +401,8 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, * plus, it does a kmalloc */ sparse_index_init(section_nr, pgdat->node_id); - memmap = __kmalloc_section_memmap(nr_pages); + memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages); + usemap = __kmalloc_section_usemap(); pgdat_resize_lock(pgdat, &flags); @@ -335,9 +411,14 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, ret = -EEXIST; goto out; } + + if (!usemap) { + ret = -ENOMEM; + goto out; + } ms->section_mem_map |= SECTION_MARKED_PRESENT; - ret = sparse_init_one_section(ms, section_nr, memmap); + ret = sparse_init_one_section(ms, section_nr, memmap, usemap); out: pgdat_resize_unlock(pgdat, &flags); diff --git a/mm/swap.c b/mm/swap.c index d3cb966fe99..a65eff8a517 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -24,16 +24,19 @@ #include <linux/module.h> #include <linux/mm_inline.h> #include <linux/buffer_head.h> /* for try_to_release_page() */ -#include <linux/module.h> #include <linux/percpu_counter.h> #include <linux/percpu.h> #include <linux/cpu.h> #include <linux/notifier.h> -#include <linux/init.h> +#include <linux/backing-dev.h> /* How many pages do we try to swap or page in/out together? */ int page_cluster; +static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; +static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; +static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, }; + /* * This path almost never happens for VM activity - pages are normally * freed via pagevecs. But it gets used by networking. @@ -94,23 +97,47 @@ void put_pages_list(struct list_head *pages) EXPORT_SYMBOL(put_pages_list); /* + * pagevec_move_tail() must be called with IRQ disabled. + * Otherwise this may cause nasty races. + */ +static void pagevec_move_tail(struct pagevec *pvec) +{ + int i; + int pgmoved = 0; + struct zone *zone = NULL; + + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + struct zone *pagezone = page_zone(page); + + if (pagezone != zone) { + if (zone) + spin_unlock(&zone->lru_lock); + zone = pagezone; + spin_lock(&zone->lru_lock); + } + if (PageLRU(page) && !PageActive(page)) { + list_move_tail(&page->lru, &zone->inactive_list); + pgmoved++; + } + } + if (zone) + spin_unlock(&zone->lru_lock); + __count_vm_events(PGROTATED, pgmoved); + release_pages(pvec->pages, pvec->nr, pvec->cold); + pagevec_reinit(pvec); +} + +/* * Writeback is about to end against a page which has been marked for immediate * reclaim. If it still appears to be reclaimable, move it to the tail of the - * inactive list. The page still has PageWriteback set, which will pin it. - * - * We don't expect many pages to come through here, so don't bother batching - * things up. - * - * To avoid placing the page at the tail of the LRU while PG_writeback is still - * set, this function will clear PG_writeback before performing the page - * motion. Do that inside the lru lock because once PG_writeback is cleared - * we may not touch the page. + * inactive list. * * Returns zero if it cleared PG_writeback. */ int rotate_reclaimable_page(struct page *page) { - struct zone *zone; + struct pagevec *pvec; unsigned long flags; if (PageLocked(page)) @@ -122,15 +149,16 @@ int rotate_reclaimable_page(struct page *page) if (!PageLRU(page)) return 1; - zone = page_zone(page); - spin_lock_irqsave(&zone->lru_lock, flags); - if (PageLRU(page) && !PageActive(page)) { - list_move_tail(&page->lru, &zone->inactive_list); - __count_vm_event(PGROTATED); - } + page_cache_get(page); + local_irq_save(flags); + pvec = &__get_cpu_var(lru_rotate_pvecs); + if (!pagevec_add(pvec, page)) + pagevec_move_tail(pvec); + local_irq_restore(flags); + if (!test_clear_page_writeback(page)) BUG(); - spin_unlock_irqrestore(&zone->lru_lock, flags); + return 0; } @@ -174,9 +202,6 @@ EXPORT_SYMBOL(mark_page_accessed); * lru_cache_add: add a page to the page lists * @page: the page to add */ -static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; -static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; - void fastcall lru_cache_add(struct page *page) { struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); @@ -197,21 +222,37 @@ void fastcall lru_cache_add_active(struct page *page) put_cpu_var(lru_add_active_pvecs); } -static void __lru_add_drain(int cpu) +/* + * Drain pages out of the cpu's pagevecs. + * Either "cpu" is the current CPU, and preemption has already been + * disabled; or "cpu" is being hot-unplugged, and is already dead. + */ +static void drain_cpu_pagevecs(int cpu) { - struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); + struct pagevec *pvec; - /* CPU is dead, so no locking needed. */ + pvec = &per_cpu(lru_add_pvecs, cpu); if (pagevec_count(pvec)) __pagevec_lru_add(pvec); + pvec = &per_cpu(lru_add_active_pvecs, cpu); if (pagevec_count(pvec)) __pagevec_lru_add_active(pvec); + + pvec = &per_cpu(lru_rotate_pvecs, cpu); + if (pagevec_count(pvec)) { + unsigned long flags; + + /* No harm done if a racing interrupt already did this */ + local_irq_save(flags); + pagevec_move_tail(pvec); + local_irq_restore(flags); + } } void lru_add_drain(void) { - __lru_add_drain(get_cpu()); + drain_cpu_pagevecs(get_cpu()); put_cpu(); } @@ -258,6 +299,7 @@ void release_pages(struct page **pages, int nr, int cold) int i; struct pagevec pages_to_free; struct zone *zone = NULL; + unsigned long uninitialized_var(flags); pagevec_init(&pages_to_free, cold); for (i = 0; i < nr; i++) { @@ -265,7 +307,7 @@ void release_pages(struct page **pages, int nr, int cold) if (unlikely(PageCompound(page))) { if (zone) { - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irqrestore(&zone->lru_lock, flags); zone = NULL; } put_compound_page(page); @@ -279,9 +321,10 @@ void release_pages(struct page **pages, int nr, int cold) struct zone *pagezone = page_zone(page); if (pagezone != zone) { if (zone) - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irqrestore(&zone->lru_lock, + flags); zone = pagezone; - spin_lock_irq(&zone->lru_lock); + spin_lock_irqsave(&zone->lru_lock, flags); } VM_BUG_ON(!PageLRU(page)); __ClearPageLRU(page); @@ -290,7 +333,7 @@ void release_pages(struct page **pages, int nr, int cold) if (!pagevec_add(&pages_to_free, page)) { if (zone) { - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irqrestore(&zone->lru_lock, flags); zone = NULL; } __pagevec_free(&pages_to_free); @@ -298,7 +341,7 @@ void release_pages(struct page **pages, int nr, int cold) } } if (zone) - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irqrestore(&zone->lru_lock, flags); pagevec_free(&pages_to_free); } @@ -491,7 +534,7 @@ static int cpu_swap_callback(struct notifier_block *nfb, if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { atomic_add(*committed, &vm_committed_space); *committed = 0; - __lru_add_drain((long)hcpu); + drain_cpu_pagevecs((long)hcpu); } return NOTIFY_OK; } @@ -505,6 +548,10 @@ void __init swap_setup(void) { unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); +#ifdef CONFIG_SWAP + bdi_init(swapper_space.backing_dev_info); +#endif + /* Use a smaller cluster for small-memory machines */ if (megs < 16) page_cluster = 2; diff --git a/mm/swap_state.c b/mm/swap_state.c index 67daecb6031..b52635601df 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -74,6 +74,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, { int error; + BUG_ON(!PageLocked(page)); BUG_ON(PageSwapCache(page)); BUG_ON(PagePrivate(page)); error = radix_tree_preload(gfp_mask); @@ -83,7 +84,6 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, entry.val, page); if (!error) { page_cache_get(page); - SetPageLocked(page); SetPageSwapCache(page); set_page_private(page, entry.val); total_swapcache_pages++; @@ -99,15 +99,18 @@ static int add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; + BUG_ON(PageLocked(page)); if (!swap_duplicate(entry)) { INC_CACHE_INFO(noent_race); return -ENOENT; } + SetPageLocked(page); error = __add_to_swap_cache(page, entry, GFP_KERNEL); /* * Anon pages are already on the LRU, we don't run lru_cache_add here. */ if (error) { + ClearPageLocked(page); swap_free(entry); if (error == -EEXIST) INC_CACHE_INFO(exist_race); diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index 8803471593f..d436a9c82db 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -66,24 +66,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) if (!dentry) goto put_memory; - error = -ENFILE; - file = get_empty_filp(); - if (!file) - goto put_dentry; - error = -ENOSPC; inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); if (!inode) - goto close_file; + goto put_dentry; d_instantiate(dentry, inode); - inode->i_nlink = 0; /* It is unlinked */ + error = -ENFILE; + file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ, + &ramfs_file_operations); + if (!file) + goto put_dentry; - file->f_path.mnt = mntget(shm_mnt); - file->f_path.dentry = dentry; - file->f_mapping = inode->i_mapping; - file->f_op = &ramfs_file_operations; - file->f_mode = FMODE_WRITE | FMODE_READ; + inode->i_nlink = 0; /* It is unlinked */ /* notify everyone as to the change of file size */ error = do_truncate(dentry, size, 0, file); diff --git a/mm/truncate.c b/mm/truncate.c index 5cdfbc1a59f..cadc15653dd 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -8,6 +8,7 @@ */ #include <linux/kernel.h> +#include <linux/backing-dev.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/module.h> @@ -72,6 +73,8 @@ void cancel_dirty_page(struct page *page, unsigned int account_size) struct address_space *mapping = page->mapping; if (mapping && mapping_cap_account_dirty(mapping)) { dec_zone_page_state(page, NR_FILE_DIRTY); + dec_bdi_stat(mapping->backing_dev_info, + BDI_RECLAIMABLE); if (account_size) task_io_account_cancelled_write(account_size); } diff --git a/mm/util.c b/mm/util.c index bf340d80686..5f64026cbb4 100644 --- a/mm/util.c +++ b/mm/util.c @@ -81,14 +81,16 @@ EXPORT_SYMBOL(kmemdup); void *krealloc(const void *p, size_t new_size, gfp_t flags) { void *ret; - size_t ks; + size_t ks = 0; if (unlikely(!new_size)) { kfree(p); return ZERO_SIZE_PTR; } - ks = ksize(p); + if (p) + ks = ksize(p); + if (ks >= new_size) return (void *)p; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3cee76a8c9f..2e01af36584 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -190,7 +190,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long fl if (unlikely(!size)) return NULL; - area = kmalloc_node(sizeof(*area), gfp_mask & GFP_LEVEL_MASK, node); + area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); + if (unlikely(!area)) return NULL; @@ -439,7 +440,7 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, area->flags |= VM_VPAGES; } else { pages = kmalloc_node(array_size, - (gfp_mask & GFP_LEVEL_MASK) | __GFP_ZERO, + (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO, node); } area->pages = pages; diff --git a/mm/vmscan.c b/mm/vmscan.c index a6e65d02499..e1471385d00 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -932,6 +932,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, long mapped_ratio; long distress; long swap_tendency; + long imbalance; if (zone_is_near_oom(zone)) goto force_reclaim_mapped; @@ -967,6 +968,46 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; /* + * If there's huge imbalance between active and inactive + * (think active 100 times larger than inactive) we should + * become more permissive, or the system will take too much + * cpu before it start swapping during memory pressure. + * Distress is about avoiding early-oom, this is about + * making swappiness graceful despite setting it to low + * values. + * + * Avoid div by zero with nr_inactive+1, and max resulting + * value is vm_total_pages. + */ + imbalance = zone_page_state(zone, NR_ACTIVE); + imbalance /= zone_page_state(zone, NR_INACTIVE) + 1; + + /* + * Reduce the effect of imbalance if swappiness is low, + * this means for a swappiness very low, the imbalance + * must be much higher than 100 for this logic to make + * the difference. + * + * Max temporary value is vm_total_pages*100. + */ + imbalance *= (vm_swappiness + 1); + imbalance /= 100; + + /* + * If not much of the ram is mapped, makes the imbalance + * less relevant, it's high priority we refill the inactive + * list with mapped pages only in presence of high ratio of + * mapped pages. + * + * Max temporary value is vm_total_pages*100. + */ + imbalance *= mapped_ratio; + imbalance /= 100; + + /* apply imbalance feedback to swap_tendency */ + swap_tendency += imbalance; + + /* * Now use this metric to decide whether to start moving mapped * memory onto the inactive list. */ @@ -1067,8 +1108,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone, unsigned long nr_to_scan; unsigned long nr_reclaimed = 0; - atomic_inc(&zone->reclaim_in_progress); - /* * Add one to `nr_to_scan' just to make sure that the kernel will * slowly sift through the active list. @@ -1107,8 +1146,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone, } throttle_vm_writeout(sc->gfp_mask); - - atomic_dec(&zone->reclaim_in_progress); return nr_reclaimed; } @@ -1146,7 +1183,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones, note_zone_scanning_priority(zone, priority); - if (zone->all_unreclaimable && priority != DEF_PRIORITY) + if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ sc->all_unreclaimable = 0; @@ -1327,7 +1364,8 @@ loop_again: if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable && priority != DEF_PRIORITY) + if (zone_is_all_unreclaimable(zone) && + priority != DEF_PRIORITY) continue; if (!zone_watermark_ok(zone, order, zone->pages_high, @@ -1362,7 +1400,8 @@ loop_again: if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable && priority != DEF_PRIORITY) + if (zone_is_all_unreclaimable(zone) && + priority != DEF_PRIORITY) continue; if (!zone_watermark_ok(zone, order, zone->pages_high, @@ -1371,18 +1410,25 @@ loop_again: temp_priority[i] = priority; sc.nr_scanned = 0; note_zone_scanning_priority(zone, priority); - nr_reclaimed += shrink_zone(priority, zone, &sc); + /* + * We put equal pressure on every zone, unless one + * zone has way too many pages free already. + */ + if (!zone_watermark_ok(zone, order, 8*zone->pages_high, + end_zone, 0)) + nr_reclaimed += shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; - if (zone->all_unreclaimable) + if (zone_is_all_unreclaimable(zone)) continue; if (nr_slab == 0 && zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE) + zone_page_state(zone, NR_INACTIVE)) * 6) - zone->all_unreclaimable = 1; + zone_set_flag(zone, + ZONE_ALL_UNRECLAIMABLE); /* * If we've done a decent amount of scanning and * the reclaim ratio is low, start doing writepage @@ -1548,7 +1594,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable && prio != DEF_PRIORITY) + if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) continue; /* For pass = 0 we don't shrink the active list */ @@ -1688,9 +1734,11 @@ static int __devinit cpu_callback(struct notifier_block *nfb, { pg_data_t *pgdat; cpumask_t mask; + int nid; if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { - for_each_online_pgdat(pgdat) { + for_each_node_state(nid, N_HIGH_MEMORY) { + pgdat = NODE_DATA(nid); mask = node_to_cpumask(pgdat->node_id); if (any_online_cpu(mask) != NR_CPUS) /* One of our CPUs online: restore mask */ @@ -1727,7 +1775,7 @@ static int __init kswapd_init(void) int nid; swap_setup(); - for_each_online_node(nid) + for_each_node_state(nid, N_HIGH_MEMORY) kswapd_run(nid); hotcpu_notifier(cpu_callback, 0); return 0; @@ -1847,8 +1895,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) { - cpumask_t mask; int node_id; + int ret; /* * Zone reclaim reclaims unmapped file backed pages and @@ -1866,15 +1914,13 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) <= zone->min_slab_pages) return 0; + if (zone_is_all_unreclaimable(zone)) + return 0; + /* - * Avoid concurrent zone reclaims, do not reclaim in a zone that does - * not have reclaimable pages and if we should not delay the allocation - * then do not scan. + * Do not scan if the allocation should not be delayed. */ - if (!(gfp_mask & __GFP_WAIT) || - zone->all_unreclaimable || - atomic_read(&zone->reclaim_in_progress) > 0 || - (current->flags & PF_MEMALLOC)) + if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) return 0; /* @@ -1884,9 +1930,14 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * as wide as possible. */ node_id = zone_to_nid(zone); - mask = node_to_cpumask(node_id); - if (!cpus_empty(mask) && node_id != numa_node_id()) + if (node_state(node_id, N_CPU) && node_id != numa_node_id()) + return 0; + + if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) return 0; - return __zone_reclaim(zone, gfp_mask, order); + ret = __zone_reclaim(zone, gfp_mask, order); + zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); + + return ret; } #endif diff --git a/mm/vmstat.c b/mm/vmstat.c index c64d169537b..4651bf153f3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -353,23 +353,6 @@ void refresh_cpu_vm_stats(int cpu) } } -static void __refresh_cpu_vm_stats(void *dummy) -{ - refresh_cpu_vm_stats(smp_processor_id()); -} - -/* - * Consolidate all counters. - * - * Note that the result is less inaccurate but still inaccurate - * if concurrent processes are allowed to run. - */ -void refresh_vm_stats(void) -{ - on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1); -} -EXPORT_SYMBOL(refresh_vm_stats); - #endif #ifdef CONFIG_NUMA @@ -398,6 +381,13 @@ void zone_statistics(struct zonelist *zonelist, struct zone *z) #include <linux/seq_file.h> +static char * const migratetype_names[MIGRATE_TYPES] = { + "Unmovable", + "Reclaimable", + "Movable", + "Reserve", +}; + static void *frag_start(struct seq_file *m, loff_t *pos) { pg_data_t *pgdat; @@ -422,28 +412,144 @@ static void frag_stop(struct seq_file *m, void *arg) { } -/* - * This walks the free areas for each zone. - */ -static int frag_show(struct seq_file *m, void *arg) +/* Walk all the zones in a node and print using a callback */ +static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, + void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) { - pg_data_t *pgdat = (pg_data_t *)arg; struct zone *zone; struct zone *node_zones = pgdat->node_zones; unsigned long flags; - int order; for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { if (!populated_zone(zone)) continue; spin_lock_irqsave(&zone->lock, flags); - seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order < MAX_ORDER; ++order) - seq_printf(m, "%6lu ", zone->free_area[order].nr_free); + print(m, pgdat, zone); spin_unlock_irqrestore(&zone->lock, flags); + } +} + +static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, + struct zone *zone) +{ + int order; + + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].nr_free); + seq_putc(m, '\n'); +} + +/* + * This walks the free areas for each zone. + */ +static int frag_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + walk_zones_in_node(m, pgdat, frag_show_print); + return 0; +} + +static void pagetypeinfo_showfree_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + int order, mtype; + + for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) { + seq_printf(m, "Node %4d, zone %8s, type %12s ", + pgdat->node_id, + zone->name, + migratetype_names[mtype]); + for (order = 0; order < MAX_ORDER; ++order) { + unsigned long freecount = 0; + struct free_area *area; + struct list_head *curr; + + area = &(zone->free_area[order]); + + list_for_each(curr, &area->free_list[mtype]) + freecount++; + seq_printf(m, "%6lu ", freecount); + } seq_putc(m, '\n'); } +} + +/* Print out the free pages at each order for each migatetype */ +static int pagetypeinfo_showfree(struct seq_file *m, void *arg) +{ + int order; + pg_data_t *pgdat = (pg_data_t *)arg; + + /* Print header */ + seq_printf(m, "%-43s ", "Free pages count per migrate type at order"); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6d ", order); + seq_putc(m, '\n'); + + walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print); + + return 0; +} + +static void pagetypeinfo_showblockcount_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + int mtype; + unsigned long pfn; + unsigned long start_pfn = zone->zone_start_pfn; + unsigned long end_pfn = start_pfn + zone->spanned_pages; + unsigned long count[MIGRATE_TYPES] = { 0, }; + + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { + struct page *page; + + if (!pfn_valid(pfn)) + continue; + + page = pfn_to_page(pfn); + mtype = get_pageblock_migratetype(page); + + count[mtype]++; + } + + /* Print counts */ + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) + seq_printf(m, "%12lu ", count[mtype]); + seq_putc(m, '\n'); +} + +/* Print out the free pages at each order for each migratetype */ +static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) +{ + int mtype; + pg_data_t *pgdat = (pg_data_t *)arg; + + seq_printf(m, "\n%-23s", "Number of blocks type "); + for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) + seq_printf(m, "%12s ", migratetype_names[mtype]); + seq_putc(m, '\n'); + walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print); + + return 0; +} + +/* + * This prints out statistics in relation to grouping pages by mobility. + * It is expensive to collect so do not constantly read the file. + */ +static int pagetypeinfo_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + seq_printf(m, "Page block order: %d\n", pageblock_order); + seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages); + seq_putc(m, '\n'); + pagetypeinfo_showfree(m, pgdat); + pagetypeinfo_showblockcount(m, pgdat); + return 0; } @@ -454,6 +560,13 @@ const struct seq_operations fragmentation_op = { .show = frag_show, }; +const struct seq_operations pagetypeinfo_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = pagetypeinfo_show, +}; + #ifdef CONFIG_ZONE_DMA #define TEXT_FOR_DMA(xx) xx "_dma", #else @@ -532,84 +645,78 @@ static const char * const vmstat_text[] = { #endif }; -/* - * Output information about zones in @pgdat. - */ -static int zoneinfo_show(struct seq_file *m, void *arg) +static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, + struct zone *zone) { - pg_data_t *pgdat = arg; - struct zone *zone; - struct zone *node_zones = pgdat->node_zones; - unsigned long flags; - - for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { - int i; - - if (!populated_zone(zone)) - continue; - - spin_lock_irqsave(&zone->lock, flags); - seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); - seq_printf(m, - "\n pages free %lu" - "\n min %lu" - "\n low %lu" - "\n high %lu" - "\n scanned %lu (a: %lu i: %lu)" - "\n spanned %lu" - "\n present %lu", - zone_page_state(zone, NR_FREE_PAGES), - zone->pages_min, - zone->pages_low, - zone->pages_high, - zone->pages_scanned, - zone->nr_scan_active, zone->nr_scan_inactive, - zone->spanned_pages, - zone->present_pages); + int i; + seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); + seq_printf(m, + "\n pages free %lu" + "\n min %lu" + "\n low %lu" + "\n high %lu" + "\n scanned %lu (a: %lu i: %lu)" + "\n spanned %lu" + "\n present %lu", + zone_page_state(zone, NR_FREE_PAGES), + zone->pages_min, + zone->pages_low, + zone->pages_high, + zone->pages_scanned, + zone->nr_scan_active, zone->nr_scan_inactive, + zone->spanned_pages, + zone->present_pages); - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - seq_printf(m, "\n %-12s %lu", vmstat_text[i], - zone_page_state(zone, i)); - - seq_printf(m, - "\n protection: (%lu", - zone->lowmem_reserve[0]); - for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) - seq_printf(m, ", %lu", zone->lowmem_reserve[i]); - seq_printf(m, - ")" - "\n pagesets"); - for_each_online_cpu(i) { - struct per_cpu_pageset *pageset; - int j; - - pageset = zone_pcp(zone, i); - for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { - seq_printf(m, - "\n cpu: %i pcp: %i" - "\n count: %i" - "\n high: %i" - "\n batch: %i", - i, j, - pageset->pcp[j].count, - pageset->pcp[j].high, - pageset->pcp[j].batch); + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + seq_printf(m, "\n %-12s %lu", vmstat_text[i], + zone_page_state(zone, i)); + + seq_printf(m, + "\n protection: (%lu", + zone->lowmem_reserve[0]); + for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) + seq_printf(m, ", %lu", zone->lowmem_reserve[i]); + seq_printf(m, + ")" + "\n pagesets"); + for_each_online_cpu(i) { + struct per_cpu_pageset *pageset; + int j; + + pageset = zone_pcp(zone, i); + for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { + seq_printf(m, + "\n cpu: %i pcp: %i" + "\n count: %i" + "\n high: %i" + "\n batch: %i", + i, j, + pageset->pcp[j].count, + pageset->pcp[j].high, + pageset->pcp[j].batch); } #ifdef CONFIG_SMP - seq_printf(m, "\n vm stats threshold: %d", - pageset->stat_threshold); + seq_printf(m, "\n vm stats threshold: %d", + pageset->stat_threshold); #endif - } - seq_printf(m, - "\n all_unreclaimable: %u" - "\n prev_priority: %i" - "\n start_pfn: %lu", - zone->all_unreclaimable, - zone->prev_priority, - zone->zone_start_pfn); - spin_unlock_irqrestore(&zone->lock, flags); - seq_putc(m, '\n'); } + seq_printf(m, + "\n all_unreclaimable: %u" + "\n prev_priority: %i" + "\n start_pfn: %lu", + zone_is_all_unreclaimable(zone), + zone->prev_priority, + zone->zone_start_pfn); + seq_putc(m, '\n'); +} + +/* + * Output information about zones in @pgdat. + */ +static int zoneinfo_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + walk_zones_in_node(m, pgdat, zoneinfo_show_print); return 0; } @@ -741,7 +848,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, static struct notifier_block __cpuinitdata vmstat_notifier = { &vmstat_cpuup_callback, NULL, 0 }; -int __init setup_vmstat(void) +static int __init setup_vmstat(void) { int cpu; |