summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorTrond Myklebust <Trond.Myklebust@netapp.com>2006-06-24 08:41:41 -0400
committerTrond Myklebust <Trond.Myklebust@netapp.com>2006-06-24 13:07:53 -0400
commit816724e65c72a90a44fbad0ef0b59b186c85fa90 (patch)
tree421fa29aedff988e392f92780637553e275d37a0 /mm
parent70ac4385a13f78bc478f26d317511893741b05bd (diff)
parentd384ea691fe4ea8c2dd5b9b8d9042eb181776f18 (diff)
Merge branch 'master' of /home/trondmy/kernel/linux-2.6/
Conflicts: fs/nfs/inode.c fs/super.c Fix conflicts between patch 'NFS: Split fs/nfs/inode.c' and patch 'VFS: Permit filesystem to override root dentry on mount'
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/filemap.c183
-rw-r--r--mm/filemap.h6
-rw-r--r--mm/fremap.c9
-rw-r--r--mm/hugetlb.c282
-rw-r--r--mm/memory.c125
-rw-r--r--mm/memory_hotplug.c27
-rw-r--r--mm/mempolicy.c36
-rw-r--r--mm/migrate.c1058
-rw-r--r--mm/mmap.c12
-rw-r--r--mm/mprotect.c37
-rw-r--r--mm/msync.c3
-rw-r--r--mm/oom_kill.c9
-rw-r--r--mm/page-writeback.c3
-rw-r--r--mm/page_alloc.c184
-rw-r--r--mm/pdflush.c3
-rw-r--r--mm/rmap.c107
-rw-r--r--mm/shmem.c18
-rw-r--r--mm/slab.c249
-rw-r--r--mm/sparse.c22
-rw-r--r--mm/swap.c42
-rw-r--r--mm/swapfile.c43
-rw-r--r--mm/truncate.c22
-rw-r--r--mm/vmalloc.c122
-rw-r--r--mm/vmscan.c240
25 files changed, 1816 insertions, 1030 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 332f5c29b53..66e65ab3942 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -138,8 +138,8 @@ config SPLIT_PTLOCK_CPUS
#
config MIGRATION
bool "Page migration"
- def_bool y if NUMA
- depends on SWAP && NUMA
+ def_bool y
+ depends on NUMA
help
Allows the migration of the physical location of pages of processes
while the virtual addresses are not changed. This is useful for
diff --git a/mm/filemap.c b/mm/filemap.c
index fd57442186c..807a463fd5e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -14,6 +14,7 @@
#include <linux/slab.h>
#include <linux/compiler.h>
#include <linux/fs.h>
+#include <linux/uaccess.h>
#include <linux/aio.h>
#include <linux/capability.h>
#include <linux/kernel_stat.h>
@@ -38,7 +39,6 @@
*/
#include <linux/buffer_head.h> /* for generic_osync_inode */
-#include <asm/uaccess.h>
#include <asm/mman.h>
static ssize_t
@@ -171,15 +171,17 @@ static int sync_page(void *word)
}
/**
- * filemap_fdatawrite_range - start writeback against all of a mapping's
- * dirty pages that lie within the byte offsets <start, end>
+ * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
* @mapping: address space structure to write
* @start: offset in bytes where the range starts
* @end: offset in bytes where the range ends (inclusive)
* @sync_mode: enable synchronous operation
*
+ * Start writeback against all of a mapping's dirty pages that lie
+ * within the byte offsets <start, end> inclusive.
+ *
* If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
- * opposed to a regular memory * cleansing writeback. The difference between
+ * opposed to a regular memory cleansing writeback. The difference between
* these two operations is that if a dirty page/buffer is encountered, it must
* be waited upon, and not just skipped over.
*/
@@ -190,8 +192,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
struct writeback_control wbc = {
.sync_mode = sync_mode,
.nr_to_write = mapping->nrpages * 2,
- .start = start,
- .end = end,
+ .range_start = start,
+ .range_end = end,
};
if (!mapping_cap_writeback_dirty(mapping))
@@ -204,7 +206,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
static inline int __filemap_fdatawrite(struct address_space *mapping,
int sync_mode)
{
- return __filemap_fdatawrite_range(mapping, 0, 0, sync_mode);
+ return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
}
int filemap_fdatawrite(struct address_space *mapping)
@@ -219,7 +221,10 @@ static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
}
-/*
+/**
+ * filemap_flush - mostly a non-blocking flush
+ * @mapping: target address_space
+ *
* This is a mostly non-blocking flush. Not suitable for data-integrity
* purposes - I/O may not be started against all dirty pages.
*/
@@ -229,7 +234,12 @@ int filemap_flush(struct address_space *mapping)
}
EXPORT_SYMBOL(filemap_flush);
-/*
+/**
+ * wait_on_page_writeback_range - wait for writeback to complete
+ * @mapping: target address_space
+ * @start: beginning page index
+ * @end: ending page index
+ *
* Wait for writeback to complete against pages indexed by start->end
* inclusive
*/
@@ -276,7 +286,13 @@ int wait_on_page_writeback_range(struct address_space *mapping,
return ret;
}
-/*
+/**
+ * sync_page_range - write and wait on all pages in the passed range
+ * @inode: target inode
+ * @mapping: target address_space
+ * @pos: beginning offset in pages to write
+ * @count: number of bytes to write
+ *
* Write and wait upon all the pages in the passed range. This is a "data
* integrity" operation. It waits upon in-flight writeout before starting and
* waiting upon new writeout. If there was an IO error, return it.
@@ -305,7 +321,13 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
}
EXPORT_SYMBOL(sync_page_range);
-/*
+/**
+ * sync_page_range_nolock
+ * @inode: target inode
+ * @mapping: target address_space
+ * @pos: beginning offset in pages to write
+ * @count: number of bytes to write
+ *
* Note: Holding i_mutex across sync_page_range_nolock is not a good idea
* as it forces O_SYNC writers to different parts of the same file
* to be serialised right until io completion.
@@ -329,10 +351,11 @@ int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
EXPORT_SYMBOL(sync_page_range_nolock);
/**
- * filemap_fdatawait - walk the list of under-writeback pages of the given
- * address space and wait for all of them.
- *
+ * filemap_fdatawait - wait for all under-writeback pages to complete
* @mapping: address space structure to wait for
+ *
+ * Walk the list of under-writeback pages of the given address space
+ * and wait for all of them.
*/
int filemap_fdatawait(struct address_space *mapping)
{
@@ -368,7 +391,12 @@ int filemap_write_and_wait(struct address_space *mapping)
}
EXPORT_SYMBOL(filemap_write_and_wait);
-/*
+/**
+ * filemap_write_and_wait_range - write out & wait on a file range
+ * @mapping: the address_space for the pages
+ * @lstart: offset in bytes where the range starts
+ * @lend: offset in bytes where the range ends (inclusive)
+ *
* Write out and wait upon file offsets lstart->lend, inclusive.
*
* Note that `lend' is inclusive (describes the last byte to be written) so
@@ -394,8 +422,14 @@ int filemap_write_and_wait_range(struct address_space *mapping,
return err;
}
-/*
- * This function is used to add newly allocated pagecache pages:
+/**
+ * add_to_page_cache - add newly allocated pagecache pages
+ * @page: page to add
+ * @mapping: the page's address_space
+ * @offset: page index
+ * @gfp_mask: page allocation mode
+ *
+ * This function is used to add newly allocated pagecache pages;
* the page is new, so we can just run SetPageLocked() against it.
* The other page state flags were set by rmqueue().
*
@@ -422,7 +456,6 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
}
return error;
}
-
EXPORT_SYMBOL(add_to_page_cache);
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
@@ -489,8 +522,7 @@ void fastcall wait_on_page_bit(struct page *page, int bit_nr)
EXPORT_SYMBOL(wait_on_page_bit);
/**
- * unlock_page() - unlock a locked page
- *
+ * unlock_page - unlock a locked page
* @page: the page
*
* Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
@@ -513,8 +545,9 @@ void fastcall unlock_page(struct page *page)
}
EXPORT_SYMBOL(unlock_page);
-/*
- * End writeback against a page.
+/**
+ * end_page_writeback - end writeback against a page
+ * @page: the page
*/
void end_page_writeback(struct page *page)
{
@@ -527,10 +560,11 @@ void end_page_writeback(struct page *page)
}
EXPORT_SYMBOL(end_page_writeback);
-/*
- * Get a lock on the page, assuming we need to sleep to get it.
+/**
+ * __lock_page - get a lock on the page, assuming we need to sleep to get it
+ * @page: the page to lock
*
- * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some
+ * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some
* random driver's requestfn sets TASK_RUNNING, we could busywait. However
* chances are that on the second loop, the block layer's plug list is empty,
* so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
@@ -544,8 +578,12 @@ void fastcall __lock_page(struct page *page)
}
EXPORT_SYMBOL(__lock_page);
-/*
- * a rather lightweight function, finding and getting a reference to a
+/**
+ * find_get_page - find and get a page reference
+ * @mapping: the address_space to search
+ * @offset: the page index
+ *
+ * A rather lightweight function, finding and getting a reference to a
* hashed page atomically.
*/
struct page * find_get_page(struct address_space *mapping, unsigned long offset)
@@ -559,11 +597,14 @@ struct page * find_get_page(struct address_space *mapping, unsigned long offset)
read_unlock_irq(&mapping->tree_lock);
return page;
}
-
EXPORT_SYMBOL(find_get_page);
-/*
- * Same as above, but trylock it instead of incrementing the count.
+/**
+ * find_trylock_page - find and lock a page
+ * @mapping: the address_space to search
+ * @offset: the page index
+ *
+ * Same as find_get_page(), but trylock it instead of incrementing the count.
*/
struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
{
@@ -576,12 +617,10 @@ struct page *find_trylock_page(struct address_space *mapping, unsigned long offs
read_unlock_irq(&mapping->tree_lock);
return page;
}
-
EXPORT_SYMBOL(find_trylock_page);
/**
* find_lock_page - locate, pin and lock a pagecache page
- *
* @mapping: the address_space to search
* @offset: the page index
*
@@ -617,12 +656,10 @@ repeat:
read_unlock_irq(&mapping->tree_lock);
return page;
}
-
EXPORT_SYMBOL(find_lock_page);
/**
* find_or_create_page - locate or add a pagecache page
- *
* @mapping: the page's address_space
* @index: the page's index into the mapping
* @gfp_mask: page allocation mode
@@ -663,7 +700,6 @@ repeat:
page_cache_release(cached_page);
return page;
}
-
EXPORT_SYMBOL(find_or_create_page);
/**
@@ -729,9 +765,16 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
return i;
}
-/*
+/**
+ * find_get_pages_tag - find and return pages that match @tag
+ * @mapping: the address_space to search
+ * @index: the starting page index
+ * @tag: the tag index
+ * @nr_pages: the maximum number of pages
+ * @pages: where the resulting pages are placed
+ *
* Like find_get_pages, except we only return pages which are tagged with
- * `tag'. We update *index to index the next page for the traversal.
+ * @tag. We update @index to index the next page for the traversal.
*/
unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
int tag, unsigned int nr_pages, struct page **pages)
@@ -750,7 +793,11 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
return ret;
}
-/*
+/**
+ * grab_cache_page_nowait - returns locked page at given index in given cache
+ * @mapping: target address_space
+ * @index: the page index
+ *
* Same as grab_cache_page, but do not wait if the page is unavailable.
* This is intended for speculative data generators, where the data can
* be regenerated if the page couldn't be grabbed. This routine should
@@ -779,19 +826,25 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
}
return page;
}
-
EXPORT_SYMBOL(grab_cache_page_nowait);
-/*
+/**
+ * do_generic_mapping_read - generic file read routine
+ * @mapping: address_space to be read
+ * @_ra: file's readahead state
+ * @filp: the file to read
+ * @ppos: current file position
+ * @desc: read_descriptor
+ * @actor: read method
+ *
* This is a generic file read routine, and uses the
- * mapping->a_ops->readpage() function for the actual low-level
- * stuff.
+ * mapping->a_ops->readpage() function for the actual low-level stuff.
*
* This is really ugly. But the goto's actually try to clarify some
* of the logic when it comes to error handling etc.
*
- * Note the struct file* is only passed for the use of readpage. It may be
- * NULL.
+ * Note the struct file* is only passed for the use of readpage.
+ * It may be NULL.
*/
void do_generic_mapping_read(struct address_space *mapping,
struct file_ra_state *_ra,
@@ -1004,7 +1057,6 @@ out:
if (filp)
file_accessed(filp);
}
-
EXPORT_SYMBOL(do_generic_mapping_read);
int file_read_actor(read_descriptor_t *desc, struct page *page,
@@ -1045,7 +1097,13 @@ success:
return size;
}
-/*
+/**
+ * __generic_file_aio_read - generic filesystem read routine
+ * @iocb: kernel I/O control block
+ * @iov: io vector request
+ * @nr_segs: number of segments in the iovec
+ * @ppos: current file position
+ *
* This is the "read()" routine for all filesystems
* that can use the page cache directly.
*/
@@ -1124,7 +1182,6 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
out:
return retval;
}
-
EXPORT_SYMBOL(__generic_file_aio_read);
ssize_t
@@ -1135,7 +1192,6 @@ generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t
BUG_ON(iocb->ki_pos != pos);
return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
}
-
EXPORT_SYMBOL(generic_file_aio_read);
ssize_t
@@ -1151,7 +1207,6 @@ generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppo
ret = wait_on_sync_kiocb(&kiocb);
return ret;
}
-
EXPORT_SYMBOL(generic_file_read);
int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
@@ -1192,7 +1247,6 @@ ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
return desc.written;
return desc.error;
}
-
EXPORT_SYMBOL(generic_file_sendfile);
static ssize_t
@@ -1228,11 +1282,15 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
}
#ifdef CONFIG_MMU
-/*
+static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
+/**
+ * page_cache_read - adds requested page to the page cache if not already there
+ * @file: file to read
+ * @offset: page index
+ *
* This adds the requested page to the page cache if it isn't already there,
* and schedules an I/O to read in its contents from disk.
*/
-static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
static int fastcall page_cache_read(struct file * file, unsigned long offset)
{
struct address_space *mapping = file->f_mapping;
@@ -1259,7 +1317,12 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
#define MMAP_LOTSAMISS (100)
-/*
+/**
+ * filemap_nopage - read in file data for page fault handling
+ * @area: the applicable vm_area
+ * @address: target address to read in
+ * @type: returned with VM_FAULT_{MINOR,MAJOR} if not %NULL
+ *
* filemap_nopage() is invoked via the vma operations vector for a
* mapped memory region to read in file data during a page fault.
*
@@ -1462,7 +1525,6 @@ page_not_uptodate:
page_cache_release(page);
return NULL;
}
-
EXPORT_SYMBOL(filemap_nopage);
static struct page * filemap_getpage(struct file *file, unsigned long pgoff,
@@ -1716,7 +1778,13 @@ repeat:
return page;
}
-/*
+/**
+ * read_cache_page - read into page cache, fill it if needed
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @filler: function to perform the read
+ * @data: destination for read data
+ *
* Read into the page cache. If a page already exists,
* and PageUptodate() is not set, try to fill the page.
*/
@@ -1754,7 +1822,6 @@ retry:
out:
return page;
}
-
EXPORT_SYMBOL(read_cache_page);
/*
@@ -1835,7 +1902,7 @@ __filemap_copy_from_user_iovec(char *vaddr,
int copy = min(bytes, iov->iov_len - base);
base = 0;
- left = __copy_from_user_inatomic(vaddr, buf, copy);
+ left = __copy_from_user_inatomic_nocache(vaddr, buf, copy);
copied += copy;
bytes -= copy;
vaddr += copy;
@@ -1854,7 +1921,7 @@ __filemap_copy_from_user_iovec(char *vaddr,
/*
* Performs necessary checks before doing a write
*
- * Can adjust writing position aor amount of bytes to write.
+ * Can adjust writing position or amount of bytes to write.
* Returns appropriate error code that caller should return or
* zero in case that write should be allowed.
*/
diff --git a/mm/filemap.h b/mm/filemap.h
index 13793ba0ce1..5683cde2205 100644
--- a/mm/filemap.h
+++ b/mm/filemap.h
@@ -13,7 +13,7 @@
#include <linux/highmem.h>
#include <linux/uio.h>
#include <linux/config.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
size_t
__filemap_copy_from_user_iovec(char *vaddr,
@@ -34,13 +34,13 @@ filemap_copy_from_user(struct page *page, unsigned long offset,
int left;
kaddr = kmap_atomic(page, KM_USER0);
- left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+ left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
kunmap_atomic(kaddr, KM_USER0);
if (left != 0) {
/* Do it the slow way */
kaddr = kmap(page);
- left = __copy_from_user(kaddr + offset, buf, bytes);
+ left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
kunmap(page);
}
return bytes - left;
diff --git a/mm/fremap.c b/mm/fremap.c
index 9f381e58bf4..21b7d0cbc98 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -83,6 +83,7 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
page_add_file_rmap(page);
pte_val = *pte;
update_mmu_cache(vma, addr, pte_val);
+ lazy_mmu_prot_update(pte_val);
err = 0;
unlock:
pte_unmap_unlock(pte, ptl);
@@ -114,7 +115,13 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
pte_val = *pte;
- update_mmu_cache(vma, addr, pte_val);
+ /*
+ * We don't need to run update_mmu_cache() here because the "file pte"
+ * being installed by install_file_pte() is not a real pte - it's a
+ * non-present entry (like a swap entry), noting what file offset should
+ * be mapped there when there's a fault (in a non-linear vma where
+ * that's not obvious).
+ */
pte_unmap_unlock(pte, ptl);
err = 0;
out:
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 832f676ca03..df499973255 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,7 +22,7 @@
#include "internal.h"
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
+static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
unsigned long max_huge_pages;
static struct list_head hugepage_freelists[MAX_NUMNODES];
static unsigned int nr_huge_pages_node[MAX_NUMNODES];
@@ -123,39 +123,13 @@ static int alloc_fresh_huge_page(void)
static struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr)
{
- struct inode *inode = vma->vm_file->f_dentry->d_inode;
struct page *page;
- int use_reserve = 0;
- unsigned long idx;
spin_lock(&hugetlb_lock);
-
- if (vma->vm_flags & VM_MAYSHARE) {
-
- /* idx = radix tree index, i.e. offset into file in
- * HPAGE_SIZE units */
- idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
- + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-
- /* The hugetlbfs specific inode info stores the number
- * of "guaranteed available" (huge) pages. That is,
- * the first 'prereserved_hpages' pages of the inode
- * are either already instantiated, or have been
- * pre-reserved (by hugetlb_reserve_for_inode()). Here
- * we're in the process of instantiating the page, so
- * we use this to determine whether to draw from the
- * pre-reserved pool or the truly free pool. */
- if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
- use_reserve = 1;
- }
-
- if (!use_reserve) {
- if (free_huge_pages <= reserved_huge_pages)
- goto fail;
- } else {
- BUG_ON(reserved_huge_pages == 0);
- reserved_huge_pages--;
- }
+ if (vma->vm_flags & VM_MAYSHARE)
+ resv_huge_pages--;
+ else if (free_huge_pages <= resv_huge_pages)
+ goto fail;
page = dequeue_huge_page(vma, addr);
if (!page)
@@ -165,96 +139,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
set_page_refcounted(page);
return page;
- fail:
- WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
+fail:
spin_unlock(&hugetlb_lock);
return NULL;
}
-/* hugetlb_extend_reservation()
- *
- * Ensure that at least 'atleast' hugepages are, and will remain,
- * available to instantiate the first 'atleast' pages of the given
- * inode. If the inode doesn't already have this many pages reserved
- * or instantiated, set aside some hugepages in the reserved pool to
- * satisfy later faults (or fail now if there aren't enough, rather
- * than getting the SIGBUS later).
- */
-int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
- unsigned long atleast)
-{
- struct inode *inode = &info->vfs_inode;
- unsigned long change_in_reserve = 0;
- int ret = 0;
-
- spin_lock(&hugetlb_lock);
- read_lock_irq(&inode->i_mapping->tree_lock);
-
- if (info->prereserved_hpages >= atleast)
- goto out;
-
- /* Because we always call this on shared mappings, none of the
- * pages beyond info->prereserved_hpages can have been
- * instantiated, so we need to reserve all of them now. */
- change_in_reserve = atleast - info->prereserved_hpages;
-
- if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
- ret = -ENOMEM;
- goto out;
- }
-
- reserved_huge_pages += change_in_reserve;
- info->prereserved_hpages = atleast;
-
- out:
- read_unlock_irq(&inode->i_mapping->tree_lock);
- spin_unlock(&hugetlb_lock);
-
- return ret;
-}
-
-/* hugetlb_truncate_reservation()
- *
- * This returns pages reserved for the given inode to the general free
- * hugepage pool. If the inode has any pages prereserved, but not
- * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
- * them.
- */
-void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
- unsigned long atmost)
-{
- struct inode *inode = &info->vfs_inode;
- struct address_space *mapping = inode->i_mapping;
- unsigned long idx;
- unsigned long change_in_reserve = 0;
- struct page *page;
-
- spin_lock(&hugetlb_lock);
- read_lock_irq(&inode->i_mapping->tree_lock);
-
- if (info->prereserved_hpages <= atmost)
- goto out;
-
- /* Count pages which were reserved, but not instantiated, and
- * which we can now release. */
- for (idx = atmost; idx < info->prereserved_hpages; idx++) {
- page = radix_tree_lookup(&mapping->page_tree, idx);
- if (!page)
- /* Pages which are already instantiated can't
- * be unreserved (and in fact have already
- * been removed from the reserved pool) */
- change_in_reserve++;
- }
-
- BUG_ON(reserved_huge_pages < change_in_reserve);
- reserved_huge_pages -= change_in_reserve;
- info->prereserved_hpages = atmost;
-
- out:
- read_unlock_irq(&inode->i_mapping->tree_lock);
- spin_unlock(&hugetlb_lock);
-}
-
static int __init hugetlb_init(void)
{
unsigned long i;
@@ -334,7 +223,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
return nr_huge_pages;
spin_lock(&hugetlb_lock);
- count = max(count, reserved_huge_pages);
+ count = max(count, resv_huge_pages);
try_to_free_low(count);
while (count < nr_huge_pages) {
struct page *page = dequeue_huge_page(NULL, 0);
@@ -361,11 +250,11 @@ int hugetlb_report_meminfo(char *buf)
return sprintf(buf,
"HugePages_Total: %5lu\n"
"HugePages_Free: %5lu\n"
- "HugePages_Rsvd: %5lu\n"
+ "HugePages_Rsvd: %5lu\n"
"Hugepagesize: %5lu kB\n",
nr_huge_pages,
free_huge_pages,
- reserved_huge_pages,
+ resv_huge_pages,
HPAGE_SIZE/1024);
}
@@ -754,3 +643,156 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
flush_tlb_range(vma, start, end);
}
+struct file_region {
+ struct list_head link;
+ long from;
+ long to;
+};
+
+static long region_add(struct list_head *head, long f, long t)
+{
+ struct file_region *rg, *nrg, *trg;
+
+ /* Locate the region we are either in or before. */
+ list_for_each_entry(rg, head, link)
+ if (f <= rg->to)
+ break;
+
+ /* Round our left edge to the current segment if it encloses us. */
+ if (f > rg->from)
+ f = rg->from;
+
+ /* Check for and consume any regions we now overlap with. */
+ nrg = rg;
+ list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+ if (&rg->link == head)
+ break;
+ if (rg->from > t)
+ break;
+
+ /* If this area reaches higher then extend our area to
+ * include it completely. If this is not the first area
+ * which we intend to reuse, free it. */
+ if (rg->to > t)
+ t = rg->to;
+ if (rg != nrg) {
+ list_del(&rg->link);
+ kfree(rg);
+ }
+ }
+ nrg->from = f;
+ nrg->to = t;
+ return 0;
+}
+
+static long region_chg(struct list_head *head, long f, long t)
+{
+ struct file_region *rg, *nrg;
+ long chg = 0;
+
+ /* Locate the region we are before or in. */
+ list_for_each_entry(rg, head, link)
+ if (f <= rg->to)
+ break;
+
+ /* If we are below the current region then a new region is required.
+ * Subtle, allocate a new region at the position but make it zero
+ * size such that we can guarentee to record the reservation. */
+ if (&rg->link == head || t < rg->from) {
+ nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+ if (nrg == 0)
+ return -ENOMEM;
+ nrg->from = f;
+ nrg->to = f;
+ INIT_LIST_HEAD(&nrg->link);
+ list_add(&nrg->link, rg->link.prev);
+
+ return t - f;
+ }
+
+ /* Round our left edge to the current segment if it encloses us. */
+ if (f > rg->from)
+ f = rg->from;
+ chg = t - f;
+
+ /* Check for and consume any regions we now overlap with. */
+ list_for_each_entry(rg, rg->link.prev, link) {
+ if (&rg->link == head)
+ break;
+ if (rg->from > t)
+ return chg;
+
+ /* We overlap with this area, if it extends futher than
+ * us then we must extend ourselves. Account for its
+ * existing reservation. */
+ if (rg->to > t) {
+ chg += rg->to - t;
+ t = rg->to;
+ }
+ chg -= rg->to - rg->from;
+ }
+ return chg;
+}
+
+static long region_truncate(struct list_head *head, long end)
+{
+ struct file_region *rg, *trg;
+ long chg = 0;
+
+ /* Locate the region we are either in or before. */
+ list_for_each_entry(rg, head, link)
+ if (end <= rg->to)
+ break;
+ if (&rg->link == head)
+ return 0;
+
+ /* If we are in the middle of a region then adjust it. */
+ if (end > rg->from) {
+ chg = rg->to - end;
+ rg->to = end;
+ rg = list_entry(rg->link.next, typeof(*rg), link);
+ }
+
+ /* Drop any remaining regions. */
+ list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+ if (&rg->link == head)
+ break;
+ chg += rg->to - rg->from;
+ list_del(&rg->link);
+ kfree(rg);
+ }
+ return chg;
+}
+
+static int hugetlb_acct_memory(long delta)
+{
+ int ret = -ENOMEM;
+
+ spin_lock(&hugetlb_lock);
+ if ((delta + resv_huge_pages) <= free_huge_pages) {
+ resv_huge_pages += delta;
+ ret = 0;
+ }
+ spin_unlock(&hugetlb_lock);
+ return ret;
+}
+
+int hugetlb_reserve_pages(struct inode *inode, long from, long to)
+{
+ long ret, chg;
+
+ chg = region_chg(&inode->i_mapping->private_list, from, to);
+ if (chg < 0)
+ return chg;
+ ret = hugetlb_acct_memory(chg);
+ if (ret < 0)
+ return ret;
+ region_add(&inode->i_mapping->private_list, from, to);
+ return 0;
+}
+
+void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
+{
+ long chg = region_truncate(&inode->i_mapping->private_list, offset);
+ hugetlb_acct_memory(freed - chg);
+}
diff --git a/mm/memory.c b/mm/memory.c
index 0ec7bc64427..247b5c312b9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -434,7 +434,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
/* pte contains position in swap or file, so copy. */
if (unlikely(!pte_present(pte))) {
if (!pte_file(pte)) {
- swap_duplicate(pte_to_swp_entry(pte));
+ swp_entry_t entry = pte_to_swp_entry(pte);
+
+ swap_duplicate(entry);
/* make sure dst_mm is on swapoff's mmlist. */
if (unlikely(list_empty(&dst_mm->mmlist))) {
spin_lock(&mmlist_lock);
@@ -443,6 +445,16 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
&src_mm->mmlist);
spin_unlock(&mmlist_lock);
}
+ if (is_write_migration_entry(entry) &&
+ is_cow_mapping(vm_flags)) {
+ /*
+ * COW mappings require pages in both parent
+ * and child to be set to read.
+ */
+ make_migration_entry_read(&entry);
+ pte = swp_entry_to_pte(entry);
+ set_pte_at(src_mm, addr, src_pte, pte);
+ }
}
goto out_set_pte;
}
@@ -1445,25 +1457,60 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct page *old_page, *new_page;
pte_t entry;
- int ret = VM_FAULT_MINOR;
+ int reuse, ret = VM_FAULT_MINOR;
old_page = vm_normal_page(vma, address, orig_pte);
if (!old_page)
goto gotten;
- if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
- int reuse = can_share_swap_page(old_page);
- unlock_page(old_page);
- if (reuse) {
- flush_cache_page(vma, address, pte_pfn(orig_pte));
- entry = pte_mkyoung(orig_pte);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- ptep_set_access_flags(vma, address, page_table, entry, 1);
- update_mmu_cache(vma, address, entry);
- lazy_mmu_prot_update(entry);
- ret |= VM_FAULT_WRITE;
- goto unlock;
+ if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) ==
+ (VM_SHARED|VM_WRITE))) {
+ if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+ /*
+ * Notify the address space that the page is about to
+ * become writable so that it can prohibit this or wait
+ * for the page to get into an appropriate state.
+ *
+ * We do this without the lock held, so that it can
+ * sleep if it needs to.
+ */
+ page_cache_get(old_page);
+ pte_unmap_unlock(page_table, ptl);
+
+ if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
+ goto unwritable_page;
+
+ page_cache_release(old_page);
+
+ /*
+ * Since we dropped the lock we need to revalidate
+ * the PTE as someone else may have changed it. If
+ * they did, we just return, as we can count on the
+ * MMU to tell us if they didn't also make it writable.
+ */
+ page_table = pte_offset_map_lock(mm, pmd, address,
+ &ptl);
+ if (!pte_same(*page_table, orig_pte))
+ goto unlock;
}
+
+ reuse = 1;
+ } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
+ reuse = can_share_swap_page(old_page);
+ unlock_page(old_page);
+ } else {
+ reuse = 0;
+ }
+
+ if (reuse) {
+ flush_cache_page(vma, address, pte_pfn(orig_pte));
+ entry = pte_mkyoung(orig_pte);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ ptep_set_access_flags(vma, address, page_table, entry, 1);
+ update_mmu_cache(vma, address, entry);
+ lazy_mmu_prot_update(entry);
+ ret |= VM_FAULT_WRITE;
+ goto unlock;
}
/*
@@ -1523,6 +1570,10 @@ oom:
if (old_page)
page_cache_release(old_page);
return VM_FAULT_OOM;
+
+unwritable_page:
+ page_cache_release(old_page);
+ return VM_FAULT_SIGBUS;
}
/*
@@ -1879,7 +1930,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out;
entry = pte_to_swp_entry(orig_pte);
-again:
+ if (is_migration_entry(entry)) {
+ migration_entry_wait(mm, pmd, address);
+ goto out;
+ }
page = lookup_swap_cache(entry);
if (!page) {
swapin_readahead(entry, address, vma);
@@ -1903,12 +1957,6 @@ again:
mark_page_accessed(page);
lock_page(page);
- if (!PageSwapCache(page)) {
- /* Page migration has occured */
- unlock_page(page);
- page_cache_release(page);
- goto again;
- }
/*
* Back out if somebody else already faulted in this pte.
@@ -2074,18 +2122,31 @@ retry:
/*
* Should we do an early C-O-W break?
*/
- if (write_access && !(vma->vm_flags & VM_SHARED)) {
- struct page *page;
+ if (write_access) {
+ if (!(vma->vm_flags & VM_SHARED)) {
+ struct page *page;
- if (unlikely(anon_vma_prepare(vma)))
- goto oom;
- page = alloc_page_vma(GFP_HIGHUSER, vma, address);
- if (!page)
- goto oom;
- copy_user_highpage(page, new_page, address);
- page_cache_release(new_page);
- new_page = page;
- anon = 1;
+ if (unlikely(anon_vma_prepare(vma)))
+ goto oom;
+ page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+ if (!page)
+ goto oom;
+ copy_user_highpage(page, new_page, address);
+ page_cache_release(new_page);
+ new_page = page;
+ anon = 1;
+
+ } else {
+ /* if the page will be shareable, see if the backing
+ * address space wants to know that the page is about
+ * to become writable */
+ if (vma->vm_ops->page_mkwrite &&
+ vma->vm_ops->page_mkwrite(vma, new_page) < 0
+ ) {
+ page_cache_release(new_page);
+ return VM_FAULT_SIGBUS;
+ }
+ }
}
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 70df5c0d957..841a077d5ae 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,7 +26,7 @@
extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
unsigned long size);
-static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
+static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int nr_pages = PAGES_PER_SECTION;
@@ -34,8 +34,15 @@ static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
int zone_type;
zone_type = zone - pgdat->node_zones;
+ if (!populated_zone(zone)) {
+ int ret = 0;
+ ret = init_currently_empty_zone(zone, phys_start_pfn, nr_pages);
+ if (ret < 0)
+ return ret;
+ }
memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
+ return 0;
}
extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
@@ -50,7 +57,11 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
if (ret < 0)
return ret;
- __add_zone(zone, phys_start_pfn);
+ ret = __add_zone(zone, phys_start_pfn);
+
+ if (ret < 0)
+ return ret;
+
return register_new_memory(__pfn_to_section(phys_start_pfn));
}
@@ -116,6 +127,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
unsigned long flags;
unsigned long onlined_pages = 0;
struct zone *zone;
+ int need_zonelists_rebuild = 0;
/*
* This doesn't need a lock to do pfn_to_page().
@@ -128,6 +140,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages);
pgdat_resize_unlock(zone->zone_pgdat, &flags);
+ /*
+ * If this zone is not populated, then it is not in zonelist.
+ * This means the page allocator ignores this zone.
+ * So, zonelist must be updated after online.
+ */
+ if (!populated_zone(zone))
+ need_zonelists_rebuild = 1;
+
for (i = 0; i < nr_pages; i++) {
struct page *page = pfn_to_page(pfn + i);
online_page(page);
@@ -138,5 +158,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
setup_per_zone_pages_min();
+ if (need_zonelists_rebuild)
+ build_all_zonelists();
+ vm_total_pages = nr_free_pagecache_pages();
return 0;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8778f58880c..ec4a1a950df 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -87,6 +87,8 @@
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/migrate.h>
+#include <linux/rmap.h>
+#include <linux/security.h>
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
@@ -587,6 +589,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
isolate_lru_page(page, pagelist);
}
+static struct page *new_node_page(struct page *page, unsigned long node, int **x)
+{
+ return alloc_pages_node(node, GFP_HIGHUSER, 0);
+}
+
/*
* Migrate pages from one node to a target node.
* Returns error or the number of pages not migrated.
@@ -603,11 +610,9 @@ int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
- if (!list_empty(&pagelist)) {
- err = migrate_pages_to(&pagelist, NULL, dest);
- if (!list_empty(&pagelist))
- putback_lru_pages(&pagelist);
- }
+ if (!list_empty(&pagelist))
+ err = migrate_pages(&pagelist, new_node_page, dest);
+
return err;
}
@@ -694,6 +699,12 @@ int do_migrate_pages(struct mm_struct *mm,
}
+static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+{
+ struct vm_area_struct *vma = (struct vm_area_struct *)private;
+
+ return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
+}
#else
static void migrate_page_add(struct page *page, struct list_head *pagelist,
@@ -706,6 +717,11 @@ int do_migrate_pages(struct mm_struct *mm,
{
return -ENOSYS;
}
+
+static struct page *new_vma_page(struct page *page, unsigned long private)
+{
+ return NULL;
+}
#endif
long do_mbind(unsigned long start, unsigned long len,
@@ -767,15 +783,13 @@ long do_mbind(unsigned long start, unsigned long len,
err = mbind_range(vma, start, end, new);
if (!list_empty(&pagelist))
- nr_failed = migrate_pages_to(&pagelist, vma, -1);
+ nr_failed = migrate_pages(&pagelist, new_vma_page,
+ (unsigned long)vma);
if (!err && nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
}
- if (!list_empty(&pagelist))
- putback_lru_pages(&pagelist);
-
up_write(&mm->mmap_sem);
mpol_free(new);
return err;
@@ -929,6 +943,10 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
goto out;
}
+ err = security_task_movememory(task);
+ if (err)
+ goto out;
+
err = do_migrate_pages(mm, &old, &new,
capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
out:
diff --git a/mm/migrate.c b/mm/migrate.c
index 1c25040693d..1c2a71aa05c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -15,6 +15,7 @@
#include <linux/migrate.h>
#include <linux/module.h>
#include <linux/swap.h>
+#include <linux/swapops.h>
#include <linux/pagemap.h>
#include <linux/buffer_head.h>
#include <linux/mm_inline.h>
@@ -23,13 +24,13 @@
#include <linux/topology.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
-#include <linux/swapops.h>
+#include <linux/writeback.h>
+#include <linux/mempolicy.h>
+#include <linux/vmalloc.h>
+#include <linux/security.h>
#include "internal.h"
-/* The maximum number of pages to take off the LRU for migration */
-#define MIGRATE_CHUNK_SIZE 256
-
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
/*
@@ -64,16 +65,11 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist)
}
/*
- * migrate_prep() needs to be called after we have compiled the list of pages
- * to be migrated using isolate_lru_page() but before we begin a series of calls
- * to migrate_pages().
+ * migrate_prep() needs to be called before we start compiling a list of pages
+ * to be migrated using isolate_lru_page().
*/
int migrate_prep(void)
{
- /* Must have swap device for migration */
- if (nr_swap_pages <= 0)
- return -ENODEV;
-
/*
* Clear the LRU lists so pages can be isolated.
* Note that pages may be moved off the LRU after we have
@@ -87,7 +83,6 @@ int migrate_prep(void)
static inline void move_to_lru(struct page *page)
{
- list_del(&page->lru);
if (PageActive(page)) {
/*
* lru_cache_add_active checks that
@@ -113,113 +108,200 @@ int putback_lru_pages(struct list_head *l)
int count = 0;
list_for_each_entry_safe(page, page2, l, lru) {
+ list_del(&page->lru);
move_to_lru(page);
count++;
}
return count;
}
-/*
- * Non migratable page
- */
-int fail_migrate_page(struct page *newpage, struct page *page)
+static inline int is_swap_pte(pte_t pte)
{
- return -EIO;
+ return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
}
-EXPORT_SYMBOL(fail_migrate_page);
/*
- * swapout a single page
- * page is locked upon entry, unlocked on exit
+ * Restore a potential migration pte to a working pte entry
*/
-static int swap_page(struct page *page)
+static void remove_migration_pte(struct vm_area_struct *vma,
+ struct page *old, struct page *new)
{
- struct address_space *mapping = page_mapping(page);
+ struct mm_struct *mm = vma->vm_mm;
+ swp_entry_t entry;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *ptep, pte;
+ spinlock_t *ptl;
+ unsigned long addr = page_address_in_vma(new, vma);
+
+ if (addr == -EFAULT)
+ return;
+
+ pgd = pgd_offset(mm, addr);
+ if (!pgd_present(*pgd))
+ return;
+
+ pud = pud_offset(pgd, addr);
+ if (!pud_present(*pud))
+ return;
+
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_present(*pmd))
+ return;
+
+ ptep = pte_offset_map(pmd, addr);
+
+ if (!is_swap_pte(*ptep)) {
+ pte_unmap(ptep);
+ return;
+ }
- if (page_mapped(page) && mapping)
- if (try_to_unmap(page, 1) != SWAP_SUCCESS)
- goto unlock_retry;
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ pte = *ptep;
+ if (!is_swap_pte(pte))
+ goto out;
- if (PageDirty(page)) {
- /* Page is dirty, try to write it out here */
- switch(pageout(page, mapping)) {
- case PAGE_KEEP:
- case PAGE_ACTIVATE:
- goto unlock_retry;
+ entry = pte_to_swp_entry(pte);
- case PAGE_SUCCESS:
- goto retry;
+ if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
+ goto out;
- case PAGE_CLEAN:
- ; /* try to free the page below */
- }
- }
+ get_page(new);
+ pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
+ if (is_write_migration_entry(entry))
+ pte = pte_mkwrite(pte);
+ set_pte_at(mm, addr, ptep, pte);
- if (PagePrivate(page)) {
- if (!try_to_release_page(page, GFP_KERNEL) ||
- (!mapping && page_count(page) == 1))
- goto unlock_retry;
- }
+ if (PageAnon(new))
+ page_add_anon_rmap(new, vma, addr);
+ else
+ page_add_file_rmap(new);
- if (remove_mapping(mapping, page)) {
- /* Success */
- unlock_page(page);
- return 0;
- }
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(vma, addr, pte);
+ lazy_mmu_prot_update(pte);
-unlock_retry:
- unlock_page(page);
+out:
+ pte_unmap_unlock(ptep, ptl);
+}
-retry:
- return -EAGAIN;
+/*
+ * Note that remove_file_migration_ptes will only work on regular mappings,
+ * Nonlinear mappings do not use migration entries.
+ */
+static void remove_file_migration_ptes(struct page *old, struct page *new)
+{
+ struct vm_area_struct *vma;
+ struct address_space *mapping = page_mapping(new);
+ struct prio_tree_iter iter;
+ pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+
+ if (!mapping)
+ return;
+
+ spin_lock(&mapping->i_mmap_lock);
+
+ vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
+ remove_migration_pte(vma, old, new);
+
+ spin_unlock(&mapping->i_mmap_lock);
}
/*
- * Remove references for a page and establish the new page with the correct
- * basic settings to be able to stop accesses to the page.
+ * Must hold mmap_sem lock on at least one of the vmas containing
+ * the page so that the anon_vma cannot vanish.
*/
-int migrate_page_remove_references(struct page *newpage,
- struct page *page, int nr_refs)
+static void remove_anon_migration_ptes(struct page *old, struct page *new)
{
- struct address_space *mapping = page_mapping(page);
- struct page **radix_pointer;
+ struct anon_vma *anon_vma;
+ struct vm_area_struct *vma;
+ unsigned long mapping;
- /*
- * Avoid doing any of the following work if the page count
- * indicates that the page is in use or truncate has removed
- * the page.
- */
- if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
- return -EAGAIN;
+ mapping = (unsigned long)new->mapping;
- /*
- * Establish swap ptes for anonymous pages or destroy pte
- * maps for files.
- *
- * In order to reestablish file backed mappings the fault handlers
- * will take the radix tree_lock which may then be used to stop
- * processses from accessing this page until the new page is ready.
- *
- * A process accessing via a swap pte (an anonymous page) will take a
- * page_lock on the old page which will block the process until the
- * migration attempt is complete. At that time the PageSwapCache bit
- * will be examined. If the page was migrated then the PageSwapCache
- * bit will be clear and the operation to retrieve the page will be
- * retried which will find the new page in the radix tree. Then a new
- * direct mapping may be generated based on the radix tree contents.
- *
- * If the page was not migrated then the PageSwapCache bit
- * is still set and the operation may continue.
- */
- if (try_to_unmap(page, 1) == SWAP_FAIL)
- /* A vma has VM_LOCKED set -> permanent failure */
- return -EPERM;
+ if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
+ return;
/*
- * Give up if we were unable to remove all mappings.
+ * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
*/
- if (page_mapcount(page))
- return -EAGAIN;
+ anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
+ spin_lock(&anon_vma->lock);
+
+ list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
+ remove_migration_pte(vma, old, new);
+
+ spin_unlock(&anon_vma->lock);
+}
+
+/*
+ * Get rid of all migration entries and replace them by
+ * references to the indicated page.
+ */
+static void remove_migration_ptes(struct page *old, struct page *new)
+{
+ if (PageAnon(new))
+ remove_anon_migration_ptes(old, new);
+ else
+ remove_file_migration_ptes(old, new);
+}
+
+/*
+ * Something used the pte of a page under migration. We need to
+ * get to the page and wait until migration is finished.
+ * When we return from this function the fault will be retried.
+ *
+ * This function is called from do_swap_page().
+ */
+void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long address)
+{
+ pte_t *ptep, pte;
+ spinlock_t *ptl;
+ swp_entry_t entry;
+ struct page *page;
+
+ ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+ pte = *ptep;
+ if (!is_swap_pte(pte))
+ goto out;
+
+ entry = pte_to_swp_entry(pte);
+ if (!is_migration_entry(entry))
+ goto out;
+
+ page = migration_entry_to_page(entry);
+
+ get_page(page);
+ pte_unmap_unlock(ptep, ptl);
+ wait_on_page_locked(page);
+ put_page(page);
+ return;
+out:
+ pte_unmap_unlock(ptep, ptl);
+}
+
+/*
+ * Replace the page in the mapping.
+ *
+ * The number of remaining references must be:
+ * 1 for anonymous pages without a mapping
+ * 2 for pages with a mapping
+ * 3 for pages with a mapping and PagePrivate set.
+ */
+static int migrate_page_move_mapping(struct address_space *mapping,
+ struct page *newpage, struct page *page)
+{
+ struct page **radix_pointer;
+
+ if (!mapping) {
+ /* Anonymous page */
+ if (page_count(page) != 1)
+ return -EAGAIN;
+ return 0;
+ }
write_lock_irq(&mapping->tree_lock);
@@ -227,7 +309,7 @@ int migrate_page_remove_references(struct page *newpage,
&mapping->page_tree,
page_index(page));
- if (!page_mapping(page) || page_count(page) != nr_refs ||
+ if (page_count(page) != 2 + !!PagePrivate(page) ||
*radix_pointer != page) {
write_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
@@ -235,19 +317,14 @@ int migrate_page_remove_references(struct page *newpage,
/*
* Now we know that no one else is looking at the page.
- *
- * Certain minimal information about a page must be available
- * in order for other subsystems to properly handle the page if they
- * find it through the radix tree update before we are finished
- * copying the page.
*/
get_page(newpage);
- newpage->index = page->index;
- newpage->mapping = page->mapping;
+#ifdef CONFIG_SWAP
if (PageSwapCache(page)) {
SetPageSwapCache(newpage);
set_page_private(newpage, page_private(page));
}
+#endif
*radix_pointer = newpage;
__put_page(page);
@@ -255,12 +332,11 @@ int migrate_page_remove_references(struct page *newpage,
return 0;
}
-EXPORT_SYMBOL(migrate_page_remove_references);
/*
* Copy the page to its new location
*/
-void migrate_page_copy(struct page *newpage, struct page *page)
+static void migrate_page_copy(struct page *newpage, struct page *page)
{
copy_highpage(newpage, page);
@@ -282,7 +358,9 @@ void migrate_page_copy(struct page *newpage, struct page *page)
set_page_dirty(newpage);
}
+#ifdef CONFIG_SWAP
ClearPageSwapCache(page);
+#endif
ClearPageActive(page);
ClearPagePrivate(page);
set_page_private(page, 0);
@@ -295,7 +373,18 @@ void migrate_page_copy(struct page *newpage, struct page *page)
if (PageWriteback(newpage))
end_page_writeback(newpage);
}
-EXPORT_SYMBOL(migrate_page_copy);
+
+/************************************************************
+ * Migration functions
+ ***********************************************************/
+
+/* Always fail migration. Used for mappings that are not movable */
+int fail_migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page)
+{
+ return -EIO;
+}
+EXPORT_SYMBOL(fail_migrate_page);
/*
* Common logic to directly migrate a single page suitable for
@@ -303,51 +392,286 @@ EXPORT_SYMBOL(migrate_page_copy);
*
* Pages are locked upon entry and exit.
*/
-int migrate_page(struct page *newpage, struct page *page)
+int migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page)
{
int rc;
BUG_ON(PageWriteback(page)); /* Writeback must be complete */
- rc = migrate_page_remove_references(newpage, page, 2);
+ rc = migrate_page_move_mapping(mapping, newpage, page);
+
+ if (rc)
+ return rc;
+
+ migrate_page_copy(newpage, page);
+ return 0;
+}
+EXPORT_SYMBOL(migrate_page);
+
+/*
+ * Migration function for pages with buffers. This function can only be used
+ * if the underlying filesystem guarantees that no other references to "page"
+ * exist.
+ */
+int buffer_migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page)
+{
+ struct buffer_head *bh, *head;
+ int rc;
+
+ if (!page_has_buffers(page))
+ return migrate_page(mapping, newpage, page);
+
+ head = page_buffers(page);
+
+ rc = migrate_page_move_mapping(mapping, newpage, page);
if (rc)
return rc;
+ bh = head;
+ do {
+ get_bh(bh);
+ lock_buffer(bh);
+ bh = bh->b_this_page;
+
+ } while (bh != head);
+
+ ClearPagePrivate(page);
+ set_page_private(newpage, page_private(page));
+ set_page_private(page, 0);
+ put_page(page);
+ get_page(newpage);
+
+ bh = head;
+ do {
+ set_bh_page(bh, newpage, bh_offset(bh));
+ bh = bh->b_this_page;
+
+ } while (bh != head);
+
+ SetPagePrivate(newpage);
+
migrate_page_copy(newpage, page);
+ bh = head;
+ do {
+ unlock_buffer(bh);
+ put_bh(bh);
+ bh = bh->b_this_page;
+
+ } while (bh != head);
+
+ return 0;
+}
+EXPORT_SYMBOL(buffer_migrate_page);
+
+/*
+ * Writeback a page to clean the dirty state
+ */
+static int writeout(struct address_space *mapping, struct page *page)
+{
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .nr_to_write = 1,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ .nonblocking = 1,
+ .for_reclaim = 1
+ };
+ int rc;
+
+ if (!mapping->a_ops->writepage)
+ /* No write method for the address space */
+ return -EINVAL;
+
+ if (!clear_page_dirty_for_io(page))
+ /* Someone else already triggered a write */
+ return -EAGAIN;
+
/*
- * Remove auxiliary swap entries and replace
- * them with real ptes.
- *
- * Note that a real pte entry will allow processes that are not
- * waiting on the page lock to use the new page via the page tables
- * before the new page is unlocked.
+ * A dirty page may imply that the underlying filesystem has
+ * the page on some queue. So the page must be clean for
+ * migration. Writeout may mean we loose the lock and the
+ * page state is no longer what we checked for earlier.
+ * At this point we know that the migration attempt cannot
+ * be successful.
*/
- remove_from_swap(newpage);
- return 0;
+ remove_migration_ptes(page, page);
+
+ rc = mapping->a_ops->writepage(page, &wbc);
+ if (rc < 0)
+ /* I/O Error writing */
+ return -EIO;
+
+ if (rc != AOP_WRITEPAGE_ACTIVATE)
+ /* unlocked. Relock */
+ lock_page(page);
+
+ return -EAGAIN;
+}
+
+/*
+ * Default handling if a filesystem does not provide a migration function.
+ */
+static int fallback_migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page)
+{
+ if (PageDirty(page))
+ return writeout(mapping, page);
+
+ /*
+ * Buffers may be managed in a filesystem specific way.
+ * We must have no buffers or drop them.
+ */
+ if (page_has_buffers(page) &&
+ !try_to_release_page(page, GFP_KERNEL))
+ return -EAGAIN;
+
+ return migrate_page(mapping, newpage, page);
+}
+
+/*
+ * Move a page to a newly allocated page
+ * The page is locked and all ptes have been successfully removed.
+ *
+ * The new page will have replaced the old page if this function
+ * is successful.
+ */
+static int move_to_new_page(struct page *newpage, struct page *page)
+{
+ struct address_space *mapping;
+ int rc;
+
+ /*
+ * Block others from accessing the page when we get around to
+ * establishing additional references. We are the only one
+ * holding a reference to the new page at this point.
+ */
+ if (TestSetPageLocked(newpage))
+ BUG();
+
+ /* Prepare mapping for the new page.*/
+ newpage->index = page->index;
+ newpage->mapping = page->mapping;
+
+ mapping = page_mapping(page);
+ if (!mapping)
+ rc = migrate_page(mapping, newpage, page);
+ else if (mapping->a_ops->migratepage)
+ /*
+ * Most pages have a mapping and most filesystems
+ * should provide a migration function. Anonymous
+ * pages are part of swap space which also has its
+ * own migration function. This is the most common
+ * path for page migration.
+ */
+ rc = mapping->a_ops->migratepage(mapping,
+ newpage, page);
+ else
+ rc = fallback_migrate_page(mapping, newpage, page);
+
+ if (!rc)
+ remove_migration_ptes(page, newpage);
+ else
+ newpage->mapping = NULL;
+
+ unlock_page(newpage);
+
+ return rc;
+}
+
+/*
+ * Obtain the lock on page, remove all ptes and migrate the page
+ * to the newly allocated page in newpage.
+ */
+static int unmap_and_move(new_page_t get_new_page, unsigned long private,
+ struct page *page, int force)
+{
+ int rc = 0;
+ int *result = NULL;
+ struct page *newpage = get_new_page(page, private, &result);
+
+ if (!newpage)
+ return -ENOMEM;
+
+ if (page_count(page) == 1)
+ /* page was freed from under us. So we are done. */
+ goto move_newpage;
+
+ rc = -EAGAIN;
+ if (TestSetPageLocked(page)) {
+ if (!force)
+ goto move_newpage;
+ lock_page(page);
+ }
+
+ if (PageWriteback(page)) {
+ if (!force)
+ goto unlock;
+ wait_on_page_writeback(page);
+ }
+
+ /*
+ * Establish migration ptes or remove ptes
+ */
+ if (try_to_unmap(page, 1) != SWAP_FAIL) {
+ if (!page_mapped(page))
+ rc = move_to_new_page(newpage, page);
+ } else
+ /* A vma has VM_LOCKED set -> permanent failure */
+ rc = -EPERM;
+
+ if (rc)
+ remove_migration_ptes(page, page);
+unlock:
+ unlock_page(page);
+
+ if (rc != -EAGAIN) {
+ /*
+ * A page that has been migrated has all references
+ * removed and will be freed. A page that has not been
+ * migrated will have kepts its references and be
+ * restored.
+ */
+ list_del(&page->lru);
+ move_to_lru(page);
+ }
+
+move_newpage:
+ /*
+ * Move the new page to the LRU. If migration was not successful
+ * then this will free the page.
+ */
+ move_to_lru(newpage);
+ if (result) {
+ if (rc)
+ *result = rc;
+ else
+ *result = page_to_nid(newpage);
+ }
+ return rc;
}
-EXPORT_SYMBOL(migrate_page);
/*
* migrate_pages
*
- * Two lists are passed to this function. The first list
- * contains the pages isolated from the LRU to be migrated.
- * The second list contains new pages that the pages isolated
- * can be moved to. If the second list is NULL then all
- * pages are swapped out.
+ * The function takes one list of pages to migrate and a function
+ * that determines from the page to be migrated and the private data
+ * the target of the move and allocates the page.
*
* The function returns after 10 attempts or if no pages
* are movable anymore because to has become empty
- * or no retryable pages exist anymore.
+ * or no retryable pages exist anymore. All pages will be
+ * retruned to the LRU or freed.
*
- * Return: Number of pages not migrated when "to" ran empty.
+ * Return: Number of pages not migrated or error code.
*/
-int migrate_pages(struct list_head *from, struct list_head *to,
- struct list_head *moved, struct list_head *failed)
+int migrate_pages(struct list_head *from,
+ new_page_t get_new_page, unsigned long private)
{
- int retry;
+ int retry = 1;
int nr_failed = 0;
int pass = 0;
struct page *page;
@@ -358,305 +682,297 @@ int migrate_pages(struct list_head *from, struct list_head *to,
if (!swapwrite)
current->flags |= PF_SWAPWRITE;
-redo:
- retry = 0;
+ for(pass = 0; pass < 10 && retry; pass++) {
+ retry = 0;
+
+ list_for_each_entry_safe(page, page2, from, lru) {
+ cond_resched();
+
+ rc = unmap_and_move(get_new_page, private,
+ page, pass > 2);
+
+ switch(rc) {
+ case -ENOMEM:
+ goto out;
+ case -EAGAIN:
+ retry++;
+ break;
+ case 0:
+ break;
+ default:
+ /* Permanent failure */
+ nr_failed++;
+ break;
+ }
+ }
+ }
+ rc = 0;
+out:
+ if (!swapwrite)
+ current->flags &= ~PF_SWAPWRITE;
- list_for_each_entry_safe(page, page2, from, lru) {
- struct page *newpage = NULL;
- struct address_space *mapping;
+ putback_lru_pages(from);
- cond_resched();
+ if (rc)
+ return rc;
- rc = 0;
- if (page_count(page) == 1)
- /* page was freed from under us. So we are done. */
- goto next;
+ return nr_failed + retry;
+}
- if (to && list_empty(to))
- break;
+#ifdef CONFIG_NUMA
+/*
+ * Move a list of individual pages
+ */
+struct page_to_node {
+ unsigned long addr;
+ struct page *page;
+ int node;
+ int status;
+};
- /*
- * Skip locked pages during the first two passes to give the
- * functions holding the lock time to release the page. Later we
- * use lock_page() to have a higher chance of acquiring the
- * lock.
- */
- rc = -EAGAIN;
- if (pass > 2)
- lock_page(page);
- else
- if (TestSetPageLocked(page))
- goto next;
+static struct page *new_page_node(struct page *p, unsigned long private,
+ int **result)
+{
+ struct page_to_node *pm = (struct page_to_node *)private;
- /*
- * Only wait on writeback if we have already done a pass where
- * we we may have triggered writeouts for lots of pages.
- */
- if (pass > 0) {
- wait_on_page_writeback(page);
- } else {
- if (PageWriteback(page))
- goto unlock_page;
- }
+ while (pm->node != MAX_NUMNODES && pm->page != p)
+ pm++;
- /*
- * Anonymous pages must have swap cache references otherwise
- * the information contained in the page maps cannot be
- * preserved.
- */
- if (PageAnon(page) && !PageSwapCache(page)) {
- if (!add_to_swap(page, GFP_KERNEL)) {
- rc = -ENOMEM;
- goto unlock_page;
- }
- }
+ if (pm->node == MAX_NUMNODES)
+ return NULL;
- if (!to) {
- rc = swap_page(page);
- goto next;
- }
+ *result = &pm->status;
- newpage = lru_to_page(to);
- lock_page(newpage);
+ return alloc_pages_node(pm->node, GFP_HIGHUSER, 0);
+}
- /*
- * Pages are properly locked and writeback is complete.
- * Try to migrate the page.
- */
- mapping = page_mapping(page);
- if (!mapping)
- goto unlock_both;
+/*
+ * Move a set of pages as indicated in the pm array. The addr
+ * field must be set to the virtual address of the page to be moved
+ * and the node number must contain a valid target node.
+ */
+static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
+ int migrate_all)
+{
+ int err;
+ struct page_to_node *pp;
+ LIST_HEAD(pagelist);
- if (mapping->a_ops->migratepage) {
- /*
- * Most pages have a mapping and most filesystems
- * should provide a migration function. Anonymous
- * pages are part of swap space which also has its
- * own migration function. This is the most common
- * path for page migration.
- */
- rc = mapping->a_ops->migratepage(newpage, page);
- goto unlock_both;
- }
-
- /* Make sure the dirty bit is up to date */
- if (try_to_unmap(page, 1) == SWAP_FAIL) {
- rc = -EPERM;
- goto unlock_both;
- }
+ down_read(&mm->mmap_sem);
- if (page_mapcount(page)) {
- rc = -EAGAIN;
- goto unlock_both;
- }
+ /*
+ * Build a list of pages to migrate
+ */
+ migrate_prep();
+ for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
+ struct vm_area_struct *vma;
+ struct page *page;
/*
- * Default handling if a filesystem does not provide
- * a migration function. We can only migrate clean
- * pages so try to write out any dirty pages first.
+ * A valid page pointer that will not match any of the
+ * pages that will be moved.
*/
- if (PageDirty(page)) {
- switch (pageout(page, mapping)) {
- case PAGE_KEEP:
- case PAGE_ACTIVATE:
- goto unlock_both;
-
- case PAGE_SUCCESS:
- unlock_page(newpage);
- goto next;
-
- case PAGE_CLEAN:
- ; /* try to migrate the page below */
- }
- }
+ pp->page = ZERO_PAGE(0);
- /*
- * Buffers are managed in a filesystem specific way.
- * We must have no buffers or drop them.
- */
- if (!page_has_buffers(page) ||
- try_to_release_page(page, GFP_KERNEL)) {
- rc = migrate_page(newpage, page);
- goto unlock_both;
- }
+ err = -EFAULT;
+ vma = find_vma(mm, pp->addr);
+ if (!vma)
+ goto set_status;
- /*
- * On early passes with mapped pages simply
- * retry. There may be a lock held for some
- * buffers that may go away. Later
- * swap them out.
- */
- if (pass > 4) {
+ page = follow_page(vma, pp->addr, FOLL_GET);
+ err = -ENOENT;
+ if (!page)
+ goto set_status;
+
+ if (PageReserved(page)) /* Check for zero page */
+ goto put_and_set;
+
+ pp->page = page;
+ err = page_to_nid(page);
+
+ if (err == pp->node)
/*
- * Persistently unable to drop buffers..... As a
- * measure of last resort we fall back to
- * swap_page().
+ * Node already in the right place
*/
- unlock_page(newpage);
- newpage = NULL;
- rc = swap_page(page);
- goto next;
- }
+ goto put_and_set;
-unlock_both:
- unlock_page(newpage);
-
-unlock_page:
- unlock_page(page);
-
-next:
- if (rc == -EAGAIN) {
- retry++;
- } else if (rc) {
- /* Permanent failure */
- list_move(&page->lru, failed);
- nr_failed++;
- } else {
- if (newpage) {
- /* Successful migration. Return page to LRU */
- move_to_lru(newpage);
- }
- list_move(&page->lru, moved);
- }
+ err = -EACCES;
+ if (page_mapcount(page) > 1 &&
+ !migrate_all)
+ goto put_and_set;
+
+ err = isolate_lru_page(page, &pagelist);
+put_and_set:
+ /*
+ * Either remove the duplicate refcount from
+ * isolate_lru_page() or drop the page ref if it was
+ * not isolated.
+ */
+ put_page(page);
+set_status:
+ pp->status = err;
}
- if (retry && pass++ < 10)
- goto redo;
- if (!swapwrite)
- current->flags &= ~PF_SWAPWRITE;
+ if (!list_empty(&pagelist))
+ err = migrate_pages(&pagelist, new_page_node,
+ (unsigned long)pm);
+ else
+ err = -ENOENT;
- return nr_failed + retry;
+ up_read(&mm->mmap_sem);
+ return err;
}
/*
- * Migration function for pages with buffers. This function can only be used
- * if the underlying filesystem guarantees that no other references to "page"
- * exist.
+ * Determine the nodes of a list of pages. The addr in the pm array
+ * must have been set to the virtual address of which we want to determine
+ * the node number.
*/
-int buffer_migrate_page(struct page *newpage, struct page *page)
+static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
{
- struct address_space *mapping = page->mapping;
- struct buffer_head *bh, *head;
- int rc;
+ down_read(&mm->mmap_sem);
+
+ for ( ; pm->node != MAX_NUMNODES; pm++) {
+ struct vm_area_struct *vma;
+ struct page *page;
+ int err;
+
+ err = -EFAULT;
+ vma = find_vma(mm, pm->addr);
+ if (!vma)
+ goto set_status;
+
+ page = follow_page(vma, pm->addr, 0);
+ err = -ENOENT;
+ /* Use PageReserved to check for zero page */
+ if (!page || PageReserved(page))
+ goto set_status;
+
+ err = page_to_nid(page);
+set_status:
+ pm->status = err;
+ }
- if (!mapping)
- return -EAGAIN;
+ up_read(&mm->mmap_sem);
+ return 0;
+}
- if (!page_has_buffers(page))
- return migrate_page(newpage, page);
+/*
+ * Move a list of pages in the address space of the currently executing
+ * process.
+ */
+asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
+ const void __user * __user *pages,
+ const int __user *nodes,
+ int __user *status, int flags)
+{
+ int err = 0;
+ int i;
+ struct task_struct *task;
+ nodemask_t task_nodes;
+ struct mm_struct *mm;
+ struct page_to_node *pm = NULL;
- head = page_buffers(page);
+ /* Check flags */
+ if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
+ return -EINVAL;
- rc = migrate_page_remove_references(newpage, page, 3);
+ if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
+ return -EPERM;
- if (rc)
- return rc;
+ /* Find the mm_struct */
+ read_lock(&tasklist_lock);
+ task = pid ? find_task_by_pid(pid) : current;
+ if (!task) {
+ read_unlock(&tasklist_lock);
+ return -ESRCH;
+ }
+ mm = get_task_mm(task);
+ read_unlock(&tasklist_lock);
- bh = head;
- do {
- get_bh(bh);
- lock_buffer(bh);
- bh = bh->b_this_page;
+ if (!mm)
+ return -EINVAL;
- } while (bh != head);
+ /*
+ * Check if this process has the right to modify the specified
+ * process. The right exists if the process has administrative
+ * capabilities, superuser privileges or the same
+ * userid as the target process.
+ */
+ if ((current->euid != task->suid) && (current->euid != task->uid) &&
+ (current->uid != task->suid) && (current->uid != task->uid) &&
+ !capable(CAP_SYS_NICE)) {
+ err = -EPERM;
+ goto out2;
+ }
- ClearPagePrivate(page);
- set_page_private(newpage, page_private(page));
- set_page_private(page, 0);
- put_page(page);
- get_page(newpage);
+ err = security_task_movememory(task);
+ if (err)
+ goto out2;
- bh = head;
- do {
- set_bh_page(bh, newpage, bh_offset(bh));
- bh = bh->b_this_page;
- } while (bh != head);
+ task_nodes = cpuset_mems_allowed(task);
- SetPagePrivate(newpage);
+ /* Limit nr_pages so that the multiplication may not overflow */
+ if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
+ err = -E2BIG;
+ goto out2;
+ }
- migrate_page_copy(newpage, page);
+ pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
+ if (!pm) {
+ err = -ENOMEM;
+ goto out2;
+ }
- bh = head;
- do {
- unlock_buffer(bh);
- put_bh(bh);
- bh = bh->b_this_page;
+ /*
+ * Get parameters from user space and initialize the pm
+ * array. Return various errors if the user did something wrong.
+ */
+ for (i = 0; i < nr_pages; i++) {
+ const void *p;
- } while (bh != head);
+ err = -EFAULT;
+ if (get_user(p, pages + i))
+ goto out;
- return 0;
-}
-EXPORT_SYMBOL(buffer_migrate_page);
+ pm[i].addr = (unsigned long)p;
+ if (nodes) {
+ int node;
-/*
- * Migrate the list 'pagelist' of pages to a certain destination.
- *
- * Specify destination with either non-NULL vma or dest_node >= 0
- * Return the number of pages not migrated or error code
- */
-int migrate_pages_to(struct list_head *pagelist,
- struct vm_area_struct *vma, int dest)
-{
- LIST_HEAD(newlist);
- LIST_HEAD(moved);
- LIST_HEAD(failed);
- int err = 0;
- unsigned long offset = 0;
- int nr_pages;
- struct page *page;
- struct list_head *p;
+ if (get_user(node, nodes + i))
+ goto out;
-redo:
- nr_pages = 0;
- list_for_each(p, pagelist) {
- if (vma) {
- /*
- * The address passed to alloc_page_vma is used to
- * generate the proper interleave behavior. We fake
- * the address here by an increasing offset in order
- * to get the proper distribution of pages.
- *
- * No decision has been made as to which page
- * a certain old page is moved to so we cannot
- * specify the correct address.
- */
- page = alloc_page_vma(GFP_HIGHUSER, vma,
- offset + vma->vm_start);
- offset += PAGE_SIZE;
- }
- else
- page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
+ err = -ENODEV;
+ if (!node_online(node))
+ goto out;
- if (!page) {
- err = -ENOMEM;
- goto out;
+ err = -EACCES;
+ if (!node_isset(node, task_nodes))
+ goto out;
+
+ pm[i].node = node;
}
- list_add_tail(&page->lru, &newlist);
- nr_pages++;
- if (nr_pages > MIGRATE_CHUNK_SIZE)
- break;
}
- err = migrate_pages(pagelist, &newlist, &moved, &failed);
+ /* End marker */
+ pm[nr_pages].node = MAX_NUMNODES;
+
+ if (nodes)
+ err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL);
+ else
+ err = do_pages_stat(mm, pm);
- putback_lru_pages(&moved); /* Call release pages instead ?? */
+ if (err >= 0)
+ /* Return status information */
+ for (i = 0; i < nr_pages; i++)
+ if (put_user(pm[i].status, status + i))
+ err = -EFAULT;
- if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
- goto redo;
out:
- /* Return leftover allocated pages */
- while (!list_empty(&newlist)) {
- page = list_entry(newlist.next, struct page, lru);
- list_del(&page->lru);
- __free_page(page);
- }
- list_splice(&failed, pagelist);
- if (err < 0)
- return err;
-
- /* Calculate number of leftover pages */
- nr_pages = 0;
- list_for_each(p, pagelist)
- nr_pages++;
- return nr_pages;
+ vfree(pm);
+out2:
+ mmput(mm);
+ return err;
}
+#endif
+
diff --git a/mm/mmap.c b/mm/mmap.c
index e6ee12344b1..6446c6134b0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1065,7 +1065,8 @@ munmap_back:
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags;
- vma->vm_page_prot = protection_map[vm_flags & 0x0f];
+ vma->vm_page_prot = protection_map[vm_flags &
+ (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
vma->vm_pgoff = pgoff;
if (file) {
@@ -1089,6 +1090,12 @@ munmap_back:
goto free_vma;
}
+ /* Don't make the VMA automatically writable if it's shared, but the
+ * backer wishes to know when pages are first written to */
+ if (vma->vm_ops && vma->vm_ops->page_mkwrite)
+ vma->vm_page_prot =
+ protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
+
/* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
* shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
* that memory reservation must be checked; but that reservation
@@ -1921,7 +1928,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
vma->vm_end = addr + len;
vma->vm_pgoff = pgoff;
vma->vm_flags = flags;
- vma->vm_page_prot = protection_map[flags & 0x0f];
+ vma->vm_page_prot = protection_map[flags &
+ (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
vma_link(mm, vma, prev, rb_link, rb_parent);
out:
mm->total_vm += len >> PAGE_SHIFT;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 4c14d4289b6..638edabaff7 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -19,7 +19,8 @@
#include <linux/mempolicy.h>
#include <linux/personality.h>
#include <linux/syscalls.h>
-
+#include <linux/swap.h>
+#include <linux/swapops.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
@@ -28,12 +29,13 @@
static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot)
{
- pte_t *pte;
+ pte_t *pte, oldpte;
spinlock_t *ptl;
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
do {
- if (pte_present(*pte)) {
+ oldpte = *pte;
+ if (pte_present(oldpte)) {
pte_t ptent;
/* Avoid an SMP race with hardware updated dirty/clean
@@ -43,7 +45,22 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot);
set_pte_at(mm, addr, pte, ptent);
lazy_mmu_prot_update(ptent);
+#ifdef CONFIG_MIGRATION
+ } else if (!pte_file(oldpte)) {
+ swp_entry_t entry = pte_to_swp_entry(oldpte);
+
+ if (is_write_migration_entry(entry)) {
+ /*
+ * A protection check is difficult so
+ * just be safe and disable write
+ */
+ make_migration_entry_read(&entry);
+ set_pte_at(mm, addr, pte,
+ swp_entry_to_pte(entry));
+ }
+#endif
}
+
} while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap_unlock(pte - 1, ptl);
}
@@ -106,6 +123,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
unsigned long oldflags = vma->vm_flags;
long nrpages = (end - start) >> PAGE_SHIFT;
unsigned long charged = 0;
+ unsigned int mask;
pgprot_t newprot;
pgoff_t pgoff;
int error;
@@ -132,8 +150,6 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
}
}
- newprot = protection_map[newflags & 0xf];
-
/*
* First try to merge with previous and/or next vma.
*/
@@ -160,6 +176,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
}
success:
+ /* Don't make the VMA automatically writable if it's shared, but the
+ * backer wishes to know when pages are first written to */
+ mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED;
+ if (vma->vm_ops && vma->vm_ops->page_mkwrite)
+ mask &= ~VM_SHARED;
+
+ newprot = protection_map[newflags & mask];
+
/*
* vm_flags and vm_page_prot are protected by the mmap_sem
* held in write mode.
@@ -205,8 +229,7 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot)
/*
* Does the application expect PROT_READ to imply PROT_EXEC:
*/
- if (unlikely((prot & PROT_READ) &&
- (current->personality & READ_IMPLIES_EXEC)))
+ if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
prot |= PROT_EXEC;
vm_flags = calc_vm_prot_bits(prot);
diff --git a/mm/msync.c b/mm/msync.c
index bc6c9537636..d083544df21 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -170,8 +170,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
* just ignore them, but return -ENOMEM at the end.
*/
down_read(&current->mm->mmap_sem);
- if (flags & MS_SYNC)
- current->flags |= PF_SYNCWRITE;
vma = find_vma(current->mm, start);
if (!vma) {
error = -ENOMEM;
@@ -228,7 +226,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
}
} while (vma && !done);
out_unlock:
- current->flags &= ~PF_SYNCWRITE;
up_read(&current->mm->mmap_sem);
out:
return error;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 042e6436c3e..d46ed0f1dc0 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -22,10 +22,11 @@
#include <linux/jiffies.h>
#include <linux/cpuset.h>
+int sysctl_panic_on_oom;
/* #define DEBUG */
/**
- * oom_badness - calculate a numeric value for how bad this task has been
+ * badness - calculate a numeric value for how bad this task has been
* @p: task struct of which task we should calculate
* @uptime: current uptime in seconds
*
@@ -200,7 +201,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
continue;
/*
- * This is in the process of releasing memory so for wait it
+ * This is in the process of releasing memory so wait for it
* to finish before killing some other task by mistake.
*/
releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
@@ -306,7 +307,7 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
}
/**
- * oom_kill - kill the "best" process when we run out of memory
+ * out_of_memory - kill the "best" process when we run out of memory
*
* If we run out of memory, we have the choice between either
* killing a random task (bad), letting the system crash (worse)
@@ -344,6 +345,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
break;
case CONSTRAINT_NONE:
+ if (sysctl_panic_on_oom)
+ panic("out of memory. panic_on_oom is selected\n");
retry:
/*
* Rambo mode: Shoot down a process and hope it solves whatever
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 75d7f48b79b..8ccf6f1b147 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -204,6 +204,7 @@ static void balance_dirty_pages(struct address_space *mapping)
.sync_mode = WB_SYNC_NONE,
.older_than_this = NULL,
.nr_to_write = write_chunk,
+ .range_cyclic = 1,
};
get_dirty_limits(&wbs, &background_thresh,
@@ -331,6 +332,7 @@ static void background_writeout(unsigned long _min_pages)
.older_than_this = NULL,
.nr_to_write = 0,
.nonblocking = 1,
+ .range_cyclic = 1,
};
for ( ; ; ) {
@@ -407,6 +409,7 @@ static void wb_kupdate(unsigned long arg)
.nr_to_write = 0,
.nonblocking = 1,
.for_kupdate = 1,
+ .range_cyclic = 1,
};
sync_supers();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 253a450c400..423db0db7c0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -37,6 +37,7 @@
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
#include <linux/mempolicy.h>
+#include <linux/stop_machine.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -83,8 +84,8 @@ EXPORT_SYMBOL(zone_table);
static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
int min_free_kbytes = 1024;
-unsigned long __initdata nr_kernel_pages;
-unsigned long __initdata nr_all_pages;
+unsigned long __meminitdata nr_kernel_pages;
+unsigned long __meminitdata nr_all_pages;
#ifdef CONFIG_DEBUG_VM
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
@@ -286,22 +287,27 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
* we can do coalesce a page and its buddy if
* (a) the buddy is not in a hole &&
* (b) the buddy is in the buddy system &&
- * (c) a page and its buddy have the same order.
+ * (c) a page and its buddy have the same order &&
+ * (d) a page and its buddy are in the same zone.
*
* For recording whether a page is in the buddy system, we use PG_buddy.
* Setting, clearing, and testing PG_buddy is serialized by zone->lock.
*
* For recording page's order, we use page_private(page).
*/
-static inline int page_is_buddy(struct page *page, int order)
+static inline int page_is_buddy(struct page *page, struct page *buddy,
+ int order)
{
#ifdef CONFIG_HOLES_IN_ZONE
- if (!pfn_valid(page_to_pfn(page)))
+ if (!pfn_valid(page_to_pfn(buddy)))
return 0;
#endif
- if (PageBuddy(page) && page_order(page) == order) {
- BUG_ON(page_count(page) != 0);
+ if (page_zone_id(page) != page_zone_id(buddy))
+ return 0;
+
+ if (PageBuddy(buddy) && page_order(buddy) == order) {
+ BUG_ON(page_count(buddy) != 0);
return 1;
}
return 0;
@@ -352,7 +358,7 @@ static inline void __free_one_page(struct page *page,
struct page *buddy;
buddy = __page_find_buddy(page, page_idx, order);
- if (!page_is_buddy(buddy, order))
+ if (!page_is_buddy(page, buddy, order))
break; /* Move the buddy up one level. */
list_del(&buddy->lru);
@@ -1485,7 +1491,7 @@ void show_free_areas(void)
}
for_each_zone(zone) {
- unsigned long nr, flags, order, total = 0;
+ unsigned long nr[MAX_ORDER], flags, order, total = 0;
show_node(zone);
printk("%s: ", zone->name);
@@ -1496,11 +1502,12 @@ void show_free_areas(void)
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
- nr = zone->free_area[order].nr_free;
- total += nr << order;
- printk("%lu*%lukB ", nr, K(1UL) << order);
+ nr[order] = zone->free_area[order].nr_free;
+ total += nr[order] << order;
}
spin_unlock_irqrestore(&zone->lock, flags);
+ for (order = 0; order < MAX_ORDER; order++)
+ printk("%lu*%lukB ", nr[order], K(1UL) << order);
printk("= %lukB\n", K(total));
}
@@ -1512,7 +1519,7 @@ void show_free_areas(void)
*
* Add all populated zones of a node to the zonelist.
*/
-static int __init build_zonelists_node(pg_data_t *pgdat,
+static int __meminit build_zonelists_node(pg_data_t *pgdat,
struct zonelist *zonelist, int nr_zones, int zone_type)
{
struct zone *zone;
@@ -1548,7 +1555,7 @@ static inline int highest_zone(int zone_bits)
#ifdef CONFIG_NUMA
#define MAX_NODE_LOAD (num_online_nodes())
-static int __initdata node_load[MAX_NUMNODES];
+static int __meminitdata node_load[MAX_NUMNODES];
/**
* find_next_best_node - find the next node that should appear in a given node's fallback list
* @node: node whose fallback list we're appending
@@ -1563,7 +1570,7 @@ static int __initdata node_load[MAX_NUMNODES];
* on them otherwise.
* It returns -1 if no node is found.
*/
-static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
+static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
{
int n, val;
int min_val = INT_MAX;
@@ -1609,7 +1616,7 @@ static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
return best_node;
}
-static void __init build_zonelists(pg_data_t *pgdat)
+static void __meminit build_zonelists(pg_data_t *pgdat)
{
int i, j, k, node, local_node;
int prev_node, load;
@@ -1661,7 +1668,7 @@ static void __init build_zonelists(pg_data_t *pgdat)
#else /* CONFIG_NUMA */
-static void __init build_zonelists(pg_data_t *pgdat)
+static void __meminit build_zonelists(pg_data_t *pgdat)
{
int i, j, k, node, local_node;
@@ -1699,14 +1706,29 @@ static void __init build_zonelists(pg_data_t *pgdat)
#endif /* CONFIG_NUMA */
-void __init build_all_zonelists(void)
+/* return values int ....just for stop_machine_run() */
+static int __meminit __build_all_zonelists(void *dummy)
{
- int i;
+ int nid;
+ for_each_online_node(nid)
+ build_zonelists(NODE_DATA(nid));
+ return 0;
+}
- for_each_online_node(i)
- build_zonelists(NODE_DATA(i));
- printk("Built %i zonelists\n", num_online_nodes());
- cpuset_init_current_mems_allowed();
+void __meminit build_all_zonelists(void)
+{
+ if (system_state == SYSTEM_BOOTING) {
+ __build_all_zonelists(0);
+ cpuset_init_current_mems_allowed();
+ } else {
+ /* we have to stop all cpus to guaranntee there is no user
+ of zonelist */
+ stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
+ /* cpuset refresh routine should be here */
+ }
+ vm_total_pages = nr_free_pagecache_pages();
+ printk("Built %i zonelists. Total pages: %ld\n",
+ num_online_nodes(), vm_total_pages);
}
/*
@@ -1722,7 +1744,8 @@ void __init build_all_zonelists(void)
*/
#define PAGES_PER_WAITQUEUE 256
-static inline unsigned long wait_table_size(unsigned long pages)
+#ifndef CONFIG_MEMORY_HOTPLUG
+static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
{
unsigned long size = 1;
@@ -1740,6 +1763,29 @@ static inline unsigned long wait_table_size(unsigned long pages)
return max(size, 4UL);
}
+#else
+/*
+ * A zone's size might be changed by hot-add, so it is not possible to determine
+ * a suitable size for its wait_table. So we use the maximum size now.
+ *
+ * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
+ *
+ * i386 (preemption config) : 4096 x 16 = 64Kbyte.
+ * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
+ * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
+ *
+ * The maximum entries are prepared when a zone's memory is (512K + 256) pages
+ * or more by the traditional way. (See above). It equals:
+ *
+ * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
+ * ia64(16K page size) : = ( 8G + 4M)byte.
+ * powerpc (64K page size) : = (32G +16M)byte.
+ */
+static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
+{
+ return 4096UL;
+}
+#endif
/*
* This is an integer logarithm so that shifts can be used later
@@ -2005,23 +2051,46 @@ void __init setup_per_cpu_pageset(void)
#endif
static __meminit
-void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
+int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
{
int i;
struct pglist_data *pgdat = zone->zone_pgdat;
+ size_t alloc_size;
/*
* The per-page waitqueue mechanism uses hashed waitqueues
* per zone.
*/
- zone->wait_table_size = wait_table_size(zone_size_pages);
- zone->wait_table_bits = wait_table_bits(zone->wait_table_size);
- zone->wait_table = (wait_queue_head_t *)
- alloc_bootmem_node(pgdat, zone->wait_table_size
- * sizeof(wait_queue_head_t));
+ zone->wait_table_hash_nr_entries =
+ wait_table_hash_nr_entries(zone_size_pages);
+ zone->wait_table_bits =
+ wait_table_bits(zone->wait_table_hash_nr_entries);
+ alloc_size = zone->wait_table_hash_nr_entries
+ * sizeof(wait_queue_head_t);
+
+ if (system_state == SYSTEM_BOOTING) {
+ zone->wait_table = (wait_queue_head_t *)
+ alloc_bootmem_node(pgdat, alloc_size);
+ } else {
+ /*
+ * This case means that a zone whose size was 0 gets new memory
+ * via memory hot-add.
+ * But it may be the case that a new node was hot-added. In
+ * this case vmalloc() will not be able to use this new node's
+ * memory - this wait_table must be initialized to use this new
+ * node itself as well.
+ * To use this new node's memory, further consideration will be
+ * necessary.
+ */
+ zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
+ }
+ if (!zone->wait_table)
+ return -ENOMEM;
- for(i = 0; i < zone->wait_table_size; ++i)
+ for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
init_waitqueue_head(zone->wait_table + i);
+
+ return 0;
}
static __meminit void zone_pcp_init(struct zone *zone)
@@ -2043,12 +2112,15 @@ static __meminit void zone_pcp_init(struct zone *zone)
zone->name, zone->present_pages, batch);
}
-static __meminit void init_currently_empty_zone(struct zone *zone,
- unsigned long zone_start_pfn, unsigned long size)
+__meminit int init_currently_empty_zone(struct zone *zone,
+ unsigned long zone_start_pfn,
+ unsigned long size)
{
struct pglist_data *pgdat = zone->zone_pgdat;
-
- zone_wait_table_init(zone, size);
+ int ret;
+ ret = zone_wait_table_init(zone, size);
+ if (ret)
+ return ret;
pgdat->nr_zones = zone_idx(zone) + 1;
zone->zone_start_pfn = zone_start_pfn;
@@ -2056,6 +2128,8 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+
+ return 0;
}
/*
@@ -2064,12 +2138,13 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
* - mark all memory queues empty
* - clear the memory bitmaps
*/
-static void __init free_area_init_core(struct pglist_data *pgdat,
+static void __meminit free_area_init_core(struct pglist_data *pgdat,
unsigned long *zones_size, unsigned long *zholes_size)
{
unsigned long j;
int nid = pgdat->node_id;
unsigned long zone_start_pfn = pgdat->node_start_pfn;
+ int ret;
pgdat_resize_init(pgdat);
pgdat->nr_zones = 0;
@@ -2111,7 +2186,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
continue;
zonetable_add(zone, nid, j, zone_start_pfn, size);
- init_currently_empty_zone(zone, zone_start_pfn, size);
+ ret = init_currently_empty_zone(zone, zone_start_pfn, size);
+ BUG_ON(ret);
zone_start_pfn += size;
}
}
@@ -2152,7 +2228,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
#endif /* CONFIG_FLAT_NODE_MEM_MAP */
}
-void __init free_area_init_node(int nid, struct pglist_data *pgdat,
+void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
unsigned long *zones_size, unsigned long node_start_pfn,
unsigned long *zholes_size)
{
@@ -2804,42 +2880,14 @@ void *__init alloc_large_system_hash(const char *tablename,
}
#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
-/*
- * pfn <-> page translation. out-of-line version.
- * (see asm-generic/memory_model.h)
- */
-#if defined(CONFIG_FLATMEM)
struct page *pfn_to_page(unsigned long pfn)
{
- return mem_map + (pfn - ARCH_PFN_OFFSET);
+ return __pfn_to_page(pfn);
}
unsigned long page_to_pfn(struct page *page)
{
- return (page - mem_map) + ARCH_PFN_OFFSET;
-}
-#elif defined(CONFIG_DISCONTIGMEM)
-struct page *pfn_to_page(unsigned long pfn)
-{
- int nid = arch_pfn_to_nid(pfn);
- return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid);
-}
-unsigned long page_to_pfn(struct page *page)
-{
- struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
- return (page - pgdat->node_mem_map) + pgdat->node_start_pfn;
-}
-#elif defined(CONFIG_SPARSEMEM)
-struct page *pfn_to_page(unsigned long pfn)
-{
- return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn;
-}
-
-unsigned long page_to_pfn(struct page *page)
-{
- long section_id = page_to_section(page);
- return page - __section_mem_map_addr(__nr_to_section(section_id));
+ return __page_to_pfn(page);
}
-#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */
EXPORT_SYMBOL(pfn_to_page);
EXPORT_SYMBOL(page_to_pfn);
#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
diff --git a/mm/pdflush.c b/mm/pdflush.c
index c4b6d0afd73..df7e50b8f70 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -202,8 +202,7 @@ int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
unsigned long flags;
int ret = 0;
- if (fn == NULL)
- BUG(); /* Hard to diagnose if it's deferred */
+ BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */
spin_lock_irqsave(&pdflush_lock, flags);
if (list_empty(&pdflush_list)) {
diff --git a/mm/rmap.c b/mm/rmap.c
index 1963e269314..882a85826bb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -103,7 +103,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma;
- list_add(&vma->anon_vma_node, &anon_vma->head);
+ list_add_tail(&vma->anon_vma_node, &anon_vma->head);
allocated = NULL;
}
spin_unlock(&mm->page_table_lock);
@@ -127,7 +127,7 @@ void __anon_vma_link(struct vm_area_struct *vma)
struct anon_vma *anon_vma = vma->anon_vma;
if (anon_vma) {
- list_add(&vma->anon_vma_node, &anon_vma->head);
+ list_add_tail(&vma->anon_vma_node, &anon_vma->head);
validate_anon_vma(vma);
}
}
@@ -138,7 +138,7 @@ void anon_vma_link(struct vm_area_struct *vma)
if (anon_vma) {
spin_lock(&anon_vma->lock);
- list_add(&vma->anon_vma_node, &anon_vma->head);
+ list_add_tail(&vma->anon_vma_node, &anon_vma->head);
validate_anon_vma(vma);
spin_unlock(&anon_vma->lock);
}
@@ -205,44 +205,6 @@ out:
return anon_vma;
}
-#ifdef CONFIG_MIGRATION
-/*
- * Remove an anonymous page from swap replacing the swap pte's
- * through real pte's pointing to valid pages and then releasing
- * the page from the swap cache.
- *
- * Must hold page lock on page and mmap_sem of one vma that contains
- * the page.
- */
-void remove_from_swap(struct page *page)
-{
- struct anon_vma *anon_vma;
- struct vm_area_struct *vma;
- unsigned long mapping;
-
- if (!PageSwapCache(page))
- return;
-
- mapping = (unsigned long)page->mapping;
-
- if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
- return;
-
- /*
- * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
- */
- anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
- spin_lock(&anon_vma->lock);
-
- list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
- remove_vma_swap(vma, page);
-
- spin_unlock(&anon_vma->lock);
- delete_from_swap_cache(page);
-}
-EXPORT_SYMBOL(remove_from_swap);
-#endif
-
/*
* At what user virtual address is page expected in vma?
*/
@@ -578,7 +540,7 @@ void page_remove_rmap(struct page *page)
* repeatedly from either try_to_unmap_anon or try_to_unmap_file.
*/
static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
- int ignore_refs)
+ int migration)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
@@ -602,7 +564,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
*/
if ((vma->vm_flags & VM_LOCKED) ||
(ptep_clear_flush_young(vma, address, pte)
- && !ignore_refs)) {
+ && !migration)) {
ret = SWAP_FAIL;
goto out_unmap;
}
@@ -620,24 +582,45 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (PageAnon(page)) {
swp_entry_t entry = { .val = page_private(page) };
- /*
- * Store the swap location in the pte.
- * See handle_pte_fault() ...
- */
- BUG_ON(!PageSwapCache(page));
- swap_duplicate(entry);
- if (list_empty(&mm->mmlist)) {
- spin_lock(&mmlist_lock);
- if (list_empty(&mm->mmlist))
- list_add(&mm->mmlist, &init_mm.mmlist);
- spin_unlock(&mmlist_lock);
+
+ if (PageSwapCache(page)) {
+ /*
+ * Store the swap location in the pte.
+ * See handle_pte_fault() ...
+ */
+ swap_duplicate(entry);
+ if (list_empty(&mm->mmlist)) {
+ spin_lock(&mmlist_lock);
+ if (list_empty(&mm->mmlist))
+ list_add(&mm->mmlist, &init_mm.mmlist);
+ spin_unlock(&mmlist_lock);
+ }
+ dec_mm_counter(mm, anon_rss);
+#ifdef CONFIG_MIGRATION
+ } else {
+ /*
+ * Store the pfn of the page in a special migration
+ * pte. do_swap_page() will wait until the migration
+ * pte is removed and then restart fault handling.
+ */
+ BUG_ON(!migration);
+ entry = make_migration_entry(page, pte_write(pteval));
+#endif
}
set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
BUG_ON(pte_file(*pte));
- dec_mm_counter(mm, anon_rss);
} else
+#ifdef CONFIG_MIGRATION
+ if (migration) {
+ /* Establish migration entry for a file page */
+ swp_entry_t entry;
+ entry = make_migration_entry(page, pte_write(pteval));
+ set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
+ } else
+#endif
dec_mm_counter(mm, file_rss);
+
page_remove_rmap(page);
page_cache_release(page);
@@ -736,7 +719,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
pte_unmap_unlock(pte - 1, ptl);
}
-static int try_to_unmap_anon(struct page *page, int ignore_refs)
+static int try_to_unmap_anon(struct page *page, int migration)
{
struct anon_vma *anon_vma;
struct vm_area_struct *vma;
@@ -747,7 +730,7 @@ static int try_to_unmap_anon(struct page *page, int ignore_refs)
return ret;
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
- ret = try_to_unmap_one(page, vma, ignore_refs);
+ ret = try_to_unmap_one(page, vma, migration);
if (ret == SWAP_FAIL || !page_mapped(page))
break;
}
@@ -764,7 +747,7 @@ static int try_to_unmap_anon(struct page *page, int ignore_refs)
*
* This function is only called from try_to_unmap for object-based pages.
*/
-static int try_to_unmap_file(struct page *page, int ignore_refs)
+static int try_to_unmap_file(struct page *page, int migration)
{
struct address_space *mapping = page->mapping;
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -778,7 +761,7 @@ static int try_to_unmap_file(struct page *page, int ignore_refs)
spin_lock(&mapping->i_mmap_lock);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
- ret = try_to_unmap_one(page, vma, ignore_refs);
+ ret = try_to_unmap_one(page, vma, migration);
if (ret == SWAP_FAIL || !page_mapped(page))
goto out;
}
@@ -863,16 +846,16 @@ out:
* SWAP_AGAIN - we missed a mapping, try again later
* SWAP_FAIL - the page is unswappable
*/
-int try_to_unmap(struct page *page, int ignore_refs)
+int try_to_unmap(struct page *page, int migration)
{
int ret;
BUG_ON(!PageLocked(page));
if (PageAnon(page))
- ret = try_to_unmap_anon(page, ignore_refs);
+ ret = try_to_unmap_anon(page, migration);
else
- ret = try_to_unmap_file(page, ignore_refs);
+ ret = try_to_unmap_file(page, migration);
if (!page_mapped(page))
ret = SWAP_SUCCESS;
diff --git a/mm/shmem.c b/mm/shmem.c
index 797eef3805c..38bc3334f26 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1081,14 +1081,6 @@ repeat:
page_cache_release(swappage);
goto repeat;
}
- if (!PageSwapCache(swappage)) {
- /* Page migration has occured */
- shmem_swp_unmap(entry);
- spin_unlock(&info->lock);
- unlock_page(swappage);
- page_cache_release(swappage);
- goto repeat;
- }
if (PageWriteback(swappage)) {
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
@@ -1654,9 +1646,9 @@ static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos,
return desc.error;
}
-static int shmem_statfs(struct super_block *sb, struct kstatfs *buf)
+static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
{
- struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
buf->f_type = TMPFS_MAGIC;
buf->f_bsize = PAGE_CACHE_SIZE;
@@ -2233,10 +2225,10 @@ static struct vm_operations_struct shmem_vm_ops = {
};
-static struct super_block *shmem_get_sb(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int shmem_get_sb(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data, struct vfsmount *mnt)
{
- return get_sb_nodev(fs_type, flags, data, shmem_fill_super);
+ return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt);
}
static struct file_system_type tmpfs_fs_type = {
diff --git a/mm/slab.c b/mm/slab.c
index f1b644eb39d..98ac20bc0de 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -331,6 +331,8 @@ static __always_inline int index_of(const size_t size)
return 0;
}
+static int slab_early_init = 1;
+
#define INDEX_AC index_of(sizeof(struct arraycache_init))
#define INDEX_L3 index_of(sizeof(struct kmem_list3))
@@ -592,6 +594,7 @@ static inline struct kmem_cache *page_get_cache(struct page *page)
{
if (unlikely(PageCompound(page)))
page = (struct page *)page_private(page);
+ BUG_ON(!PageSlab(page));
return (struct kmem_cache *)page->lru.next;
}
@@ -604,6 +607,7 @@ static inline struct slab *page_get_slab(struct page *page)
{
if (unlikely(PageCompound(page)))
page = (struct page *)page_private(page);
+ BUG_ON(!PageSlab(page));
return (struct slab *)page->lru.prev;
}
@@ -1024,6 +1028,40 @@ static void drain_alien_cache(struct kmem_cache *cachep,
}
}
}
+
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+{
+ struct slab *slabp = virt_to_slab(objp);
+ int nodeid = slabp->nodeid;
+ struct kmem_list3 *l3;
+ struct array_cache *alien = NULL;
+
+ /*
+ * Make sure we are not freeing a object from another node to the array
+ * cache on this cpu.
+ */
+ if (likely(slabp->nodeid == numa_node_id()))
+ return 0;
+
+ l3 = cachep->nodelists[numa_node_id()];
+ STATS_INC_NODEFREES(cachep);
+ if (l3->alien && l3->alien[nodeid]) {
+ alien = l3->alien[nodeid];
+ spin_lock(&alien->lock);
+ if (unlikely(alien->avail == alien->limit)) {
+ STATS_INC_ACOVERFLOW(cachep);
+ __drain_alien_cache(cachep, alien, nodeid);
+ }
+ alien->entry[alien->avail++] = objp;
+ spin_unlock(&alien->lock);
+ } else {
+ spin_lock(&(cachep->nodelists[nodeid])->list_lock);
+ free_block(cachep, &objp, 1, nodeid);
+ spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
+ }
+ return 1;
+}
+
#else
#define drain_alien_cache(cachep, alien) do { } while (0)
@@ -1038,6 +1076,11 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
{
}
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+{
+ return 0;
+}
+
#endif
static int cpuup_callback(struct notifier_block *nfb,
@@ -1335,6 +1378,8 @@ void __init kmem_cache_init(void)
NULL, NULL);
}
+ slab_early_init = 0;
+
while (sizes->cs_size != ULONG_MAX) {
/*
* For performance, all the general caches are L1 aligned.
@@ -1450,31 +1495,29 @@ __initcall(cpucache_init);
static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
struct page *page;
- void *addr;
+ int nr_pages;
int i;
- flags |= cachep->gfpflags;
#ifndef CONFIG_MMU
- /* nommu uses slab's for process anonymous memory allocations, so
- * requires __GFP_COMP to properly refcount higher order allocations"
+ /*
+ * Nommu uses slab's for process anonymous memory allocations, and thus
+ * requires __GFP_COMP to properly refcount higher order allocations
*/
- page = alloc_pages_node(nodeid, (flags | __GFP_COMP), cachep->gfporder);
-#else
- page = alloc_pages_node(nodeid, flags, cachep->gfporder);
+ flags |= __GFP_COMP;
#endif
+ flags |= cachep->gfpflags;
+
+ page = alloc_pages_node(nodeid, flags, cachep->gfporder);
if (!page)
return NULL;
- addr = page_address(page);
- i = (1 << cachep->gfporder);
+ nr_pages = (1 << cachep->gfporder);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
- atomic_add(i, &slab_reclaim_pages);
- add_page_state(nr_slab, i);
- while (i--) {
- __SetPageSlab(page);
- page++;
- }
- return addr;
+ atomic_add(nr_pages, &slab_reclaim_pages);
+ add_page_state(nr_slab, nr_pages);
+ for (i = 0; i < nr_pages; i++)
+ __SetPageSlab(page + i);
+ return page_address(page);
}
/*
@@ -1913,8 +1956,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
void (*dtor)(void*, struct kmem_cache *, unsigned long))
{
size_t left_over, slab_size, ralign;
- struct kmem_cache *cachep = NULL;
- struct list_head *p;
+ struct kmem_cache *cachep = NULL, *pc;
/*
* Sanity checks... these are all serious usage bugs.
@@ -1934,8 +1976,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
mutex_lock(&cache_chain_mutex);
- list_for_each(p, &cache_chain) {
- struct kmem_cache *pc = list_entry(p, struct kmem_cache, next);
+ list_for_each_entry(pc, &cache_chain, next) {
mm_segment_t old_fs = get_fs();
char tmp;
int res;
@@ -2069,8 +2110,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
#endif
#endif
- /* Determine if the slab management is 'on' or 'off' slab. */
- if (size >= (PAGE_SIZE >> 3))
+ /*
+ * Determine if the slab management is 'on' or 'off' slab.
+ * (bootstrapping cannot cope with offslab caches so don't do
+ * it too early on.)
+ */
+ if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
/*
* Size is large, assume best to place the slab management obj
* off-slab (should allow better packing of objs).
@@ -2460,23 +2505,28 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
slabp->inuse--;
}
-static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp,
- void *objp)
+/*
+ * Map pages beginning at addr to the given cache and slab. This is required
+ * for the slab allocator to be able to lookup the cache and slab of a
+ * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
+ */
+static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
+ void *addr)
{
- int i;
+ int nr_pages;
struct page *page;
- /* Nasty!!!!!! I hope this is OK. */
- page = virt_to_page(objp);
+ page = virt_to_page(addr);
- i = 1;
+ nr_pages = 1;
if (likely(!PageCompound(page)))
- i <<= cachep->gfporder;
+ nr_pages <<= cache->gfporder;
+
do {
- page_set_cache(page, cachep);
- page_set_slab(page, slabp);
+ page_set_cache(page, cache);
+ page_set_slab(page, slab);
page++;
- } while (--i);
+ } while (--nr_pages);
}
/*
@@ -2548,7 +2598,7 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
goto opps1;
slabp->nodeid = nodeid;
- set_slab_attr(cachep, slabp, objp);
+ slab_map_pages(cachep, slabp, objp);
cache_init_objs(cachep, slabp, ctor_flags);
@@ -2596,6 +2646,28 @@ static void kfree_debugcheck(const void *objp)
}
}
+static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
+{
+ unsigned long redzone1, redzone2;
+
+ redzone1 = *dbg_redzone1(cache, obj);
+ redzone2 = *dbg_redzone2(cache, obj);
+
+ /*
+ * Redzone is ok.
+ */
+ if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
+ return;
+
+ if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
+ slab_error(cache, "double free detected");
+ else
+ slab_error(cache, "memory outside object was overwritten");
+
+ printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n",
+ obj, redzone1, redzone2);
+}
+
static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
void *caller)
{
@@ -2607,27 +2679,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
kfree_debugcheck(objp);
page = virt_to_page(objp);
- if (page_get_cache(page) != cachep) {
- printk(KERN_ERR "mismatch in kmem_cache_free: expected "
- "cache %p, got %p\n",
- page_get_cache(page), cachep);
- printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
- printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
- page_get_cache(page)->name);
- WARN_ON(1);
- }
slabp = page_get_slab(page);
if (cachep->flags & SLAB_RED_ZONE) {
- if (*dbg_redzone1(cachep, objp) != RED_ACTIVE ||
- *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
- slab_error(cachep, "double free, or memory outside"
- " object was overwritten");
- printk(KERN_ERR "%p: redzone 1:0x%lx, "
- "redzone 2:0x%lx.\n",
- objp, *dbg_redzone1(cachep, objp),
- *dbg_redzone2(cachep, objp));
- }
+ verify_redzone_free(cachep, objp);
*dbg_redzone1(cachep, objp) = RED_INACTIVE;
*dbg_redzone2(cachep, objp) = RED_INACTIVE;
}
@@ -3087,41 +3142,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
check_irq_off();
objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
- /* Make sure we are not freeing a object from another
- * node to the array cache on this cpu.
- */
-#ifdef CONFIG_NUMA
- {
- struct slab *slabp;
- slabp = virt_to_slab(objp);
- if (unlikely(slabp->nodeid != numa_node_id())) {
- struct array_cache *alien = NULL;
- int nodeid = slabp->nodeid;
- struct kmem_list3 *l3;
-
- l3 = cachep->nodelists[numa_node_id()];
- STATS_INC_NODEFREES(cachep);
- if (l3->alien && l3->alien[nodeid]) {
- alien = l3->alien[nodeid];
- spin_lock(&alien->lock);
- if (unlikely(alien->avail == alien->limit)) {
- STATS_INC_ACOVERFLOW(cachep);
- __drain_alien_cache(cachep,
- alien, nodeid);
- }
- alien->entry[alien->avail++] = objp;
- spin_unlock(&alien->lock);
- } else {
- spin_lock(&(cachep->nodelists[nodeid])->
- list_lock);
- free_block(cachep, &objp, 1, nodeid);
- spin_unlock(&(cachep->nodelists[nodeid])->
- list_lock);
- }
- return;
- }
- }
-#endif
+ if (cache_free_alien(cachep, objp))
+ return;
+
if (likely(ac->avail < ac->limit)) {
STATS_INC_FREEHIT(cachep);
ac->entry[ac->avail++] = objp;
@@ -3254,26 +3277,10 @@ EXPORT_SYMBOL(kmalloc_node);
#endif
/**
- * kmalloc - allocate memory
+ * __do_kmalloc - allocate memory
* @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
+ * @flags: the type of memory to allocate (see kmalloc).
* @caller: function caller for debug tracking of the caller
- *
- * kmalloc is the normal method of allocating memory
- * in the kernel.
- *
- * The @flags argument may be one of:
- *
- * %GFP_USER - Allocate memory on behalf of user. May sleep.
- *
- * %GFP_KERNEL - Allocate normal kernel ram. May sleep.
- *
- * %GFP_ATOMIC - Allocation will not sleep. Use inside interrupt handlers.
- *
- * Additionally, the %GFP_DMA flag may be set to indicate the memory
- * must be suitable for DMA. This can mean different things on different
- * platforms. For example, on i386, it means that the memory must come
- * from the first 16MB.
*/
static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
void *caller)
@@ -3371,6 +3378,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
{
unsigned long flags;
+ BUG_ON(virt_to_cache(objp) != cachep);
+
local_irq_save(flags);
__cache_free(cachep, objp);
local_irq_restore(flags);
@@ -3680,7 +3689,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
*/
static void cache_reap(void *unused)
{
- struct list_head *walk;
+ struct kmem_cache *searchp;
struct kmem_list3 *l3;
int node = numa_node_id();
@@ -3691,13 +3700,11 @@ static void cache_reap(void *unused)
return;
}
- list_for_each(walk, &cache_chain) {
- struct kmem_cache *searchp;
+ list_for_each_entry(searchp, &cache_chain, next) {
struct list_head *p;
int tofree;
struct slab *slabp;
- searchp = list_entry(walk, struct kmem_cache, next);
check_irq_on();
/*
@@ -3825,7 +3832,6 @@ static void s_stop(struct seq_file *m, void *p)
static int s_show(struct seq_file *m, void *p)
{
struct kmem_cache *cachep = p;
- struct list_head *q;
struct slab *slabp;
unsigned long active_objs;
unsigned long num_objs;
@@ -3846,15 +3852,13 @@ static int s_show(struct seq_file *m, void *p)
check_irq_on();
spin_lock_irq(&l3->list_lock);
- list_for_each(q, &l3->slabs_full) {
- slabp = list_entry(q, struct slab, list);
+ list_for_each_entry(slabp, &l3->slabs_full, list) {
if (slabp->inuse != cachep->num && !error)
error = "slabs_full accounting error";
active_objs += cachep->num;
active_slabs++;
}
- list_for_each(q, &l3->slabs_partial) {
- slabp = list_entry(q, struct slab, list);
+ list_for_each_entry(slabp, &l3->slabs_partial, list) {
if (slabp->inuse == cachep->num && !error)
error = "slabs_partial inuse accounting error";
if (!slabp->inuse && !error)
@@ -3862,8 +3866,7 @@ static int s_show(struct seq_file *m, void *p)
active_objs += slabp->inuse;
active_slabs++;
}
- list_for_each(q, &l3->slabs_free) {
- slabp = list_entry(q, struct slab, list);
+ list_for_each_entry(slabp, &l3->slabs_free, list) {
if (slabp->inuse && !error)
error = "slabs_free/inuse accounting error";
num_slabs++;
@@ -3956,7 +3959,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
{
char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
int limit, batchcount, shared, res;
- struct list_head *p;
+ struct kmem_cache *cachep;
if (count > MAX_SLABINFO_WRITE)
return -EINVAL;
@@ -3975,10 +3978,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
/* Find the cache in the chain of caches. */
mutex_lock(&cache_chain_mutex);
res = -EINVAL;
- list_for_each(p, &cache_chain) {
- struct kmem_cache *cachep;
-
- cachep = list_entry(p, struct kmem_cache, next);
+ list_for_each_entry(cachep, &cache_chain, next) {
if (!strcmp(cachep->name, kbuf)) {
if (limit < 1 || batchcount < 1 ||
batchcount > limit || shared < 0) {
@@ -4080,7 +4080,6 @@ static void show_symbol(struct seq_file *m, unsigned long address)
static int leaks_show(struct seq_file *m, void *p)
{
struct kmem_cache *cachep = p;
- struct list_head *q;
struct slab *slabp;
struct kmem_list3 *l3;
const char *name;
@@ -4105,14 +4104,10 @@ static int leaks_show(struct seq_file *m, void *p)
check_irq_on();
spin_lock_irq(&l3->list_lock);
- list_for_each(q, &l3->slabs_full) {
- slabp = list_entry(q, struct slab, list);
+ list_for_each_entry(slabp, &l3->slabs_full, list)
handle_slab(n, cachep, slabp);
- }
- list_for_each(q, &l3->slabs_partial) {
- slabp = list_entry(q, struct slab, list);
+ list_for_each_entry(slabp, &l3->slabs_partial, list)
handle_slab(n, cachep, slabp);
- }
spin_unlock_irq(&l3->list_lock);
}
name = cachep->name;
diff --git a/mm/sparse.c b/mm/sparse.c
index 100040c0dfb..e0a3fe48aa3 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -99,6 +99,22 @@ int __section_nr(struct mem_section* ms)
return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
}
+/*
+ * During early boot, before section_mem_map is used for an actual
+ * mem_map, we use section_mem_map to store the section's NUMA
+ * node. This keeps us from having to use another data structure. The
+ * node information is cleared just before we store the real mem_map.
+ */
+static inline unsigned long sparse_encode_early_nid(int nid)
+{
+ return (nid << SECTION_NID_SHIFT);
+}
+
+static inline int sparse_early_nid(struct mem_section *section)
+{
+ return (section->section_mem_map >> SECTION_NID_SHIFT);
+}
+
/* Record a memory area against a node. */
void memory_present(int nid, unsigned long start, unsigned long end)
{
@@ -113,7 +129,8 @@ void memory_present(int nid, unsigned long start, unsigned long end)
ms = __nr_to_section(section);
if (!ms->section_mem_map)
- ms->section_mem_map = SECTION_MARKED_PRESENT;
+ ms->section_mem_map = sparse_encode_early_nid(nid) |
+ SECTION_MARKED_PRESENT;
}
}
@@ -164,6 +181,7 @@ static int sparse_init_one_section(struct mem_section *ms,
if (!valid_section(ms))
return -EINVAL;
+ ms->section_mem_map &= ~SECTION_MAP_MASK;
ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
return 1;
@@ -172,8 +190,8 @@ static int sparse_init_one_section(struct mem_section *ms,
static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
{
struct page *map;
- int nid = early_pfn_to_nid(section_nr_to_pfn(pnum));
struct mem_section *ms = __nr_to_section(pnum);
+ int nid = sparse_early_nid(ms);
map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
if (map)
diff --git a/mm/swap.c b/mm/swap.c
index 88895c249bc..03ae2076f92 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -480,48 +480,6 @@ static int cpu_swap_callback(struct notifier_block *nfb,
#endif /* CONFIG_HOTPLUG_CPU */
#endif /* CONFIG_SMP */
-#ifdef CONFIG_SMP
-void percpu_counter_mod(struct percpu_counter *fbc, long amount)
-{
- long count;
- long *pcount;
- int cpu = get_cpu();
-
- pcount = per_cpu_ptr(fbc->counters, cpu);
- count = *pcount + amount;
- if (count >= FBC_BATCH || count <= -FBC_BATCH) {
- spin_lock(&fbc->lock);
- fbc->count += count;
- *pcount = 0;
- spin_unlock(&fbc->lock);
- } else {
- *pcount = count;
- }
- put_cpu();
-}
-EXPORT_SYMBOL(percpu_counter_mod);
-
-/*
- * Add up all the per-cpu counts, return the result. This is a more accurate
- * but much slower version of percpu_counter_read_positive()
- */
-long percpu_counter_sum(struct percpu_counter *fbc)
-{
- long ret;
- int cpu;
-
- spin_lock(&fbc->lock);
- ret = fbc->count;
- for_each_possible_cpu(cpu) {
- long *pcount = per_cpu_ptr(fbc->counters, cpu);
- ret += *pcount;
- }
- spin_unlock(&fbc->lock);
- return ret < 0 ? 0 : ret;
-}
-EXPORT_SYMBOL(percpu_counter_sum);
-#endif
-
/*
* Perform any setup for the swap system
*/
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e5fd5385f0c..cc367f7e75d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -395,6 +395,9 @@ void free_swap_and_cache(swp_entry_t entry)
struct swap_info_struct * p;
struct page *page = NULL;
+ if (is_migration_entry(entry))
+ return;
+
p = swap_info_get(entry);
if (p) {
if (swap_entry_free(p, swp_offset(entry)) == 1) {
@@ -615,15 +618,6 @@ static int unuse_mm(struct mm_struct *mm,
return 0;
}
-#ifdef CONFIG_MIGRATION
-int remove_vma_swap(struct vm_area_struct *vma, struct page *page)
-{
- swp_entry_t entry = { .val = page_private(page) };
-
- return unuse_vma(vma, entry, page);
-}
-#endif
-
/*
* Scan swap_map from current position to next entry still in use.
* Recycle to start on reaching the end, returning 0 when empty.
@@ -716,7 +710,6 @@ static int try_to_unuse(unsigned int type)
*/
swap_map = &si->swap_map[i];
entry = swp_entry(type, i);
-again:
page = read_swap_cache_async(entry, NULL, 0);
if (!page) {
/*
@@ -751,12 +744,6 @@ again:
wait_on_page_locked(page);
wait_on_page_writeback(page);
lock_page(page);
- if (!PageSwapCache(page)) {
- /* Page migration has occured */
- unlock_page(page);
- page_cache_release(page);
- goto again;
- }
wait_on_page_writeback(page);
/*
@@ -785,10 +772,8 @@ again:
while (*swap_map > 1 && !retval &&
(p = p->next) != &start_mm->mmlist) {
mm = list_entry(p, struct mm_struct, mmlist);
- if (atomic_inc_return(&mm->mm_users) == 1) {
- atomic_dec(&mm->mm_users);
+ if (!atomic_inc_not_zero(&mm->mm_users))
continue;
- }
spin_unlock(&mmlist_lock);
mmput(prev_mm);
prev_mm = mm;
@@ -1407,19 +1392,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
if (!(p->flags & SWP_USED))
break;
error = -EPERM;
- /*
- * Test if adding another swap device is possible. There are
- * two limiting factors: 1) the number of bits for the swap
- * type swp_entry_t definition and 2) the number of bits for
- * the swap type in the swap ptes as defined by the different
- * architectures. To honor both limitations a swap entry
- * with swap offset 0 and swap type ~0UL is created, encoded
- * to a swap pte, decoded to a swp_entry_t again and finally
- * the swap type part is extracted. This will mask all bits
- * from the initial ~0UL that can't be encoded in either the
- * swp_entry_t or the architecture definition of a swap pte.
- */
- if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
+ if (type >= MAX_SWAPFILES) {
spin_unlock(&swap_lock);
goto out;
}
@@ -1504,8 +1477,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
error = -EINVAL;
goto bad_swap;
}
- page = read_cache_page(mapping, 0,
- (filler_t *)mapping->a_ops->readpage, swap_file);
+ page = read_mapping_page(mapping, 0, swap_file);
if (IS_ERR(page)) {
error = PTR_ERR(page);
goto bad_swap;
@@ -1709,6 +1681,9 @@ int swap_duplicate(swp_entry_t entry)
unsigned long offset, type;
int result = 0;
+ if (is_migration_entry(entry))
+ return 1;
+
type = swp_type(entry);
if (type >= nr_swapfiles)
goto bad_file;
diff --git a/mm/truncate.c b/mm/truncate.c
index 6cb3fff25f6..cf1b015df4a 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -230,14 +230,24 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
+ pgoff_t index;
+ int lock_failed;
- if (TestSetPageLocked(page)) {
- next++;
- continue;
- }
- if (page->index > next)
- next = page->index;
+ lock_failed = TestSetPageLocked(page);
+
+ /*
+ * We really shouldn't be looking at the ->index of an
+ * unlocked page. But we're not allowed to lock these
+ * pages. So we rely upon nobody altering the ->index
+ * of this (pinned-by-us) page.
+ */
+ index = page->index;
+ if (index > next)
+ next = index;
next++;
+ if (lock_failed)
+ continue;
+
if (PageDirty(page) || PageWriteback(page))
goto unlock;
if (page_mapped(page))
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index c0504f1e34e..35f8553f893 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -257,6 +257,19 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int
}
/* Caller must hold vmlist_lock */
+static struct vm_struct *__find_vm_area(void *addr)
+{
+ struct vm_struct *tmp;
+
+ for (tmp = vmlist; tmp != NULL; tmp = tmp->next) {
+ if (tmp->addr == addr)
+ break;
+ }
+
+ return tmp;
+}
+
+/* Caller must hold vmlist_lock */
struct vm_struct *__remove_vm_area(void *addr)
{
struct vm_struct **p, *tmp;
@@ -498,11 +511,33 @@ EXPORT_SYMBOL(__vmalloc);
*/
void *vmalloc(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
}
EXPORT_SYMBOL(vmalloc);
/**
+ * vmalloc_user - allocate virtually contiguous memory which has
+ * been zeroed so it can be mapped to userspace without
+ * leaking data.
+ *
+ * @size: allocation size
+ */
+void *vmalloc_user(unsigned long size)
+{
+ struct vm_struct *area;
+ void *ret;
+
+ ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
+ write_lock(&vmlist_lock);
+ area = __find_vm_area(ret);
+ area->flags |= VM_USERMAP;
+ write_unlock(&vmlist_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(vmalloc_user);
+
+/**
* vmalloc_node - allocate memory on a specific node
*
* @size: allocation size
@@ -516,7 +551,7 @@ EXPORT_SYMBOL(vmalloc);
*/
void *vmalloc_node(unsigned long size, int node)
{
- return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
+ return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
}
EXPORT_SYMBOL(vmalloc_node);
@@ -556,6 +591,28 @@ void *vmalloc_32(unsigned long size)
}
EXPORT_SYMBOL(vmalloc_32);
+/**
+ * vmalloc_32_user - allocate virtually contiguous memory (32bit
+ * addressable) which is zeroed so it can be
+ * mapped to userspace without leaking data.
+ *
+ * @size: allocation size
+ */
+void *vmalloc_32_user(unsigned long size)
+{
+ struct vm_struct *area;
+ void *ret;
+
+ ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
+ write_lock(&vmlist_lock);
+ area = __find_vm_area(ret);
+ area->flags |= VM_USERMAP;
+ write_unlock(&vmlist_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(vmalloc_32_user);
+
long vread(char *buf, char *addr, unsigned long count)
{
struct vm_struct *tmp;
@@ -630,3 +687,64 @@ finished:
read_unlock(&vmlist_lock);
return buf - buf_start;
}
+
+/**
+ * remap_vmalloc_range - map vmalloc pages to userspace
+ *
+ * @vma: vma to cover (map full range of vma)
+ * @addr: vmalloc memory
+ * @pgoff: number of pages into addr before first page to map
+ * @returns: 0 for success, -Exxx on failure
+ *
+ * This function checks that addr is a valid vmalloc'ed area, and
+ * that it is big enough to cover the vma. Will return failure if
+ * that criteria isn't met.
+ *
+ * Similar to remap_pfn_range (see mm/memory.c)
+ */
+int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
+ unsigned long pgoff)
+{
+ struct vm_struct *area;
+ unsigned long uaddr = vma->vm_start;
+ unsigned long usize = vma->vm_end - vma->vm_start;
+ int ret;
+
+ if ((PAGE_SIZE-1) & (unsigned long)addr)
+ return -EINVAL;
+
+ read_lock(&vmlist_lock);
+ area = __find_vm_area(addr);
+ if (!area)
+ goto out_einval_locked;
+
+ if (!(area->flags & VM_USERMAP))
+ goto out_einval_locked;
+
+ if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
+ goto out_einval_locked;
+ read_unlock(&vmlist_lock);
+
+ addr += pgoff << PAGE_SHIFT;
+ do {
+ struct page *page = vmalloc_to_page(addr);
+ ret = vm_insert_page(vma, uaddr, page);
+ if (ret)
+ return ret;
+
+ uaddr += PAGE_SIZE;
+ addr += PAGE_SIZE;
+ usize -= PAGE_SIZE;
+ } while (usize > 0);
+
+ /* Prevent "things" like memory migration? VM_flags need a cleanup... */
+ vma->vm_flags |= VM_RESERVED;
+
+ return ret;
+
+out_einval_locked:
+ read_unlock(&vmlist_lock);
+ return -EINVAL;
+}
+EXPORT_SYMBOL(remap_vmalloc_range);
+
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 440a733fe2e..72babac71de 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -61,6 +61,8 @@ struct scan_control {
* In this context, it doesn't matter that we scan the
* whole list at once. */
int swap_cluster_max;
+
+ int swappiness;
};
/*
@@ -108,7 +110,7 @@ struct shrinker {
* From 0 .. 100. Higher means more swappy.
*/
int vm_swappiness = 60;
-static long total_memory;
+long vm_total_pages; /* The total number of pages which the VM controls */
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
@@ -288,11 +290,23 @@ static void handle_write_error(struct address_space *mapping,
unlock_page(page);
}
+/* possible outcome of pageout() */
+typedef enum {
+ /* failed to write page out, page is locked */
+ PAGE_KEEP,
+ /* move page to the active list, page is locked */
+ PAGE_ACTIVATE,
+ /* page has been sent to the disk successfully, page is unlocked */
+ PAGE_SUCCESS,
+ /* page is clean and locked */
+ PAGE_CLEAN,
+} pageout_t;
+
/*
* pageout is called by shrink_page_list() for each dirty page.
* Calls ->writepage().
*/
-pageout_t pageout(struct page *page, struct address_space *mapping)
+static pageout_t pageout(struct page *page, struct address_space *mapping)
{
/*
* If the page is dirty, only perform writeback if that write
@@ -337,6 +351,8 @@ pageout_t pageout(struct page *page, struct address_space *mapping)
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
.nr_to_write = SWAP_CLUSTER_MAX,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
.nonblocking = 1,
.for_reclaim = 1,
};
@@ -727,7 +743,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
* how much memory
* is mapped.
*/
- mapped_ratio = (sc->nr_mapped * 100) / total_memory;
+ mapped_ratio = (sc->nr_mapped * 100) / vm_total_pages;
/*
* Now decide how much we really want to unmap some pages. The
@@ -741,7 +757,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
* A 100% value of vm_swappiness overrides this algorithm
* altogether.
*/
- swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
+ swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
/*
* Now use this metric to decide whether to start moving mapped
@@ -957,6 +973,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
.may_writepage = !laptop_mode,
.swap_cluster_max = SWAP_CLUSTER_MAX,
.may_swap = 1,
+ .swappiness = vm_swappiness,
};
inc_page_state(allocstall);
@@ -1021,10 +1038,6 @@ out:
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at pages_high.
*
- * If `nr_pages' is non-zero then it is the number of pages which are to be
- * reclaimed, regardless of the zone occupancies. This is a software suspend
- * special.
- *
* Returns the number of pages which were actually freed.
*
* There is special handling here for zones which are full of pinned pages.
@@ -1042,10 +1055,8 @@ out:
* the page allocator fallback scheme to ensure that aging of pages is balanced
* across the zones.
*/
-static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
- int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
{
- unsigned long to_free = nr_pages;
int all_zones_ok;
int priority;
int i;
@@ -1055,7 +1066,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.may_swap = 1,
- .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX,
+ .swap_cluster_max = SWAP_CLUSTER_MAX,
+ .swappiness = vm_swappiness,
};
loop_again:
@@ -1082,31 +1094,26 @@ loop_again:
all_zones_ok = 1;
- if (nr_pages == 0) {
- /*
- * Scan in the highmem->dma direction for the highest
- * zone which needs scanning
- */
- for (i = pgdat->nr_zones - 1; i >= 0; i--) {
- struct zone *zone = pgdat->node_zones + i;
+ /*
+ * Scan in the highmem->dma direction for the highest
+ * zone which needs scanning
+ */
+ for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+ struct zone *zone = pgdat->node_zones + i;
- if (!populated_zone(zone))
- continue;
+ if (!populated_zone(zone))
+ continue;
- if (zone->all_unreclaimable &&
- priority != DEF_PRIORITY)
- continue;
+ if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ continue;
- if (!zone_watermark_ok(zone, order,
- zone->pages_high, 0, 0)) {
- end_zone = i;
- goto scan;
- }
+ if (!zone_watermark_ok(zone, order, zone->pages_high,
+ 0, 0)) {
+ end_zone = i;
+ goto scan;
}
- goto out;
- } else {
- end_zone = pgdat->nr_zones - 1;
}
+ goto out;
scan:
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
@@ -1133,11 +1140,9 @@ scan:
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue;
- if (nr_pages == 0) { /* Not software suspend */
- if (!zone_watermark_ok(zone, order,
- zone->pages_high, end_zone, 0))
- all_zones_ok = 0;
- }
+ if (!zone_watermark_ok(zone, order, zone->pages_high,
+ end_zone, 0))
+ all_zones_ok = 0;
zone->temp_priority = priority;
if (zone->prev_priority > priority)
zone->prev_priority = priority;
@@ -1162,8 +1167,6 @@ scan:
total_scanned > nr_reclaimed + nr_reclaimed / 2)
sc.may_writepage = 1;
}
- if (nr_pages && to_free > nr_reclaimed)
- continue; /* swsusp: need to do more work */
if (all_zones_ok)
break; /* kswapd: all done */
/*
@@ -1179,7 +1182,7 @@ scan:
* matches the direct reclaim path behaviour in terms of impact
* on zone->*_priority.
*/
- if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages)
+ if (nr_reclaimed >= SWAP_CLUSTER_MAX)
break;
}
out:
@@ -1261,7 +1264,7 @@ static int kswapd(void *p)
}
finish_wait(&pgdat->kswapd_wait, &wait);
- balance_pgdat(pgdat, 0, order);
+ balance_pgdat(pgdat, order);
}
return 0;
}
@@ -1290,35 +1293,154 @@ void wakeup_kswapd(struct zone *zone, int order)
#ifdef CONFIG_PM
/*
- * Try to free `nr_pages' of memory, system-wide. Returns the number of freed
- * pages.
+ * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
+ * from LRU lists system-wide, for given pass and priority, and returns the
+ * number of reclaimed pages
+ *
+ * For pass > 3 we also try to shrink the LRU lists that contain a few pages
+ */
+static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
+ int prio, struct scan_control *sc)
+{
+ struct zone *zone;
+ unsigned long nr_to_scan, ret = 0;
+
+ for_each_zone(zone) {
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone->all_unreclaimable && prio != DEF_PRIORITY)
+ continue;
+
+ /* For pass = 0 we don't shrink the active list */
+ if (pass > 0) {
+ zone->nr_scan_active += (zone->nr_active >> prio) + 1;
+ if (zone->nr_scan_active >= nr_pages || pass > 3) {
+ zone->nr_scan_active = 0;
+ nr_to_scan = min(nr_pages, zone->nr_active);
+ shrink_active_list(nr_to_scan, zone, sc);
+ }
+ }
+
+ zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1;
+ if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
+ zone->nr_scan_inactive = 0;
+ nr_to_scan = min(nr_pages, zone->nr_inactive);
+ ret += shrink_inactive_list(nr_to_scan, zone, sc);
+ if (ret >= nr_pages)
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Try to free `nr_pages' of memory, system-wide, and return the number of
+ * freed pages.
+ *
+ * Rather than trying to age LRUs the aim is to preserve the overall
+ * LRU order by reclaiming preferentially
+ * inactive > active > active referenced > active mapped
*/
unsigned long shrink_all_memory(unsigned long nr_pages)
{
- pg_data_t *pgdat;
- unsigned long nr_to_free = nr_pages;
+ unsigned long lru_pages, nr_slab;
unsigned long ret = 0;
- unsigned retry = 2;
- struct reclaim_state reclaim_state = {
- .reclaimed_slab = 0,
+ int pass;
+ struct reclaim_state reclaim_state;
+ struct zone *zone;
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .may_swap = 0,
+ .swap_cluster_max = nr_pages,
+ .may_writepage = 1,
+ .swappiness = vm_swappiness,
};
current->reclaim_state = &reclaim_state;
-repeat:
- for_each_online_pgdat(pgdat) {
- unsigned long freed;
- freed = balance_pgdat(pgdat, nr_to_free, 0);
- ret += freed;
- nr_to_free -= freed;
- if ((long)nr_to_free <= 0)
+ lru_pages = 0;
+ for_each_zone(zone)
+ lru_pages += zone->nr_active + zone->nr_inactive;
+
+ nr_slab = read_page_state(nr_slab);
+ /* If slab caches are huge, it's better to hit them first */
+ while (nr_slab >= lru_pages) {
+ reclaim_state.reclaimed_slab = 0;
+ shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+ if (!reclaim_state.reclaimed_slab)
break;
+
+ ret += reclaim_state.reclaimed_slab;
+ if (ret >= nr_pages)
+ goto out;
+
+ nr_slab -= reclaim_state.reclaimed_slab;
}
- if (retry-- && ret < nr_pages) {
- blk_congestion_wait(WRITE, HZ/5);
- goto repeat;
+
+ /*
+ * We try to shrink LRUs in 5 passes:
+ * 0 = Reclaim from inactive_list only
+ * 1 = Reclaim from active list but don't reclaim mapped
+ * 2 = 2nd pass of type 1
+ * 3 = Reclaim mapped (normal reclaim)
+ * 4 = 2nd pass of type 3
+ */
+ for (pass = 0; pass < 5; pass++) {
+ int prio;
+
+ /* Needed for shrinking slab caches later on */
+ if (!lru_pages)
+ for_each_zone(zone) {
+ lru_pages += zone->nr_active;
+ lru_pages += zone->nr_inactive;
+ }
+
+ /* Force reclaiming mapped pages in the passes #3 and #4 */
+ if (pass > 2) {
+ sc.may_swap = 1;
+ sc.swappiness = 100;
+ }
+
+ for (prio = DEF_PRIORITY; prio >= 0; prio--) {
+ unsigned long nr_to_scan = nr_pages - ret;
+
+ sc.nr_mapped = read_page_state(nr_mapped);
+ sc.nr_scanned = 0;
+
+ ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
+ if (ret >= nr_pages)
+ goto out;
+
+ reclaim_state.reclaimed_slab = 0;
+ shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages);
+ ret += reclaim_state.reclaimed_slab;
+ if (ret >= nr_pages)
+ goto out;
+
+ if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
+ blk_congestion_wait(WRITE, HZ / 10);
+ }
+
+ lru_pages = 0;
}
+
+ /*
+ * If ret = 0, we could not shrink LRUs, but there may be something
+ * in slab caches
+ */
+ if (!ret)
+ do {
+ reclaim_state.reclaimed_slab = 0;
+ shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+ ret += reclaim_state.reclaimed_slab;
+ } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
+
+out:
current->reclaim_state = NULL;
+
return ret;
}
#endif
@@ -1360,7 +1482,6 @@ static int __init kswapd_init(void)
pgdat->kswapd = find_task_by_pid(pid);
read_unlock(&tasklist_lock);
}
- total_memory = nr_free_pagecache_pages();
hotcpu_notifier(cpu_callback, 0);
return 0;
}
@@ -1416,6 +1537,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
.swap_cluster_max = max_t(unsigned long, nr_pages,
SWAP_CLUSTER_MAX),
.gfp_mask = gfp_mask,
+ .swappiness = vm_swappiness,
};
disable_swap_token();