summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig9
-rw-r--r--mm/Makefile6
-rw-r--r--mm/bootmem.c58
-rw-r--r--mm/fadvise.c5
-rw-r--r--mm/filemap.c160
-rw-r--r--mm/filemap_xip.c8
-rw-r--r--mm/hugetlb.c194
-rw-r--r--mm/internal.h21
-rw-r--r--mm/madvise.c35
-rw-r--r--mm/memory.c34
-rw-r--r--mm/memory_hotplug.c1
-rw-r--r--mm/mempolicy.c692
-rw-r--r--mm/mlock.c1
-rw-r--r--mm/mmap.c1
-rw-r--r--mm/mremap.c1
-rw-r--r--mm/msync.c2
-rw-r--r--mm/nommu.c7
-rw-r--r--mm/oom_kill.c8
-rw-r--r--mm/page-writeback.c17
-rw-r--r--mm/page_alloc.c503
-rw-r--r--mm/pdflush.c2
-rw-r--r--mm/readahead.c15
-rw-r--r--mm/rmap.c74
-rw-r--r--mm/shmem.c81
-rw-r--r--mm/slab.c1194
-rw-r--r--mm/slob.c385
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/swap.c55
-rw-r--r--mm/swap_state.c8
-rw-r--r--mm/swapfile.c60
-rw-r--r--mm/tiny-shmem.c29
-rw-r--r--mm/truncate.c45
-rw-r--r--mm/util.c39
-rw-r--r--mm/vmscan.c450
34 files changed, 2898 insertions, 1306 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 21eb51d4da8..a9cb80ae640 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -11,7 +11,7 @@ choice
config FLATMEM_MANUAL
bool "Flat Memory"
- depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE
+ depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
help
This option allows you to change some of the ways that
Linux manages its memory internally. Most users will
@@ -132,3 +132,10 @@ config SPLIT_PTLOCK_CPUS
default "4096" if ARM && !CPU_CACHE_VIPT
default "4096" if PARISC && !PA20
default "4"
+
+#
+# support for page migration
+#
+config MIGRATION
+ def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
+ depends on SWAP
diff --git a/mm/Makefile b/mm/Makefile
index 2fa6d2ca9f2..9aa03fa1dcc 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,8 +9,8 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
page_alloc.o page-writeback.o pdflush.o \
- readahead.o slab.o swap.o truncate.o vmscan.o \
- prio_tree.o $(mmu-y)
+ readahead.o swap.o truncate.o vmscan.o \
+ prio_tree.o util.o $(mmu-y)
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
@@ -18,5 +18,7 @@ obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SHMEM) += shmem.o
obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+obj-$(CONFIG_SLOB) += slob.o
+obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 16b9465eb4e..35c32290f71 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -296,20 +296,12 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
unsigned long v = ~map[i / BITS_PER_LONG];
if (gofast && v == ~0UL) {
- int j, order;
+ int order;
page = pfn_to_page(pfn);
count += BITS_PER_LONG;
- __ClearPageReserved(page);
order = ffs(BITS_PER_LONG) - 1;
- set_page_refs(page, order);
- for (j = 1; j < BITS_PER_LONG; j++) {
- if (j + 16 < BITS_PER_LONG)
- prefetchw(page + j + 16);
- __ClearPageReserved(page + j);
- set_page_count(page + j, 0);
- }
- __free_pages(page, order);
+ __free_pages_bootmem(page, order);
i += BITS_PER_LONG;
page += BITS_PER_LONG;
} else if (v) {
@@ -319,9 +311,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
for (m = 1; m && i < idx; m<<=1, page++, i++) {
if (v & m) {
count++;
- __ClearPageReserved(page);
- set_page_refs(page, 0);
- __free_page(page);
+ __free_pages_bootmem(page, 0);
}
}
} else {
@@ -339,9 +329,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
count = 0;
for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) {
count++;
- __ClearPageReserved(page);
- set_page_count(page, 1);
- __free_page(page);
+ __free_pages_bootmem(page, 0);
}
total += count;
bdata->node_bootmem_map = NULL;
@@ -393,15 +381,14 @@ unsigned long __init free_all_bootmem (void)
return(free_all_bootmem_core(NODE_DATA(0)));
}
-void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal,
- unsigned long limit)
+void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
{
pg_data_t *pgdat = pgdat_list;
void *ptr;
for_each_pgdat(pgdat)
if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
- align, goal, limit)))
+ align, goal, 0)))
return(ptr);
/*
@@ -413,15 +400,40 @@ void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, un
}
-void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align,
- unsigned long goal, unsigned long limit)
+void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align,
+ unsigned long goal)
{
void *ptr;
- ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, limit);
+ ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
if (ptr)
return (ptr);
- return __alloc_bootmem_limit(size, align, goal, limit);
+ return __alloc_bootmem(size, align, goal);
+}
+
+#define LOW32LIMIT 0xffffffff
+
+void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
+{
+ pg_data_t *pgdat = pgdat_list;
+ void *ptr;
+
+ for_each_pgdat(pgdat)
+ if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+ align, goal, LOW32LIMIT)))
+ return(ptr);
+
+ /*
+ * Whoops, we cannot satisfy the allocation request.
+ */
+ printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
+ panic("Out of low memory");
+ return NULL;
}
+void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT);
+}
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 5f19e87bc5a..d257c89e770 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -37,6 +37,11 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
if (!file)
return -EBADF;
+ if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) {
+ ret = -ESPIPE;
+ goto out;
+ }
+
mapping = file->f_mapping;
if (!mapping || len < 0) {
ret = -EINVAL;
diff --git a/mm/filemap.c b/mm/filemap.c
index 33a28bfde15..44da3d47699 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -15,6 +15,7 @@
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/aio.h>
+#include <linux/capability.h>
#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/swap.h>
@@ -61,7 +62,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
* ->swap_lock (exclusive_swap_page, others)
* ->mapping->tree_lock
*
- * ->i_sem
+ * ->i_mutex
* ->i_mmap_lock (truncate->unmap_mapping_range)
*
* ->mmap_sem
@@ -73,9 +74,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
* ->lock_page (access_process_vm)
*
* ->mmap_sem
- * ->i_sem (msync)
+ * ->i_mutex (msync)
*
- * ->i_sem
+ * ->i_mutex
* ->i_alloc_sem (various)
*
* ->inode_lock
@@ -93,6 +94,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
* ->private_lock (try_to_unmap_one)
* ->tree_lock (try_to_unmap_one)
* ->zone.lru_lock (follow_page->mark_page_accessed)
+ * ->zone.lru_lock (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->tree_lock (page_remove_rmap->set_page_dirty)
* ->inode_lock (page_remove_rmap->set_page_dirty)
@@ -276,11 +278,11 @@ static int wait_on_page_writeback_range(struct address_space *mapping,
* integrity" operation. It waits upon in-flight writeout before starting and
* waiting upon new writeout. If there was an IO error, return it.
*
- * We need to re-take i_sem during the generic_osync_inode list walk because
+ * We need to re-take i_mutex during the generic_osync_inode list walk because
* it is otherwise livelockable.
*/
int sync_page_range(struct inode *inode, struct address_space *mapping,
- loff_t pos, size_t count)
+ loff_t pos, loff_t count)
{
pgoff_t start = pos >> PAGE_CACHE_SHIFT;
pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -290,9 +292,9 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
return 0;
ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
if (ret == 0) {
- down(&inode->i_sem);
+ mutex_lock(&inode->i_mutex);
ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
- up(&inode->i_sem);
+ mutex_unlock(&inode->i_mutex);
}
if (ret == 0)
ret = wait_on_page_writeback_range(mapping, start, end);
@@ -301,13 +303,12 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
EXPORT_SYMBOL(sync_page_range);
/*
- * Note: Holding i_sem across sync_page_range_nolock is not a good idea
+ * Note: Holding i_mutex across sync_page_range_nolock is not a good idea
* as it forces O_SYNC writers to different parts of the same file
* to be serialised right until io completion.
*/
-static int sync_page_range_nolock(struct inode *inode,
- struct address_space *mapping,
- loff_t pos, size_t count)
+int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
+ loff_t pos, loff_t count)
{
pgoff_t start = pos >> PAGE_CACHE_SHIFT;
pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -322,6 +323,7 @@ static int sync_page_range_nolock(struct inode *inode,
ret = wait_on_page_writeback_range(mapping, start, end);
return ret;
}
+EXPORT_SYMBOL(sync_page_range_nolock);
/**
* filemap_fdatawait - walk the list of under-writeback pages of the given
@@ -343,30 +345,44 @@ EXPORT_SYMBOL(filemap_fdatawait);
int filemap_write_and_wait(struct address_space *mapping)
{
- int retval = 0;
+ int err = 0;
if (mapping->nrpages) {
- retval = filemap_fdatawrite(mapping);
- if (retval == 0)
- retval = filemap_fdatawait(mapping);
+ err = filemap_fdatawrite(mapping);
+ /*
+ * Even if the above returned error, the pages may be
+ * written partially (e.g. -ENOSPC), so we wait for it.
+ * But the -EIO is special case, it may indicate the worst
+ * thing (e.g. bug) happened, so we avoid waiting for it.
+ */
+ if (err != -EIO) {
+ int err2 = filemap_fdatawait(mapping);
+ if (!err)
+ err = err2;
+ }
}
- return retval;
+ return err;
}
+EXPORT_SYMBOL(filemap_write_and_wait);
int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend)
{
- int retval = 0;
+ int err = 0;
if (mapping->nrpages) {
- retval = __filemap_fdatawrite_range(mapping, lstart, lend,
- WB_SYNC_ALL);
- if (retval == 0)
- retval = wait_on_page_writeback_range(mapping,
- lstart >> PAGE_CACHE_SHIFT,
- lend >> PAGE_CACHE_SHIFT);
+ err = __filemap_fdatawrite_range(mapping, lstart, lend,
+ WB_SYNC_ALL);
+ /* See comment of filemap_write_and_wait() */
+ if (err != -EIO) {
+ int err2 = wait_on_page_writeback_range(mapping,
+ lstart >> PAGE_CACHE_SHIFT,
+ lend >> PAGE_CACHE_SHIFT);
+ if (!err)
+ err = err2;
+ }
}
- return retval;
+ return err;
}
/*
@@ -555,11 +571,12 @@ repeat:
page_cache_get(page);
if (TestSetPageLocked(page)) {
read_unlock_irq(&mapping->tree_lock);
- lock_page(page);
+ __lock_page(page);
read_lock_irq(&mapping->tree_lock);
/* Has the page been truncated while we slept? */
- if (page->mapping != mapping || page->index != offset) {
+ if (unlikely(page->mapping != mapping ||
+ page->index != offset)) {
unlock_page(page);
page_cache_release(page);
goto repeat;
@@ -831,8 +848,13 @@ readpage:
/* Start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page);
- if (unlikely(error))
+ if (unlikely(error)) {
+ if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto find_page;
+ }
goto readpage_error;
+ }
if (!PageUptodate(page)) {
lock_page(page);
@@ -1152,26 +1174,24 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
{
struct address_space *mapping = file->f_mapping;
struct page *page;
- int error;
+ int ret;
- page = page_cache_alloc_cold(mapping);
- if (!page)
- return -ENOMEM;
+ do {
+ page = page_cache_alloc_cold(mapping);
+ if (!page)
+ return -ENOMEM;
+
+ ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+ if (ret == 0)
+ ret = mapping->a_ops->readpage(file, page);
+ else if (ret == -EEXIST)
+ ret = 0; /* losing race to add is OK */
- error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
- if (!error) {
- error = mapping->a_ops->readpage(file, page);
page_cache_release(page);
- return error;
- }
- /*
- * We arrive here in the unlikely event that someone
- * raced with us and added our page to the cache first
- * or we are out of memory for radix-tree nodes.
- */
- page_cache_release(page);
- return error == -EEXIST ? 0 : error;
+ } while (ret == AOP_TRUNCATED_PAGE);
+
+ return ret;
}
#define MMAP_LOTSAMISS (100)
@@ -1331,10 +1351,14 @@ page_not_uptodate:
goto success;
}
- if (!mapping->a_ops->readpage(file, page)) {
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
+ } else if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto retry_find;
}
/*
@@ -1358,10 +1382,14 @@ page_not_uptodate:
goto success;
}
ClearPageError(page);
- if (!mapping->a_ops->readpage(file, page)) {
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
+ } else if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto retry_find;
}
/*
@@ -1444,10 +1472,14 @@ page_not_uptodate:
goto success;
}
- if (!mapping->a_ops->readpage(file, page)) {
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
+ } else if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto retry_find;
}
/*
@@ -1470,10 +1502,14 @@ page_not_uptodate:
}
ClearPageError(page);
- if (!mapping->a_ops->readpage(file, page)) {
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
+ } else if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto retry_find;
}
/*
@@ -1858,7 +1894,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
/*
* Sync the fs metadata but not the minor inode changes and
* of course not the data as we did direct DMA for the IO.
- * i_sem is held, which protects generic_osync_inode() from
+ * i_mutex is held, which protects generic_osync_inode() from
* livelocking.
*/
if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
@@ -1934,12 +1970,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
status = a_ops->prepare_write(file, page, offset, offset+bytes);
if (unlikely(status)) {
loff_t isize = i_size_read(inode);
+
+ if (status != AOP_TRUNCATED_PAGE)
+ unlock_page(page);
+ page_cache_release(page);
+ if (status == AOP_TRUNCATED_PAGE)
+ continue;
/*
* prepare_write() may have instantiated a few blocks
* outside i_size. Trim these off again.
*/
- unlock_page(page);
- page_cache_release(page);
if (pos + bytes > isize)
vmtruncate(inode, isize);
break;
@@ -1952,6 +1992,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
cur_iov, iov_base, bytes);
flush_dcache_page(page);
status = a_ops->commit_write(file, page, offset, offset+bytes);
+ if (status == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ continue;
+ }
if (likely(copied > 0)) {
if (!status)
status = copied;
@@ -2066,7 +2110,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
if (err)
goto out;
- inode_update_time(inode, 1);
+ file_update_time(file);
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (unlikely(file->f_flags & O_DIRECT)) {
@@ -2153,10 +2197,10 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
BUG_ON(iocb->ki_pos != pos);
- down(&inode->i_sem);
+ mutex_lock(&inode->i_mutex);
ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
&iocb->ki_pos);
- up(&inode->i_sem);
+ mutex_unlock(&inode->i_mutex);
if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
ssize_t err;
@@ -2178,9 +2222,9 @@ ssize_t generic_file_write(struct file *file, const char __user *buf,
struct iovec local_iov = { .iov_base = (void __user *)buf,
.iov_len = count };
- down(&inode->i_sem);
+ mutex_lock(&inode->i_mutex);
ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
- up(&inode->i_sem);
+ mutex_unlock(&inode->i_mutex);
if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
ssize_t err;
@@ -2214,9 +2258,9 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
struct inode *inode = mapping->host;
ssize_t ret;
- down(&inode->i_sem);
+ mutex_lock(&inode->i_mutex);
ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
- up(&inode->i_sem);
+ mutex_unlock(&inode->i_mutex);
if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
int err;
@@ -2230,7 +2274,7 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
EXPORT_SYMBOL(generic_file_writev);
/*
- * Called under i_sem for writes to S_ISREG files. Returns -EIO if something
+ * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something
* went wrong during pagecache shootdown.
*/
static ssize_t
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 9cf687e4a29..b960ac8e591 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -338,7 +338,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
*ppos = pos;
/*
* No need to use i_size_read() here, the i_size
- * cannot change under us because we hold i_sem.
+ * cannot change under us because we hold i_mutex.
*/
if (pos > inode->i_size) {
i_size_write(inode, pos);
@@ -358,7 +358,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
loff_t pos;
ssize_t ret;
- down(&inode->i_sem);
+ mutex_lock(&inode->i_mutex);
if (!access_ok(VERIFY_READ, buf, len)) {
ret=-EFAULT;
@@ -383,14 +383,14 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
if (ret)
goto out_backing;
- inode_update_time(inode, 1);
+ file_update_time(filp);
ret = __xip_file_write (filp, buf, count, pos, ppos);
out_backing:
current->backing_dev_info = NULL;
out_up:
- up(&inode->i_sem);
+ mutex_unlock(&inode->i_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(xip_file_write);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3e52df7c471..b21d78c941b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -11,6 +11,9 @@
#include <linux/highmem.h>
#include <linux/nodemask.h>
#include <linux/pagemap.h>
+#include <linux/mempolicy.h>
+#include <linux/cpuset.h>
+
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -36,18 +39,22 @@ static void enqueue_huge_page(struct page *page)
free_huge_pages_node[nid]++;
}
-static struct page *dequeue_huge_page(void)
+static struct page *dequeue_huge_page(struct vm_area_struct *vma,
+ unsigned long address)
{
int nid = numa_node_id();
struct page *page = NULL;
+ struct zonelist *zonelist = huge_zonelist(vma, address);
+ struct zone **z;
- if (list_empty(&hugepage_freelists[nid])) {
- for (nid = 0; nid < MAX_NUMNODES; ++nid)
- if (!list_empty(&hugepage_freelists[nid]))
- break;
+ for (z = zonelist->zones; *z; z++) {
+ nid = (*z)->zone_pgdat->node_id;
+ if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
+ !list_empty(&hugepage_freelists[nid]))
+ break;
}
- if (nid >= 0 && nid < MAX_NUMNODES &&
- !list_empty(&hugepage_freelists[nid])) {
+
+ if (*z) {
page = list_entry(hugepage_freelists[nid].next,
struct page, lru);
list_del(&page->lru);
@@ -85,13 +92,13 @@ void free_huge_page(struct page *page)
spin_unlock(&hugetlb_lock);
}
-struct page *alloc_huge_page(void)
+struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
{
struct page *page;
int i;
spin_lock(&hugetlb_lock);
- page = dequeue_huge_page();
+ page = dequeue_huge_page(vma, addr);
if (!page) {
spin_unlock(&hugetlb_lock);
return NULL;
@@ -194,7 +201,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
spin_lock(&hugetlb_lock);
try_to_free_low(count);
while (count < nr_huge_pages) {
- struct page *page = dequeue_huge_page();
+ struct page *page = dequeue_huge_page(NULL, 0);
if (!page)
break;
update_and_free_page(page);
@@ -261,11 +268,12 @@ struct vm_operations_struct hugetlb_vm_ops = {
.nopage = hugetlb_nopage,
};
-static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
+ int writable)
{
pte_t entry;
- if (vma->vm_flags & VM_WRITE) {
+ if (writable) {
entry =
pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
} else {
@@ -277,12 +285,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
return entry;
}
+static void set_huge_ptep_writable(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ pte_t entry;
+
+ entry = pte_mkwrite(pte_mkdirty(*ptep));
+ ptep_set_access_flags(vma, address, ptep, entry, 1);
+ update_mmu_cache(vma, address, entry);
+ lazy_mmu_prot_update(entry);
+}
+
+
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
{
pte_t *src_pte, *dst_pte, entry;
struct page *ptepage;
unsigned long addr;
+ int cow;
+
+ cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
src_pte = huge_pte_offset(src, addr);
@@ -294,6 +317,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
spin_lock(&dst->page_table_lock);
spin_lock(&src->page_table_lock);
if (!pte_none(*src_pte)) {
+ if (cow)
+ ptep_set_wrprotect(src, addr, src_pte);
entry = *src_pte;
ptepage = pte_page(entry);
get_page(ptepage);
@@ -345,57 +370,63 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
flush_tlb_range(vma, start, end);
}
-static struct page *find_lock_huge_page(struct address_space *mapping,
- unsigned long idx)
+static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep, pte_t pte)
{
- struct page *page;
- int err;
- struct inode *inode = mapping->host;
- unsigned long size;
+ struct page *old_page, *new_page;
+ int i, avoidcopy;
-retry:
- page = find_lock_page(mapping, idx);
- if (page)
- goto out;
+ old_page = pte_page(pte);
- /* Check to make sure the mapping hasn't been truncated */
- size = i_size_read(inode) >> HPAGE_SHIFT;
- if (idx >= size)
- goto out;
+ /* If no-one else is actually using this page, avoid the copy
+ * and just make the page writable */
+ avoidcopy = (page_count(old_page) == 1);
+ if (avoidcopy) {
+ set_huge_ptep_writable(vma, address, ptep);
+ return VM_FAULT_MINOR;
+ }
- if (hugetlb_get_quota(mapping))
- goto out;
- page = alloc_huge_page();
- if (!page) {
- hugetlb_put_quota(mapping);
- goto out;
+ page_cache_get(old_page);
+ new_page = alloc_huge_page(vma, address);
+
+ if (!new_page) {
+ page_cache_release(old_page);
+
+ /* Logically this is OOM, not a SIGBUS, but an OOM
+ * could cause the kernel to go killing other
+ * processes which won't help the hugepage situation
+ * at all (?) */
+ return VM_FAULT_SIGBUS;
}
- err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
- if (err) {
- put_page(page);
- hugetlb_put_quota(mapping);
- if (err == -EEXIST)
- goto retry;
- page = NULL;
+ spin_unlock(&mm->page_table_lock);
+ for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
+ copy_user_highpage(new_page + i, old_page + i,
+ address + i*PAGE_SIZE);
+ spin_lock(&mm->page_table_lock);
+
+ ptep = huge_pte_offset(mm, address & HPAGE_MASK);
+ if (likely(pte_same(*ptep, pte))) {
+ /* Break COW */
+ set_huge_pte_at(mm, address, ptep,
+ make_huge_pte(vma, new_page, 1));
+ /* Make the old page be freed below */
+ new_page = old_page;
}
-out:
- return page;
+ page_cache_release(new_page);
+ page_cache_release(old_page);
+ return VM_FAULT_MINOR;
}
-int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, int write_access)
+int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep, int write_access)
{
int ret = VM_FAULT_SIGBUS;
unsigned long idx;
unsigned long size;
- pte_t *pte;
struct page *page;
struct address_space *mapping;
-
- pte = huge_pte_alloc(mm, address);
- if (!pte)
- goto out;
+ pte_t new_pte;
mapping = vma->vm_file->f_mapping;
idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
@@ -405,9 +436,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* Use page lock to guard against racing truncation
* before we get page_table_lock.
*/
- page = find_lock_huge_page(mapping, idx);
- if (!page)
- goto out;
+retry:
+ page = find_lock_page(mapping, idx);
+ if (!page) {
+ if (hugetlb_get_quota(mapping))
+ goto out;
+ page = alloc_huge_page(vma, address);
+ if (!page) {
+ hugetlb_put_quota(mapping);
+ goto out;
+ }
+
+ if (vma->vm_flags & VM_SHARED) {
+ int err;
+
+ err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+ if (err) {
+ put_page(page);
+ hugetlb_put_quota(mapping);
+ if (err == -EEXIST)
+ goto retry;
+ goto out;
+ }
+ } else
+ lock_page(page);
+ }
spin_lock(&mm->page_table_lock);
size = i_size_read(mapping->host) >> HPAGE_SHIFT;
@@ -415,11 +468,19 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
goto backout;
ret = VM_FAULT_MINOR;
- if (!pte_none(*pte))
+ if (!pte_none(*ptep))
goto backout;
add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
- set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
+ new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
+ && (vma->vm_flags & VM_SHARED)));
+ set_huge_pte_at(mm, address, ptep, new_pte);
+
+ if (write_access && !(vma->vm_flags & VM_SHARED)) {
+ /* Optimization, do the COW without a second fault */
+ ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+ }
+
spin_unlock(&mm->page_table_lock);
unlock_page(page);
out:
@@ -433,6 +494,33 @@ backout:
goto out;
}
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, int write_access)
+{
+ pte_t *ptep;
+ pte_t entry;
+ int ret;
+
+ ptep = huge_pte_alloc(mm, address);
+ if (!ptep)
+ return VM_FAULT_OOM;
+
+ entry = *ptep;
+ if (pte_none(entry))
+ return hugetlb_no_page(mm, vma, address, ptep, write_access);
+
+ ret = VM_FAULT_MINOR;
+
+ spin_lock(&mm->page_table_lock);
+ /* Check for a racing update before calling hugetlb_cow */
+ if (likely(pte_same(entry, *ptep)))
+ if (write_access && !pte_write(entry))
+ ret = hugetlb_cow(mm, vma, address, ptep, entry);
+ spin_unlock(&mm->page_table_lock);
+
+ return ret;
+}
+
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, int *length, int i)
diff --git a/mm/internal.h b/mm/internal.h
index 6bf134e8fb3..17256bb2f4e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -9,5 +9,22 @@
* 2 of the License, or (at your option) any later version.
*/
-/* page_alloc.c */
-extern void set_page_refs(struct page *page, int order);
+static inline void set_page_refs(struct page *page, int order)
+{
+#ifdef CONFIG_MMU
+ set_page_count(page, 1);
+#else
+ int i;
+
+ /*
+ * We need to reference all the pages for this order, otherwise if
+ * anyone accesses one of the pages with (get/put) it will be freed.
+ * - eg: access_process_vm()
+ */
+ for (i = 0; i < (1 << order); i++)
+ set_page_count(page + i, 1);
+#endif /* CONFIG_MMU */
+}
+
+extern void fastcall __init __free_pages_bootmem(struct page *page,
+ unsigned int order);
diff --git a/mm/madvise.c b/mm/madvise.c
index 2b7cf0400a2..ae0ae3ea299 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -140,6 +140,36 @@ static long madvise_dontneed(struct vm_area_struct * vma,
return 0;
}
+/*
+ * Application wants to free up the pages and associated backing store.
+ * This is effectively punching a hole into the middle of a file.
+ *
+ * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
+ * Other filesystems return -ENOSYS.
+ */
+static long madvise_remove(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ struct address_space *mapping;
+ loff_t offset, endoff;
+
+ if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
+ return -EINVAL;
+
+ if (!vma->vm_file || !vma->vm_file->f_mapping
+ || !vma->vm_file->f_mapping->host) {
+ return -EINVAL;
+ }
+
+ mapping = vma->vm_file->f_mapping;
+
+ offset = (loff_t)(start - vma->vm_start)
+ + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+ endoff = (loff_t)(end - vma->vm_start - 1)
+ + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+ return vmtruncate_range(mapping->host, offset, endoff);
+}
+
static long
madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
unsigned long start, unsigned long end, int behavior)
@@ -152,6 +182,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
case MADV_RANDOM:
error = madvise_behavior(vma, prev, start, end, behavior);
break;
+ case MADV_REMOVE:
+ error = madvise_remove(vma, start, end);
+ break;
case MADV_WILLNEED:
error = madvise_willneed(vma, prev, start, end);
@@ -190,6 +223,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
* some pages ahead.
* MADV_DONTNEED - the application is finished with the given range,
* so the kernel can free resources associated with it.
+ * MADV_REMOVE - the application wants to free up the given range of
+ * pages and associated backing store.
*
* return values:
* zero - success
diff --git a/mm/memory.c b/mm/memory.c
index d8dde07a365..7a11ddd5060 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1498,7 +1498,7 @@ gotten:
update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
lru_cache_add_active(new_page);
- page_add_anon_rmap(new_page, vma, address);
+ page_add_new_anon_rmap(new_page, vma, address);
/* Free the old page.. */
new_page = old_page;
@@ -1770,9 +1770,32 @@ out_big:
out_busy:
return -ETXTBSY;
}
-
EXPORT_SYMBOL(vmtruncate);
+int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
+{
+ struct address_space *mapping = inode->i_mapping;
+
+ /*
+ * If the underlying filesystem is not going to provide
+ * a way to truncate a range of blocks (punch a hole) -
+ * we should return failure right now.
+ */
+ if (!inode->i_op || !inode->i_op->truncate_range)
+ return -ENOSYS;
+
+ mutex_lock(&inode->i_mutex);
+ down_write(&inode->i_alloc_sem);
+ unmap_mapping_range(mapping, offset, (end - offset), 1);
+ truncate_inode_pages_range(mapping, offset, end);
+ inode->i_op->truncate_range(inode, offset, end);
+ up_write(&inode->i_alloc_sem);
+ mutex_unlock(&inode->i_mutex);
+
+ return 0;
+}
+EXPORT_SYMBOL(vmtruncate_range);
+
/*
* Primitive swap readahead code. We simply read an aligned block of
* (1 << page_cluster) entries in the swap area. This method is chosen
@@ -1954,8 +1977,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto release;
inc_mm_counter(mm, anon_rss);
lru_cache_add_active(page);
- SetPageReferenced(page);
- page_add_anon_rmap(page, vma, address);
+ page_add_new_anon_rmap(page, vma, address);
} else {
/* Map the ZERO_PAGE - vm_page_prot is readonly */
page = ZERO_PAGE(address);
@@ -2086,7 +2108,7 @@ retry:
if (anon) {
inc_mm_counter(mm, anon_rss);
lru_cache_add_active(new_page);
- page_add_anon_rmap(new_page, vma, address);
+ page_add_new_anon_rmap(new_page, vma, address);
} else {
inc_mm_counter(mm, file_rss);
page_add_file_rmap(new_page);
@@ -2245,6 +2267,8 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
}
+EXPORT_SYMBOL_GPL(__handle_mm_fault);
+
#ifndef __PAGETABLE_PUD_FOLDED
/*
* Allocate page upper directory.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f6d4af8af8a..a918f77f02f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -42,7 +42,6 @@ extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
int nr_pages);
static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
{
- struct pglist_data *pgdat = zone->zone_pgdat;
int nr_pages = PAGES_PER_SECTION;
int ret;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 72f402cc9c9..73790188b0e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -83,9 +83,18 @@
#include <linux/init.h>
#include <linux/compat.h>
#include <linux/mempolicy.h>
+#include <linux/swap.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
+/* Internal flags */
+#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
+#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
+#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
+
static kmem_cache_t *policy_cache;
static kmem_cache_t *sn_cache;
@@ -93,7 +102,7 @@ static kmem_cache_t *sn_cache;
/* Highest zone. An specific allocation for a zone below that is not
policied. */
-static int policy_zone;
+int policy_zone = ZONE_DMA;
struct mempolicy default_policy = {
.refcnt = ATOMIC_INIT(1), /* never free it */
@@ -131,17 +140,8 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
if (!zl)
return NULL;
num = 0;
- for_each_node_mask(nd, *nodes) {
- int k;
- for (k = MAX_NR_ZONES-1; k >= 0; k--) {
- struct zone *z = &NODE_DATA(nd)->node_zones[k];
- if (!z->present_pages)
- continue;
- zl->zones[num++] = z;
- if (k > policy_zone)
- policy_zone = k;
- }
- }
+ for_each_node_mask(nd, *nodes)
+ zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
zl->zones[num] = NULL;
return zl;
}
@@ -180,12 +180,19 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
break;
}
policy->policy = mode;
+ policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
return policy;
}
-/* Ensure all existing pages follow the policy. */
+static void gather_stats(struct page *, void *);
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+ unsigned long flags);
+
+/* Scan through pages checking if pages follow certain conditions. */
static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end, nodemask_t *nodes)
+ unsigned long addr, unsigned long end,
+ const nodemask_t *nodes, unsigned long flags,
+ void *private)
{
pte_t *orig_pte;
pte_t *pte;
@@ -201,8 +208,28 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
page = vm_normal_page(vma, addr, *pte);
if (!page)
continue;
+ /*
+ * The check for PageReserved here is important to avoid
+ * handling zero pages and other pages that may have been
+ * marked special by the system.
+ *
+ * If the PageReserved would not be checked here then f.e.
+ * the location of the zero page could have an influence
+ * on MPOL_MF_STRICT, zero pages would be counted for
+ * the per node stats, and there would be useless attempts
+ * to put zero pages on the migration list.
+ */
+ if (PageReserved(page))
+ continue;
nid = page_to_nid(page);
- if (!node_isset(nid, *nodes))
+ if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+ continue;
+
+ if (flags & MPOL_MF_STATS)
+ gather_stats(page, private);
+ else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ migrate_page_add(page, private, flags);
+ else
break;
} while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap_unlock(orig_pte, ptl);
@@ -210,7 +237,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
}
static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
- unsigned long addr, unsigned long end, nodemask_t *nodes)
+ unsigned long addr, unsigned long end,
+ const nodemask_t *nodes, unsigned long flags,
+ void *private)
{
pmd_t *pmd;
unsigned long next;
@@ -220,14 +249,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
- if (check_pte_range(vma, pmd, addr, next, nodes))
+ if (check_pte_range(vma, pmd, addr, next, nodes,
+ flags, private))
return -EIO;
} while (pmd++, addr = next, addr != end);
return 0;
}
static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
- unsigned long addr, unsigned long end, nodemask_t *nodes)
+ unsigned long addr, unsigned long end,
+ const nodemask_t *nodes, unsigned long flags,
+ void *private)
{
pud_t *pud;
unsigned long next;
@@ -237,14 +269,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- if (check_pmd_range(vma, pud, addr, next, nodes))
+ if (check_pmd_range(vma, pud, addr, next, nodes,
+ flags, private))
return -EIO;
} while (pud++, addr = next, addr != end);
return 0;
}
static inline int check_pgd_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end, nodemask_t *nodes)
+ unsigned long addr, unsigned long end,
+ const nodemask_t *nodes, unsigned long flags,
+ void *private)
{
pgd_t *pgd;
unsigned long next;
@@ -254,36 +289,61 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- if (check_pud_range(vma, pgd, addr, next, nodes))
+ if (check_pud_range(vma, pgd, addr, next, nodes,
+ flags, private))
return -EIO;
} while (pgd++, addr = next, addr != end);
return 0;
}
-/* Step 1: check the range */
+/* Check if a vma is migratable */
+static inline int vma_migratable(struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & (
+ VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
+ return 0;
+ return 1;
+}
+
+/*
+ * Check if all pages in a range are on a set of nodes.
+ * If pagelist != NULL then isolate pages from the LRU and
+ * put them on the pagelist.
+ */
static struct vm_area_struct *
check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
- nodemask_t *nodes, unsigned long flags)
+ const nodemask_t *nodes, unsigned long flags, void *private)
{
int err;
struct vm_area_struct *first, *vma, *prev;
+ /* Clear the LRU lists so pages can be isolated */
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ lru_add_drain_all();
+
first = find_vma(mm, start);
if (!first)
return ERR_PTR(-EFAULT);
prev = NULL;
for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
- if (!vma->vm_next && vma->vm_end < end)
- return ERR_PTR(-EFAULT);
- if (prev && prev->vm_end < vma->vm_start)
- return ERR_PTR(-EFAULT);
- if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
+ if (!(flags & MPOL_MF_DISCONTIG_OK)) {
+ if (!vma->vm_next && vma->vm_end < end)
+ return ERR_PTR(-EFAULT);
+ if (prev && prev->vm_end < vma->vm_start)
+ return ERR_PTR(-EFAULT);
+ }
+ if (!is_vm_hugetlb_page(vma) &&
+ ((flags & MPOL_MF_STRICT) ||
+ ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
+ vma_migratable(vma)))) {
unsigned long endvma = vma->vm_end;
+
if (endvma > end)
endvma = end;
if (vma->vm_start > start)
start = vma->vm_start;
- err = check_pgd_range(vma, start, endvma, nodes);
+ err = check_pgd_range(vma, start, endvma, nodes,
+ flags, private);
if (err) {
first = ERR_PTR(err);
break;
@@ -342,51 +402,10 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
if (!nodes)
return 0;
- /* Update current mems_allowed */
- cpuset_update_current_mems_allowed();
- /* Ignore nodes not set in current->mems_allowed */
- cpuset_restrict_to_mems_allowed(nodes->bits);
- return mpol_check_policy(mode, nodes);
-}
-
-long do_mbind(unsigned long start, unsigned long len,
- unsigned long mode, nodemask_t *nmask, unsigned long flags)
-{
- struct vm_area_struct *vma;
- struct mm_struct *mm = current->mm;
- struct mempolicy *new;
- unsigned long end;
- int err;
-
- if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
- return -EINVAL;
- if (start & ~PAGE_MASK)
- return -EINVAL;
- if (mode == MPOL_DEFAULT)
- flags &= ~MPOL_MF_STRICT;
- len = (len + PAGE_SIZE - 1) & PAGE_MASK;
- end = start + len;
- if (end < start)
+ cpuset_update_task_memory_state();
+ if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
return -EINVAL;
- if (end == start)
- return 0;
- if (mpol_check_policy(mode, nmask))
- return -EINVAL;
- new = mpol_new(mode, nmask);
- if (IS_ERR(new))
- return PTR_ERR(new);
-
- PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
- mode,nodes_addr(nodes)[0]);
-
- down_write(&mm->mmap_sem);
- vma = check_range(mm, start, end, nmask, flags);
- err = PTR_ERR(vma);
- if (!IS_ERR(vma))
- err = mbind_range(vma, start, end, new);
- up_write(&mm->mmap_sem);
- mpol_free(new);
- return err;
+ return mpol_check_policy(mode, nodes);
}
/* Set the process memory policy */
@@ -457,7 +476,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
struct vm_area_struct *vma = NULL;
struct mempolicy *pol = current->mempolicy;
- cpuset_update_current_mems_allowed();
+ cpuset_update_task_memory_state();
if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
return -EINVAL;
if (flags & MPOL_F_ADDR) {
@@ -509,11 +528,141 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
}
/*
+ * page migration
+ */
+
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+ unsigned long flags)
+{
+ /*
+ * Avoid migrating a page that is shared with others.
+ */
+ if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
+ if (isolate_lru_page(page))
+ list_add(&page->lru, pagelist);
+ }
+}
+
+static int swap_pages(struct list_head *pagelist)
+{
+ LIST_HEAD(moved);
+ LIST_HEAD(failed);
+ int n;
+
+ n = migrate_pages(pagelist, NULL, &moved, &failed);
+ putback_lru_pages(&failed);
+ putback_lru_pages(&moved);
+
+ return n;
+}
+
+/*
+ * For now migrate_pages simply swaps out the pages from nodes that are in
+ * the source set but not in the target set. In the future, we would
+ * want a function that moves pages between the two nodesets in such
+ * a way as to preserve the physical layout as much as possible.
+ *
+ * Returns the number of page that could not be moved.
+ */
+int do_migrate_pages(struct mm_struct *mm,
+ const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+ LIST_HEAD(pagelist);
+ int count = 0;
+ nodemask_t nodes;
+
+ nodes_andnot(nodes, *from_nodes, *to_nodes);
+
+ down_read(&mm->mmap_sem);
+ check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
+ flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+
+ if (!list_empty(&pagelist)) {
+ count = swap_pages(&pagelist);
+ putback_lru_pages(&pagelist);
+ }
+
+ up_read(&mm->mmap_sem);
+ return count;
+}
+
+long do_mbind(unsigned long start, unsigned long len,
+ unsigned long mode, nodemask_t *nmask, unsigned long flags)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm = current->mm;
+ struct mempolicy *new;
+ unsigned long end;
+ int err;
+ LIST_HEAD(pagelist);
+
+ if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
+ MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ || mode > MPOL_MAX)
+ return -EINVAL;
+ if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+
+ if (mode == MPOL_DEFAULT)
+ flags &= ~MPOL_MF_STRICT;
+
+ len = (len + PAGE_SIZE - 1) & PAGE_MASK;
+ end = start + len;
+
+ if (end < start)
+ return -EINVAL;
+ if (end == start)
+ return 0;
+
+ if (mpol_check_policy(mode, nmask))
+ return -EINVAL;
+
+ new = mpol_new(mode, nmask);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ /*
+ * If we are using the default policy then operation
+ * on discontinuous address spaces is okay after all
+ */
+ if (!new)
+ flags |= MPOL_MF_DISCONTIG_OK;
+
+ PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+ mode,nodes_addr(nodes)[0]);
+
+ down_write(&mm->mmap_sem);
+ vma = check_range(mm, start, end, nmask,
+ flags | MPOL_MF_INVERT, &pagelist);
+
+ err = PTR_ERR(vma);
+ if (!IS_ERR(vma)) {
+ int nr_failed = 0;
+
+ err = mbind_range(vma, start, end, new);
+ if (!list_empty(&pagelist))
+ nr_failed = swap_pages(&pagelist);
+
+ if (!err && nr_failed && (flags & MPOL_MF_STRICT))
+ err = -EIO;
+ }
+ if (!list_empty(&pagelist))
+ putback_lru_pages(&pagelist);
+
+ up_write(&mm->mmap_sem);
+ mpol_free(new);
+ return err;
+}
+
+/*
* User space interface with variable sized bitmaps for nodelists.
*/
/* Copy a node mask from user space. */
-static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
+static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
unsigned long maxnode)
{
unsigned long k;
@@ -602,6 +751,65 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
return do_set_mempolicy(mode, &nodes);
}
+asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
+ const unsigned long __user *old_nodes,
+ const unsigned long __user *new_nodes)
+{
+ struct mm_struct *mm;
+ struct task_struct *task;
+ nodemask_t old;
+ nodemask_t new;
+ nodemask_t task_nodes;
+ int err;
+
+ err = get_nodes(&old, old_nodes, maxnode);
+ if (err)
+ return err;
+
+ err = get_nodes(&new, new_nodes, maxnode);
+ if (err)
+ return err;
+
+ /* Find the mm_struct */
+ read_lock(&tasklist_lock);
+ task = pid ? find_task_by_pid(pid) : current;
+ if (!task) {
+ read_unlock(&tasklist_lock);
+ return -ESRCH;
+ }
+ mm = get_task_mm(task);
+ read_unlock(&tasklist_lock);
+
+ if (!mm)
+ return -EINVAL;
+
+ /*
+ * Check if this process has the right to modify the specified
+ * process. The right exists if the process has administrative
+ * capabilities, superuser priviledges or the same
+ * userid as the target process.
+ */
+ if ((current->euid != task->suid) && (current->euid != task->uid) &&
+ (current->uid != task->suid) && (current->uid != task->uid) &&
+ !capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ task_nodes = cpuset_mems_allowed(task);
+ /* Is the user allowed to access the target nodes? */
+ if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
+out:
+ mmput(mm);
+ return err;
+}
+
+
/* Retrieve NUMA policy */
asmlinkage long sys_get_mempolicy(int __user *policy,
unsigned long __user *nmask,
@@ -708,8 +916,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
#endif
/* Return effective policy for a VMA */
-struct mempolicy *
-get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
+static struct mempolicy * get_vma_policy(struct task_struct *task,
+ struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol = task->mempolicy;
@@ -768,6 +976,33 @@ static unsigned interleave_nodes(struct mempolicy *policy)
return nid;
}
+/*
+ * Depending on the memory policy provide a node from which to allocate the
+ * next slab entry.
+ */
+unsigned slab_node(struct mempolicy *policy)
+{
+ switch (policy->policy) {
+ case MPOL_INTERLEAVE:
+ return interleave_nodes(policy);
+
+ case MPOL_BIND:
+ /*
+ * Follow bind policy behavior and start allocation at the
+ * first node.
+ */
+ return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
+
+ case MPOL_PREFERRED:
+ if (policy->v.preferred_node >= 0)
+ return policy->v.preferred_node;
+ /* Fall through */
+
+ default:
+ return numa_node_id();
+ }
+}
+
/* Do static interleaving for a VMA with known offset. */
static unsigned offset_il_node(struct mempolicy *pol,
struct vm_area_struct *vma, unsigned long off)
@@ -785,6 +1020,34 @@ static unsigned offset_il_node(struct mempolicy *pol,
return nid;
}
+/* Determine a node number for interleave */
+static inline unsigned interleave_nid(struct mempolicy *pol,
+ struct vm_area_struct *vma, unsigned long addr, int shift)
+{
+ if (vma) {
+ unsigned long off;
+
+ off = vma->vm_pgoff;
+ off += (addr - vma->vm_start) >> shift;
+ return offset_il_node(pol, vma, off);
+ } else
+ return interleave_nodes(pol);
+}
+
+/* Return a zonelist suitable for a huge page allocation. */
+struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
+{
+ struct mempolicy *pol = get_vma_policy(current, vma, addr);
+
+ if (pol->policy == MPOL_INTERLEAVE) {
+ unsigned nid;
+
+ nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
+ return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
+ }
+ return zonelist_policy(GFP_HIGHUSER, pol);
+}
+
/* Allocate a page in interleaved policy.
Own path because it needs to do special accounting. */
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
@@ -829,19 +1092,12 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol = get_vma_policy(current, vma, addr);
- cpuset_update_current_mems_allowed();
+ cpuset_update_task_memory_state();
if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
unsigned nid;
- if (vma) {
- unsigned long off;
- off = vma->vm_pgoff;
- off += (addr - vma->vm_start) >> PAGE_SHIFT;
- nid = offset_il_node(pol, vma, off);
- } else {
- /* fall back to process interleaving */
- nid = interleave_nodes(pol);
- }
+
+ nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
return alloc_page_interleave(gfp, 0, nid);
}
return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
@@ -862,7 +1118,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
* interrupt context and apply the current process NUMA policy.
* Returns NULL when no page can be allocated.
*
- * Don't call cpuset_update_current_mems_allowed() unless
+ * Don't call cpuset_update_task_memory_state() unless
* 1) it's ok to take cpuset_sem (can WAIT), and
* 2) allocating for current task (not interrupt).
*/
@@ -871,7 +1127,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
struct mempolicy *pol = current->mempolicy;
if ((gfp & __GFP_WAIT) && !in_interrupt())
- cpuset_update_current_mems_allowed();
+ cpuset_update_task_memory_state();
if (!pol || in_interrupt())
pol = &default_policy;
if (pol->policy == MPOL_INTERLEAVE)
@@ -880,6 +1136,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
}
EXPORT_SYMBOL(alloc_pages_current);
+/*
+ * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
+ * rebinds the mempolicy its copying by calling mpol_rebind_policy()
+ * with the mems_allowed returned by cpuset_mems_allowed(). This
+ * keeps mempolicies cpuset relative after its cpuset moves. See
+ * further kernel/cpuset.c update_nodemask().
+ */
+void *cpuset_being_rebound;
+
/* Slow path of a mempolicy copy */
struct mempolicy *__mpol_copy(struct mempolicy *old)
{
@@ -887,6 +1152,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
if (!new)
return ERR_PTR(-ENOMEM);
+ if (current_cpuset_is_being_rebound()) {
+ nodemask_t mems = cpuset_mems_allowed(current);
+ mpol_rebind_policy(old, &mems);
+ }
*new = *old;
atomic_set(&new->refcnt, 1);
if (new->policy == MPOL_BIND) {
@@ -940,54 +1209,6 @@ void __mpol_free(struct mempolicy *p)
}
/*
- * Hugetlb policy. Same as above, just works with node numbers instead of
- * zonelists.
- */
-
-/* Find first node suitable for an allocation */
-int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
-{
- struct mempolicy *pol = get_vma_policy(current, vma, addr);
-
- switch (pol->policy) {
- case MPOL_DEFAULT:
- return numa_node_id();
- case MPOL_BIND:
- return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
- case MPOL_INTERLEAVE:
- return interleave_nodes(pol);
- case MPOL_PREFERRED:
- return pol->v.preferred_node >= 0 ?
- pol->v.preferred_node : numa_node_id();
- }
- BUG();
- return 0;
-}
-
-/* Find secondary valid nodes for an allocation */
-int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
-{
- struct mempolicy *pol = get_vma_policy(current, vma, addr);
-
- switch (pol->policy) {
- case MPOL_PREFERRED:
- case MPOL_DEFAULT:
- case MPOL_INTERLEAVE:
- return 1;
- case MPOL_BIND: {
- struct zone **z;
- for (z = pol->v.zonelist->zones; *z; z++)
- if ((*z)->zone_pgdat->node_id == nid)
- return 1;
- return 0;
- }
- default:
- BUG();
- return 0;
- }
-}
-
-/*
* Shared memory backing store policy support.
*
* Remember policies even when nobody has shared memory mapped.
@@ -1141,6 +1362,30 @@ restart:
return 0;
}
+void mpol_shared_policy_init(struct shared_policy *info, int policy,
+ nodemask_t *policy_nodes)
+{
+ info->root = RB_ROOT;
+ spin_lock_init(&info->lock);
+
+ if (policy != MPOL_DEFAULT) {
+ struct mempolicy *newpol;
+
+ /* Falls back to MPOL_DEFAULT on any error */
+ newpol = mpol_new(policy, policy_nodes);
+ if (!IS_ERR(newpol)) {
+ /* Create pseudo-vma that contains just the policy */
+ struct vm_area_struct pvma;
+
+ memset(&pvma, 0, sizeof(struct vm_area_struct));
+ /* Policy covers entire file */
+ pvma.vm_end = TASK_SIZE;
+ mpol_set_shared_policy(info, &pvma, newpol);
+ mpol_free(newpol);
+ }
+ }
+}
+
int mpol_set_shared_policy(struct shared_policy *info,
struct vm_area_struct *vma, struct mempolicy *npol)
{
@@ -1209,25 +1454,31 @@ void numa_default_policy(void)
}
/* Migrate a policy to a different set of nodes */
-static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
- const nodemask_t *new)
+void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
+ nodemask_t *mpolmask;
nodemask_t tmp;
if (!pol)
return;
+ mpolmask = &pol->cpuset_mems_allowed;
+ if (nodes_equal(*mpolmask, *newmask))
+ return;
switch (pol->policy) {
case MPOL_DEFAULT:
break;
case MPOL_INTERLEAVE:
- nodes_remap(tmp, pol->v.nodes, *old, *new);
+ nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
pol->v.nodes = tmp;
- current->il_next = node_remap(current->il_next, *old, *new);
+ *mpolmask = *newmask;
+ current->il_next = node_remap(current->il_next,
+ *mpolmask, *newmask);
break;
case MPOL_PREFERRED:
pol->v.preferred_node = node_remap(pol->v.preferred_node,
- *old, *new);
+ *mpolmask, *newmask);
+ *mpolmask = *newmask;
break;
case MPOL_BIND: {
nodemask_t nodes;
@@ -1237,7 +1488,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
nodes_clear(nodes);
for (z = pol->v.zonelist->zones; *z; z++)
node_set((*z)->zone_pgdat->node_id, nodes);
- nodes_remap(tmp, nodes, *old, *new);
+ nodes_remap(tmp, nodes, *mpolmask, *newmask);
nodes = tmp;
zonelist = bind_zonelist(&nodes);
@@ -1252,6 +1503,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
kfree(pol->v.zonelist);
pol->v.zonelist = zonelist;
}
+ *mpolmask = *newmask;
break;
}
default:
@@ -1261,12 +1513,156 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
}
/*
- * Someone moved this task to different nodes. Fixup mempolicies.
+ * Wrapper for mpol_rebind_policy() that just requires task
+ * pointer, and updates task mempolicy.
+ */
+
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
+{
+ mpol_rebind_policy(tsk->mempolicy, new);
+}
+
+/*
+ * Rebind each vma in mm to new nodemask.
*
- * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
- * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
+ * Call holding a reference to mm. Takes mm->mmap_sem during call.
+ */
+
+void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
+{
+ struct vm_area_struct *vma;
+
+ down_write(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next)
+ mpol_rebind_policy(vma->vm_policy, new);
+ up_write(&mm->mmap_sem);
+}
+
+/*
+ * Display pages allocated per node and memory policy via /proc.
+ */
+
+static const char *policy_types[] = { "default", "prefer", "bind",
+ "interleave" };
+
+/*
+ * Convert a mempolicy into a string.
+ * Returns the number of characters in buffer (if positive)
+ * or an error (negative)
*/
-void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
+static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+{
+ char *p = buffer;
+ int l;
+ nodemask_t nodes;
+ int mode = pol ? pol->policy : MPOL_DEFAULT;
+
+ switch (mode) {
+ case MPOL_DEFAULT:
+ nodes_clear(nodes);
+ break;
+
+ case MPOL_PREFERRED:
+ nodes_clear(nodes);
+ node_set(pol->v.preferred_node, nodes);
+ break;
+
+ case MPOL_BIND:
+ get_zonemask(pol, &nodes);
+ break;
+
+ case MPOL_INTERLEAVE:
+ nodes = pol->v.nodes;
+ break;
+
+ default:
+ BUG();
+ return -EFAULT;
+ }
+
+ l = strlen(policy_types[mode]);
+ if (buffer + maxlen < p + l + 1)
+ return -ENOSPC;
+
+ strcpy(p, policy_types[mode]);
+ p += l;
+
+ if (!nodes_empty(nodes)) {
+ if (buffer + maxlen < p + 2)
+ return -ENOSPC;
+ *p++ = '=';
+ p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
+ }
+ return p - buffer;
+}
+
+struct numa_maps {
+ unsigned long pages;
+ unsigned long anon;
+ unsigned long mapped;
+ unsigned long mapcount_max;
+ unsigned long node[MAX_NUMNODES];
+};
+
+static void gather_stats(struct page *page, void *private)
+{
+ struct numa_maps *md = private;
+ int count = page_mapcount(page);
+
+ if (count)
+ md->mapped++;
+
+ if (count > md->mapcount_max)
+ md->mapcount_max = count;
+
+ md->pages++;
+
+ if (PageAnon(page))
+ md->anon++;
+
+ md->node[page_to_nid(page)]++;
+ cond_resched();
+}
+
+int show_numa_map(struct seq_file *m, void *v)
{
- rebind_policy(current->mempolicy, old, new);
+ struct task_struct *task = m->private;
+ struct vm_area_struct *vma = v;
+ struct numa_maps *md;
+ int n;
+ char buffer[50];
+
+ if (!vma->vm_mm)
+ return 0;
+
+ md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
+ if (!md)
+ return 0;
+
+ check_pgd_range(vma, vma->vm_start, vma->vm_end,
+ &node_online_map, MPOL_MF_STATS, md);
+
+ if (md->pages) {
+ mpol_to_str(buffer, sizeof(buffer),
+ get_vma_policy(task, vma, vma->vm_start));
+
+ seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
+ vma->vm_start, buffer, md->pages,
+ md->mapped, md->mapcount_max);
+
+ if (md->anon)
+ seq_printf(m," anon=%lu",md->anon);
+
+ for_each_online_node(n)
+ if (md->node[n])
+ seq_printf(m, " N%d=%lu", n, md->node[n]);
+
+ seq_putc(m, '\n');
+ }
+ kfree(md);
+
+ if (m->count < m->size)
+ m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+ return 0;
}
+
diff --git a/mm/mlock.c b/mm/mlock.c
index 4ae3a46ff76..b90c59573ab 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -5,6 +5,7 @@
* (C) Copyright 2002 Christoph Hellwig
*/
+#include <linux/capability.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/mempolicy.h>
diff --git a/mm/mmap.c b/mm/mmap.c
index 64ba4dbcb7d..47556d2b3e9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -13,6 +13,7 @@
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
+#include <linux/capability.h>
#include <linux/init.h>
#include <linux/file.h>
#include <linux/fs.h>
diff --git a/mm/mremap.c b/mm/mremap.c
index ddaeee9a0b6..1903bdf65e4 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -13,6 +13,7 @@
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/swap.h>
+#include <linux/capability.h>
#include <linux/fs.h>
#include <linux/highmem.h>
#include <linux/security.h>
diff --git a/mm/msync.c b/mm/msync.c
index 1b5b6f662dc..3563a56e1a5 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -137,7 +137,7 @@ static int msync_interval(struct vm_area_struct *vma,
ret = filemap_fdatawrite(mapping);
if (file->f_op && file->f_op->fsync) {
/*
- * We don't take i_sem here because mmap_sem
+ * We don't take i_mutex here because mmap_sem
* is already held.
*/
err = file->f_op->fsync(file,file->f_dentry,1);
diff --git a/mm/nommu.c b/mm/nommu.c
index c1196812876..c10262d6823 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1177,3 +1177,10 @@ int in_gate_area_no_task(unsigned long addr)
{
return 0;
}
+
+struct page *filemap_nopage(struct vm_area_struct *area,
+ unsigned long address, int *type)
+{
+ BUG();
+ return NULL;
+}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d348b903595..14bd4ec7959 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -274,6 +274,7 @@ void out_of_memory(gfp_t gfp_mask, int order)
show_mem();
}
+ cpuset_lock();
read_lock(&tasklist_lock);
retry:
p = select_bad_process();
@@ -284,6 +285,7 @@ retry:
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
read_unlock(&tasklist_lock);
+ cpuset_unlock();
panic("Out of memory and no killable processes...\n");
}
@@ -293,12 +295,14 @@ retry:
out:
read_unlock(&tasklist_lock);
+ cpuset_unlock();
if (mm)
mmput(mm);
/*
* Give "p" a good chance of killing itself before we
- * retry to allocate memory.
+ * retry to allocate memory unless "p" is current
*/
- schedule_timeout_interruptible(1);
+ if (!test_thread_flag(TIF_MEMDIE))
+ schedule_timeout_interruptible(1);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0166ea15c9e..945559fb63d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -46,7 +46,7 @@
static long ratelimit_pages = 32;
static long total_pages; /* The total number of pages in the machine. */
-static int dirty_exceeded; /* Dirty mem may be over limit */
+static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */
/*
* When balance_dirty_pages decides that the caller needs to perform some
@@ -212,7 +212,8 @@ static void balance_dirty_pages(struct address_space *mapping)
if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
break;
- dirty_exceeded = 1;
+ if (!dirty_exceeded)
+ dirty_exceeded = 1;
/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
* Unstable writes are a feature of certain networked
@@ -234,7 +235,7 @@ static void balance_dirty_pages(struct address_space *mapping)
blk_congestion_wait(WRITE, HZ/10);
}
- if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
+ if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh && dirty_exceeded)
dirty_exceeded = 0;
if (writeback_in_progress(bdi))
@@ -550,11 +551,17 @@ void __init page_writeback_init(void)
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
+ int ret;
+
if (wbc->nr_to_write <= 0)
return 0;
+ wbc->for_writepages = 1;
if (mapping->a_ops->writepages)
- return mapping->a_ops->writepages(mapping, wbc);
- return generic_writepages(mapping, wbc);
+ ret = mapping->a_ops->writepages(mapping, wbc);
+ else
+ ret = generic_writepages(mapping, wbc);
+ wbc->for_writepages = 0;
+ return ret;
}
/**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fe14a8c87fc..df54e2fc8ee 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -36,6 +36,7 @@
#include <linux/memory_hotplug.h>
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
+#include <linux/mempolicy.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -52,6 +53,9 @@ struct pglist_data *pgdat_list __read_mostly;
unsigned long totalram_pages __read_mostly;
unsigned long totalhigh_pages __read_mostly;
long nr_swap_pages;
+int percpu_pagelist_fraction;
+
+static void fastcall free_hot_cold_page(struct page *page, int cold);
/*
* results with 256, 32 in the lowmem_reserve sysctl:
@@ -81,6 +85,7 @@ int min_free_kbytes = 1024;
unsigned long __initdata nr_kernel_pages;
unsigned long __initdata nr_all_pages;
+#ifdef CONFIG_DEBUG_VM
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
{
int ret = 0;
@@ -122,16 +127,23 @@ static int bad_range(struct zone *zone, struct page *page)
return 0;
}
-static void bad_page(const char *function, struct page *page)
+#else
+static inline int bad_range(struct zone *zone, struct page *page)
{
- printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
- function, current->comm, page);
- printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
- (int)(2*sizeof(unsigned long)), (unsigned long)page->flags,
- page->mapping, page_mapcount(page), page_count(page));
- printk(KERN_EMERG "Backtrace:\n");
+ return 0;
+}
+#endif
+
+static void bad_page(struct page *page)
+{
+ printk(KERN_EMERG "Bad page state in process '%s'\n"
+ KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
+ KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
+ KERN_EMERG "Backtrace:\n",
+ current->comm, page, (int)(2*sizeof(unsigned long)),
+ (unsigned long)page->flags, page->mapping,
+ page_mapcount(page), page_count(page));
dump_stack();
- printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
page->flags &= ~(1 << PG_lru |
1 << PG_private |
1 << PG_locked |
@@ -184,19 +196,15 @@ static void destroy_compound_page(struct page *page, unsigned long order)
int i;
int nr_pages = 1 << order;
- if (!PageCompound(page))
- return;
-
- if (page[1].index != order)
- bad_page(__FUNCTION__, page);
+ if (unlikely(page[1].index != order))
+ bad_page(page);
for (i = 0; i < nr_pages; i++) {
struct page *p = page + i;
- if (!PageCompound(p))
- bad_page(__FUNCTION__, page);
- if (page_private(p) != (unsigned long)page)
- bad_page(__FUNCTION__, page);
+ if (unlikely(!PageCompound(p) |
+ (page_private(p) != (unsigned long)page)))
+ bad_page(page);
ClearPageCompound(p);
}
}
@@ -255,14 +263,20 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
/*
* This function checks whether a page is free && is the buddy
* we can do coalesce a page and its buddy if
- * (a) the buddy is free &&
- * (b) the buddy is on the buddy system &&
- * (c) a page and its buddy have the same order.
+ * (a) the buddy is not in a hole &&
+ * (b) the buddy is free &&
+ * (c) the buddy is on the buddy system &&
+ * (d) a page and its buddy have the same order.
* for recording page's order, we use page_private(page) and PG_private.
*
*/
static inline int page_is_buddy(struct page *page, int order)
{
+#ifdef CONFIG_HOLES_IN_ZONE
+ if (!pfn_valid(page_to_pfn(page)))
+ return 0;
+#endif
+
if (PagePrivate(page) &&
(page_order(page) == order) &&
page_count(page) == 0)
@@ -294,13 +308,13 @@ static inline int page_is_buddy(struct page *page, int order)
* -- wli
*/
-static inline void __free_pages_bulk (struct page *page,
+static inline void __free_one_page(struct page *page,
struct zone *zone, unsigned int order)
{
unsigned long page_idx;
int order_size = 1 << order;
- if (unlikely(order))
+ if (unlikely(PageCompound(page)))
destroy_compound_page(page, order);
page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
@@ -314,17 +328,15 @@ static inline void __free_pages_bulk (struct page *page,
struct free_area *area;
struct page *buddy;
- combined_idx = __find_combined_index(page_idx, order);
buddy = __page_find_buddy(page, page_idx, order);
-
- if (bad_range(zone, buddy))
- break;
if (!page_is_buddy(buddy, order))
break; /* Move the buddy up one level. */
+
list_del(&buddy->lru);
area = zone->free_area + order;
area->nr_free--;
rmv_page_order(buddy);
+ combined_idx = __find_combined_index(page_idx, order);
page = page + (combined_idx - page_idx);
page_idx = combined_idx;
order++;
@@ -334,11 +346,11 @@ static inline void __free_pages_bulk (struct page *page,
zone->free_area[order].nr_free++;
}
-static inline int free_pages_check(const char *function, struct page *page)
+static inline int free_pages_check(struct page *page)
{
- if ( page_mapcount(page) ||
- page->mapping != NULL ||
- page_count(page) != 0 ||
+ if (unlikely(page_mapcount(page) |
+ (page->mapping != NULL) |
+ (page_count(page) != 0) |
(page->flags & (
1 << PG_lru |
1 << PG_private |
@@ -348,8 +360,8 @@ static inline int free_pages_check(const char *function, struct page *page)
1 << PG_slab |
1 << PG_swapcache |
1 << PG_writeback |
- 1 << PG_reserved )))
- bad_page(function, page);
+ 1 << PG_reserved ))))
+ bad_page(page);
if (PageDirty(page))
__ClearPageDirty(page);
/*
@@ -371,51 +383,90 @@ static inline int free_pages_check(const char *function, struct page *page)
* And clear the zone's pages_scanned counter, to hold off the "all pages are
* pinned" detection logic.
*/
-static int
-free_pages_bulk(struct zone *zone, int count,
- struct list_head *list, unsigned int order)
+static void free_pages_bulk(struct zone *zone, int count,
+ struct list_head *list, int order)
{
- unsigned long flags;
- struct page *page = NULL;
- int ret = 0;
-
- spin_lock_irqsave(&zone->lock, flags);
+ spin_lock(&zone->lock);
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
- while (!list_empty(list) && count--) {
+ while (count--) {
+ struct page *page;
+
+ BUG_ON(list_empty(list));
page = list_entry(list->prev, struct page, lru);
- /* have to delete it as __free_pages_bulk list manipulates */
+ /* have to delete it as __free_one_page list manipulates */
list_del(&page->lru);
- __free_pages_bulk(page, zone, order);
- ret++;
+ __free_one_page(page, zone, order);
}
- spin_unlock_irqrestore(&zone->lock, flags);
- return ret;
+ spin_unlock(&zone->lock);
}
-void __free_pages_ok(struct page *page, unsigned int order)
+static void free_one_page(struct zone *zone, struct page *page, int order)
{
LIST_HEAD(list);
+ list_add(&page->lru, &list);
+ free_pages_bulk(zone, 1, &list, order);
+}
+
+static void __free_pages_ok(struct page *page, unsigned int order)
+{
+ unsigned long flags;
int i;
int reserved = 0;
arch_free_page(page, order);
+ if (!PageHighMem(page))
+ mutex_debug_check_no_locks_freed(page_address(page),
+ PAGE_SIZE<<order);
#ifndef CONFIG_MMU
- if (order > 0)
- for (i = 1 ; i < (1 << order) ; ++i)
- __put_page(page + i);
+ for (i = 1 ; i < (1 << order) ; ++i)
+ __put_page(page + i);
#endif
for (i = 0 ; i < (1 << order) ; ++i)
- reserved += free_pages_check(__FUNCTION__, page + i);
+ reserved += free_pages_check(page + i);
if (reserved)
return;
- list_add(&page->lru, &list);
- mod_page_state(pgfree, 1 << order);
- kernel_map_pages(page, 1<<order, 0);
- free_pages_bulk(page_zone(page), 1, &list, order);
+ kernel_map_pages(page, 1 << order, 0);
+ local_irq_save(flags);
+ __mod_page_state(pgfree, 1 << order);
+ free_one_page(page_zone(page), page, order);
+ local_irq_restore(flags);
+}
+
+/*
+ * permit the bootmem allocator to evade page validation on high-order frees
+ */
+void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
+{
+ if (order == 0) {
+ __ClearPageReserved(page);
+ set_page_count(page, 0);
+
+ free_hot_cold_page(page, 0);
+ } else {
+ LIST_HEAD(list);
+ int loop;
+
+ for (loop = 0; loop < BITS_PER_LONG; loop++) {
+ struct page *p = &page[loop];
+
+ if (loop + 16 < BITS_PER_LONG)
+ prefetchw(p + 16);
+ __ClearPageReserved(p);
+ set_page_count(p, 0);
+ }
+
+ arch_free_page(page, order);
+
+ mod_page_state(pgfree, 1 << order);
+
+ list_add(&page->lru, &list);
+ kernel_map_pages(page, 1 << order, 0);
+ free_pages_bulk(page_zone(page), 1, &list, order);
+ }
}
@@ -433,8 +484,7 @@ void __free_pages_ok(struct page *page, unsigned int order)
*
* -- wli
*/
-static inline struct page *
-expand(struct zone *zone, struct page *page,
+static inline void expand(struct zone *zone, struct page *page,
int low, int high, struct free_area *area)
{
unsigned long size = 1 << high;
@@ -448,24 +498,6 @@ expand(struct zone *zone, struct page *page,
area->nr_free++;
set_page_order(&page[size], high);
}
- return page;
-}
-
-void set_page_refs(struct page *page, int order)
-{
-#ifdef CONFIG_MMU
- set_page_count(page, 1);
-#else
- int i;
-
- /*
- * We need to reference all the pages for this order, otherwise if
- * anyone accesses one of the pages with (get/put) it will be freed.
- * - eg: access_process_vm()
- */
- for (i = 0; i < (1 << order); i++)
- set_page_count(page + i, 1);
-#endif /* CONFIG_MMU */
}
/*
@@ -473,9 +505,9 @@ void set_page_refs(struct page *page, int order)
*/
static int prep_new_page(struct page *page, int order)
{
- if ( page_mapcount(page) ||
- page->mapping != NULL ||
- page_count(page) != 0 ||
+ if (unlikely(page_mapcount(page) |
+ (page->mapping != NULL) |
+ (page_count(page) != 0) |
(page->flags & (
1 << PG_lru |
1 << PG_private |
@@ -486,8 +518,8 @@ static int prep_new_page(struct page *page, int order)
1 << PG_slab |
1 << PG_swapcache |
1 << PG_writeback |
- 1 << PG_reserved )))
- bad_page(__FUNCTION__, page);
+ 1 << PG_reserved ))))
+ bad_page(page);
/*
* For now, we report if PG_reserved was found set, but do not
@@ -525,7 +557,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
rmv_page_order(page);
area->nr_free--;
zone->free_pages -= 1UL << order;
- return expand(zone, page, order, current_order, area);
+ expand(zone, page, order, current_order, area);
+ return page;
}
return NULL;
@@ -539,21 +572,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list)
{
- unsigned long flags;
int i;
- int allocated = 0;
- struct page *page;
- spin_lock_irqsave(&zone->lock, flags);
+ spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
- page = __rmqueue(zone, order);
- if (page == NULL)
+ struct page *page = __rmqueue(zone, order);
+ if (unlikely(page == NULL))
break;
- allocated++;
list_add_tail(&page->lru, list);
}
- spin_unlock_irqrestore(&zone->lock, flags);
- return allocated;
+ spin_unlock(&zone->lock);
+ return i;
}
#ifdef CONFIG_NUMA
@@ -572,14 +601,13 @@ void drain_remote_pages(void)
if (zone->zone_pgdat->node_id == numa_node_id())
continue;
- pset = zone->pageset[smp_processor_id()];
+ pset = zone_pcp(zone, smp_processor_id());
for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
struct per_cpu_pages *pcp;
pcp = &pset->pcp[i];
- if (pcp->count)
- pcp->count -= free_pages_bulk(zone, pcp->count,
- &pcp->list, 0);
+ free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+ pcp->count = 0;
}
}
local_irq_restore(flags);
@@ -589,6 +617,7 @@ void drain_remote_pages(void)
#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
static void __drain_pages(unsigned int cpu)
{
+ unsigned long flags;
struct zone *zone;
int i;
@@ -600,8 +629,10 @@ static void __drain_pages(unsigned int cpu)
struct per_cpu_pages *pcp;
pcp = &pset->pcp[i];
- pcp->count -= free_pages_bulk(zone, pcp->count,
- &pcp->list, 0);
+ local_irq_save(flags);
+ free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+ pcp->count = 0;
+ local_irq_restore(flags);
}
}
}
@@ -647,18 +678,14 @@ void drain_local_pages(void)
}
#endif /* CONFIG_PM */
-static void zone_statistics(struct zonelist *zonelist, struct zone *z)
+static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
{
#ifdef CONFIG_NUMA
- unsigned long flags;
- int cpu;
pg_data_t *pg = z->zone_pgdat;
pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
struct per_cpu_pageset *p;
- local_irq_save(flags);
- cpu = smp_processor_id();
- p = zone_pcp(z,cpu);
+ p = zone_pcp(z, cpu);
if (pg == orig) {
p->numa_hit++;
} else {
@@ -669,14 +696,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
p->local_node++;
else
p->other_node++;
- local_irq_restore(flags);
#endif
}
/*
* Free a 0-order page
*/
-static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
static void fastcall free_hot_cold_page(struct page *page, int cold)
{
struct zone *zone = page_zone(page);
@@ -687,18 +712,20 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
if (PageAnon(page))
page->mapping = NULL;
- if (free_pages_check(__FUNCTION__, page))
+ if (free_pages_check(page))
return;
- inc_page_state(pgfree);
kernel_map_pages(page, 1, 0);
pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
local_irq_save(flags);
+ __inc_page_state(pgfree);
list_add(&page->lru, &pcp->list);
pcp->count++;
- if (pcp->count >= pcp->high)
- pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+ if (pcp->count >= pcp->high) {
+ free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+ pcp->count -= pcp->batch;
+ }
local_irq_restore(flags);
put_cpu();
}
@@ -727,49 +754,58 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
* we cheat by calling it from here, in the order > 0 path. Saves a branch
* or two.
*/
-static struct page *
-buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
+static struct page *buffered_rmqueue(struct zonelist *zonelist,
+ struct zone *zone, int order, gfp_t gfp_flags)
{
unsigned long flags;
struct page *page;
int cold = !!(gfp_flags & __GFP_COLD);
+ int cpu;
again:
- if (order == 0) {
+ cpu = get_cpu();
+ if (likely(order == 0)) {
struct per_cpu_pages *pcp;
- page = NULL;
- pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
+ pcp = &zone_pcp(zone, cpu)->pcp[cold];
local_irq_save(flags);
- if (pcp->count <= pcp->low)
+ if (!pcp->count) {
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, &pcp->list);
- if (pcp->count) {
- page = list_entry(pcp->list.next, struct page, lru);
- list_del(&page->lru);
- pcp->count--;
+ if (unlikely(!pcp->count))
+ goto failed;
}
- local_irq_restore(flags);
- put_cpu();
+ page = list_entry(pcp->list.next, struct page, lru);
+ list_del(&page->lru);
+ pcp->count--;
} else {
spin_lock_irqsave(&zone->lock, flags);
page = __rmqueue(zone, order);
- spin_unlock_irqrestore(&zone->lock, flags);
+ spin_unlock(&zone->lock);
+ if (!page)
+ goto failed;
}
- if (page != NULL) {
- BUG_ON(bad_range(zone, page));
- mod_page_state_zone(zone, pgalloc, 1 << order);
- if (prep_new_page(page, order))
- goto again;
+ __mod_page_state_zone(zone, pgalloc, 1 << order);
+ zone_statistics(zonelist, zone, cpu);
+ local_irq_restore(flags);
+ put_cpu();
+
+ BUG_ON(bad_range(zone, page));
+ if (prep_new_page(page, order))
+ goto again;
- if (gfp_flags & __GFP_ZERO)
- prep_zero_page(page, order, gfp_flags);
+ if (gfp_flags & __GFP_ZERO)
+ prep_zero_page(page, order, gfp_flags);
- if (order && (gfp_flags & __GFP_COMP))
- prep_compound_page(page, order);
- }
+ if (order && (gfp_flags & __GFP_COMP))
+ prep_compound_page(page, order);
return page;
+
+failed:
+ local_irq_restore(flags);
+ put_cpu();
+ return NULL;
}
#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
@@ -842,12 +878,13 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
mark = (*z)->pages_high;
if (!zone_watermark_ok(*z, order, mark,
classzone_idx, alloc_flags))
- continue;
+ if (!zone_reclaim_mode ||
+ !zone_reclaim(*z, gfp_mask, order))
+ continue;
}
- page = buffered_rmqueue(*z, order, gfp_mask);
+ page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
if (page) {
- zone_statistics(zonelist, *z);
break;
}
} while (*(++z) != NULL);
@@ -896,15 +933,15 @@ restart:
*
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or if the caller has realtime scheduling
- * policy.
+ * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
+ * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
*/
alloc_flags = ALLOC_WMARK_MIN;
if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
alloc_flags |= ALLOC_HARDER;
if (gfp_mask & __GFP_HIGH)
alloc_flags |= ALLOC_HIGH;
- if (wait)
- alloc_flags |= ALLOC_CPUSET;
+ alloc_flags |= ALLOC_CPUSET;
/*
* Go through the zonelist again. Let __GFP_HIGH and allocations
@@ -926,7 +963,7 @@ restart:
nofail_alloc:
/* go through the zonelist yet again, ignoring mins */
page = get_page_from_freelist(gfp_mask, order,
- zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET);
+ zonelist, ALLOC_NO_WATERMARKS);
if (page)
goto got_pg;
if (gfp_mask & __GFP_NOFAIL) {
@@ -945,6 +982,7 @@ rebalance:
cond_resched();
/* We now go into synchronous reclaim */
+ cpuset_memory_pressure_bump();
p->flags |= PF_MEMALLOC;
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
@@ -1171,7 +1209,7 @@ EXPORT_SYMBOL(nr_pagecache);
DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
#endif
-void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
+static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
{
int cpu = 0;
@@ -1224,7 +1262,7 @@ void get_full_page_state(struct page_state *ret)
__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
}
-unsigned long __read_page_state(unsigned long offset)
+unsigned long read_page_state_offset(unsigned long offset)
{
unsigned long ret = 0;
int cpu;
@@ -1238,18 +1276,26 @@ unsigned long __read_page_state(unsigned long offset)
return ret;
}
-void __mod_page_state(unsigned long offset, unsigned long delta)
+void __mod_page_state_offset(unsigned long offset, unsigned long delta)
+{
+ void *ptr;
+
+ ptr = &__get_cpu_var(page_states);
+ *(unsigned long *)(ptr + offset) += delta;
+}
+EXPORT_SYMBOL(__mod_page_state_offset);
+
+void mod_page_state_offset(unsigned long offset, unsigned long delta)
{
unsigned long flags;
- void* ptr;
+ void *ptr;
local_irq_save(flags);
ptr = &__get_cpu_var(page_states);
- *(unsigned long*)(ptr + offset) += delta;
+ *(unsigned long *)(ptr + offset) += delta;
local_irq_restore(flags);
}
-
-EXPORT_SYMBOL(__mod_page_state);
+EXPORT_SYMBOL(mod_page_state_offset);
void __get_zone_counts(unsigned long *active, unsigned long *inactive,
unsigned long *free, struct pglist_data *pgdat)
@@ -1335,7 +1381,7 @@ void show_free_areas(void)
show_node(zone);
printk("%s per-cpu:", zone->name);
- if (!zone->present_pages) {
+ if (!populated_zone(zone)) {
printk(" empty\n");
continue;
} else
@@ -1347,10 +1393,9 @@ void show_free_areas(void)
pageset = zone_pcp(zone, cpu);
for (temperature = 0; temperature < 2; temperature++)
- printk("cpu %d %s: low %d, high %d, batch %d used:%d\n",
+ printk("cpu %d %s: high %d, batch %d used:%d\n",
cpu,
temperature ? "cold" : "hot",
- pageset->pcp[temperature].low,
pageset->pcp[temperature].high,
pageset->pcp[temperature].batch,
pageset->pcp[temperature].count);
@@ -1413,7 +1458,7 @@ void show_free_areas(void)
show_node(zone);
printk("%s: ", zone->name);
- if (!zone->present_pages) {
+ if (!populated_zone(zone)) {
printk("empty\n");
continue;
}
@@ -1433,36 +1478,29 @@ void show_free_areas(void)
/*
* Builds allocation fallback zone lists.
+ *
+ * Add all populated zones of a node to the zonelist.
*/
-static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
-{
- switch (k) {
- struct zone *zone;
- default:
- BUG();
- case ZONE_HIGHMEM:
- zone = pgdat->node_zones + ZONE_HIGHMEM;
- if (zone->present_pages) {
+static int __init build_zonelists_node(pg_data_t *pgdat,
+ struct zonelist *zonelist, int nr_zones, int zone_type)
+{
+ struct zone *zone;
+
+ BUG_ON(zone_type > ZONE_HIGHMEM);
+
+ do {
+ zone = pgdat->node_zones + zone_type;
+ if (populated_zone(zone)) {
#ifndef CONFIG_HIGHMEM
- BUG();
+ BUG_ON(zone_type > ZONE_NORMAL);
#endif
- zonelist->zones[j++] = zone;
+ zonelist->zones[nr_zones++] = zone;
+ check_highest_zone(zone_type);
}
- case ZONE_NORMAL:
- zone = pgdat->node_zones + ZONE_NORMAL;
- if (zone->present_pages)
- zonelist->zones[j++] = zone;
- case ZONE_DMA32:
- zone = pgdat->node_zones + ZONE_DMA32;
- if (zone->present_pages)
- zonelist->zones[j++] = zone;
- case ZONE_DMA:
- zone = pgdat->node_zones + ZONE_DMA;
- if (zone->present_pages)
- zonelist->zones[j++] = zone;
- }
+ zone_type--;
- return j;
+ } while (zone_type >= 0);
+ return nr_zones;
}
static inline int highest_zone(int zone_bits)
@@ -1559,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat)
prev_node = local_node;
nodes_clear(used_mask);
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+ int distance = node_distance(local_node, node);
+
+ /*
+ * If another node is sufficiently far away then it is better
+ * to reclaim pages in a zone before going off node.
+ */
+ if (distance > RECLAIM_DISTANCE)
+ zone_reclaim_mode = 1;
+
/*
* We don't want to pressure a particular node.
* So adding penalty to the first node in same
* distance group to make it round-robin.
*/
- if (node_distance(local_node, node) !=
- node_distance(local_node, prev_node))
+
+ if (distance != node_distance(local_node, prev_node))
node_load[node] += load;
prev_node = node;
load--;
@@ -1699,18 +1746,16 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
* up by free_all_bootmem() once the early boot process is
* done. Non-atomic initialization, single-pass.
*/
-void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn)
{
struct page *page;
unsigned long end_pfn = start_pfn + size;
unsigned long pfn;
- for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
if (!early_pfn_valid(pfn))
continue;
- if (!early_pfn_in_nid(pfn, nid))
- continue;
page = pfn_to_page(pfn);
set_page_links(page, zone, nid, pfn);
set_page_count(page, 1);
@@ -1754,7 +1799,7 @@ void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
memmap_init_zone((size), (nid), (zone), (start_pfn))
#endif
-static int __devinit zone_batchsize(struct zone *zone)
+static int __meminit zone_batchsize(struct zone *zone)
{
int batch;
@@ -1794,19 +1839,35 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
pcp = &p->pcp[0]; /* hot */
pcp->count = 0;
- pcp->low = 0;
pcp->high = 6 * batch;
pcp->batch = max(1UL, 1 * batch);
INIT_LIST_HEAD(&pcp->list);
pcp = &p->pcp[1]; /* cold*/
pcp->count = 0;
- pcp->low = 0;
pcp->high = 2 * batch;
pcp->batch = max(1UL, batch/2);
INIT_LIST_HEAD(&pcp->list);
}
+/*
+ * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
+ * to the value high for the pageset p.
+ */
+
+static void setup_pagelist_highmark(struct per_cpu_pageset *p,
+ unsigned long high)
+{
+ struct per_cpu_pages *pcp;
+
+ pcp = &p->pcp[0]; /* hot list */
+ pcp->high = high;
+ pcp->batch = max(1UL, high/4);
+ if ((high/4) > (PAGE_SHIFT * 8))
+ pcp->batch = PAGE_SHIFT * 8;
+}
+
+
#ifdef CONFIG_NUMA
/*
* Boot pageset table. One per cpu which is going to be used for all
@@ -1832,18 +1893,22 @@ static struct per_cpu_pageset
* Dynamically allocate memory for the
* per cpu pageset array in struct zone.
*/
-static int __devinit process_zones(int cpu)
+static int __meminit process_zones(int cpu)
{
struct zone *zone, *dzone;
for_each_zone(zone) {
- zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
+ zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
GFP_KERNEL, cpu_to_node(cpu));
- if (!zone->pageset[cpu])
+ if (!zone_pcp(zone, cpu))
goto bad;
- setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
+ setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
+
+ if (percpu_pagelist_fraction)
+ setup_pagelist_highmark(zone_pcp(zone, cpu),
+ (zone->present_pages / percpu_pagelist_fraction));
}
return 0;
@@ -1851,15 +1916,14 @@ bad:
for_each_zone(dzone) {
if (dzone == zone)
break;
- kfree(dzone->pageset[cpu]);
- dzone->pageset[cpu] = NULL;
+ kfree(zone_pcp(dzone, cpu));
+ zone_pcp(dzone, cpu) = NULL;
}
return -ENOMEM;
}
static inline void free_zone_pagesets(int cpu)
{
-#ifdef CONFIG_NUMA
struct zone *zone;
for_each_zone(zone) {
@@ -1868,10 +1932,9 @@ static inline void free_zone_pagesets(int cpu)
zone_pcp(zone, cpu) = NULL;
kfree(pset);
}
-#endif
}
-static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
+static int __meminit pageset_cpuup_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
@@ -1911,7 +1974,7 @@ void __init setup_per_cpu_pageset(void)
#endif
-static __devinit
+static __meminit
void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
{
int i;
@@ -1931,7 +1994,7 @@ void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
init_waitqueue_head(zone->wait_table + i);
}
-static __devinit void zone_pcp_init(struct zone *zone)
+static __meminit void zone_pcp_init(struct zone *zone)
{
int cpu;
unsigned long batch = zone_batchsize(zone);
@@ -1939,7 +2002,7 @@ static __devinit void zone_pcp_init(struct zone *zone)
for (cpu = 0; cpu < NR_CPUS; cpu++) {
#ifdef CONFIG_NUMA
/* Early boot. Slab allocator not functional yet */
- zone->pageset[cpu] = &boot_pageset[cpu];
+ zone_pcp(zone, cpu) = &boot_pageset[cpu];
setup_pageset(&boot_pageset[cpu],0);
#else
setup_pageset(zone_pcp(zone,cpu), batch);
@@ -1949,7 +2012,7 @@ static __devinit void zone_pcp_init(struct zone *zone)
zone->name, zone->present_pages, batch);
}
-static __devinit void init_currently_empty_zone(struct zone *zone,
+static __meminit void init_currently_empty_zone(struct zone *zone,
unsigned long zone_start_pfn, unsigned long size)
{
struct pglist_data *pgdat = zone->zone_pgdat;
@@ -2116,7 +2179,7 @@ static int frag_show(struct seq_file *m, void *arg)
int order;
for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
- if (!zone->present_pages)
+ if (!populated_zone(zone))
continue;
spin_lock_irqsave(&zone->lock, flags);
@@ -2149,7 +2212,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
int i;
- if (!zone->present_pages)
+ if (!populated_zone(zone))
continue;
spin_lock_irqsave(&zone->lock, flags);
@@ -2182,7 +2245,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
seq_printf(m,
")"
"\n pagesets");
- for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) {
+ for_each_online_cpu(i) {
struct per_cpu_pageset *pageset;
int j;
@@ -2197,12 +2260,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
seq_printf(m,
"\n cpu: %i pcp: %i"
"\n count: %i"
- "\n low: %i"
"\n high: %i"
"\n batch: %i",
i, j,
pageset->pcp[j].count,
- pageset->pcp[j].low,
pageset->pcp[j].high,
pageset->pcp[j].batch);
}
@@ -2257,32 +2318,40 @@ static char *vmstat_text[] = {
"pgpgout",
"pswpin",
"pswpout",
- "pgalloc_high",
+ "pgalloc_high",
"pgalloc_normal",
+ "pgalloc_dma32",
"pgalloc_dma",
+
"pgfree",
"pgactivate",
"pgdeactivate",
"pgfault",
"pgmajfault",
+
"pgrefill_high",
"pgrefill_normal",
+ "pgrefill_dma32",
"pgrefill_dma",
"pgsteal_high",
"pgsteal_normal",
+ "pgsteal_dma32",
"pgsteal_dma",
+
"pgscan_kswapd_high",
"pgscan_kswapd_normal",
-
+ "pgscan_kswapd_dma32",
"pgscan_kswapd_dma",
+
"pgscan_direct_high",
"pgscan_direct_normal",
+ "pgscan_direct_dma32",
"pgscan_direct_dma",
- "pginodesteal",
+ "pginodesteal",
"slabs_scanned",
"kswapd_steal",
"kswapd_inodesteal",
@@ -2539,6 +2608,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
return 0;
}
+/*
+ * percpu_pagelist_fraction - changes the pcp->high for each zone on each
+ * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
+ * can have before it gets flushed back to buddy allocator.
+ */
+
+int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+ struct zone *zone;
+ unsigned int cpu;
+ int ret;
+
+ ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ if (!write || (ret == -EINVAL))
+ return ret;
+ for_each_zone(zone) {
+ for_each_online_cpu(cpu) {
+ unsigned long high;
+ high = zone->present_pages / percpu_pagelist_fraction;
+ setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+ }
+ }
+ return 0;
+}
+
__initdata int hashdist = HASHDIST_DEFAULT;
#ifdef CONFIG_NUMA
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 52822c98c48..c4b6d0afd73 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -90,7 +90,7 @@ struct pdflush_work {
static int __pdflush(struct pdflush_work *my_work)
{
- current->flags |= PF_FLUSHER;
+ current->flags |= PF_FLUSHER | PF_SWAPWRITE;
my_work->fn = NULL;
my_work->who = current;
INIT_LIST_HEAD(&my_work->list);
diff --git a/mm/readahead.c b/mm/readahead.c
index 72e7adbb87c..8d6eeaaa629 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -158,7 +158,7 @@ static int read_pages(struct address_space *mapping, struct file *filp,
{
unsigned page_idx;
struct pagevec lru_pvec;
- int ret = 0;
+ int ret;
if (mapping->a_ops->readpages) {
ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
@@ -171,14 +171,17 @@ static int read_pages(struct address_space *mapping, struct file *filp,
list_del(&page->lru);
if (!add_to_page_cache(page, mapping,
page->index, GFP_KERNEL)) {
- mapping->a_ops->readpage(filp, page);
- if (!pagevec_add(&lru_pvec, page))
- __pagevec_lru_add(&lru_pvec);
- } else {
- page_cache_release(page);
+ ret = mapping->a_ops->readpage(filp, page);
+ if (ret != AOP_TRUNCATED_PAGE) {
+ if (!pagevec_add(&lru_pvec, page))
+ __pagevec_lru_add(&lru_pvec);
+ continue;
+ } /* else fall through to release */
}
+ page_cache_release(page);
}
pagevec_lru_add(&lru_pvec);
+ ret = 0;
out:
return ret;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index f853c6def15..d85a99d28c0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -20,20 +20,20 @@
/*
* Lock ordering in mm:
*
- * inode->i_sem (while writing or truncating, not reading or faulting)
+ * inode->i_mutex (while writing or truncating, not reading or faulting)
* inode->i_alloc_sem
*
* When a page fault occurs in writing from user to file, down_read
- * of mmap_sem nests within i_sem; in sys_msync, i_sem nests within
- * down_read of mmap_sem; i_sem and down_write of mmap_sem are never
- * taken together; in truncation, i_sem is taken outermost.
+ * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within
+ * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never
+ * taken together; in truncation, i_mutex is taken outermost.
*
* mm->mmap_sem
* page->flags PG_locked (lock_page)
* mapping->i_mmap_lock
* anon_vma->lock
* mm->page_table_lock or pte_lock
- * zone->lru_lock (in mark_page_accessed)
+ * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
* swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers)
@@ -435,6 +435,30 @@ int page_referenced(struct page *page, int is_locked)
}
/**
+ * page_set_anon_rmap - setup new anonymous rmap
+ * @page: the page to add the mapping to
+ * @vma: the vm area in which the mapping is added
+ * @address: the user virtual address mapped
+ */
+static void __page_set_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+
+ BUG_ON(!anon_vma);
+ anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+ page->mapping = (struct address_space *) anon_vma;
+
+ page->index = linear_page_index(vma, address);
+
+ /*
+ * nr_mapped state can be updated without turning off
+ * interrupts because it is not modified via interrupt.
+ */
+ __inc_page_state(nr_mapped);
+}
+
+/**
* page_add_anon_rmap - add pte mapping to an anonymous page
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
@@ -445,20 +469,27 @@ int page_referenced(struct page *page, int is_locked)
void page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
- if (atomic_inc_and_test(&page->_mapcount)) {
- struct anon_vma *anon_vma = vma->anon_vma;
-
- BUG_ON(!anon_vma);
- anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
- page->mapping = (struct address_space *) anon_vma;
-
- page->index = linear_page_index(vma, address);
-
- inc_page_state(nr_mapped);
- }
+ if (atomic_inc_and_test(&page->_mapcount))
+ __page_set_anon_rmap(page, vma, address);
/* else checking page index and mapping is racy */
}
+/*
+ * page_add_new_anon_rmap - add pte mapping to a new anonymous page
+ * @page: the page to add the mapping to
+ * @vma: the vm area in which the mapping is added
+ * @address: the user virtual address mapped
+ *
+ * Same as page_add_anon_rmap but must only be called on *new* pages.
+ * This means the inc-and-test can be bypassed.
+ */
+void page_add_new_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
+ __page_set_anon_rmap(page, vma, address);
+}
+
/**
* page_add_file_rmap - add pte mapping to a file page
* @page: the page to add the mapping to
@@ -471,7 +502,7 @@ void page_add_file_rmap(struct page *page)
BUG_ON(!pfn_valid(page_to_pfn(page)));
if (atomic_inc_and_test(&page->_mapcount))
- inc_page_state(nr_mapped);
+ __inc_page_state(nr_mapped);
}
/**
@@ -483,6 +514,13 @@ void page_add_file_rmap(struct page *page)
void page_remove_rmap(struct page *page)
{
if (atomic_add_negative(-1, &page->_mapcount)) {
+ if (page_mapcount(page) < 0) {
+ printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
+ printk (KERN_EMERG " page->flags = %lx\n", page->flags);
+ printk (KERN_EMERG " page->count = %x\n", page_count(page));
+ printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
+ }
+
BUG_ON(page_mapcount(page) < 0);
/*
* It would be tidy to reset the PageAnon mapping here,
@@ -495,7 +533,7 @@ void page_remove_rmap(struct page *page)
*/
if (page_test_and_clear_dirty(page))
set_page_dirty(page);
- dec_page_state(nr_mapped);
+ __dec_page_state(nr_mapped);
}
}
diff --git a/mm/shmem.c b/mm/shmem.c
index dc25565a61e..ce501bce1c2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -457,7 +457,7 @@ static void shmem_free_pages(struct list_head *next)
} while (next);
}
-static void shmem_truncate(struct inode *inode)
+static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
{
struct shmem_inode_info *info = SHMEM_I(inode);
unsigned long idx;
@@ -475,18 +475,27 @@ static void shmem_truncate(struct inode *inode)
long nr_swaps_freed = 0;
int offset;
int freed;
+ int punch_hole = 0;
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
- idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (idx >= info->next_index)
return;
spin_lock(&info->lock);
info->flags |= SHMEM_TRUNCATE;
- limit = info->next_index;
- info->next_index = idx;
+ if (likely(end == (loff_t) -1)) {
+ limit = info->next_index;
+ info->next_index = idx;
+ } else {
+ limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ if (limit > info->next_index)
+ limit = info->next_index;
+ punch_hole = 1;
+ }
+
topdir = info->i_indirect;
- if (topdir && idx <= SHMEM_NR_DIRECT) {
+ if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
info->i_indirect = NULL;
nr_pages_to_free++;
list_add(&topdir->lru, &pages_to_free);
@@ -573,11 +582,12 @@ static void shmem_truncate(struct inode *inode)
set_page_private(subdir, page_private(subdir) - freed);
if (offset)
spin_unlock(&info->lock);
- BUG_ON(page_private(subdir) > offset);
+ if (!punch_hole)
+ BUG_ON(page_private(subdir) > offset);
}
if (offset)
offset = 0;
- else if (subdir) {
+ else if (subdir && !page_private(subdir)) {
dir[diroff] = NULL;
nr_pages_to_free++;
list_add(&subdir->lru, &pages_to_free);
@@ -594,7 +604,7 @@ done2:
* Also, though shmem_getpage checks i_size before adding to
* cache, no recheck after: so fix the narrow window there too.
*/
- truncate_inode_pages(inode->i_mapping, inode->i_size);
+ truncate_inode_pages_range(inode->i_mapping, start, end);
}
spin_lock(&info->lock);
@@ -614,6 +624,11 @@ done2:
}
}
+static void shmem_truncate(struct inode *inode)
+{
+ shmem_truncate_range(inode, inode->i_size, (loff_t)-1);
+}
+
static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
@@ -855,7 +870,7 @@ unlock:
swap_free(swap);
redirty:
set_page_dirty(page);
- return WRITEPAGE_ACTIVATE; /* Return with the page locked */
+ return AOP_WRITEPAGE_ACTIVATE; /* Return with the page locked */
}
#ifdef CONFIG_NUMA
@@ -1255,7 +1270,7 @@ out_nomem:
return retval;
}
-static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
file_accessed(file);
vma->vm_ops = &shmem_vm_ops;
@@ -1301,7 +1316,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
case S_IFREG:
inode->i_op = &shmem_inode_operations;
inode->i_fop = &shmem_file_operations;
- mpol_shared_policy_init(&info->policy);
+ mpol_shared_policy_init(&info->policy, sbinfo->policy,
+ &sbinfo->policy_nodes);
break;
case S_IFDIR:
inode->i_nlink++;
@@ -1315,7 +1331,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
* Must not load anything in the rbtree,
* mpol_free_shared_policy will not be called.
*/
- mpol_shared_policy_init(&info->policy);
+ mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
+ NULL);
break;
}
} else if (sbinfo->max_inodes) {
@@ -1355,7 +1372,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
if (!access_ok(VERIFY_READ, buf, count))
return -EFAULT;
- down(&inode->i_sem);
+ mutex_lock(&inode->i_mutex);
pos = *ppos;
written = 0;
@@ -1440,7 +1457,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
if (written)
err = written;
out:
- up(&inode->i_sem);
+ mutex_unlock(&inode->i_mutex);
return err;
}
@@ -1476,7 +1493,7 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
/*
* We must evaluate after, since reads (unlike writes)
- * are called without i_sem protection against truncate
+ * are called without i_mutex protection against truncate
*/
nr = PAGE_CACHE_SIZE;
i_size = i_size_read(inode);
@@ -1828,7 +1845,9 @@ static struct inode_operations shmem_symlink_inode_operations = {
.put_link = shmem_put_link,
};
-static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes)
+static int shmem_parse_options(char *options, int *mode, uid_t *uid,
+ gid_t *gid, unsigned long *blocks, unsigned long *inodes,
+ int *policy, nodemask_t *policy_nodes)
{
char *this_char, *value, *rest;
@@ -1882,6 +1901,19 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid,
*gid = simple_strtoul(value,&rest,0);
if (*rest)
goto bad_val;
+ } else if (!strcmp(this_char,"mpol")) {
+ if (!strcmp(value,"default"))
+ *policy = MPOL_DEFAULT;
+ else if (!strcmp(value,"preferred"))
+ *policy = MPOL_PREFERRED;
+ else if (!strcmp(value,"bind"))
+ *policy = MPOL_BIND;
+ else if (!strcmp(value,"interleave"))
+ *policy = MPOL_INTERLEAVE;
+ else
+ goto bad_val;
+ } else if (!strcmp(this_char,"mpol_nodelist")) {
+ nodelist_parse(value, *policy_nodes);
} else {
printk(KERN_ERR "tmpfs: Bad mount option %s\n",
this_char);
@@ -1902,12 +1934,14 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
unsigned long max_blocks = sbinfo->max_blocks;
unsigned long max_inodes = sbinfo->max_inodes;
+ int policy = sbinfo->policy;
+ nodemask_t policy_nodes = sbinfo->policy_nodes;
unsigned long blocks;
unsigned long inodes;
int error = -EINVAL;
- if (shmem_parse_options(data, NULL, NULL, NULL,
- &max_blocks, &max_inodes))
+ if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks,
+ &max_inodes, &policy, &policy_nodes))
return error;
spin_lock(&sbinfo->stat_lock);
@@ -1933,6 +1967,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
sbinfo->free_blocks = max_blocks - blocks;
sbinfo->max_inodes = max_inodes;
sbinfo->free_inodes = max_inodes - inodes;
+ sbinfo->policy = policy;
+ sbinfo->policy_nodes = policy_nodes;
out:
spin_unlock(&sbinfo->stat_lock);
return error;
@@ -1957,6 +1993,8 @@ static int shmem_fill_super(struct super_block *sb,
struct shmem_sb_info *sbinfo;
unsigned long blocks = 0;
unsigned long inodes = 0;
+ int policy = MPOL_DEFAULT;
+ nodemask_t policy_nodes = node_online_map;
#ifdef CONFIG_TMPFS
/*
@@ -1969,8 +2007,8 @@ static int shmem_fill_super(struct super_block *sb,
inodes = totalram_pages - totalhigh_pages;
if (inodes > blocks)
inodes = blocks;
- if (shmem_parse_options(data, &mode, &uid, &gid,
- &blocks, &inodes))
+ if (shmem_parse_options(data, &mode, &uid, &gid, &blocks,
+ &inodes, &policy, &policy_nodes))
return -EINVAL;
}
#else
@@ -1988,6 +2026,8 @@ static int shmem_fill_super(struct super_block *sb,
sbinfo->free_blocks = blocks;
sbinfo->max_inodes = inodes;
sbinfo->free_inodes = inodes;
+ sbinfo->policy = policy;
+ sbinfo->policy_nodes = policy_nodes;
sb->s_fs_info = sbinfo;
sb->s_maxbytes = SHMEM_MAX_BYTES;
@@ -2083,6 +2123,7 @@ static struct file_operations shmem_file_operations = {
static struct inode_operations shmem_inode_operations = {
.truncate = shmem_truncate,
.setattr = shmem_notify_change,
+ .truncate_range = shmem_truncate_range,
};
static struct inode_operations shmem_dir_inode_operations = {
diff --git a/mm/slab.c b/mm/slab.c
index e5ec26e0c46..6f8495e2185 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -68,7 +68,7 @@
* Further notes from the original documentation:
*
* 11 April '97. Started multi-threading - markhe
- * The global cache-chain is protected by the semaphore 'cache_chain_sem'.
+ * The global cache-chain is protected by the mutex 'cache_chain_mutex'.
* The sem is only needed when accessing/extending the cache-chain, which
* can never happen inside an interrupt (kmem_cache_create(),
* kmem_cache_shrink() and kmem_cache_reap()).
@@ -103,6 +103,8 @@
#include <linux/rcupdate.h>
#include <linux/string.h>
#include <linux/nodemask.h>
+#include <linux/mempolicy.h>
+#include <linux/mutex.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -130,7 +132,6 @@
#define FORCED_DEBUG 0
#endif
-
/* Shouldn't this be in a header file somewhere? */
#define BYTES_PER_WORD sizeof(void *)
@@ -217,12 +218,12 @@ static unsigned long offslab_limit;
* Slabs are chained into three list: fully used, partial, fully free slabs.
*/
struct slab {
- struct list_head list;
- unsigned long colouroff;
- void *s_mem; /* including colour offset */
- unsigned int inuse; /* num of objs active in slab */
- kmem_bufctl_t free;
- unsigned short nodeid;
+ struct list_head list;
+ unsigned long colouroff;
+ void *s_mem; /* including colour offset */
+ unsigned int inuse; /* num of objs active in slab */
+ kmem_bufctl_t free;
+ unsigned short nodeid;
};
/*
@@ -242,9 +243,9 @@ struct slab {
* We assume struct slab_rcu can overlay struct slab when destroying.
*/
struct slab_rcu {
- struct rcu_head head;
- kmem_cache_t *cachep;
- void *addr;
+ struct rcu_head head;
+ kmem_cache_t *cachep;
+ void *addr;
};
/*
@@ -279,23 +280,23 @@ struct array_cache {
#define BOOT_CPUCACHE_ENTRIES 1
struct arraycache_init {
struct array_cache cache;
- void * entries[BOOT_CPUCACHE_ENTRIES];
+ void *entries[BOOT_CPUCACHE_ENTRIES];
};
/*
* The slab lists for all objects.
*/
struct kmem_list3 {
- struct list_head slabs_partial; /* partial list first, better asm code */
- struct list_head slabs_full;
- struct list_head slabs_free;
- unsigned long free_objects;
- unsigned long next_reap;
- int free_touched;
- unsigned int free_limit;
- spinlock_t list_lock;
- struct array_cache *shared; /* shared per node */
- struct array_cache **alien; /* on other nodes */
+ struct list_head slabs_partial; /* partial list first, better asm code */
+ struct list_head slabs_full;
+ struct list_head slabs_free;
+ unsigned long free_objects;
+ unsigned long next_reap;
+ int free_touched;
+ unsigned int free_limit;
+ spinlock_t list_lock;
+ struct array_cache *shared; /* shared per node */
+ struct array_cache **alien; /* on other nodes */
};
/*
@@ -367,63 +368,63 @@ static inline void kmem_list3_init(struct kmem_list3 *parent)
*
* manages a cache.
*/
-
+
struct kmem_cache {
/* 1) per-cpu data, touched during every alloc/free */
- struct array_cache *array[NR_CPUS];
- unsigned int batchcount;
- unsigned int limit;
- unsigned int shared;
- unsigned int objsize;
+ struct array_cache *array[NR_CPUS];
+ unsigned int batchcount;
+ unsigned int limit;
+ unsigned int shared;
+ unsigned int objsize;
/* 2) touched by every alloc & free from the backend */
- struct kmem_list3 *nodelists[MAX_NUMNODES];
- unsigned int flags; /* constant flags */
- unsigned int num; /* # of objs per slab */
- spinlock_t spinlock;
+ struct kmem_list3 *nodelists[MAX_NUMNODES];
+ unsigned int flags; /* constant flags */
+ unsigned int num; /* # of objs per slab */
+ spinlock_t spinlock;
/* 3) cache_grow/shrink */
/* order of pgs per slab (2^n) */
- unsigned int gfporder;
+ unsigned int gfporder;
/* force GFP flags, e.g. GFP_DMA */
- gfp_t gfpflags;
+ gfp_t gfpflags;
- size_t colour; /* cache colouring range */
- unsigned int colour_off; /* colour offset */
- unsigned int colour_next; /* cache colouring */
- kmem_cache_t *slabp_cache;
- unsigned int slab_size;
- unsigned int dflags; /* dynamic flags */
+ size_t colour; /* cache colouring range */
+ unsigned int colour_off; /* colour offset */
+ unsigned int colour_next; /* cache colouring */
+ kmem_cache_t *slabp_cache;
+ unsigned int slab_size;
+ unsigned int dflags; /* dynamic flags */
/* constructor func */
- void (*ctor)(void *, kmem_cache_t *, unsigned long);
+ void (*ctor) (void *, kmem_cache_t *, unsigned long);
/* de-constructor func */
- void (*dtor)(void *, kmem_cache_t *, unsigned long);
+ void (*dtor) (void *, kmem_cache_t *, unsigned long);
/* 4) cache creation/removal */
- const char *name;
- struct list_head next;
+ const char *name;
+ struct list_head next;
/* 5) statistics */
#if STATS
- unsigned long num_active;
- unsigned long num_allocations;
- unsigned long high_mark;
- unsigned long grown;
- unsigned long reaped;
- unsigned long errors;
- unsigned long max_freeable;
- unsigned long node_allocs;
- unsigned long node_frees;
- atomic_t allochit;
- atomic_t allocmiss;
- atomic_t freehit;
- atomic_t freemiss;
+ unsigned long num_active;
+ unsigned long num_allocations;
+ unsigned long high_mark;
+ unsigned long grown;
+ unsigned long reaped;
+ unsigned long errors;
+ unsigned long max_freeable;
+ unsigned long node_allocs;
+ unsigned long node_frees;
+ atomic_t allochit;
+ atomic_t allocmiss;
+ atomic_t freehit;
+ atomic_t freemiss;
#endif
#if DEBUG
- int dbghead;
- int reallen;
+ int dbghead;
+ int reallen;
#endif
};
@@ -523,14 +524,15 @@ static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp)
{
BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
if (cachep->flags & SLAB_STORE_USER)
- return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD);
- return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD);
+ return (unsigned long *)(objp + cachep->objsize -
+ 2 * BYTES_PER_WORD);
+ return (unsigned long *)(objp + cachep->objsize - BYTES_PER_WORD);
}
static void **dbg_userword(kmem_cache_t *cachep, void *objp)
{
BUG_ON(!(cachep->flags & SLAB_STORE_USER));
- return (void**)(objp+cachep->objsize-BYTES_PER_WORD);
+ return (void **)(objp + cachep->objsize - BYTES_PER_WORD);
}
#else
@@ -607,31 +609,31 @@ struct cache_names {
static struct cache_names __initdata cache_names[] = {
#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
#include <linux/kmalloc_sizes.h>
- { NULL, }
+ {NULL,}
#undef CACHE
};
static struct arraycache_init initarray_cache __initdata =
- { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
+ { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
static struct arraycache_init initarray_generic =
- { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
+ { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
/* internal cache of cache description objs */
static kmem_cache_t cache_cache = {
- .batchcount = 1,
- .limit = BOOT_CPUCACHE_ENTRIES,
- .shared = 1,
- .objsize = sizeof(kmem_cache_t),
- .flags = SLAB_NO_REAP,
- .spinlock = SPIN_LOCK_UNLOCKED,
- .name = "kmem_cache",
+ .batchcount = 1,
+ .limit = BOOT_CPUCACHE_ENTRIES,
+ .shared = 1,
+ .objsize = sizeof(kmem_cache_t),
+ .flags = SLAB_NO_REAP,
+ .spinlock = SPIN_LOCK_UNLOCKED,
+ .name = "kmem_cache",
#if DEBUG
- .reallen = sizeof(kmem_cache_t),
+ .reallen = sizeof(kmem_cache_t),
#endif
};
/* Guard access to the cache-chain. */
-static struct semaphore cache_chain_sem;
+static DEFINE_MUTEX(cache_chain_mutex);
static struct list_head cache_chain;
/*
@@ -655,9 +657,9 @@ static enum {
static DEFINE_PER_CPU(struct work_struct, reap_work);
-static void free_block(kmem_cache_t* cachep, void** objpp, int len, int node);
-static void enable_cpucache (kmem_cache_t *cachep);
-static void cache_reap (void *unused);
+static void free_block(kmem_cache_t *cachep, void **objpp, int len, int node);
+static void enable_cpucache(kmem_cache_t *cachep);
+static void cache_reap(void *unused);
static int __node_shrink(kmem_cache_t *cachep, int node);
static inline struct array_cache *ac_data(kmem_cache_t *cachep)
@@ -671,9 +673,9 @@ static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags)
#if DEBUG
/* This happens if someone tries to call
- * kmem_cache_create(), or __kmalloc(), before
- * the generic caches are initialized.
- */
+ * kmem_cache_create(), or __kmalloc(), before
+ * the generic caches are initialized.
+ */
BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
#endif
while (size > csizep->cs_size)
@@ -697,10 +699,10 @@ EXPORT_SYMBOL(kmem_find_general_cachep);
/* Cal the num objs, wastage, and bytes left over for a given slab size. */
static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
- int flags, size_t *left_over, unsigned int *num)
+ int flags, size_t *left_over, unsigned int *num)
{
int i;
- size_t wastage = PAGE_SIZE<<gfporder;
+ size_t wastage = PAGE_SIZE << gfporder;
size_t extra = 0;
size_t base = 0;
@@ -709,7 +711,7 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
extra = sizeof(kmem_bufctl_t);
}
i = 0;
- while (i*size + ALIGN(base+i*extra, align) <= wastage)
+ while (i * size + ALIGN(base + i * extra, align) <= wastage)
i++;
if (i > 0)
i--;
@@ -718,8 +720,8 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
i = SLAB_LIMIT;
*num = i;
- wastage -= i*size;
- wastage -= ALIGN(base+i*extra, align);
+ wastage -= i * size;
+ wastage -= ALIGN(base + i * extra, align);
*left_over = wastage;
}
@@ -728,7 +730,7 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg)
{
printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
- function, cachep->name, msg);
+ function, cachep->name, msg);
dump_stack();
}
@@ -755,9 +757,9 @@ static void __devinit start_cpu_timer(int cpu)
}
static struct array_cache *alloc_arraycache(int node, int entries,
- int batchcount)
+ int batchcount)
{
- int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
+ int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
struct array_cache *nc = NULL;
nc = kmalloc_node(memsize, GFP_KERNEL, node);
@@ -772,10 +774,12 @@ static struct array_cache *alloc_arraycache(int node, int entries,
}
#ifdef CONFIG_NUMA
+static void *__cache_alloc_node(kmem_cache_t *, gfp_t, int);
+
static inline struct array_cache **alloc_alien_cache(int node, int limit)
{
struct array_cache **ac_ptr;
- int memsize = sizeof(void*)*MAX_NUMNODES;
+ int memsize = sizeof(void *) * MAX_NUMNODES;
int i;
if (limit > 1)
@@ -789,7 +793,7 @@ static inline struct array_cache **alloc_alien_cache(int node, int limit)
}
ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
if (!ac_ptr[i]) {
- for (i--; i <=0; i--)
+ for (i--; i <= 0; i--)
kfree(ac_ptr[i]);
kfree(ac_ptr);
return NULL;
@@ -807,12 +811,13 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
return;
for_each_node(i)
- kfree(ac_ptr[i]);
+ kfree(ac_ptr[i]);
kfree(ac_ptr);
}
-static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache *ac, int node)
+static inline void __drain_alien_cache(kmem_cache_t *cachep,
+ struct array_cache *ac, int node)
{
struct kmem_list3 *rl3 = cachep->nodelists[node];
@@ -826,7 +831,7 @@ static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache
static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
{
- int i=0;
+ int i = 0;
struct array_cache *ac;
unsigned long flags;
@@ -846,18 +851,17 @@ static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
#endif
static int __devinit cpuup_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+ unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
- kmem_cache_t* cachep;
+ kmem_cache_t *cachep;
struct kmem_list3 *l3 = NULL;
int node = cpu_to_node(cpu);
int memsize = sizeof(struct kmem_list3);
- struct array_cache *nc = NULL;
switch (action) {
case CPU_UP_PREPARE:
- down(&cache_chain_sem);
+ mutex_lock(&cache_chain_mutex);
/* we need to do this right in the beginning since
* alloc_arraycache's are going to use this list.
* kmalloc_node allows us to add the slab to the right
@@ -871,27 +875,29 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
*/
if (!cachep->nodelists[node]) {
if (!(l3 = kmalloc_node(memsize,
- GFP_KERNEL, node)))
+ GFP_KERNEL, node)))
goto bad;
kmem_list3_init(l3);
l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+ ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
cachep->nodelists[node] = l3;
}
spin_lock_irq(&cachep->nodelists[node]->list_lock);
cachep->nodelists[node]->free_limit =
- (1 + nr_cpus_node(node)) *
- cachep->batchcount + cachep->num;
+ (1 + nr_cpus_node(node)) *
+ cachep->batchcount + cachep->num;
spin_unlock_irq(&cachep->nodelists[node]->list_lock);
}
/* Now we can go ahead with allocating the shared array's
- & array cache's */
+ & array cache's */
list_for_each_entry(cachep, &cache_chain, next) {
+ struct array_cache *nc;
+
nc = alloc_arraycache(node, cachep->limit,
- cachep->batchcount);
+ cachep->batchcount);
if (!nc)
goto bad;
cachep->array[cpu] = nc;
@@ -900,16 +906,17 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
BUG_ON(!l3);
if (!l3->shared) {
if (!(nc = alloc_arraycache(node,
- cachep->shared*cachep->batchcount,
- 0xbaadf00d)))
- goto bad;
+ cachep->shared *
+ cachep->batchcount,
+ 0xbaadf00d)))
+ goto bad;
/* we are serialised from CPU_DEAD or
- CPU_UP_CANCELLED by the cpucontrol lock */
+ CPU_UP_CANCELLED by the cpucontrol lock */
l3->shared = nc;
}
}
- up(&cache_chain_sem);
+ mutex_unlock(&cache_chain_mutex);
break;
case CPU_ONLINE:
start_cpu_timer(cpu);
@@ -918,7 +925,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
case CPU_DEAD:
/* fall thru */
case CPU_UP_CANCELED:
- down(&cache_chain_sem);
+ mutex_lock(&cache_chain_mutex);
list_for_each_entry(cachep, &cache_chain, next) {
struct array_cache *nc;
@@ -942,13 +949,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
free_block(cachep, nc->entry, nc->avail, node);
if (!cpus_empty(mask)) {
- spin_unlock(&l3->list_lock);
- goto unlock_cache;
- }
+ spin_unlock(&l3->list_lock);
+ goto unlock_cache;
+ }
if (l3->shared) {
free_block(cachep, l3->shared->entry,
- l3->shared->avail, node);
+ l3->shared->avail, node);
kfree(l3->shared);
l3->shared = NULL;
}
@@ -966,17 +973,17 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
} else {
spin_unlock(&l3->list_lock);
}
-unlock_cache:
+ unlock_cache:
spin_unlock_irq(&cachep->spinlock);
kfree(nc);
}
- up(&cache_chain_sem);
+ mutex_unlock(&cache_chain_mutex);
break;
#endif
}
return NOTIFY_OK;
-bad:
- up(&cache_chain_sem);
+ bad:
+ mutex_unlock(&cache_chain_mutex);
return NOTIFY_BAD;
}
@@ -985,8 +992,7 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
/*
* swap the static kmem_list3 with kmalloced memory
*/
-static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list,
- int nodeid)
+static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, int nodeid)
{
struct kmem_list3 *ptr;
@@ -1045,7 +1051,6 @@ void __init kmem_cache_init(void)
*/
/* 1) create the cache_cache */
- init_MUTEX(&cache_chain_sem);
INIT_LIST_HEAD(&cache_chain);
list_add(&cache_cache.next, &cache_chain);
cache_cache.colour_off = cache_line_size();
@@ -1055,14 +1060,14 @@ void __init kmem_cache_init(void)
cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
- &left_over, &cache_cache.num);
+ &left_over, &cache_cache.num);
if (!cache_cache.num)
BUG();
- cache_cache.colour = left_over/cache_cache.colour_off;
+ cache_cache.colour = left_over / cache_cache.colour_off;
cache_cache.colour_next = 0;
- cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) +
- sizeof(struct slab), cache_line_size());
+ cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
+ sizeof(struct slab), cache_line_size());
/* 2+3) create the kmalloc caches */
sizes = malloc_sizes;
@@ -1074,14 +1079,18 @@ void __init kmem_cache_init(void)
*/
sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
- sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN,
- (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+ sizes[INDEX_AC].cs_size,
+ ARCH_KMALLOC_MINALIGN,
+ (ARCH_KMALLOC_FLAGS |
+ SLAB_PANIC), NULL, NULL);
if (INDEX_AC != INDEX_L3)
sizes[INDEX_L3].cs_cachep =
- kmem_cache_create(names[INDEX_L3].name,
- sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN,
- (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+ kmem_cache_create(names[INDEX_L3].name,
+ sizes[INDEX_L3].cs_size,
+ ARCH_KMALLOC_MINALIGN,
+ (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL,
+ NULL);
while (sizes->cs_size != ULONG_MAX) {
/*
@@ -1091,35 +1100,41 @@ void __init kmem_cache_init(void)
* Note for systems short on memory removing the alignment will
* allow tighter packing of the smaller caches.
*/
- if(!sizes->cs_cachep)
+ if (!sizes->cs_cachep)
sizes->cs_cachep = kmem_cache_create(names->name,
- sizes->cs_size, ARCH_KMALLOC_MINALIGN,
- (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+ sizes->cs_size,
+ ARCH_KMALLOC_MINALIGN,
+ (ARCH_KMALLOC_FLAGS
+ | SLAB_PANIC),
+ NULL, NULL);
/* Inc off-slab bufctl limit until the ceiling is hit. */
if (!(OFF_SLAB(sizes->cs_cachep))) {
- offslab_limit = sizes->cs_size-sizeof(struct slab);
+ offslab_limit = sizes->cs_size - sizeof(struct slab);
offslab_limit /= sizeof(kmem_bufctl_t);
}
sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
- sizes->cs_size, ARCH_KMALLOC_MINALIGN,
- (ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC),
- NULL, NULL);
+ sizes->cs_size,
+ ARCH_KMALLOC_MINALIGN,
+ (ARCH_KMALLOC_FLAGS |
+ SLAB_CACHE_DMA |
+ SLAB_PANIC), NULL,
+ NULL);
sizes++;
names++;
}
/* 4) Replace the bootstrap head arrays */
{
- void * ptr;
+ void *ptr;
ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
local_irq_disable();
BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
memcpy(ptr, ac_data(&cache_cache),
- sizeof(struct arraycache_init));
+ sizeof(struct arraycache_init));
cache_cache.array[smp_processor_id()] = ptr;
local_irq_enable();
@@ -1127,11 +1142,11 @@ void __init kmem_cache_init(void)
local_irq_disable();
BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep)
- != &initarray_generic.cache);
+ != &initarray_generic.cache);
memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep),
- sizeof(struct arraycache_init));
+ sizeof(struct arraycache_init));
malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
- ptr;
+ ptr;
local_irq_enable();
}
/* 5) Replace the bootstrap kmem_list3's */
@@ -1139,16 +1154,16 @@ void __init kmem_cache_init(void)
int node;
/* Replace the static kmem_list3 structures for the boot cpu */
init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
- numa_node_id());
+ numa_node_id());
for_each_online_node(node) {
init_list(malloc_sizes[INDEX_AC].cs_cachep,
- &initkmem_list3[SIZE_AC+node], node);
+ &initkmem_list3[SIZE_AC + node], node);
if (INDEX_AC != INDEX_L3) {
init_list(malloc_sizes[INDEX_L3].cs_cachep,
- &initkmem_list3[SIZE_L3+node],
- node);
+ &initkmem_list3[SIZE_L3 + node],
+ node);
}
}
}
@@ -1156,10 +1171,10 @@ void __init kmem_cache_init(void)
/* 6) resize the head arrays to their final sizes */
{
kmem_cache_t *cachep;
- down(&cache_chain_sem);
+ mutex_lock(&cache_chain_mutex);
list_for_each_entry(cachep, &cache_chain, next)
- enable_cpucache(cachep);
- up(&cache_chain_sem);
+ enable_cpucache(cachep);
+ mutex_unlock(&cache_chain_mutex);
}
/* Done! */
@@ -1184,7 +1199,7 @@ static int __init cpucache_init(void)
* pages to gfp.
*/
for_each_online_cpu(cpu)
- start_cpu_timer(cpu);
+ start_cpu_timer(cpu);
return 0;
}
@@ -1226,7 +1241,7 @@ static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
*/
static void kmem_freepages(kmem_cache_t *cachep, void *addr)
{
- unsigned long i = (1<<cachep->gfporder);
+ unsigned long i = (1 << cachep->gfporder);
struct page *page = virt_to_page(addr);
const unsigned long nr_freed = i;
@@ -1239,13 +1254,13 @@ static void kmem_freepages(kmem_cache_t *cachep, void *addr)
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += nr_freed;
free_pages((unsigned long)addr, cachep->gfporder);
- if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
- atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages);
+ if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
+ atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages);
}
static void kmem_rcu_free(struct rcu_head *head)
{
- struct slab_rcu *slab_rcu = (struct slab_rcu *) head;
+ struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
kmem_cache_t *cachep = slab_rcu->cachep;
kmem_freepages(cachep, slab_rcu->addr);
@@ -1257,19 +1272,19 @@ static void kmem_rcu_free(struct rcu_head *head)
#ifdef CONFIG_DEBUG_PAGEALLOC
static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
- unsigned long caller)
+ unsigned long caller)
{
int size = obj_reallen(cachep);
- addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)];
+ addr = (unsigned long *)&((char *)addr)[obj_dbghead(cachep)];
- if (size < 5*sizeof(unsigned long))
+ if (size < 5 * sizeof(unsigned long))
return;
- *addr++=0x12345678;
- *addr++=caller;
- *addr++=smp_processor_id();
- size -= 3*sizeof(unsigned long);
+ *addr++ = 0x12345678;
+ *addr++ = caller;
+ *addr++ = smp_processor_id();
+ size -= 3 * sizeof(unsigned long);
{
unsigned long *sptr = &caller;
unsigned long svalue;
@@ -1277,7 +1292,7 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
while (!kstack_end(sptr)) {
svalue = *sptr++;
if (kernel_text_address(svalue)) {
- *addr++=svalue;
+ *addr++ = svalue;
size -= sizeof(unsigned long);
if (size <= sizeof(unsigned long))
break;
@@ -1285,25 +1300,25 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
}
}
- *addr++=0x87654321;
+ *addr++ = 0x87654321;
}
#endif
static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
{
int size = obj_reallen(cachep);
- addr = &((char*)addr)[obj_dbghead(cachep)];
+ addr = &((char *)addr)[obj_dbghead(cachep)];
memset(addr, val, size);
- *(unsigned char *)(addr+size-1) = POISON_END;
+ *(unsigned char *)(addr + size - 1) = POISON_END;
}
static void dump_line(char *data, int offset, int limit)
{
int i;
printk(KERN_ERR "%03x:", offset);
- for (i=0;i<limit;i++) {
- printk(" %02x", (unsigned char)data[offset+i]);
+ for (i = 0; i < limit; i++) {
+ printk(" %02x", (unsigned char)data[offset + i]);
}
printk("\n");
}
@@ -1318,24 +1333,24 @@ static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
if (cachep->flags & SLAB_RED_ZONE) {
printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
- *dbg_redzone1(cachep, objp),
- *dbg_redzone2(cachep, objp));
+ *dbg_redzone1(cachep, objp),
+ *dbg_redzone2(cachep, objp));
}
if (cachep->flags & SLAB_STORE_USER) {
printk(KERN_ERR "Last user: [<%p>]",
- *dbg_userword(cachep, objp));
+ *dbg_userword(cachep, objp));
print_symbol("(%s)",
- (unsigned long)*dbg_userword(cachep, objp));
+ (unsigned long)*dbg_userword(cachep, objp));
printk("\n");
}
- realobj = (char*)objp+obj_dbghead(cachep);
+ realobj = (char *)objp + obj_dbghead(cachep);
size = obj_reallen(cachep);
- for (i=0; i<size && lines;i+=16, lines--) {
+ for (i = 0; i < size && lines; i += 16, lines--) {
int limit;
limit = 16;
- if (i+limit > size)
- limit = size-i;
+ if (i + limit > size)
+ limit = size - i;
dump_line(realobj, i, limit);
}
}
@@ -1346,27 +1361,28 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
int size, i;
int lines = 0;
- realobj = (char*)objp+obj_dbghead(cachep);
+ realobj = (char *)objp + obj_dbghead(cachep);
size = obj_reallen(cachep);
- for (i=0;i<size;i++) {
+ for (i = 0; i < size; i++) {
char exp = POISON_FREE;
- if (i == size-1)
+ if (i == size - 1)
exp = POISON_END;
if (realobj[i] != exp) {
int limit;
/* Mismatch ! */
/* Print header */
if (lines == 0) {
- printk(KERN_ERR "Slab corruption: start=%p, len=%d\n",
- realobj, size);
+ printk(KERN_ERR
+ "Slab corruption: start=%p, len=%d\n",
+ realobj, size);
print_objinfo(cachep, objp, 0);
}
/* Hexdump the affected line */
- i = (i/16)*16;
+ i = (i / 16) * 16;
limit = 16;
- if (i+limit > size)
- limit = size-i;
+ if (i + limit > size)
+ limit = size - i;
dump_line(realobj, i, limit);
i += 16;
lines++;
@@ -1382,19 +1398,19 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
struct slab *slabp = page_get_slab(virt_to_page(objp));
int objnr;
- objnr = (objp-slabp->s_mem)/cachep->objsize;
+ objnr = (objp - slabp->s_mem) / cachep->objsize;
if (objnr) {
- objp = slabp->s_mem+(objnr-1)*cachep->objsize;
- realobj = (char*)objp+obj_dbghead(cachep);
+ objp = slabp->s_mem + (objnr - 1) * cachep->objsize;
+ realobj = (char *)objp + obj_dbghead(cachep);
printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
- realobj, size);
+ realobj, size);
print_objinfo(cachep, objp, 2);
}
- if (objnr+1 < cachep->num) {
- objp = slabp->s_mem+(objnr+1)*cachep->objsize;
- realobj = (char*)objp+obj_dbghead(cachep);
+ if (objnr + 1 < cachep->num) {
+ objp = slabp->s_mem + (objnr + 1) * cachep->objsize;
+ realobj = (char *)objp + obj_dbghead(cachep);
printk(KERN_ERR "Next obj: start=%p, len=%d\n",
- realobj, size);
+ realobj, size);
print_objinfo(cachep, objp, 2);
}
}
@@ -1405,7 +1421,7 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
* Before calling the slab must have been unlinked from the cache.
* The cache-lock is not held/needed.
*/
-static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
+static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
{
void *addr = slabp->s_mem - slabp->colouroff;
@@ -1416,8 +1432,11 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
if (cachep->flags & SLAB_POISON) {
#ifdef CONFIG_DEBUG_PAGEALLOC
- if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep))
- kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1);
+ if ((cachep->objsize % PAGE_SIZE) == 0
+ && OFF_SLAB(cachep))
+ kernel_map_pages(virt_to_page(objp),
+ cachep->objsize / PAGE_SIZE,
+ 1);
else
check_poison_obj(cachep, objp);
#else
@@ -1427,20 +1446,20 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
slab_error(cachep, "start of a freed object "
- "was overwritten");
+ "was overwritten");
if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
slab_error(cachep, "end of a freed object "
- "was overwritten");
+ "was overwritten");
}
if (cachep->dtor && !(cachep->flags & SLAB_POISON))
- (cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0);
+ (cachep->dtor) (objp + obj_dbghead(cachep), cachep, 0);
}
#else
if (cachep->dtor) {
int i;
for (i = 0; i < cachep->num; i++) {
- void* objp = slabp->s_mem+cachep->objsize*i;
- (cachep->dtor)(objp, cachep, 0);
+ void *objp = slabp->s_mem + cachep->objsize * i;
+ (cachep->dtor) (objp, cachep, 0);
}
}
#endif
@@ -1448,7 +1467,7 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
struct slab_rcu *slab_rcu;
- slab_rcu = (struct slab_rcu *) slabp;
+ slab_rcu = (struct slab_rcu *)slabp;
slab_rcu->cachep = cachep;
slab_rcu->addr = addr;
call_rcu(&slab_rcu->head, kmem_rcu_free);
@@ -1466,11 +1485,58 @@ static inline void set_up_list3s(kmem_cache_t *cachep, int index)
int node;
for_each_online_node(node) {
- cachep->nodelists[node] = &initkmem_list3[index+node];
+ cachep->nodelists[node] = &initkmem_list3[index + node];
cachep->nodelists[node]->next_reap = jiffies +
- REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+ REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+ }
+}
+
+/**
+ * calculate_slab_order - calculate size (page order) of slabs and the number
+ * of objects per slab.
+ *
+ * This could be made much more intelligent. For now, try to avoid using
+ * high order pages for slabs. When the gfp() functions are more friendly
+ * towards high-order requests, this should be changed.
+ */
+static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size,
+ size_t align, gfp_t flags)
+{
+ size_t left_over = 0;
+
+ for (;; cachep->gfporder++) {
+ unsigned int num;
+ size_t remainder;
+
+ if (cachep->gfporder > MAX_GFP_ORDER) {
+ cachep->num = 0;
+ break;
+ }
+
+ cache_estimate(cachep->gfporder, size, align, flags,
+ &remainder, &num);
+ if (!num)
+ continue;
+ /* More than offslab_limit objects will cause problems */
+ if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit)
+ break;
+
+ cachep->num = num;
+ left_over = remainder;
+
+ /*
+ * Large number of objects is good, but very large slabs are
+ * currently bad for the gfp()s.
+ */
+ if (cachep->gfporder >= slab_break_gfp_order)
+ break;
+
+ if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder))
+ /* Acceptable internal fragmentation */
+ break;
}
+ return left_over;
}
/**
@@ -1519,16 +1585,15 @@ kmem_cache_create (const char *name, size_t size, size_t align,
* Sanity checks... these are all serious usage bugs.
*/
if ((!name) ||
- in_interrupt() ||
- (size < BYTES_PER_WORD) ||
- (size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) ||
- (dtor && !ctor)) {
- printk(KERN_ERR "%s: Early error in slab %s\n",
- __FUNCTION__, name);
- BUG();
- }
+ in_interrupt() ||
+ (size < BYTES_PER_WORD) ||
+ (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
+ printk(KERN_ERR "%s: Early error in slab %s\n",
+ __FUNCTION__, name);
+ BUG();
+ }
- down(&cache_chain_sem);
+ mutex_lock(&cache_chain_mutex);
list_for_each(p, &cache_chain) {
kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
@@ -1546,11 +1611,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
set_fs(old_fs);
if (res) {
printk("SLAB: cache with size %d has lost its name\n",
- pc->objsize);
+ pc->objsize);
continue;
}
- if (!strcmp(pc->name,name)) {
+ if (!strcmp(pc->name, name)) {
printk("kmem_cache_create: duplicate cache %s\n", name);
dump_stack();
goto oops;
@@ -1562,10 +1627,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
/* No constructor, but inital state check requested */
printk(KERN_ERR "%s: No con, but init state check "
- "requested - %s\n", __FUNCTION__, name);
+ "requested - %s\n", __FUNCTION__, name);
flags &= ~SLAB_DEBUG_INITIAL;
}
-
#if FORCED_DEBUG
/*
* Enable redzoning and last user accounting, except for caches with
@@ -1573,8 +1637,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
* above the next power of two: caches with object sizes just above a
* power of two have a significant amount of internal fragmentation.
*/
- if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD)))
- flags |= SLAB_RED_ZONE|SLAB_STORE_USER;
+ if ((size < 4096
+ || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD)))
+ flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
if (!(flags & SLAB_DESTROY_BY_RCU))
flags |= SLAB_POISON;
#endif
@@ -1595,9 +1660,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
* unaligned accesses for some archs when redzoning is used, and makes
* sure any on-slab bufctl's are also correctly aligned.
*/
- if (size & (BYTES_PER_WORD-1)) {
- size += (BYTES_PER_WORD-1);
- size &= ~(BYTES_PER_WORD-1);
+ if (size & (BYTES_PER_WORD - 1)) {
+ size += (BYTES_PER_WORD - 1);
+ size &= ~(BYTES_PER_WORD - 1);
}
/* calculate out the final buffer alignment: */
@@ -1608,7 +1673,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
* objects into one cacheline.
*/
ralign = cache_line_size();
- while (size <= ralign/2)
+ while (size <= ralign / 2)
ralign /= 2;
} else {
ralign = BYTES_PER_WORD;
@@ -1617,13 +1682,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
if (ralign < ARCH_SLAB_MINALIGN) {
ralign = ARCH_SLAB_MINALIGN;
if (ralign > BYTES_PER_WORD)
- flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
+ flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
}
/* 3) caller mandated alignment: disables debug if necessary */
if (ralign < align) {
ralign = align;
if (ralign > BYTES_PER_WORD)
- flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
+ flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
}
/* 4) Store it. Note that the debug code below can reduce
* the alignment to BYTES_PER_WORD.
@@ -1645,7 +1710,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
/* add space for red zone words */
cachep->dbghead += BYTES_PER_WORD;
- size += 2*BYTES_PER_WORD;
+ size += 2 * BYTES_PER_WORD;
}
if (flags & SLAB_STORE_USER) {
/* user store requires word alignment and
@@ -1656,7 +1721,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
size += BYTES_PER_WORD;
}
#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
- if (size >= malloc_sizes[INDEX_L3+1].cs_size && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
+ if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
+ && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
cachep->dbghead += PAGE_SIZE - size;
size = PAGE_SIZE;
}
@@ -1664,7 +1730,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
#endif
/* Determine if the slab management is 'on' or 'off' slab. */
- if (size >= (PAGE_SIZE>>3))
+ if (size >= (PAGE_SIZE >> 3))
/*
* Size is large, assume best to place the slab management obj
* off-slab (should allow better packing of objs).
@@ -1681,47 +1747,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
*/
cachep->gfporder = 0;
cache_estimate(cachep->gfporder, size, align, flags,
- &left_over, &cachep->num);
- } else {
- /*
- * Calculate size (in pages) of slabs, and the num of objs per
- * slab. This could be made much more intelligent. For now,
- * try to avoid using high page-orders for slabs. When the
- * gfp() funcs are more friendly towards high-order requests,
- * this should be changed.
- */
- do {
- unsigned int break_flag = 0;
-cal_wastage:
- cache_estimate(cachep->gfporder, size, align, flags,
- &left_over, &cachep->num);
- if (break_flag)
- break;
- if (cachep->gfporder >= MAX_GFP_ORDER)
- break;
- if (!cachep->num)
- goto next;
- if (flags & CFLGS_OFF_SLAB &&
- cachep->num > offslab_limit) {
- /* This num of objs will cause problems. */
- cachep->gfporder--;
- break_flag++;
- goto cal_wastage;
- }
-
- /*
- * Large num of objs is good, but v. large slabs are
- * currently bad for the gfp()s.
- */
- if (cachep->gfporder >= slab_break_gfp_order)
- break;
-
- if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder))
- break; /* Acceptable internal fragmentation. */
-next:
- cachep->gfporder++;
- } while (1);
- }
+ &left_over, &cachep->num);
+ } else
+ left_over = calculate_slab_order(cachep, size, align, flags);
if (!cachep->num) {
printk("kmem_cache_create: couldn't create cache %s.\n", name);
@@ -1729,8 +1757,8 @@ next:
cachep = NULL;
goto oops;
}
- slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t)
- + sizeof(struct slab), align);
+ slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
+ + sizeof(struct slab), align);
/*
* If the slab has been placed off-slab, and we have enough space then
@@ -1743,14 +1771,15 @@ next:
if (flags & CFLGS_OFF_SLAB) {
/* really off slab. No need for manual alignment */
- slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab);
+ slab_size =
+ cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
}
cachep->colour_off = cache_line_size();
/* Offset must be a multiple of the alignment. */
if (cachep->colour_off < align)
cachep->colour_off = align;
- cachep->colour = left_over/cachep->colour_off;
+ cachep->colour = left_over / cachep->colour_off;
cachep->slab_size = slab_size;
cachep->flags = flags;
cachep->gfpflags = 0;
@@ -1777,7 +1806,7 @@ next:
* the creation of further caches will BUG().
*/
cachep->array[smp_processor_id()] =
- &initarray_generic.cache;
+ &initarray_generic.cache;
/* If the cache that's used by
* kmalloc(sizeof(kmem_list3)) is the first cache,
@@ -1791,8 +1820,7 @@ next:
g_cpucache_up = PARTIAL_AC;
} else {
cachep->array[smp_processor_id()] =
- kmalloc(sizeof(struct arraycache_init),
- GFP_KERNEL);
+ kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
if (g_cpucache_up == PARTIAL_AC) {
set_up_list3s(cachep, SIZE_L3);
@@ -1802,16 +1830,18 @@ next:
for_each_online_node(node) {
cachep->nodelists[node] =
- kmalloc_node(sizeof(struct kmem_list3),
- GFP_KERNEL, node);
+ kmalloc_node(sizeof
+ (struct kmem_list3),
+ GFP_KERNEL, node);
BUG_ON(!cachep->nodelists[node]);
- kmem_list3_init(cachep->nodelists[node]);
+ kmem_list3_init(cachep->
+ nodelists[node]);
}
}
}
cachep->nodelists[numa_node_id()]->next_reap =
- jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+ jiffies + REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
BUG_ON(!ac_data(cachep));
ac_data(cachep)->avail = 0;
@@ -1820,16 +1850,16 @@ next:
ac_data(cachep)->touched = 0;
cachep->batchcount = 1;
cachep->limit = BOOT_CPUCACHE_ENTRIES;
- }
+ }
/* cache setup completed, link it into the list */
list_add(&cachep->next, &cache_chain);
unlock_cpu_hotplug();
-oops:
+ oops:
if (!cachep && (flags & SLAB_PANIC))
panic("kmem_cache_create(): failed to create slab `%s'\n",
- name);
- up(&cache_chain_sem);
+ name);
+ mutex_unlock(&cache_chain_mutex);
return cachep;
}
EXPORT_SYMBOL(kmem_cache_create);
@@ -1871,7 +1901,7 @@ static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
/*
* Waits for all CPUs to execute func().
*/
-static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
+static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
{
check_irq_on();
preempt_disable();
@@ -1886,12 +1916,12 @@ static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
preempt_enable();
}
-static void drain_array_locked(kmem_cache_t* cachep,
- struct array_cache *ac, int force, int node);
+static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
+ int force, int node);
static void do_drain(void *arg)
{
- kmem_cache_t *cachep = (kmem_cache_t*)arg;
+ kmem_cache_t *cachep = (kmem_cache_t *) arg;
struct array_cache *ac;
int node = numa_node_id();
@@ -1911,7 +1941,7 @@ static void drain_cpu_caches(kmem_cache_t *cachep)
smp_call_function_all_cpus(do_drain, cachep);
check_irq_on();
spin_lock_irq(&cachep->spinlock);
- for_each_online_node(node) {
+ for_each_online_node(node) {
l3 = cachep->nodelists[node];
if (l3) {
spin_lock(&l3->list_lock);
@@ -1949,8 +1979,7 @@ static int __node_shrink(kmem_cache_t *cachep, int node)
slab_destroy(cachep, slabp);
spin_lock_irq(&l3->list_lock);
}
- ret = !list_empty(&l3->slabs_full) ||
- !list_empty(&l3->slabs_partial);
+ ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial);
return ret;
}
@@ -2006,7 +2035,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
* The caller must guarantee that noone will allocate memory from the cache
* during the kmem_cache_destroy().
*/
-int kmem_cache_destroy(kmem_cache_t * cachep)
+int kmem_cache_destroy(kmem_cache_t *cachep)
{
int i;
struct kmem_list3 *l3;
@@ -2018,18 +2047,18 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
lock_cpu_hotplug();
/* Find the cache in the chain of caches. */
- down(&cache_chain_sem);
+ mutex_lock(&cache_chain_mutex);
/*
* the chain is never empty, cache_cache is never destroyed
*/
list_del(&cachep->next);
- up(&cache_chain_sem);
+ mutex_unlock(&cache_chain_mutex);
if (__cache_shrink(cachep)) {
slab_error(cachep, "Can't free all objects");
- down(&cache_chain_sem);
- list_add(&cachep->next,&cache_chain);
- up(&cache_chain_sem);
+ mutex_lock(&cache_chain_mutex);
+ list_add(&cachep->next, &cache_chain);
+ mutex_unlock(&cache_chain_mutex);
unlock_cpu_hotplug();
return 1;
}
@@ -2038,7 +2067,7 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
synchronize_rcu();
for_each_online_cpu(i)
- kfree(cachep->array[i]);
+ kfree(cachep->array[i]);
/* NUMA: free the list3 structures */
for_each_online_node(i) {
@@ -2057,39 +2086,39 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
EXPORT_SYMBOL(kmem_cache_destroy);
/* Get the memory for a slab management obj. */
-static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
- int colour_off, gfp_t local_flags)
+static struct slab *alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
+ int colour_off, gfp_t local_flags)
{
struct slab *slabp;
-
+
if (OFF_SLAB(cachep)) {
/* Slab management obj is off-slab. */
slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
if (!slabp)
return NULL;
} else {
- slabp = objp+colour_off;
+ slabp = objp + colour_off;
colour_off += cachep->slab_size;
}
slabp->inuse = 0;
slabp->colouroff = colour_off;
- slabp->s_mem = objp+colour_off;
+ slabp->s_mem = objp + colour_off;
return slabp;
}
static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
{
- return (kmem_bufctl_t *)(slabp+1);
+ return (kmem_bufctl_t *) (slabp + 1);
}
static void cache_init_objs(kmem_cache_t *cachep,
- struct slab *slabp, unsigned long ctor_flags)
+ struct slab *slabp, unsigned long ctor_flags)
{
int i;
for (i = 0; i < cachep->num; i++) {
- void *objp = slabp->s_mem+cachep->objsize*i;
+ void *objp = slabp->s_mem + cachep->objsize * i;
#if DEBUG
/* need to poison the objs? */
if (cachep->flags & SLAB_POISON)
@@ -2107,25 +2136,28 @@ static void cache_init_objs(kmem_cache_t *cachep,
* Otherwise, deadlock. They must also be threaded.
*/
if (cachep->ctor && !(cachep->flags & SLAB_POISON))
- cachep->ctor(objp+obj_dbghead(cachep), cachep, ctor_flags);
+ cachep->ctor(objp + obj_dbghead(cachep), cachep,
+ ctor_flags);
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
slab_error(cachep, "constructor overwrote the"
- " end of an object");
+ " end of an object");
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
slab_error(cachep, "constructor overwrote the"
- " start of an object");
+ " start of an object");
}
- if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
- kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
+ if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
+ && cachep->flags & SLAB_POISON)
+ kernel_map_pages(virt_to_page(objp),
+ cachep->objsize / PAGE_SIZE, 0);
#else
if (cachep->ctor)
cachep->ctor(objp, cachep, ctor_flags);
#endif
- slab_bufctl(slabp)[i] = i+1;
+ slab_bufctl(slabp)[i] = i + 1;
}
- slab_bufctl(slabp)[i-1] = BUFCTL_END;
+ slab_bufctl(slabp)[i - 1] = BUFCTL_END;
slabp->free = 0;
}
@@ -2161,17 +2193,17 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
*/
static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
{
- struct slab *slabp;
- void *objp;
- size_t offset;
- gfp_t local_flags;
- unsigned long ctor_flags;
+ struct slab *slabp;
+ void *objp;
+ size_t offset;
+ gfp_t local_flags;
+ unsigned long ctor_flags;
struct kmem_list3 *l3;
/* Be lazy and only check for valid flags here,
- * keeping it out of the critical path in kmem_cache_alloc().
+ * keeping it out of the critical path in kmem_cache_alloc().
*/
- if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW))
+ if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW))
BUG();
if (flags & SLAB_NO_GROW)
return 0;
@@ -2237,9 +2269,9 @@ static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
l3->free_objects += cachep->num;
spin_unlock(&l3->list_lock);
return 1;
-opps1:
+ opps1:
kmem_freepages(cachep, objp);
-failed:
+ failed:
if (local_flags & __GFP_WAIT)
local_irq_disable();
return 0;
@@ -2259,18 +2291,19 @@ static void kfree_debugcheck(const void *objp)
if (!virt_addr_valid(objp)) {
printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
- (unsigned long)objp);
- BUG();
+ (unsigned long)objp);
+ BUG();
}
page = virt_to_page(objp);
if (!PageSlab(page)) {
- printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", (unsigned long)objp);
+ printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n",
+ (unsigned long)objp);
BUG();
}
}
static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
- void *caller)
+ void *caller)
{
struct page *page;
unsigned int objnr;
@@ -2281,20 +2314,26 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
page = virt_to_page(objp);
if (page_get_cache(page) != cachep) {
- printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n",
- page_get_cache(page),cachep);
+ printk(KERN_ERR
+ "mismatch in kmem_cache_free: expected cache %p, got %p\n",
+ page_get_cache(page), cachep);
printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
- printk(KERN_ERR "%p is %s.\n", page_get_cache(page), page_get_cache(page)->name);
+ printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
+ page_get_cache(page)->name);
WARN_ON(1);
}
slabp = page_get_slab(page);
if (cachep->flags & SLAB_RED_ZONE) {
- if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
- slab_error(cachep, "double free, or memory outside"
- " object was overwritten");
- printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
- objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp));
+ if (*dbg_redzone1(cachep, objp) != RED_ACTIVE
+ || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
+ slab_error(cachep,
+ "double free, or memory outside"
+ " object was overwritten");
+ printk(KERN_ERR
+ "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+ objp, *dbg_redzone1(cachep, objp),
+ *dbg_redzone2(cachep, objp));
}
*dbg_redzone1(cachep, objp) = RED_INACTIVE;
*dbg_redzone2(cachep, objp) = RED_INACTIVE;
@@ -2302,30 +2341,31 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
if (cachep->flags & SLAB_STORE_USER)
*dbg_userword(cachep, objp) = caller;
- objnr = (objp-slabp->s_mem)/cachep->objsize;
+ objnr = (objp - slabp->s_mem) / cachep->objsize;
BUG_ON(objnr >= cachep->num);
- BUG_ON(objp != slabp->s_mem + objnr*cachep->objsize);
+ BUG_ON(objp != slabp->s_mem + objnr * cachep->objsize);
if (cachep->flags & SLAB_DEBUG_INITIAL) {
/* Need to call the slab's constructor so the
* caller can perform a verify of its state (debugging).
* Called without the cache-lock held.
*/
- cachep->ctor(objp+obj_dbghead(cachep),
- cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY);
+ cachep->ctor(objp + obj_dbghead(cachep),
+ cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
}
if (cachep->flags & SLAB_POISON && cachep->dtor) {
/* we want to cache poison the object,
* call the destruction callback
*/
- cachep->dtor(objp+obj_dbghead(cachep), cachep, 0);
+ cachep->dtor(objp + obj_dbghead(cachep), cachep, 0);
}
if (cachep->flags & SLAB_POISON) {
#ifdef CONFIG_DEBUG_PAGEALLOC
if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
store_stackinfo(cachep, objp, (unsigned long)caller);
- kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
+ kernel_map_pages(virt_to_page(objp),
+ cachep->objsize / PAGE_SIZE, 0);
} else {
poison_obj(cachep, objp, POISON_FREE);
}
@@ -2340,7 +2380,7 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
{
kmem_bufctl_t i;
int entries = 0;
-
+
/* Check slab's freelist to see if this obj is there. */
for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
entries++;
@@ -2348,13 +2388,16 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
goto bad;
}
if (entries != cachep->num - slabp->inuse) {
-bad:
- printk(KERN_ERR "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
- cachep->name, cachep->num, slabp, slabp->inuse);
- for (i=0;i<sizeof(slabp)+cachep->num*sizeof(kmem_bufctl_t);i++) {
- if ((i%16)==0)
+ bad:
+ printk(KERN_ERR
+ "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
+ cachep->name, cachep->num, slabp, slabp->inuse);
+ for (i = 0;
+ i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t);
+ i++) {
+ if ((i % 16) == 0)
printk("\n%03x:", i);
- printk(" %02x", ((unsigned char*)slabp)[i]);
+ printk(" %02x", ((unsigned char *)slabp)[i]);
}
printk("\n");
BUG();
@@ -2374,7 +2417,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
check_irq_off();
ac = ac_data(cachep);
-retry:
+ retry:
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
/* if there was little recent activity on this
@@ -2396,8 +2439,8 @@ retry:
shared_array->avail -= batchcount;
ac->avail = batchcount;
memcpy(ac->entry,
- &(shared_array->entry[shared_array->avail]),
- sizeof(void*)*batchcount);
+ &(shared_array->entry[shared_array->avail]),
+ sizeof(void *) * batchcount);
shared_array->touched = 1;
goto alloc_done;
}
@@ -2425,7 +2468,7 @@ retry:
/* get obj pointer */
ac->entry[ac->avail++] = slabp->s_mem +
- slabp->free*cachep->objsize;
+ slabp->free * cachep->objsize;
slabp->inuse++;
next = slab_bufctl(slabp)[slabp->free];
@@ -2433,7 +2476,7 @@ retry:
slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
WARN_ON(numa_node_id() != slabp->nodeid);
#endif
- slabp->free = next;
+ slabp->free = next;
}
check_slabp(cachep, slabp);
@@ -2445,9 +2488,9 @@ retry:
list_add(&slabp->list, &l3->slabs_partial);
}
-must_grow:
+ must_grow:
l3->free_objects -= ac->avail;
-alloc_done:
+ alloc_done:
spin_unlock(&l3->list_lock);
if (unlikely(!ac->avail)) {
@@ -2459,7 +2502,7 @@ alloc_done:
if (!x && ac->avail == 0) // no objects in sight? abort
return NULL;
- if (!ac->avail) // objects refilled by interrupt?
+ if (!ac->avail) // objects refilled by interrupt?
goto retry;
}
ac->touched = 1;
@@ -2476,16 +2519,16 @@ cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags)
}
#if DEBUG
-static void *
-cache_alloc_debugcheck_after(kmem_cache_t *cachep,
- gfp_t flags, void *objp, void *caller)
+static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
+ void *objp, void *caller)
{
- if (!objp)
+ if (!objp)
return objp;
- if (cachep->flags & SLAB_POISON) {
+ if (cachep->flags & SLAB_POISON) {
#ifdef CONFIG_DEBUG_PAGEALLOC
if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
- kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1);
+ kernel_map_pages(virt_to_page(objp),
+ cachep->objsize / PAGE_SIZE, 1);
else
check_poison_obj(cachep, objp);
#else
@@ -2497,24 +2540,28 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep,
*dbg_userword(cachep, objp) = caller;
if (cachep->flags & SLAB_RED_ZONE) {
- if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
- slab_error(cachep, "double free, or memory outside"
- " object was overwritten");
- printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
- objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp));
+ if (*dbg_redzone1(cachep, objp) != RED_INACTIVE
+ || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
+ slab_error(cachep,
+ "double free, or memory outside"
+ " object was overwritten");
+ printk(KERN_ERR
+ "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+ objp, *dbg_redzone1(cachep, objp),
+ *dbg_redzone2(cachep, objp));
}
*dbg_redzone1(cachep, objp) = RED_ACTIVE;
*dbg_redzone2(cachep, objp) = RED_ACTIVE;
}
objp += obj_dbghead(cachep);
if (cachep->ctor && cachep->flags & SLAB_POISON) {
- unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
+ unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
if (!(flags & __GFP_WAIT))
ctor_flags |= SLAB_CTOR_ATOMIC;
cachep->ctor(objp, cachep, ctor_flags);
- }
+ }
return objp;
}
#else
@@ -2523,9 +2570,18 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep,
static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
{
- void* objp;
+ void *objp;
struct array_cache *ac;
+#ifdef CONFIG_NUMA
+ if (unlikely(current->mempolicy && !in_interrupt())) {
+ int nid = slab_node(current->mempolicy);
+
+ if (nid != numa_node_id())
+ return __cache_alloc_node(cachep, flags, nid);
+ }
+#endif
+
check_irq_off();
ac = ac_data(cachep);
if (likely(ac->avail)) {
@@ -2542,7 +2598,7 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
{
unsigned long save_flags;
- void* objp;
+ void *objp;
cache_alloc_debugcheck_before(cachep, flags);
@@ -2550,7 +2606,7 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
objp = ____cache_alloc(cachep, flags);
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp,
- __builtin_return_address(0));
+ __builtin_return_address(0));
prefetchw(objp);
return objp;
}
@@ -2562,74 +2618,75 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
{
struct list_head *entry;
- struct slab *slabp;
- struct kmem_list3 *l3;
- void *obj;
- kmem_bufctl_t next;
- int x;
-
- l3 = cachep->nodelists[nodeid];
- BUG_ON(!l3);
-
-retry:
- spin_lock(&l3->list_lock);
- entry = l3->slabs_partial.next;
- if (entry == &l3->slabs_partial) {
- l3->free_touched = 1;
- entry = l3->slabs_free.next;
- if (entry == &l3->slabs_free)
- goto must_grow;
- }
-
- slabp = list_entry(entry, struct slab, list);
- check_spinlock_acquired_node(cachep, nodeid);
- check_slabp(cachep, slabp);
-
- STATS_INC_NODEALLOCS(cachep);
- STATS_INC_ACTIVE(cachep);
- STATS_SET_HIGH(cachep);
-
- BUG_ON(slabp->inuse == cachep->num);
-
- /* get obj pointer */
- obj = slabp->s_mem + slabp->free*cachep->objsize;
- slabp->inuse++;
- next = slab_bufctl(slabp)[slabp->free];
+ struct slab *slabp;
+ struct kmem_list3 *l3;
+ void *obj;
+ kmem_bufctl_t next;
+ int x;
+
+ l3 = cachep->nodelists[nodeid];
+ BUG_ON(!l3);
+
+ retry:
+ spin_lock(&l3->list_lock);
+ entry = l3->slabs_partial.next;
+ if (entry == &l3->slabs_partial) {
+ l3->free_touched = 1;
+ entry = l3->slabs_free.next;
+ if (entry == &l3->slabs_free)
+ goto must_grow;
+ }
+
+ slabp = list_entry(entry, struct slab, list);
+ check_spinlock_acquired_node(cachep, nodeid);
+ check_slabp(cachep, slabp);
+
+ STATS_INC_NODEALLOCS(cachep);
+ STATS_INC_ACTIVE(cachep);
+ STATS_SET_HIGH(cachep);
+
+ BUG_ON(slabp->inuse == cachep->num);
+
+ /* get obj pointer */
+ obj = slabp->s_mem + slabp->free * cachep->objsize;
+ slabp->inuse++;
+ next = slab_bufctl(slabp)[slabp->free];
#if DEBUG
- slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+ slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
#endif
- slabp->free = next;
- check_slabp(cachep, slabp);
- l3->free_objects--;
- /* move slabp to correct slabp list: */
- list_del(&slabp->list);
-
- if (slabp->free == BUFCTL_END) {
- list_add(&slabp->list, &l3->slabs_full);
- } else {
- list_add(&slabp->list, &l3->slabs_partial);
- }
+ slabp->free = next;
+ check_slabp(cachep, slabp);
+ l3->free_objects--;
+ /* move slabp to correct slabp list: */
+ list_del(&slabp->list);
+
+ if (slabp->free == BUFCTL_END) {
+ list_add(&slabp->list, &l3->slabs_full);
+ } else {
+ list_add(&slabp->list, &l3->slabs_partial);
+ }
- spin_unlock(&l3->list_lock);
- goto done;
+ spin_unlock(&l3->list_lock);
+ goto done;
-must_grow:
- spin_unlock(&l3->list_lock);
- x = cache_grow(cachep, flags, nodeid);
+ must_grow:
+ spin_unlock(&l3->list_lock);
+ x = cache_grow(cachep, flags, nodeid);
- if (!x)
- return NULL;
+ if (!x)
+ return NULL;
- goto retry;
-done:
- return obj;
+ goto retry;
+ done:
+ return obj;
}
#endif
/*
* Caller needs to acquire correct kmem_list's list_lock
*/
-static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int node)
+static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
+ int node)
{
int i;
struct kmem_list3 *l3;
@@ -2652,7 +2709,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n
if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
printk(KERN_ERR "slab: double free detected in cache "
- "'%s', objp %p\n", cachep->name, objp);
+ "'%s', objp %p\n", cachep->name, objp);
BUG();
}
#endif
@@ -2696,20 +2753,19 @@ static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
spin_lock(&l3->list_lock);
if (l3->shared) {
struct array_cache *shared_array = l3->shared;
- int max = shared_array->limit-shared_array->avail;
+ int max = shared_array->limit - shared_array->avail;
if (max) {
if (batchcount > max)
batchcount = max;
memcpy(&(shared_array->entry[shared_array->avail]),
- ac->entry,
- sizeof(void*)*batchcount);
+ ac->entry, sizeof(void *) * batchcount);
shared_array->avail += batchcount;
goto free_done;
}
}
free_block(cachep, ac->entry, batchcount, node);
-free_done:
+ free_done:
#if STATS
{
int i = 0;
@@ -2731,10 +2787,9 @@ free_done:
spin_unlock(&l3->list_lock);
ac->avail -= batchcount;
memmove(ac->entry, &(ac->entry[batchcount]),
- sizeof(void*)*ac->avail);
+ sizeof(void *) * ac->avail);
}
-
/*
* __cache_free
* Release an obj back to its cache. If the obj has a constructed
@@ -2759,7 +2814,8 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
if (unlikely(slabp->nodeid != numa_node_id())) {
struct array_cache *alien = NULL;
int nodeid = slabp->nodeid;
- struct kmem_list3 *l3 = cachep->nodelists[numa_node_id()];
+ struct kmem_list3 *l3 =
+ cachep->nodelists[numa_node_id()];
STATS_INC_NODEFREES(cachep);
if (l3->alien && l3->alien[nodeid]) {
@@ -2767,15 +2823,15 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
spin_lock(&alien->lock);
if (unlikely(alien->avail == alien->limit))
__drain_alien_cache(cachep,
- alien, nodeid);
+ alien, nodeid);
alien->entry[alien->avail++] = objp;
spin_unlock(&alien->lock);
} else {
spin_lock(&(cachep->nodelists[nodeid])->
- list_lock);
+ list_lock);
free_block(cachep, &objp, 1, nodeid);
spin_unlock(&(cachep->nodelists[nodeid])->
- list_lock);
+ list_lock);
}
return;
}
@@ -2822,9 +2878,9 @@ EXPORT_SYMBOL(kmem_cache_alloc);
*/
int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
{
- unsigned long addr = (unsigned long) ptr;
+ unsigned long addr = (unsigned long)ptr;
unsigned long min_addr = PAGE_OFFSET;
- unsigned long align_mask = BYTES_PER_WORD-1;
+ unsigned long align_mask = BYTES_PER_WORD - 1;
unsigned long size = cachep->objsize;
struct page *page;
@@ -2844,7 +2900,7 @@ int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
if (unlikely(page_get_cache(page) != cachep))
goto out;
return 1;
-out:
+ out:
return 0;
}
@@ -2871,8 +2927,10 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
if (unlikely(!cachep->nodelists[nodeid])) {
/* Fall back to __cache_alloc if we run into trouble */
- printk(KERN_WARNING "slab: not allocating in inactive node %d for cache %s\n", nodeid, cachep->name);
- return __cache_alloc(cachep,flags);
+ printk(KERN_WARNING
+ "slab: not allocating in inactive node %d for cache %s\n",
+ nodeid, cachep->name);
+ return __cache_alloc(cachep, flags);
}
cache_alloc_debugcheck_before(cachep, flags);
@@ -2882,7 +2940,9 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
else
ptr = __cache_alloc_node(cachep, flags, nodeid);
local_irq_restore(save_flags);
- ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0));
+ ptr =
+ cache_alloc_debugcheck_after(cachep, flags, ptr,
+ __builtin_return_address(0));
return ptr;
}
@@ -2944,12 +3004,11 @@ EXPORT_SYMBOL(__kmalloc);
* Objects should be dereferenced using the per_cpu_ptr macro only.
*
* @size: how many bytes of memory are required.
- * @align: the alignment, which can't be greater than SMP_CACHE_BYTES.
*/
-void *__alloc_percpu(size_t size, size_t align)
+void *__alloc_percpu(size_t size)
{
int i;
- struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
+ struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL);
if (!pdata)
return NULL;
@@ -2973,9 +3032,9 @@ void *__alloc_percpu(size_t size, size_t align)
}
/* Catch derefs w/o wrappers */
- return (void *) (~(unsigned long) pdata);
+ return (void *)(~(unsigned long)pdata);
-unwind_oom:
+ unwind_oom:
while (--i >= 0) {
if (!cpu_possible(i))
continue;
@@ -3006,20 +3065,6 @@ void kmem_cache_free(kmem_cache_t *cachep, void *objp)
EXPORT_SYMBOL(kmem_cache_free);
/**
- * kzalloc - allocate memory. The memory is set to zero.
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- */
-void *kzalloc(size_t size, gfp_t flags)
-{
- void *ret = kmalloc(size, flags);
- if (ret)
- memset(ret, 0, size);
- return ret;
-}
-EXPORT_SYMBOL(kzalloc);
-
-/**
* kfree - free previously allocated memory
* @objp: pointer returned by kmalloc.
*
@@ -3038,7 +3083,8 @@ void kfree(const void *objp)
local_irq_save(flags);
kfree_debugcheck(objp);
c = page_get_cache(virt_to_page(objp));
- __cache_free(c, (void*)objp);
+ mutex_debug_check_no_locks_freed(objp, obj_reallen(c));
+ __cache_free(c, (void *)objp);
local_irq_restore(flags);
}
EXPORT_SYMBOL(kfree);
@@ -3051,17 +3097,16 @@ EXPORT_SYMBOL(kfree);
* Don't free memory not originally allocated by alloc_percpu()
* The complemented objp is to check for that.
*/
-void
-free_percpu(const void *objp)
+void free_percpu(const void *objp)
{
int i;
- struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
+ struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp);
/*
* We allocate for all cpus so we cannot use for online cpu here.
*/
for_each_cpu(i)
- kfree(p->ptrs[i]);
+ kfree(p->ptrs[i]);
kfree(p);
}
EXPORT_SYMBOL(free_percpu);
@@ -3095,44 +3140,44 @@ static int alloc_kmemlist(kmem_cache_t *cachep)
if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
goto fail;
#endif
- if (!(new = alloc_arraycache(node, (cachep->shared*
- cachep->batchcount), 0xbaadf00d)))
+ if (!(new = alloc_arraycache(node, (cachep->shared *
+ cachep->batchcount),
+ 0xbaadf00d)))
goto fail;
if ((l3 = cachep->nodelists[node])) {
spin_lock_irq(&l3->list_lock);
if ((nc = cachep->nodelists[node]->shared))
- free_block(cachep, nc->entry,
- nc->avail, node);
+ free_block(cachep, nc->entry, nc->avail, node);
l3->shared = new;
if (!cachep->nodelists[node]->alien) {
l3->alien = new_alien;
new_alien = NULL;
}
- l3->free_limit = (1 + nr_cpus_node(node))*
- cachep->batchcount + cachep->num;
+ l3->free_limit = (1 + nr_cpus_node(node)) *
+ cachep->batchcount + cachep->num;
spin_unlock_irq(&l3->list_lock);
kfree(nc);
free_alien_cache(new_alien);
continue;
}
if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
- GFP_KERNEL, node)))
+ GFP_KERNEL, node)))
goto fail;
kmem_list3_init(l3);
l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+ ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
l3->shared = new;
l3->alien = new_alien;
- l3->free_limit = (1 + nr_cpus_node(node))*
- cachep->batchcount + cachep->num;
+ l3->free_limit = (1 + nr_cpus_node(node)) *
+ cachep->batchcount + cachep->num;
cachep->nodelists[node] = l3;
}
return err;
-fail:
+ fail:
err = -ENOMEM;
return err;
}
@@ -3154,18 +3199,19 @@ static void do_ccupdate_local(void *info)
new->new[smp_processor_id()] = old;
}
-
static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
- int shared)
+ int shared)
{
struct ccupdate_struct new;
int i, err;
- memset(&new.new,0,sizeof(new.new));
+ memset(&new.new, 0, sizeof(new.new));
for_each_online_cpu(i) {
- new.new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount);
+ new.new[i] =
+ alloc_arraycache(cpu_to_node(i), limit, batchcount);
if (!new.new[i]) {
- for (i--; i >= 0; i--) kfree(new.new[i]);
+ for (i--; i >= 0; i--)
+ kfree(new.new[i]);
return -ENOMEM;
}
}
@@ -3193,13 +3239,12 @@ static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
err = alloc_kmemlist(cachep);
if (err) {
printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
- cachep->name, -err);
+ cachep->name, -err);
BUG();
}
return 0;
}
-
static void enable_cpucache(kmem_cache_t *cachep)
{
int err;
@@ -3246,14 +3291,14 @@ static void enable_cpucache(kmem_cache_t *cachep)
if (limit > 32)
limit = 32;
#endif
- err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared);
+ err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
if (err)
printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
- cachep->name, -err);
+ cachep->name, -err);
}
-static void drain_array_locked(kmem_cache_t *cachep,
- struct array_cache *ac, int force, int node)
+static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
+ int force, int node)
{
int tofree;
@@ -3261,14 +3306,14 @@ static void drain_array_locked(kmem_cache_t *cachep,
if (ac->touched && !force) {
ac->touched = 0;
} else if (ac->avail) {
- tofree = force ? ac->avail : (ac->limit+4)/5;
+ tofree = force ? ac->avail : (ac->limit + 4) / 5;
if (tofree > ac->avail) {
- tofree = (ac->avail+1)/2;
+ tofree = (ac->avail + 1) / 2;
}
free_block(cachep, ac->entry, tofree, node);
ac->avail -= tofree;
memmove(ac->entry, &(ac->entry[tofree]),
- sizeof(void*)*ac->avail);
+ sizeof(void *) * ac->avail);
}
}
@@ -3281,7 +3326,7 @@ static void drain_array_locked(kmem_cache_t *cachep,
* - clear the per-cpu caches for this CPU.
* - return freeable pages to the main free memory pool.
*
- * If we cannot acquire the cache chain semaphore then just give up - we'll
+ * If we cannot acquire the cache chain mutex then just give up - we'll
* try again on the next iteration.
*/
static void cache_reap(void *unused)
@@ -3289,15 +3334,16 @@ static void cache_reap(void *unused)
struct list_head *walk;
struct kmem_list3 *l3;
- if (down_trylock(&cache_chain_sem)) {
+ if (!mutex_trylock(&cache_chain_mutex)) {
/* Give up. Setup the next iteration. */
- schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
+ schedule_delayed_work(&__get_cpu_var(reap_work),
+ REAPTIMEOUT_CPUC);
return;
}
list_for_each(walk, &cache_chain) {
kmem_cache_t *searchp;
- struct list_head* p;
+ struct list_head *p;
int tofree;
struct slab *slabp;
@@ -3314,7 +3360,7 @@ static void cache_reap(void *unused)
spin_lock_irq(&l3->list_lock);
drain_array_locked(searchp, ac_data(searchp), 0,
- numa_node_id());
+ numa_node_id());
if (time_after(l3->next_reap, jiffies))
goto next_unlock;
@@ -3323,14 +3369,16 @@ static void cache_reap(void *unused)
if (l3->shared)
drain_array_locked(searchp, l3->shared, 0,
- numa_node_id());
+ numa_node_id());
if (l3->free_touched) {
l3->free_touched = 0;
goto next_unlock;
}
- tofree = (l3->free_limit+5*searchp->num-1)/(5*searchp->num);
+ tofree =
+ (l3->free_limit + 5 * searchp->num -
+ 1) / (5 * searchp->num);
do {
p = l3->slabs_free.next;
if (p == &(l3->slabs_free))
@@ -3350,14 +3398,14 @@ static void cache_reap(void *unused)
spin_unlock_irq(&l3->list_lock);
slab_destroy(searchp, slabp);
spin_lock_irq(&l3->list_lock);
- } while(--tofree > 0);
-next_unlock:
+ } while (--tofree > 0);
+ next_unlock:
spin_unlock_irq(&l3->list_lock);
-next:
+ next:
cond_resched();
}
check_irq_on();
- up(&cache_chain_sem);
+ mutex_unlock(&cache_chain_mutex);
drain_remote_pages();
/* Setup the next iteration */
schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
@@ -3365,32 +3413,37 @@ next:
#ifdef CONFIG_PROC_FS
-static void *s_start(struct seq_file *m, loff_t *pos)
+static void print_slabinfo_header(struct seq_file *m)
{
- loff_t n = *pos;
- struct list_head *p;
-
- down(&cache_chain_sem);
- if (!n) {
- /*
- * Output format version, so at least we can change it
- * without _too_ many complaints.
- */
+ /*
+ * Output format version, so at least we can change it
+ * without _too_ many complaints.
+ */
#if STATS
- seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
+ seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
#else
- seq_puts(m, "slabinfo - version: 2.1\n");
+ seq_puts(m, "slabinfo - version: 2.1\n");
#endif
- seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
- seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
- seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
+ seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
+ "<objperslab> <pagesperslab>");
+ seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
+ seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
#if STATS
- seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>"
- " <error> <maxfreeable> <nodeallocs> <remotefrees>");
- seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
+ seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
+ "<error> <maxfreeable> <nodeallocs> <remotefrees>");
+ seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
#endif
- seq_putc(m, '\n');
- }
+ seq_putc(m, '\n');
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+ loff_t n = *pos;
+ struct list_head *p;
+
+ mutex_lock(&cache_chain_mutex);
+ if (!n)
+ print_slabinfo_header(m);
p = cache_chain.next;
while (n--) {
p = p->next;
@@ -3405,23 +3458,23 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
kmem_cache_t *cachep = p;
++*pos;
return cachep->next.next == &cache_chain ? NULL
- : list_entry(cachep->next.next, kmem_cache_t, next);
+ : list_entry(cachep->next.next, kmem_cache_t, next);
}
static void s_stop(struct seq_file *m, void *p)
{
- up(&cache_chain_sem);
+ mutex_unlock(&cache_chain_mutex);
}
static int s_show(struct seq_file *m, void *p)
{
kmem_cache_t *cachep = p;
struct list_head *q;
- struct slab *slabp;
- unsigned long active_objs;
- unsigned long num_objs;
- unsigned long active_slabs = 0;
- unsigned long num_slabs, free_objects = 0, shared_avail = 0;
+ struct slab *slabp;
+ unsigned long active_objs;
+ unsigned long num_objs;
+ unsigned long active_slabs = 0;
+ unsigned long num_slabs, free_objects = 0, shared_avail = 0;
const char *name;
char *error = NULL;
int node;
@@ -3438,14 +3491,14 @@ static int s_show(struct seq_file *m, void *p)
spin_lock(&l3->list_lock);
- list_for_each(q,&l3->slabs_full) {
+ list_for_each(q, &l3->slabs_full) {
slabp = list_entry(q, struct slab, list);
if (slabp->inuse != cachep->num && !error)
error = "slabs_full accounting error";
active_objs += cachep->num;
active_slabs++;
}
- list_for_each(q,&l3->slabs_partial) {
+ list_for_each(q, &l3->slabs_partial) {
slabp = list_entry(q, struct slab, list);
if (slabp->inuse == cachep->num && !error)
error = "slabs_partial inuse accounting error";
@@ -3454,7 +3507,7 @@ static int s_show(struct seq_file *m, void *p)
active_objs += slabp->inuse;
active_slabs++;
}
- list_for_each(q,&l3->slabs_free) {
+ list_for_each(q, &l3->slabs_free) {
slabp = list_entry(q, struct slab, list);
if (slabp->inuse && !error)
error = "slabs_free/inuse accounting error";
@@ -3465,25 +3518,24 @@ static int s_show(struct seq_file *m, void *p)
spin_unlock(&l3->list_lock);
}
- num_slabs+=active_slabs;
- num_objs = num_slabs*cachep->num;
+ num_slabs += active_slabs;
+ num_objs = num_slabs * cachep->num;
if (num_objs - active_objs != free_objects && !error)
error = "free_objects accounting error";
- name = cachep->name;
+ name = cachep->name;
if (error)
printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
- name, active_objs, num_objs, cachep->objsize,
- cachep->num, (1<<cachep->gfporder));
+ name, active_objs, num_objs, cachep->objsize,
+ cachep->num, (1 << cachep->gfporder));
seq_printf(m, " : tunables %4u %4u %4u",
- cachep->limit, cachep->batchcount,
- cachep->shared);
+ cachep->limit, cachep->batchcount, cachep->shared);
seq_printf(m, " : slabdata %6lu %6lu %6lu",
- active_slabs, num_slabs, shared_avail);
+ active_slabs, num_slabs, shared_avail);
#if STATS
- { /* list3 stats */
+ { /* list3 stats */
unsigned long high = cachep->high_mark;
unsigned long allocs = cachep->num_allocations;
unsigned long grown = cachep->grown;
@@ -3494,9 +3546,7 @@ static int s_show(struct seq_file *m, void *p)
unsigned long node_frees = cachep->node_frees;
seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
- %4lu %4lu %4lu %4lu",
- allocs, high, grown, reaped, errors,
- max_freeable, node_allocs, node_frees);
+ %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees);
}
/* cpu stats */
{
@@ -3506,7 +3556,7 @@ static int s_show(struct seq_file *m, void *p)
unsigned long freemiss = atomic_read(&cachep->freemiss);
seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
- allochit, allocmiss, freehit, freemiss);
+ allochit, allocmiss, freehit, freemiss);
}
#endif
seq_putc(m, '\n');
@@ -3529,10 +3579,10 @@ static int s_show(struct seq_file *m, void *p)
*/
struct seq_operations slabinfo_op = {
- .start = s_start,
- .next = s_next,
- .stop = s_stop,
- .show = s_show,
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show,
};
#define MAX_SLABINFO_WRITE 128
@@ -3543,18 +3593,18 @@ struct seq_operations slabinfo_op = {
* @count: data length
* @ppos: unused
*/
-ssize_t slabinfo_write(struct file *file, const char __user *buffer,
- size_t count, loff_t *ppos)
+ssize_t slabinfo_write(struct file *file, const char __user * buffer,
+ size_t count, loff_t *ppos)
{
- char kbuf[MAX_SLABINFO_WRITE+1], *tmp;
+ char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
int limit, batchcount, shared, res;
struct list_head *p;
-
+
if (count > MAX_SLABINFO_WRITE)
return -EINVAL;
if (copy_from_user(&kbuf, buffer, count))
return -EFAULT;
- kbuf[MAX_SLABINFO_WRITE] = '\0';
+ kbuf[MAX_SLABINFO_WRITE] = '\0';
tmp = strchr(kbuf, ' ');
if (!tmp)
@@ -3565,25 +3615,24 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
return -EINVAL;
/* Find the cache in the chain of caches. */
- down(&cache_chain_sem);
+ mutex_lock(&cache_chain_mutex);
res = -EINVAL;
- list_for_each(p,&cache_chain) {
+ list_for_each(p, &cache_chain) {
kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next);
if (!strcmp(cachep->name, kbuf)) {
if (limit < 1 ||
batchcount < 1 ||
- batchcount > limit ||
- shared < 0) {
+ batchcount > limit || shared < 0) {
res = 0;
} else {
res = do_tune_cpucache(cachep, limit,
- batchcount, shared);
+ batchcount, shared);
}
break;
}
}
- up(&cache_chain_sem);
+ mutex_unlock(&cache_chain_mutex);
if (res >= 0)
res = count;
return res;
@@ -3609,26 +3658,3 @@ unsigned int ksize(const void *objp)
return obj_reallen(page_get_cache(virt_to_page(objp)));
}
-
-
-/*
- * kstrdup - allocate space for and copy an existing string
- *
- * @s: the string to duplicate
- * @gfp: the GFP mask used in the kmalloc() call when allocating memory
- */
-char *kstrdup(const char *s, gfp_t gfp)
-{
- size_t len;
- char *buf;
-
- if (!s)
- return NULL;
-
- len = strlen(s) + 1;
- buf = kmalloc(len, gfp);
- if (buf)
- memcpy(buf, s, len);
- return buf;
-}
-EXPORT_SYMBOL(kstrdup);
diff --git a/mm/slob.c b/mm/slob.c
new file mode 100644
index 00000000000..1c240c4b71d
--- /dev/null
+++ b/mm/slob.c
@@ -0,0 +1,385 @@
+/*
+ * SLOB Allocator: Simple List Of Blocks
+ *
+ * Matt Mackall <mpm@selenic.com> 12/30/03
+ *
+ * How SLOB works:
+ *
+ * The core of SLOB is a traditional K&R style heap allocator, with
+ * support for returning aligned objects. The granularity of this
+ * allocator is 8 bytes on x86, though it's perhaps possible to reduce
+ * this to 4 if it's deemed worth the effort. The slob heap is a
+ * singly-linked list of pages from __get_free_page, grown on demand
+ * and allocation from the heap is currently first-fit.
+ *
+ * Above this is an implementation of kmalloc/kfree. Blocks returned
+ * from kmalloc are 8-byte aligned and prepended with a 8-byte header.
+ * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
+ * __get_free_pages directly so that it can return page-aligned blocks
+ * and keeps a linked list of such pages and their orders. These
+ * objects are detected in kfree() by their page alignment.
+ *
+ * SLAB is emulated on top of SLOB by simply calling constructors and
+ * destructors for every SLAB allocation. Objects are returned with
+ * the 8-byte alignment unless the SLAB_MUST_HWCACHE_ALIGN flag is
+ * set, in which case the low-level allocator will fragment blocks to
+ * create the proper alignment. Again, objects of page-size or greater
+ * are allocated by calling __get_free_pages. As SLAB objects know
+ * their size, no separate size bookkeeping is necessary and there is
+ * essentially no allocation space overhead.
+ */
+
+#include <linux/config.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+
+struct slob_block {
+ int units;
+ struct slob_block *next;
+};
+typedef struct slob_block slob_t;
+
+#define SLOB_UNIT sizeof(slob_t)
+#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
+#define SLOB_ALIGN L1_CACHE_BYTES
+
+struct bigblock {
+ int order;
+ void *pages;
+ struct bigblock *next;
+};
+typedef struct bigblock bigblock_t;
+
+static slob_t arena = { .next = &arena, .units = 1 };
+static slob_t *slobfree = &arena;
+static bigblock_t *bigblocks;
+static DEFINE_SPINLOCK(slob_lock);
+static DEFINE_SPINLOCK(block_lock);
+
+static void slob_free(void *b, int size);
+
+static void *slob_alloc(size_t size, gfp_t gfp, int align)
+{
+ slob_t *prev, *cur, *aligned = 0;
+ int delta = 0, units = SLOB_UNITS(size);
+ unsigned long flags;
+
+ spin_lock_irqsave(&slob_lock, flags);
+ prev = slobfree;
+ for (cur = prev->next; ; prev = cur, cur = cur->next) {
+ if (align) {
+ aligned = (slob_t *)ALIGN((unsigned long)cur, align);
+ delta = aligned - cur;
+ }
+ if (cur->units >= units + delta) { /* room enough? */
+ if (delta) { /* need to fragment head to align? */
+ aligned->units = cur->units - delta;
+ aligned->next = cur->next;
+ cur->next = aligned;
+ cur->units = delta;
+ prev = cur;
+ cur = aligned;
+ }
+
+ if (cur->units == units) /* exact fit? */
+ prev->next = cur->next; /* unlink */
+ else { /* fragment */
+ prev->next = cur + units;
+ prev->next->units = cur->units - units;
+ prev->next->next = cur->next;
+ cur->units = units;
+ }
+
+ slobfree = prev;
+ spin_unlock_irqrestore(&slob_lock, flags);
+ return cur;
+ }
+ if (cur == slobfree) {
+ spin_unlock_irqrestore(&slob_lock, flags);
+
+ if (size == PAGE_SIZE) /* trying to shrink arena? */
+ return 0;
+
+ cur = (slob_t *)__get_free_page(gfp);
+ if (!cur)
+ return 0;
+
+ slob_free(cur, PAGE_SIZE);
+ spin_lock_irqsave(&slob_lock, flags);
+ cur = slobfree;
+ }
+ }
+}
+
+static void slob_free(void *block, int size)
+{
+ slob_t *cur, *b = (slob_t *)block;
+ unsigned long flags;
+
+ if (!block)
+ return;
+
+ if (size)
+ b->units = SLOB_UNITS(size);
+
+ /* Find reinsertion point */
+ spin_lock_irqsave(&slob_lock, flags);
+ for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next)
+ if (cur >= cur->next && (b > cur || b < cur->next))
+ break;
+
+ if (b + b->units == cur->next) {
+ b->units += cur->next->units;
+ b->next = cur->next->next;
+ } else
+ b->next = cur->next;
+
+ if (cur + cur->units == b) {
+ cur->units += b->units;
+ cur->next = b->next;
+ } else
+ cur->next = b;
+
+ slobfree = cur;
+
+ spin_unlock_irqrestore(&slob_lock, flags);
+}
+
+static int FASTCALL(find_order(int size));
+static int fastcall find_order(int size)
+{
+ int order = 0;
+ for ( ; size > 4096 ; size >>=1)
+ order++;
+ return order;
+}
+
+void *kmalloc(size_t size, gfp_t gfp)
+{
+ slob_t *m;
+ bigblock_t *bb;
+ unsigned long flags;
+
+ if (size < PAGE_SIZE - SLOB_UNIT) {
+ m = slob_alloc(size + SLOB_UNIT, gfp, 0);
+ return m ? (void *)(m + 1) : 0;
+ }
+
+ bb = slob_alloc(sizeof(bigblock_t), gfp, 0);
+ if (!bb)
+ return 0;
+
+ bb->order = find_order(size);
+ bb->pages = (void *)__get_free_pages(gfp, bb->order);
+
+ if (bb->pages) {
+ spin_lock_irqsave(&block_lock, flags);
+ bb->next = bigblocks;
+ bigblocks = bb;
+ spin_unlock_irqrestore(&block_lock, flags);
+ return bb->pages;
+ }
+
+ slob_free(bb, sizeof(bigblock_t));
+ return 0;
+}
+
+EXPORT_SYMBOL(kmalloc);
+
+void kfree(const void *block)
+{
+ bigblock_t *bb, **last = &bigblocks;
+ unsigned long flags;
+
+ if (!block)
+ return;
+
+ if (!((unsigned long)block & (PAGE_SIZE-1))) {
+ /* might be on the big block list */
+ spin_lock_irqsave(&block_lock, flags);
+ for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) {
+ if (bb->pages == block) {
+ *last = bb->next;
+ spin_unlock_irqrestore(&block_lock, flags);
+ free_pages((unsigned long)block, bb->order);
+ slob_free(bb, sizeof(bigblock_t));
+ return;
+ }
+ }
+ spin_unlock_irqrestore(&block_lock, flags);
+ }
+
+ slob_free((slob_t *)block - 1, 0);
+ return;
+}
+
+EXPORT_SYMBOL(kfree);
+
+unsigned int ksize(const void *block)
+{
+ bigblock_t *bb;
+ unsigned long flags;
+
+ if (!block)
+ return 0;
+
+ if (!((unsigned long)block & (PAGE_SIZE-1))) {
+ spin_lock_irqsave(&block_lock, flags);
+ for (bb = bigblocks; bb; bb = bb->next)
+ if (bb->pages == block) {
+ spin_unlock_irqrestore(&slob_lock, flags);
+ return PAGE_SIZE << bb->order;
+ }
+ spin_unlock_irqrestore(&block_lock, flags);
+ }
+
+ return ((slob_t *)block - 1)->units * SLOB_UNIT;
+}
+
+struct kmem_cache {
+ unsigned int size, align;
+ const char *name;
+ void (*ctor)(void *, struct kmem_cache *, unsigned long);
+ void (*dtor)(void *, struct kmem_cache *, unsigned long);
+};
+
+struct kmem_cache *kmem_cache_create(const char *name, size_t size,
+ size_t align, unsigned long flags,
+ void (*ctor)(void*, struct kmem_cache *, unsigned long),
+ void (*dtor)(void*, struct kmem_cache *, unsigned long))
+{
+ struct kmem_cache *c;
+
+ c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
+
+ if (c) {
+ c->name = name;
+ c->size = size;
+ c->ctor = ctor;
+ c->dtor = dtor;
+ /* ignore alignment unless it's forced */
+ c->align = (flags & SLAB_MUST_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
+ if (c->align < align)
+ c->align = align;
+ }
+
+ return c;
+}
+EXPORT_SYMBOL(kmem_cache_create);
+
+int kmem_cache_destroy(struct kmem_cache *c)
+{
+ slob_free(c, sizeof(struct kmem_cache));
+ return 0;
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
+
+void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
+{
+ void *b;
+
+ if (c->size < PAGE_SIZE)
+ b = slob_alloc(c->size, flags, c->align);
+ else
+ b = (void *)__get_free_pages(flags, find_order(c->size));
+
+ if (c->ctor)
+ c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR);
+
+ return b;
+}
+EXPORT_SYMBOL(kmem_cache_alloc);
+
+void kmem_cache_free(struct kmem_cache *c, void *b)
+{
+ if (c->dtor)
+ c->dtor(b, c, 0);
+
+ if (c->size < PAGE_SIZE)
+ slob_free(b, c->size);
+ else
+ free_pages((unsigned long)b, find_order(c->size));
+}
+EXPORT_SYMBOL(kmem_cache_free);
+
+unsigned int kmem_cache_size(struct kmem_cache *c)
+{
+ return c->size;
+}
+EXPORT_SYMBOL(kmem_cache_size);
+
+const char *kmem_cache_name(struct kmem_cache *c)
+{
+ return c->name;
+}
+EXPORT_SYMBOL(kmem_cache_name);
+
+static struct timer_list slob_timer = TIMER_INITIALIZER(
+ (void (*)(unsigned long))kmem_cache_init, 0, 0);
+
+void kmem_cache_init(void)
+{
+ void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
+
+ if (p)
+ free_page((unsigned long)p);
+
+ mod_timer(&slob_timer, jiffies + HZ);
+}
+
+atomic_t slab_reclaim_pages = ATOMIC_INIT(0);
+EXPORT_SYMBOL(slab_reclaim_pages);
+
+#ifdef CONFIG_SMP
+
+void *__alloc_percpu(size_t size, size_t align)
+{
+ int i;
+ struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
+
+ if (!pdata)
+ return NULL;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ if (!cpu_possible(i))
+ continue;
+ pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
+ if (!pdata->ptrs[i])
+ goto unwind_oom;
+ memset(pdata->ptrs[i], 0, size);
+ }
+
+ /* Catch derefs w/o wrappers */
+ return (void *) (~(unsigned long) pdata);
+
+unwind_oom:
+ while (--i >= 0) {
+ if (!cpu_possible(i))
+ continue;
+ kfree(pdata->ptrs[i]);
+ }
+ kfree(pdata);
+ return NULL;
+}
+EXPORT_SYMBOL(__alloc_percpu);
+
+void
+free_percpu(const void *objp)
+{
+ int i;
+ struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
+
+ for (i = 0; i < NR_CPUS; i++) {
+ if (!cpu_possible(i))
+ continue;
+ kfree(p->ptrs[i]);
+ }
+ kfree(p);
+}
+EXPORT_SYMBOL(free_percpu);
+
+#endif
diff --git a/mm/sparse.c b/mm/sparse.c
index 72079b538e2..0a51f36ba3a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -18,10 +18,10 @@
*/
#ifdef CONFIG_SPARSEMEM_EXTREME
struct mem_section *mem_section[NR_SECTION_ROOTS]
- ____cacheline_maxaligned_in_smp;
+ ____cacheline_internodealigned_in_smp;
#else
struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
- ____cacheline_maxaligned_in_smp;
+ ____cacheline_internodealigned_in_smp;
#endif
EXPORT_SYMBOL(mem_section);
diff --git a/mm/swap.c b/mm/swap.c
index 73d351439ef..bc2442a7b0e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -156,18 +156,50 @@ void fastcall lru_cache_add_active(struct page *page)
put_cpu_var(lru_add_active_pvecs);
}
-void lru_add_drain(void)
+static void __lru_add_drain(int cpu)
{
- struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+ struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
+ /* CPU is dead, so no locking needed. */
if (pagevec_count(pvec))
__pagevec_lru_add(pvec);
- pvec = &__get_cpu_var(lru_add_active_pvecs);
+ pvec = &per_cpu(lru_add_active_pvecs, cpu);
if (pagevec_count(pvec))
__pagevec_lru_add_active(pvec);
- put_cpu_var(lru_add_pvecs);
}
+void lru_add_drain(void)
+{
+ __lru_add_drain(get_cpu());
+ put_cpu();
+}
+
+#ifdef CONFIG_NUMA
+static void lru_add_drain_per_cpu(void *dummy)
+{
+ lru_add_drain();
+}
+
+/*
+ * Returns 0 for success
+ */
+int lru_add_drain_all(void)
+{
+ return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
+}
+
+#else
+
+/*
+ * Returns 0 for success
+ */
+int lru_add_drain_all(void)
+{
+ lru_add_drain();
+ return 0;
+}
+#endif
+
/*
* This path almost never happens for VM activity - pages are normally
* freed via pagevecs. But it gets used by networking.
@@ -378,6 +410,8 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
return pagevec_count(pvec);
}
+EXPORT_SYMBOL(pagevec_lookup);
+
unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
pgoff_t *index, int tag, unsigned nr_pages)
{
@@ -412,17 +446,6 @@ void vm_acct_memory(long pages)
}
#ifdef CONFIG_HOTPLUG_CPU
-static void lru_drain_cache(unsigned int cpu)
-{
- struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
-
- /* CPU is dead, so no locking needed. */
- if (pagevec_count(pvec))
- __pagevec_lru_add(pvec);
- pvec = &per_cpu(lru_add_active_pvecs, cpu);
- if (pagevec_count(pvec))
- __pagevec_lru_add_active(pvec);
-}
/* Drop the CPU's cached committed space back into the central pool. */
static int cpu_swap_callback(struct notifier_block *nfb,
@@ -435,7 +458,7 @@ static int cpu_swap_callback(struct notifier_block *nfb,
if (action == CPU_DEAD) {
atomic_add(*committed, &vm_committed_space);
*committed = 0;
- lru_drain_cache((long)hcpu);
+ __lru_add_drain((long)hcpu);
}
return NOTIFY_OK;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 0df9a57b1de..7b09ac503fe 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -14,6 +14,7 @@
#include <linux/pagemap.h>
#include <linux/buffer_head.h>
#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
#include <asm/pgtable.h>
@@ -140,7 +141,7 @@ void __delete_from_swap_cache(struct page *page)
* Allocate swap space for the page and add the page to the
* swap cache. Caller needs to hold the page lock.
*/
-int add_to_swap(struct page * page)
+int add_to_swap(struct page * page, gfp_t gfp_mask)
{
swp_entry_t entry;
int err;
@@ -165,7 +166,7 @@ int add_to_swap(struct page * page)
* Add it to the swap cache and mark it dirty
*/
err = __add_to_swap_cache(page, entry,
- GFP_ATOMIC|__GFP_NOMEMALLOC|__GFP_NOWARN);
+ gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
switch (err) {
case 0: /* Success */
@@ -272,12 +273,11 @@ void free_page_and_swap_cache(struct page *page)
*/
void free_pages_and_swap_cache(struct page **pages, int nr)
{
- int chunk = 16;
struct page **pagep = pages;
lru_add_drain();
while (nr) {
- int todo = min(chunk, nr);
+ int todo = min(nr, PAGEVEC_SIZE);
int i;
for (i = 0; i < todo; i++)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index edafeace301..f1e69c30d20 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -25,6 +25,8 @@
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/backing-dev.h>
+#include <linux/mutex.h>
+#include <linux/capability.h>
#include <linux/syscalls.h>
#include <asm/pgtable.h>
@@ -45,12 +47,12 @@ struct swap_list_t swap_list = {-1, -1};
struct swap_info_struct swap_info[MAX_SWAPFILES];
-static DECLARE_MUTEX(swapon_sem);
+static DEFINE_MUTEX(swapon_mutex);
/*
* We need this because the bdev->unplug_fn can sleep and we cannot
* hold swap_lock while calling the unplug_fn. And swap_lock
- * cannot be turned into a semaphore.
+ * cannot be turned into a mutex.
*/
static DECLARE_RWSEM(swap_unplug_sem);
@@ -211,6 +213,26 @@ noswap:
return (swp_entry_t) {0};
}
+swp_entry_t get_swap_page_of_type(int type)
+{
+ struct swap_info_struct *si;
+ pgoff_t offset;
+
+ spin_lock(&swap_lock);
+ si = swap_info + type;
+ if (si->flags & SWP_WRITEOK) {
+ nr_swap_pages--;
+ offset = scan_swap_map(si);
+ if (offset) {
+ spin_unlock(&swap_lock);
+ return swp_entry(type, offset);
+ }
+ nr_swap_pages++;
+ }
+ spin_unlock(&swap_lock);
+ return (swp_entry_t) {0};
+}
+
static struct swap_info_struct * swap_info_get(swp_entry_t entry)
{
struct swap_info_struct * p;
@@ -1140,7 +1162,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
up_write(&swap_unplug_sem);
destroy_swap_extents(p);
- down(&swapon_sem);
+ mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
drain_mmlist();
@@ -1159,7 +1181,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
p->swap_map = NULL;
p->flags = 0;
spin_unlock(&swap_lock);
- up(&swapon_sem);
+ mutex_unlock(&swapon_mutex);
vfree(swap_map);
inode = mapping->host;
if (S_ISBLK(inode->i_mode)) {
@@ -1167,9 +1189,9 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
set_blocksize(bdev, p->old_block_size);
bd_release(bdev);
} else {
- down(&inode->i_sem);
+ mutex_lock(&inode->i_mutex);
inode->i_flags &= ~S_SWAPFILE;
- up(&inode->i_sem);
+ mutex_unlock(&inode->i_mutex);
}
filp_close(swap_file, NULL);
err = 0;
@@ -1188,7 +1210,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
int i;
loff_t l = *pos;
- down(&swapon_sem);
+ mutex_lock(&swapon_mutex);
for (i = 0; i < nr_swapfiles; i++, ptr++) {
if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
@@ -1217,7 +1239,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
static void swap_stop(struct seq_file *swap, void *v)
{
- up(&swapon_sem);
+ mutex_unlock(&swapon_mutex);
}
static int swap_show(struct seq_file *swap, void *v)
@@ -1386,7 +1408,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
p->bdev = bdev;
} else if (S_ISREG(inode->i_mode)) {
p->bdev = inode->i_sb->s_bdev;
- down(&inode->i_sem);
+ mutex_lock(&inode->i_mutex);
did_down = 1;
if (IS_SWAPFILE(inode)) {
error = -EBUSY;
@@ -1422,7 +1444,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
swap_header_version = 2;
else {
- printk("Unable to find swap-space signature\n");
+ printk(KERN_ERR "Unable to find swap-space signature\n");
error = -EINVAL;
goto bad_swap;
}
@@ -1473,7 +1495,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
goto bad_swap;
if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
goto bad_swap;
-
+
/* OK, set up the swap map and apply the bad block list */
if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
error = -ENOMEM;
@@ -1482,17 +1504,17 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
error = 0;
memset(p->swap_map, 0, maxpages * sizeof(short));
- for (i=0; i<swap_header->info.nr_badpages; i++) {
- int page = swap_header->info.badpages[i];
- if (page <= 0 || page >= swap_header->info.last_page)
+ for (i = 0; i < swap_header->info.nr_badpages; i++) {
+ int page_nr = swap_header->info.badpages[i];
+ if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
error = -EINVAL;
else
- p->swap_map[page] = SWAP_MAP_BAD;
+ p->swap_map[page_nr] = SWAP_MAP_BAD;
}
nr_good_pages = swap_header->info.last_page -
swap_header->info.nr_badpages -
1 /* header page */;
- if (error)
+ if (error)
goto bad_swap;
}
@@ -1519,7 +1541,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
goto bad_swap;
}
- down(&swapon_sem);
+ mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
p->flags = SWP_ACTIVE;
nr_swap_pages += nr_good_pages;
@@ -1545,7 +1567,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
swap_info[prev].next = p - swap_info;
}
spin_unlock(&swap_lock);
- up(&swapon_sem);
+ mutex_unlock(&swapon_mutex);
error = 0;
goto out;
bad_swap:
@@ -1576,7 +1598,7 @@ out:
if (did_down) {
if (!error)
inode->i_flags |= S_SWAPFILE;
- up(&inode->i_sem);
+ mutex_unlock(&inode->i_mutex);
}
return error;
}
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index b58abcf44ed..f9d6a9cc91c 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -81,13 +81,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
goto close_file;
d_instantiate(dentry, inode);
- inode->i_size = size;
inode->i_nlink = 0; /* It is unlinked */
+
file->f_vfsmnt = mntget(shm_mnt);
file->f_dentry = dentry;
file->f_mapping = inode->i_mapping;
file->f_op = &ramfs_file_operations;
file->f_mode = FMODE_WRITE | FMODE_READ;
+
+ /* notify everyone as to the change of file size */
+ error = do_truncate(dentry, size, 0, file);
+ if (error < 0)
+ goto close_file;
+
return file;
close_file:
@@ -123,3 +129,24 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
{
return 0;
}
+
+int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ file_accessed(file);
+#ifndef CONFIG_MMU
+ return ramfs_nommu_mmap(file, vma);
+#else
+ return 0;
+#endif
+}
+
+#ifndef CONFIG_MMU
+unsigned long shmem_get_unmapped_area(struct file *file,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long pgoff,
+ unsigned long flags)
+{
+ return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags);
+}
+#endif
diff --git a/mm/truncate.c b/mm/truncate.c
index 9173ab50060..6cb3fff25f6 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -82,12 +82,15 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
}
/**
- * truncate_inode_pages - truncate *all* the pages from an offset
+ * truncate_inode_pages - truncate range of pages specified by start and
+ * end byte offsets
* @mapping: mapping to truncate
* @lstart: offset from which to truncate
+ * @lend: offset to which to truncate
*
- * Truncate the page cache at a set offset, removing the pages that are beyond
- * that offset (and zeroing out partial pages).
+ * Truncate the page cache, removing the pages that are between
+ * specified offsets (and zeroing out partial page
+ * (if lstart is not page aligned)).
*
* Truncate takes two passes - the first pass is nonblocking. It will not
* block on page locks and it will not block on writeback. The second pass
@@ -101,12 +104,12 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
* We pass down the cache-hot hint to the page freeing code. Even if the
* mapping is large, it is probably the case that the final pages are the most
* recently touched, and freeing happens in ascending file offset order.
- *
- * Called under (and serialised by) inode->i_sem.
*/
-void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+void truncate_inode_pages_range(struct address_space *mapping,
+ loff_t lstart, loff_t lend)
{
const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+ pgoff_t end;
const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
struct pagevec pvec;
pgoff_t next;
@@ -115,13 +118,22 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
if (mapping->nrpages == 0)
return;
+ BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
+ end = (lend >> PAGE_CACHE_SHIFT);
+
pagevec_init(&pvec, 0);
next = start;
- while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ while (next <= end &&
+ pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
pgoff_t page_index = page->index;
+ if (page_index > end) {
+ next = page_index;
+ break;
+ }
+
if (page_index > next)
next = page_index;
next++;
@@ -157,9 +169,15 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
next = start;
continue;
}
+ if (pvec.pages[0]->index > end) {
+ pagevec_release(&pvec);
+ break;
+ }
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
+ if (page->index > end)
+ break;
lock_page(page);
wait_on_page_writeback(page);
if (page->index > next)
@@ -171,7 +189,19 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
pagevec_release(&pvec);
}
}
+EXPORT_SYMBOL(truncate_inode_pages_range);
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from which to truncate
+ *
+ * Called under (and serialised by) inode->i_mutex.
+ */
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+{
+ truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
+}
EXPORT_SYMBOL(truncate_inode_pages);
/**
@@ -219,7 +249,6 @@ unlock:
break;
}
pagevec_release(&pvec);
- cond_resched();
}
return ret;
}
diff --git a/mm/util.c b/mm/util.c
new file mode 100644
index 00000000000..5f4bb59da63
--- /dev/null
+++ b/mm/util.c
@@ -0,0 +1,39 @@
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+/**
+ * kzalloc - allocate memory. The memory is set to zero.
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ */
+void *kzalloc(size_t size, gfp_t flags)
+{
+ void *ret = kmalloc(size, flags);
+ if (ret)
+ memset(ret, 0, size);
+ return ret;
+}
+EXPORT_SYMBOL(kzalloc);
+
+/*
+ * kstrdup - allocate space for and copy an existing string
+ *
+ * @s: the string to duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ */
+char *kstrdup(const char *s, gfp_t gfp)
+{
+ size_t len;
+ char *buf;
+
+ if (!s)
+ return NULL;
+
+ len = strlen(s) + 1;
+ buf = kmalloc(len, gfp);
+ if (buf)
+ memcpy(buf, s, len);
+ return buf;
+}
+EXPORT_SYMBOL(kstrdup);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b0cd81c32de..2e34b61a70c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -63,9 +63,6 @@ struct scan_control {
unsigned long nr_mapped; /* From page_state */
- /* How many pages shrink_cache() should reclaim */
- int nr_to_reclaim;
-
/* Ask shrink_caches, or shrink_zone to scan at this priority */
unsigned int priority;
@@ -186,8 +183,7 @@ EXPORT_SYMBOL(remove_shrinker);
*
* Returns the number of slab objects which we shrunk.
*/
-static int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
- unsigned long lru_pages)
+int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages)
{
struct shrinker *shrinker;
int ret = 0;
@@ -275,9 +271,7 @@ static inline int is_page_cache_freeable(struct page *page)
static int may_write_to_queue(struct backing_dev_info *bdi)
{
- if (current_is_kswapd())
- return 1;
- if (current_is_pdflush()) /* This is unlikely, but why not... */
+ if (current->flags & PF_SWAPWRITE)
return 1;
if (!bdi_write_congested(bdi))
return 1;
@@ -367,7 +361,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
res = mapping->a_ops->writepage(page, &wbc);
if (res < 0)
handle_write_error(mapping, page, res);
- if (res == WRITEPAGE_ACTIVATE) {
+ if (res == AOP_WRITEPAGE_ACTIVATE) {
ClearPageReclaim(page);
return PAGE_ACTIVATE;
}
@@ -382,6 +376,43 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
return PAGE_CLEAN;
}
+static int remove_mapping(struct address_space *mapping, struct page *page)
+{
+ if (!mapping)
+ return 0; /* truncate got there first */
+
+ write_lock_irq(&mapping->tree_lock);
+
+ /*
+ * The non-racy check for busy page. It is critical to check
+ * PageDirty _after_ making sure that the page is freeable and
+ * not in use by anybody. (pagecache + us == 2)
+ */
+ if (unlikely(page_count(page) != 2))
+ goto cannot_free;
+ smp_rmb();
+ if (unlikely(PageDirty(page)))
+ goto cannot_free;
+
+ if (PageSwapCache(page)) {
+ swp_entry_t swap = { .val = page_private(page) };
+ __delete_from_swap_cache(page);
+ write_unlock_irq(&mapping->tree_lock);
+ swap_free(swap);
+ __put_page(page); /* The pagecache ref */
+ return 1;
+ }
+
+ __remove_from_page_cache(page);
+ write_unlock_irq(&mapping->tree_lock);
+ __put_page(page);
+ return 1;
+
+cannot_free:
+ write_unlock_irq(&mapping->tree_lock);
+ return 0;
+}
+
/*
* shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
*/
@@ -432,7 +463,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
if (PageAnon(page) && !PageSwapCache(page)) {
if (!sc->may_swap)
goto keep_locked;
- if (!add_to_swap(page))
+ if (!add_to_swap(page, GFP_ATOMIC))
goto activate_locked;
}
#endif /* CONFIG_SWAP */
@@ -515,36 +546,8 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
goto free_it;
}
- if (!mapping)
- goto keep_locked; /* truncate got there first */
-
- write_lock_irq(&mapping->tree_lock);
-
- /*
- * The non-racy check for busy page. It is critical to check
- * PageDirty _after_ making sure that the page is freeable and
- * not in use by anybody. (pagecache + us == 2)
- */
- if (unlikely(page_count(page) != 2))
- goto cannot_free;
- smp_rmb();
- if (unlikely(PageDirty(page)))
- goto cannot_free;
-
-#ifdef CONFIG_SWAP
- if (PageSwapCache(page)) {
- swp_entry_t swap = { .val = page_private(page) };
- __delete_from_swap_cache(page);
- write_unlock_irq(&mapping->tree_lock);
- swap_free(swap);
- __put_page(page); /* The pagecache ref */
- goto free_it;
- }
-#endif /* CONFIG_SWAP */
-
- __remove_from_page_cache(page);
- write_unlock_irq(&mapping->tree_lock);
- __put_page(page);
+ if (!remove_mapping(mapping, page))
+ goto keep_locked;
free_it:
unlock_page(page);
@@ -553,10 +556,6 @@ free_it:
__pagevec_release_nonlru(&freed_pvec);
continue;
-cannot_free:
- write_unlock_irq(&mapping->tree_lock);
- goto keep_locked;
-
activate_locked:
SetPageActive(page);
pgactivate++;
@@ -574,6 +573,228 @@ keep:
return reclaimed;
}
+#ifdef CONFIG_MIGRATION
+static inline void move_to_lru(struct page *page)
+{
+ list_del(&page->lru);
+ if (PageActive(page)) {
+ /*
+ * lru_cache_add_active checks that
+ * the PG_active bit is off.
+ */
+ ClearPageActive(page);
+ lru_cache_add_active(page);
+ } else {
+ lru_cache_add(page);
+ }
+ put_page(page);
+}
+
+/*
+ * Add isolated pages on the list back to the LRU.
+ *
+ * returns the number of pages put back.
+ */
+int putback_lru_pages(struct list_head *l)
+{
+ struct page *page;
+ struct page *page2;
+ int count = 0;
+
+ list_for_each_entry_safe(page, page2, l, lru) {
+ move_to_lru(page);
+ count++;
+ }
+ return count;
+}
+
+/*
+ * swapout a single page
+ * page is locked upon entry, unlocked on exit
+ */
+static int swap_page(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+
+ if (page_mapped(page) && mapping)
+ if (try_to_unmap(page) != SWAP_SUCCESS)
+ goto unlock_retry;
+
+ if (PageDirty(page)) {
+ /* Page is dirty, try to write it out here */
+ switch(pageout(page, mapping)) {
+ case PAGE_KEEP:
+ case PAGE_ACTIVATE:
+ goto unlock_retry;
+
+ case PAGE_SUCCESS:
+ goto retry;
+
+ case PAGE_CLEAN:
+ ; /* try to free the page below */
+ }
+ }
+
+ if (PagePrivate(page)) {
+ if (!try_to_release_page(page, GFP_KERNEL) ||
+ (!mapping && page_count(page) == 1))
+ goto unlock_retry;
+ }
+
+ if (remove_mapping(mapping, page)) {
+ /* Success */
+ unlock_page(page);
+ return 0;
+ }
+
+unlock_retry:
+ unlock_page(page);
+
+retry:
+ return -EAGAIN;
+}
+/*
+ * migrate_pages
+ *
+ * Two lists are passed to this function. The first list
+ * contains the pages isolated from the LRU to be migrated.
+ * The second list contains new pages that the pages isolated
+ * can be moved to. If the second list is NULL then all
+ * pages are swapped out.
+ *
+ * The function returns after 10 attempts or if no pages
+ * are movable anymore because t has become empty
+ * or no retryable pages exist anymore.
+ *
+ * SIMPLIFIED VERSION: This implementation of migrate_pages
+ * is only swapping out pages and never touches the second
+ * list. The direct migration patchset
+ * extends this function to avoid the use of swap.
+ *
+ * Return: Number of pages not migrated when "to" ran empty.
+ */
+int migrate_pages(struct list_head *from, struct list_head *to,
+ struct list_head *moved, struct list_head *failed)
+{
+ int retry;
+ int nr_failed = 0;
+ int pass = 0;
+ struct page *page;
+ struct page *page2;
+ int swapwrite = current->flags & PF_SWAPWRITE;
+ int rc;
+
+ if (!swapwrite)
+ current->flags |= PF_SWAPWRITE;
+
+redo:
+ retry = 0;
+
+ list_for_each_entry_safe(page, page2, from, lru) {
+ cond_resched();
+
+ rc = 0;
+ if (page_count(page) == 1)
+ /* page was freed from under us. So we are done. */
+ goto next;
+
+ /*
+ * Skip locked pages during the first two passes to give the
+ * functions holding the lock time to release the page. Later we
+ * use lock_page() to have a higher chance of acquiring the
+ * lock.
+ */
+ rc = -EAGAIN;
+ if (pass > 2)
+ lock_page(page);
+ else
+ if (TestSetPageLocked(page))
+ goto next;
+
+ /*
+ * Only wait on writeback if we have already done a pass where
+ * we we may have triggered writeouts for lots of pages.
+ */
+ if (pass > 0) {
+ wait_on_page_writeback(page);
+ } else {
+ if (PageWriteback(page))
+ goto unlock_page;
+ }
+
+ /*
+ * Anonymous pages must have swap cache references otherwise
+ * the information contained in the page maps cannot be
+ * preserved.
+ */
+ if (PageAnon(page) && !PageSwapCache(page)) {
+ if (!add_to_swap(page, GFP_KERNEL)) {
+ rc = -ENOMEM;
+ goto unlock_page;
+ }
+ }
+
+ /*
+ * Page is properly locked and writeback is complete.
+ * Try to migrate the page.
+ */
+ rc = swap_page(page);
+ goto next;
+
+unlock_page:
+ unlock_page(page);
+
+next:
+ if (rc == -EAGAIN) {
+ retry++;
+ } else if (rc) {
+ /* Permanent failure */
+ list_move(&page->lru, failed);
+ nr_failed++;
+ } else {
+ /* Success */
+ list_move(&page->lru, moved);
+ }
+ }
+ if (retry && pass++ < 10)
+ goto redo;
+
+ if (!swapwrite)
+ current->flags &= ~PF_SWAPWRITE;
+
+ return nr_failed + retry;
+}
+
+/*
+ * Isolate one page from the LRU lists and put it on the
+ * indicated list with elevated refcount.
+ *
+ * Result:
+ * 0 = page not on LRU list
+ * 1 = page removed from LRU list and added to the specified list.
+ */
+int isolate_lru_page(struct page *page)
+{
+ int ret = 0;
+
+ if (PageLRU(page)) {
+ struct zone *zone = page_zone(page);
+ spin_lock_irq(&zone->lru_lock);
+ if (TestClearPageLRU(page)) {
+ ret = 1;
+ get_page(page);
+ if (PageActive(page))
+ del_page_from_active_list(zone, page);
+ else
+ del_page_from_inactive_list(zone, page);
+ }
+ spin_unlock_irq(&zone->lru_lock);
+ }
+
+ return ret;
+}
+#endif
+
/*
* zone->lru_lock is heavily contended. Some of the functions that
* shrink the lists perform better by taking out a batch of pages
@@ -653,17 +874,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
goto done;
max_scan -= nr_scan;
- if (current_is_kswapd())
- mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
- else
- mod_page_state_zone(zone, pgscan_direct, nr_scan);
nr_freed = shrink_list(&page_list, sc);
- if (current_is_kswapd())
- mod_page_state(kswapd_steal, nr_freed);
- mod_page_state_zone(zone, pgsteal, nr_freed);
- sc->nr_to_reclaim -= nr_freed;
- spin_lock_irq(&zone->lru_lock);
+ local_irq_disable();
+ if (current_is_kswapd()) {
+ __mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
+ __mod_page_state(kswapd_steal, nr_freed);
+ } else
+ __mod_page_state_zone(zone, pgscan_direct, nr_scan);
+ __mod_page_state_zone(zone, pgsteal, nr_freed);
+
+ spin_lock(&zone->lru_lock);
/*
* Put back any unfreeable pages.
*/
@@ -825,11 +1046,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
}
}
zone->nr_active += pgmoved;
- spin_unlock_irq(&zone->lru_lock);
- pagevec_release(&pvec);
+ spin_unlock(&zone->lru_lock);
+
+ __mod_page_state_zone(zone, pgrefill, pgscanned);
+ __mod_page_state(pgdeactivate, pgdeactivate);
+ local_irq_enable();
- mod_page_state_zone(zone, pgrefill, pgscanned);
- mod_page_state(pgdeactivate, pgdeactivate);
+ pagevec_release(&pvec);
}
/*
@@ -861,8 +1084,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
else
nr_inactive = 0;
- sc->nr_to_reclaim = sc->swap_cluster_max;
-
while (nr_active || nr_inactive) {
if (nr_active) {
sc->nr_to_scan = min(nr_active,
@@ -876,8 +1097,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
(unsigned long)sc->swap_cluster_max);
nr_inactive -= sc->nr_to_scan;
shrink_cache(zone, sc);
- if (sc->nr_to_reclaim <= 0)
- break;
}
}
@@ -910,7 +1129,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
for (i = 0; zones[i] != NULL; i++) {
struct zone *zone = zones[i];
- if (zone->present_pages == 0)
+ if (!populated_zone(zone))
continue;
if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
@@ -1084,7 +1303,7 @@ loop_again:
for (i = pgdat->nr_zones - 1; i >= 0; i--) {
struct zone *zone = pgdat->node_zones + i;
- if (zone->present_pages == 0)
+ if (!populated_zone(zone))
continue;
if (zone->all_unreclaimable &&
@@ -1121,7 +1340,7 @@ scan:
struct zone *zone = pgdat->node_zones + i;
int nr_slab;
- if (zone->present_pages == 0)
+ if (!populated_zone(zone))
continue;
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
@@ -1238,7 +1457,7 @@ static int kswapd(void *p)
* us from recursively trying to free more memory as we're
* trying to free the first piece of memory in the first place).
*/
- tsk->flags |= PF_MEMALLOC|PF_KSWAPD;
+ tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
order = 0;
for ( ; ; ) {
@@ -1273,7 +1492,7 @@ void wakeup_kswapd(struct zone *zone, int order)
{
pg_data_t *pgdat;
- if (zone->present_pages == 0)
+ if (!populated_zone(zone))
return;
pgdat = zone->zone_pgdat;
@@ -1354,30 +1573,51 @@ static int __init kswapd_init(void)
module_init(kswapd_init)
+#ifdef CONFIG_NUMA
+/*
+ * Zone reclaim mode
+ *
+ * If non-zero call zone_reclaim when the number of free pages falls below
+ * the watermarks.
+ *
+ * In the future we may add flags to the mode. However, the page allocator
+ * should only have to check that zone_reclaim_mode != 0 before calling
+ * zone_reclaim().
+ */
+int zone_reclaim_mode __read_mostly;
/*
+ * Mininum time between zone reclaim scans
+ */
+#define ZONE_RECLAIM_INTERVAL HZ/2
+/*
* Try to free up some pages from this zone through reclaim.
*/
int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
{
- struct scan_control sc;
int nr_pages = 1 << order;
- int total_reclaimed = 0;
+ struct task_struct *p = current;
+ struct reclaim_state reclaim_state;
+ struct scan_control sc = {
+ .gfp_mask = gfp_mask,
+ .may_writepage = 0,
+ .may_swap = 0,
+ .nr_mapped = read_page_state(nr_mapped),
+ .nr_scanned = 0,
+ .nr_reclaimed = 0,
+ .priority = 0
+ };
- /* The reclaim may sleep, so don't do it if sleep isn't allowed */
- if (!(gfp_mask & __GFP_WAIT))
- return 0;
- if (zone->all_unreclaimable)
- return 0;
+ if (!(gfp_mask & __GFP_WAIT) ||
+ zone->zone_pgdat->node_id != numa_node_id() ||
+ zone->all_unreclaimable ||
+ atomic_read(&zone->reclaim_in_progress) > 0)
+ return 0;
+
+ if (time_before(jiffies,
+ zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
+ return 0;
- sc.gfp_mask = gfp_mask;
- sc.may_writepage = 0;
- sc.may_swap = 0;
- sc.nr_mapped = read_page_state(nr_mapped);
- sc.nr_scanned = 0;
- sc.nr_reclaimed = 0;
- /* scan at the highest priority */
- sc.priority = 0;
disable_swap_token();
if (nr_pages > SWAP_CLUSTER_MAX)
@@ -1385,44 +1625,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
else
sc.swap_cluster_max = SWAP_CLUSTER_MAX;
- /* Don't reclaim the zone if there are other reclaimers active */
- if (atomic_read(&zone->reclaim_in_progress) > 0)
- goto out;
-
+ cond_resched();
+ p->flags |= PF_MEMALLOC;
+ reclaim_state.reclaimed_slab = 0;
+ p->reclaim_state = &reclaim_state;
shrink_zone(zone, &sc);
- total_reclaimed = sc.nr_reclaimed;
-
- out:
- return total_reclaimed;
-}
-
-asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
- unsigned int state)
-{
- struct zone *z;
- int i;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
+ p->reclaim_state = NULL;
+ current->flags &= ~PF_MEMALLOC;
- if (node >= MAX_NUMNODES || !node_online(node))
- return -EINVAL;
+ if (sc.nr_reclaimed == 0)
+ zone->last_unsuccessful_zone_reclaim = jiffies;
- /* This will break if we ever add more zones */
- if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM)))
- return -EINVAL;
-
- for (i = 0; i < MAX_NR_ZONES; i++) {
- if (!(zone & 1<<i))
- continue;
-
- z = &NODE_DATA(node)->node_zones[i];
-
- if (state)
- z->reclaim_pages = 1;
- else
- z->reclaim_pages = 0;
- }
-
- return 0;
+ return sc.nr_reclaimed > nr_pages;
}
+#endif
+