summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/allocpercpu.c9
-rw-r--r--mm/bootmem.c6
-rw-r--r--mm/bounce.c4
-rw-r--r--mm/fadvise.c2
-rw-r--r--mm/filemap.c13
-rw-r--r--mm/filemap_xip.c4
-rw-r--r--mm/fremap.c4
-rw-r--r--mm/hugetlb.c30
-rw-r--r--mm/memory.c51
-rw-r--r--mm/memory_hotplug.c7
-rw-r--r--mm/mempolicy.c12
-rw-r--r--mm/migrate.c19
-rw-r--r--mm/mincore.c183
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mmap.c14
-rw-r--r--mm/mmzone.c5
-rw-r--r--mm/nommu.c30
-rw-r--r--mm/oom_kill.c62
-rw-r--r--mm/page-writeback.c106
-rw-r--r--mm/page_alloc.c406
-rw-r--r--mm/page_io.c45
-rw-r--r--mm/pdflush.c1
-rw-r--r--mm/readahead.c12
-rw-r--r--mm/rmap.c36
-rw-r--r--mm/shmem.c35
-rw-r--r--mm/slab.c411
-rw-r--r--mm/slob.c27
-rw-r--r--mm/sparse.c23
-rw-r--r--mm/swap.c10
-rw-r--r--mm/swapfile.c102
-rw-r--r--mm/thrash.c116
-rw-r--r--mm/tiny-shmem.c4
-rw-r--r--mm/truncate.c41
-rw-r--r--mm/vmscan.c60
-rw-r--r--mm/vmstat.c22
35 files changed, 1224 insertions, 690 deletions
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index eaa9abeea53..b2486cf887a 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -17,10 +17,9 @@
void percpu_depopulate(void *__pdata, int cpu)
{
struct percpu_data *pdata = __percpu_disguise(__pdata);
- if (pdata->ptrs[cpu]) {
- kfree(pdata->ptrs[cpu]);
- pdata->ptrs[cpu] = NULL;
- }
+
+ kfree(pdata->ptrs[cpu]);
+ pdata->ptrs[cpu] = NULL;
}
EXPORT_SYMBOL_GPL(percpu_depopulate);
@@ -123,6 +122,8 @@ EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
*/
void percpu_free(void *__pdata)
{
+ if (unlikely(!__pdata))
+ return;
__percpu_depopulate_mask(__pdata, &cpu_possible_map);
kfree(__percpu_disguise(__pdata));
}
diff --git a/mm/bootmem.c b/mm/bootmem.c
index d53112fcb40..00a96970b23 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -27,8 +27,6 @@ unsigned long max_low_pfn;
unsigned long min_low_pfn;
unsigned long max_pfn;
-EXPORT_UNUSED_SYMBOL(max_pfn); /* June 2006 */
-
static LIST_HEAD(bdata_list);
#ifdef CONFIG_CRASH_DUMP
/*
@@ -196,6 +194,10 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
if (limit && bdata->node_boot_start >= limit)
return NULL;
+ /* on nodes without memory - bootmem_map is NULL */
+ if (!bdata->node_bootmem_map)
+ return NULL;
+
end_pfn = bdata->node_low_pfn;
limit = PFN_DOWN(limit);
if (limit && end_pfn > limit)
diff --git a/mm/bounce.c b/mm/bounce.c
index e4b62d2a402..643efbe8240 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -237,6 +237,8 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
if (!bio)
return;
+ blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
+
/*
* at least one page was bounced, fill in possible non-highmem
* pages
@@ -291,8 +293,6 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
pool = isa_page_pool;
}
- blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
-
/*
* slow path
*/
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 168c78a121b..0df4c899e97 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -38,7 +38,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
if (!file)
return -EBADF;
- if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) {
+ if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) {
ret = -ESPIPE;
goto out;
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 7b84dc81434..8332c77b1bd 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1181,8 +1181,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
if (pos < size) {
retval = generic_file_direct_IO(READ, iocb,
iov, pos, nr_segs);
- if (retval > 0 && !is_sync_kiocb(iocb))
- retval = -EIOCBQUEUED;
if (retval > 0)
*ppos = pos + retval;
}
@@ -1445,7 +1443,6 @@ no_cached_page:
* effect.
*/
error = page_cache_read(file, pgoff);
- grab_swap_token();
/*
* The page we want has now been added to the page cache.
@@ -1893,6 +1890,7 @@ int should_remove_suid(struct dentry *dentry)
return 0;
}
+EXPORT_SYMBOL(should_remove_suid);
int __remove_suid(struct dentry *dentry, int kill)
{
@@ -2047,15 +2045,14 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
* Sync the fs metadata but not the minor inode changes and
* of course not the data as we did direct DMA for the IO.
* i_mutex is held, which protects generic_osync_inode() from
- * livelocking.
+ * livelocking. AIO O_DIRECT ops attempt to sync metadata here.
*/
- if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+ if ((written >= 0 || written == -EIOCBQUEUED) &&
+ ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
if (err < 0)
written = err;
}
- if (written == count && !is_sync_kiocb(iocb))
- written = -EIOCBQUEUED;
return written;
}
EXPORT_SYMBOL(generic_file_direct_write);
@@ -2269,7 +2266,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
if (count == 0)
goto out;
- err = remove_suid(file->f_dentry);
+ err = remove_suid(file->f_path.dentry);
if (err)
goto out;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index b4fd0d7c9bf..45b3553865c 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -189,7 +189,7 @@ __xip_unmap (struct address_space * mapping,
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
pteval = ptep_clear_flush(vma, address, pte);
- page_remove_rmap(page);
+ page_remove_rmap(page, vma);
dec_mm_counter(mm, file_rss);
BUG_ON(pte_dirty(pteval));
pte_unmap_unlock(pte, ptl);
@@ -379,7 +379,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
if (count == 0)
goto out_backing;
- ret = remove_suid(filp->f_dentry);
+ ret = remove_suid(filp->f_path.dentry);
if (ret)
goto out_backing;
diff --git a/mm/fremap.c b/mm/fremap.c
index 7a9d0f5d246..4e3f53dd5fd 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -33,7 +33,7 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
if (page) {
if (pte_dirty(pte))
set_page_dirty(page);
- page_remove_rmap(page);
+ page_remove_rmap(page, vma);
page_cache_release(page);
}
} else {
@@ -101,7 +101,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
{
int err = -ENOMEM;
pte_t *pte;
- pte_t pte_val;
spinlock_t *ptl;
pte = get_locked_pte(mm, addr, &ptl);
@@ -114,7 +113,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
}
set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
- pte_val = *pte;
/*
* We don't need to run update_mmu_cache() here because the "file pte"
* being installed by install_file_pte() is not a real pte - it's a
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a088f593a80..cb362f761f1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -44,14 +44,14 @@ static void clear_huge_page(struct page *page, unsigned long addr)
}
static void copy_huge_page(struct page *dst, struct page *src,
- unsigned long addr)
+ unsigned long addr, struct vm_area_struct *vma)
{
int i;
might_sleep();
for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
cond_resched();
- copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
+ copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
}
}
@@ -73,7 +73,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
for (z = zonelist->zones; *z; z++) {
nid = zone_to_nid(*z);
- if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
+ if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) &&
!list_empty(&hugepage_freelists[nid]))
break;
}
@@ -109,7 +109,7 @@ static int alloc_fresh_huge_page(void)
if (nid == MAX_NUMNODES)
nid = first_node(node_online_map);
if (page) {
- page[1].lru.next = (void *)free_huge_page; /* dtor */
+ set_compound_page_dtor(page, free_huge_page);
spin_lock(&hugetlb_lock);
nr_huge_pages++;
nr_huge_pages_node[page_to_nid(page)]++;
@@ -344,7 +344,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
entry = *src_pte;
ptepage = pte_page(entry);
get_page(ptepage);
- add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
set_huge_pte_at(dst, addr, dst_pte, entry);
}
spin_unlock(&src->page_table_lock);
@@ -365,6 +364,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
pte_t pte;
struct page *page;
struct page *tmp;
+ /*
+ * A page gathering list, protected by per file i_mmap_lock. The
+ * lock is used to avoid list corruption from multiple unmapping
+ * of the same page since we are using page->lru.
+ */
LIST_HEAD(page_list);
WARN_ON(!is_vm_hugetlb_page(vma));
@@ -372,24 +376,21 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
BUG_ON(end & ~HPAGE_MASK);
spin_lock(&mm->page_table_lock);
-
- /* Update high watermark before we lower rss */
- update_hiwater_rss(mm);
-
for (address = start; address < end; address += HPAGE_SIZE) {
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
+ if (huge_pmd_unshare(mm, &address, ptep))
+ continue;
+
pte = huge_ptep_get_and_clear(mm, address, ptep);
if (pte_none(pte))
continue;
page = pte_page(pte);
list_add(&page->lru, &page_list);
- add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
}
-
spin_unlock(&mm->page_table_lock);
flush_tlb_range(vma, start, end);
list_for_each_entry_safe(page, tmp, &page_list, lru) {
@@ -441,7 +442,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
}
spin_unlock(&mm->page_table_lock);
- copy_huge_page(new_page, old_page, address);
+ copy_huge_page(new_page, old_page, address, vma);
spin_lock(&mm->page_table_lock);
ptep = huge_pte_offset(mm, address & HPAGE_MASK);
@@ -515,7 +516,6 @@ retry:
if (!pte_none(*ptep))
goto backout;
- add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
set_huge_pte_at(mm, address, ptep, new_pte);
@@ -653,11 +653,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
BUG_ON(address >= end);
flush_cache_range(vma, address, end);
+ spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
spin_lock(&mm->page_table_lock);
for (; address < end; address += HPAGE_SIZE) {
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
+ if (huge_pmd_unshare(mm, &address, ptep))
+ continue;
if (!pte_none(*ptep)) {
pte = huge_ptep_get_and_clear(mm, address, ptep);
pte = pte_mkhuge(pte_modify(pte, newprot));
@@ -666,6 +669,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
}
}
spin_unlock(&mm->page_table_lock);
+ spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
flush_tlb_range(vma, start, end);
}
diff --git a/mm/memory.c b/mm/memory.c
index 156861fcac4..af227d26e10 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -681,7 +681,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
mark_page_accessed(page);
file_rss--;
}
- page_remove_rmap(page);
+ page_remove_rmap(page, vma);
tlb_remove_page(tlb, page);
continue;
}
@@ -1091,7 +1091,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (pages) {
pages[i] = page;
- flush_anon_page(page, start);
+ flush_anon_page(vma, page, start);
flush_dcache_page(page);
}
if (vmas)
@@ -1110,23 +1110,29 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
{
pte_t *pte;
spinlock_t *ptl;
+ int err = 0;
pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
- return -ENOMEM;
+ return -EAGAIN;
arch_enter_lazy_mmu_mode();
do {
struct page *page = ZERO_PAGE(addr);
pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
+
+ if (unlikely(!pte_none(*pte))) {
+ err = -EEXIST;
+ pte++;
+ break;
+ }
page_cache_get(page);
page_add_file_rmap(page);
inc_mm_counter(mm, file_rss);
- BUG_ON(!pte_none(*pte));
set_pte_at(mm, addr, pte, zero_pte);
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
- return 0;
+ return err;
}
static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
@@ -1134,16 +1140,18 @@ static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
{
pmd_t *pmd;
unsigned long next;
+ int err;
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
- return -ENOMEM;
+ return -EAGAIN;
do {
next = pmd_addr_end(addr, end);
- if (zeromap_pte_range(mm, pmd, addr, next, prot))
- return -ENOMEM;
+ err = zeromap_pte_range(mm, pmd, addr, next, prot);
+ if (err)
+ break;
} while (pmd++, addr = next, addr != end);
- return 0;
+ return err;
}
static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
@@ -1151,16 +1159,18 @@ static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
{
pud_t *pud;
unsigned long next;
+ int err;
pud = pud_alloc(mm, pgd, addr);
if (!pud)
- return -ENOMEM;
+ return -EAGAIN;
do {
next = pud_addr_end(addr, end);
- if (zeromap_pmd_range(mm, pud, addr, next, prot))
- return -ENOMEM;
+ err = zeromap_pmd_range(mm, pud, addr, next, prot);
+ if (err)
+ break;
} while (pud++, addr = next, addr != end);
- return 0;
+ return err;
}
int zeromap_page_range(struct vm_area_struct *vma,
@@ -1431,7 +1441,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
return pte;
}
-static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
+static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
{
/*
* If the source page was a PFN mapping, we don't have
@@ -1454,9 +1464,9 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
kunmap_atomic(kaddr, KM_USER0);
flush_dcache_page(dst);
return;
-
+
}
- copy_user_highpage(dst, src, va);
+ copy_user_highpage(dst, src, va, vma);
}
/*
@@ -1567,7 +1577,7 @@ gotten:
new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
if (!new_page)
goto oom;
- cow_user_page(new_page, old_page, address);
+ cow_user_page(new_page, old_page, address, vma);
}
/*
@@ -1576,7 +1586,7 @@ gotten:
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) {
- page_remove_rmap(old_page);
+ page_remove_rmap(old_page, vma);
if (!PageAnon(old_page)) {
dec_mm_counter(mm, file_rss);
inc_mm_counter(mm, anon_rss);
@@ -1902,7 +1912,6 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
return 0;
}
-EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */
/**
* swapin_readahead - swap in pages in hope we need them soon
@@ -1991,6 +2000,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry);
if (!page) {
+ grab_swap_token(); /* Contend for token _before_ read-in */
swapin_readahead(entry, address, vma);
page = read_swap_cache_async(entry, vma, address);
if (!page) {
@@ -2008,7 +2018,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Had to read the page from swap area: Major fault */
ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
- grab_swap_token();
}
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2191,7 +2200,7 @@ retry:
page = alloc_page_vma(GFP_HIGHUSER, vma, address);
if (!page)
goto oom;
- copy_user_highpage(page, new_page, address);
+ copy_user_highpage(page, new_page, address, vma);
page_cache_release(new_page);
new_page = page;
anon = 1;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fd678a662ea..84279127fcd 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -67,12 +67,13 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
zone_type = zone - pgdat->node_zones;
if (!populated_zone(zone)) {
int ret = 0;
- ret = init_currently_empty_zone(zone, phys_start_pfn, nr_pages);
+ ret = init_currently_empty_zone(zone, phys_start_pfn,
+ nr_pages, MEMMAP_HOTPLUG);
if (ret < 0)
return ret;
}
- memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
- zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
+ memmap_init_zone(nr_pages, nid, zone_type,
+ phys_start_pfn, MEMMAP_HOTPLUG);
return 0;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 617fb31086e..da946394655 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -141,9 +141,11 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
enum zone_type k;
max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
+ max++; /* space for zlcache_ptr (see mmzone.h) */
zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
if (!zl)
return NULL;
+ zl->zlcache_ptr = NULL;
num = 0;
/* First put in the highest zones from all nodes, then all the next
lower zones etc. Avoid empty zones because the memory allocator
@@ -219,7 +221,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
do {
struct page *page;
- unsigned int nid;
+ int nid;
if (!pte_present(*pte))
continue;
@@ -1324,7 +1326,7 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
atomic_set(&new->refcnt, 1);
if (new->policy == MPOL_BIND) {
int sz = ksize(old->v.zonelist);
- new->v.zonelist = kmemdup(old->v.zonelist, sz, SLAB_KERNEL);
+ new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
if (!new->v.zonelist) {
kmem_cache_free(policy_cache, new);
return ERR_PTR(-ENOMEM);
@@ -1705,8 +1707,8 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
* Display pages allocated per node and memory policy via /proc.
*/
-static const char *policy_types[] = { "default", "prefer", "bind",
- "interleave" };
+static const char * const policy_types[] =
+ { "default", "prefer", "bind", "interleave" };
/*
* Convert a mempolicy into a string.
@@ -1855,7 +1857,7 @@ int show_numa_map(struct seq_file *m, void *v)
if (file) {
seq_printf(m, " file=");
- seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
+ seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
seq_printf(m, " heap");
} else if (vma->vm_start <= mm->start_stack &&
diff --git a/mm/migrate.c b/mm/migrate.c
index b4979d423d2..e9b161bde95 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -294,7 +294,7 @@ out:
static int migrate_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page)
{
- struct page **radix_pointer;
+ void **pslot;
if (!mapping) {
/* Anonymous page */
@@ -305,12 +305,11 @@ static int migrate_page_move_mapping(struct address_space *mapping,
write_lock_irq(&mapping->tree_lock);
- radix_pointer = (struct page **)radix_tree_lookup_slot(
- &mapping->page_tree,
- page_index(page));
+ pslot = radix_tree_lookup_slot(&mapping->page_tree,
+ page_index(page));
if (page_count(page) != 2 + !!PagePrivate(page) ||
- *radix_pointer != page) {
+ (struct page *)radix_tree_deref_slot(pslot) != page) {
write_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
@@ -318,7 +317,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
/*
* Now we know that no one else is looking at the page.
*/
- get_page(newpage);
+ get_page(newpage); /* add cache reference */
#ifdef CONFIG_SWAP
if (PageSwapCache(page)) {
SetPageSwapCache(newpage);
@@ -326,8 +325,14 @@ static int migrate_page_move_mapping(struct address_space *mapping,
}
#endif
- *radix_pointer = newpage;
+ radix_tree_replace_slot(pslot, newpage);
+
+ /*
+ * Drop cache reference from old page.
+ * We know this isn't the last reference.
+ */
__put_page(page);
+
write_unlock_irq(&mapping->tree_lock);
return 0;
diff --git a/mm/mincore.c b/mm/mincore.c
index 72890780c1c..8aca6f7167b 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -1,7 +1,7 @@
/*
* linux/mm/mincore.c
*
- * Copyright (C) 1994-1999 Linus Torvalds
+ * Copyright (C) 1994-2006 Linus Torvalds
*/
/*
@@ -38,46 +38,51 @@ static unsigned char mincore_page(struct vm_area_struct * vma,
return present;
}
-static long mincore_vma(struct vm_area_struct * vma,
- unsigned long start, unsigned long end, unsigned char __user * vec)
+/*
+ * Do a chunk of "sys_mincore()". We've already checked
+ * all the arguments, we hold the mmap semaphore: we should
+ * just return the amount of info we're asked for.
+ */
+static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages)
{
- long error, i, remaining;
- unsigned char * tmp;
-
- error = -ENOMEM;
- if (!vma->vm_file)
- return error;
-
- start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- if (end > vma->vm_end)
- end = vma->vm_end;
- end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ unsigned long i, nr, pgoff;
+ struct vm_area_struct *vma = find_vma(current->mm, addr);
- error = -EAGAIN;
- tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
- if (!tmp)
- return error;
+ /*
+ * find_vma() didn't find anything above us, or we're
+ * in an unmapped hole in the address space: ENOMEM.
+ */
+ if (!vma || addr < vma->vm_start)
+ return -ENOMEM;
- /* (end - start) is # of pages, and also # of bytes in "vec */
- remaining = (end - start),
+ /*
+ * Ok, got it. But check whether it's a segment we support
+ * mincore() on. Right now, we don't do any anonymous mappings.
+ *
+ * FIXME: This is just stupid. And returning ENOMEM is
+ * stupid too. We should just look at the page tables. But
+ * this is what we've traditionally done, so we'll just
+ * continue doing it.
+ */
+ if (!vma->vm_file)
+ return -ENOMEM;
- error = 0;
- for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
- int j = 0;
- long thispiece = (remaining < PAGE_SIZE) ?
- remaining : PAGE_SIZE;
+ /*
+ * Calculate how many pages there are left in the vma, and
+ * what the pgoff is for our address.
+ */
+ nr = (vma->vm_end - addr) >> PAGE_SHIFT;
+ if (nr > pages)
+ nr = pages;
- while (j < thispiece)
- tmp[j++] = mincore_page(vma, start++);
+ pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+ pgoff += vma->vm_pgoff;
- if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
- error = -EFAULT;
- break;
- }
- }
+ /* And then we just fill the sucker in.. */
+ for (i = 0 ; i < nr; i++, pgoff++)
+ vec[i] = mincore_page(vma, pgoff);
- free_page((unsigned long) tmp);
- return error;
+ return nr;
}
/*
@@ -107,82 +112,50 @@ static long mincore_vma(struct vm_area_struct * vma,
asmlinkage long sys_mincore(unsigned long start, size_t len,
unsigned char __user * vec)
{
- int index = 0;
- unsigned long end, limit;
- struct vm_area_struct * vma;
- size_t max;
- int unmapped_error = 0;
- long error;
-
- /* check the arguments */
- if (start & ~PAGE_CACHE_MASK)
- goto einval;
-
- limit = TASK_SIZE;
- if (start >= limit)
- goto enomem;
-
- if (!len)
- return 0;
-
- max = limit - start;
- len = PAGE_CACHE_ALIGN(len);
- if (len > max || !len)
- goto enomem;
+ long retval;
+ unsigned long pages;
+ unsigned char *tmp;
- end = start + len;
+ /* Check the start address: needs to be page-aligned.. */
+ if (start & ~PAGE_CACHE_MASK)
+ return -EINVAL;
- /* check the output buffer whilst holding the lock */
- error = -EFAULT;
- down_read(&current->mm->mmap_sem);
+ /* ..and we need to be passed a valid user-space range */
+ if (!access_ok(VERIFY_READ, (void __user *) start, len))
+ return -ENOMEM;
- if (!access_ok(VERIFY_WRITE, vec, len >> PAGE_SHIFT))
- goto out;
+ /* This also avoids any overflows on PAGE_CACHE_ALIGN */
+ pages = len >> PAGE_SHIFT;
+ pages += (len & ~PAGE_MASK) != 0;
- /*
- * If the interval [start,end) covers some unmapped address
- * ranges, just ignore them, but return -ENOMEM at the end.
- */
- error = 0;
-
- vma = find_vma(current->mm, start);
- while (vma) {
- /* Here start < vma->vm_end. */
- if (start < vma->vm_start) {
- unmapped_error = -ENOMEM;
- start = vma->vm_start;
- }
+ if (!access_ok(VERIFY_WRITE, vec, pages))
+ return -EFAULT;
- /* Here vma->vm_start <= start < vma->vm_end. */
- if (end <= vma->vm_end) {
- if (start < end) {
- error = mincore_vma(vma, start, end,
- &vec[index]);
- if (error)
- goto out;
- }
- error = unmapped_error;
- goto out;
+ tmp = (void *) __get_free_page(GFP_USER);
+ if (!tmp)
+ return -EAGAIN;
+
+ retval = 0;
+ while (pages) {
+ /*
+ * Do at most PAGE_SIZE entries per iteration, due to
+ * the temporary buffer size.
+ */
+ down_read(&current->mm->mmap_sem);
+ retval = do_mincore(start, tmp, min(pages, PAGE_SIZE));
+ up_read(&current->mm->mmap_sem);
+
+ if (retval <= 0)
+ break;
+ if (copy_to_user(vec, tmp, retval)) {
+ retval = -EFAULT;
+ break;
}
-
- /* Here vma->vm_start <= start < vma->vm_end < end. */
- error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
- if (error)
- goto out;
- index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
- start = vma->vm_end;
- vma = vma->vm_next;
+ pages -= retval;
+ vec += retval;
+ start += retval << PAGE_SHIFT;
+ retval = 0;
}
-
- /* we found a hole in the area queried if we arrive here */
- error = -ENOMEM;
-
-out:
- up_read(&current->mm->mmap_sem);
- return error;
-
-einval:
- return -EINVAL;
-enomem:
- return -ENOMEM;
+ free_page((unsigned long) tmp);
+ return retval;
}
diff --git a/mm/mlock.c b/mm/mlock.c
index b90c59573ab..3446b7ef731 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -65,7 +65,7 @@ success:
ret = make_pages_present(start, end);
}
- vma->vm_mm->locked_vm -= pages;
+ mm->locked_vm -= pages;
out:
if (ret == -ENOMEM)
ret = -EAGAIN;
diff --git a/mm/mmap.c b/mm/mmap.c
index 7b40abd7cba..9717337293c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -188,7 +188,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
struct file *file, struct address_space *mapping)
{
if (vma->vm_flags & VM_DENYWRITE)
- atomic_inc(&file->f_dentry->d_inode->i_writecount);
+ atomic_inc(&file->f_path.dentry->d_inode->i_writecount);
if (vma->vm_flags & VM_SHARED)
mapping->i_mmap_writable--;
@@ -399,7 +399,7 @@ static inline void __vma_link_file(struct vm_area_struct *vma)
struct address_space *mapping = file->f_mapping;
if (vma->vm_flags & VM_DENYWRITE)
- atomic_dec(&file->f_dentry->d_inode->i_writecount);
+ atomic_dec(&file->f_path.dentry->d_inode->i_writecount);
if (vma->vm_flags & VM_SHARED)
mapping->i_mmap_writable++;
@@ -907,7 +907,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
* mounted, in which case we dont add PROT_EXEC.)
*/
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
- if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)))
+ if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
prot |= PROT_EXEC;
if (!len)
@@ -960,7 +960,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
return -EAGAIN;
}
- inode = file ? file->f_dentry->d_inode : NULL;
+ inode = file ? file->f_path.dentry->d_inode : NULL;
if (file) {
switch (flags & MAP_TYPE) {
@@ -989,7 +989,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
return -EACCES;
- if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) {
+ if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
if (vm_flags & VM_EXEC)
return -EPERM;
vm_flags &= ~VM_MAYEXEC;
@@ -1736,7 +1736,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
if (mm->map_count >= sysctl_max_map_count)
return -ENOMEM;
- new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (!new)
return -ENOMEM;
@@ -2057,7 +2057,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
vma_start < new_vma->vm_end)
*vmap = new_vma;
} else {
- new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (new_vma) {
*new_vma = *vma;
pol = mpol_copy(vma_policy(vma));
diff --git a/mm/mmzone.c b/mm/mmzone.c
index febea1c9816..eb5838634f1 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -14,8 +14,6 @@ struct pglist_data *first_online_pgdat(void)
return NODE_DATA(first_online_node);
}
-EXPORT_UNUSED_SYMBOL(first_online_pgdat); /* June 2006 */
-
struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
{
int nid = next_online_node(pgdat->node_id);
@@ -24,8 +22,6 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
return NULL;
return NODE_DATA(nid);
}
-EXPORT_UNUSED_SYMBOL(next_online_pgdat); /* June 2006 */
-
/*
* next_zone - helper magic for for_each_zone()
@@ -45,5 +41,4 @@ struct zone *next_zone(struct zone *zone)
}
return zone;
}
-EXPORT_UNUSED_SYMBOL(next_zone); /* June 2006 */
diff --git a/mm/nommu.c b/mm/nommu.c
index 8bdde9508f3..23fb033e596 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -497,15 +497,17 @@ static int validate_mmap_request(struct file *file,
(flags & MAP_TYPE) != MAP_SHARED)
return -EINVAL;
- if (PAGE_ALIGN(len) == 0)
- return addr;
-
- if (len > TASK_SIZE)
+ if (!len)
return -EINVAL;
+ /* Careful about overflows.. */
+ len = PAGE_ALIGN(len);
+ if (!len || len > TASK_SIZE)
+ return -ENOMEM;
+
/* offset overflow? */
if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
- return -EINVAL;
+ return -EOVERFLOW;
if (file) {
/* validate file mapping requests */
@@ -521,7 +523,7 @@ static int validate_mmap_request(struct file *file,
*/
mapping = file->f_mapping;
if (!mapping)
- mapping = file->f_dentry->d_inode->i_mapping;
+ mapping = file->f_path.dentry->d_inode->i_mapping;
capabilities = 0;
if (mapping && mapping->backing_dev_info)
@@ -530,7 +532,7 @@ static int validate_mmap_request(struct file *file,
if (!capabilities) {
/* no explicit capabilities set, so assume some
* defaults */
- switch (file->f_dentry->d_inode->i_mode & S_IFMT) {
+ switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) {
case S_IFREG:
case S_IFBLK:
capabilities = BDI_CAP_MAP_COPY;
@@ -561,11 +563,11 @@ static int validate_mmap_request(struct file *file,
!(file->f_mode & FMODE_WRITE))
return -EACCES;
- if (IS_APPEND(file->f_dentry->d_inode) &&
+ if (IS_APPEND(file->f_path.dentry->d_inode) &&
(file->f_mode & FMODE_WRITE))
return -EACCES;
- if (locks_verify_locked(file->f_dentry->d_inode))
+ if (locks_verify_locked(file->f_path.dentry->d_inode))
return -EAGAIN;
if (!(capabilities & BDI_CAP_MAP_DIRECT))
@@ -596,7 +598,7 @@ static int validate_mmap_request(struct file *file,
/* handle executable mappings and implied executable
* mappings */
- if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) {
+ if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
if (prot & PROT_EXEC)
return -EPERM;
}
@@ -806,10 +808,9 @@ unsigned long do_mmap_pgoff(struct file *file,
vm_flags = determine_vm_flags(file, prot, flags, capabilities);
/* we're going to need to record the mapping if it works */
- vml = kmalloc(sizeof(struct vm_list_struct), GFP_KERNEL);
+ vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL);
if (!vml)
goto error_getting_vml;
- memset(vml, 0, sizeof(*vml));
down_write(&nommu_vma_sem);
@@ -832,7 +833,7 @@ unsigned long do_mmap_pgoff(struct file *file,
continue;
/* search for overlapping mappings on the same file */
- if (vma->vm_file->f_dentry->d_inode != file->f_dentry->d_inode)
+ if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode)
continue;
if (vma->vm_pgoff >= pgoff + pglen)
@@ -885,11 +886,10 @@ unsigned long do_mmap_pgoff(struct file *file,
}
/* we're going to need a VMA struct as well */
- vma = kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
+ vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
if (!vma)
goto error_getting_vma;
- memset(vma, 0, sizeof(*vma));
INIT_LIST_HEAD(&vma->anon_vma_node);
atomic_set(&vma->vm_usage, 1);
if (file)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2e3ce3a928b..b278b8d60ee 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -61,12 +61,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
}
/*
- * swapoff can easily use up all memory, so kill those first.
- */
- if (p->flags & PF_SWAPOFF)
- return ULONG_MAX;
-
- /*
* The memory size of the process is the basis for the badness.
*/
points = mm->total_vm;
@@ -77,6 +71,12 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
task_unlock(p);
/*
+ * swapoff can easily use up all memory, so kill those first.
+ */
+ if (p->flags & PF_SWAPOFF)
+ return ULONG_MAX;
+
+ /*
* Processes which fork a lot of child processes are likely
* a good choice. We add half the vmsize of the children if they
* have an own mm. This prevents forking servers to flood the
@@ -174,10 +174,15 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
{
#ifdef CONFIG_NUMA
struct zone **z;
- nodemask_t nodes = node_online_map;
+ nodemask_t nodes;
+ int node;
+ /* node has memory ? */
+ for_each_online_node(node)
+ if (NODE_DATA(node)->node_present_pages)
+ node_set(node, nodes);
for (z = zonelist->zones; *z; z++)
- if (cpuset_zone_allowed(*z, gfp_mask))
+ if (cpuset_zone_allowed_softwall(*z, gfp_mask))
node_clear(zone_to_nid(*z), nodes);
else
return CONSTRAINT_CPUSET;
@@ -264,7 +269,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
* flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
* set.
*/
-static void __oom_kill_task(struct task_struct *p, const char *message)
+static void __oom_kill_task(struct task_struct *p, int verbose)
{
if (is_init(p)) {
WARN_ON(1);
@@ -278,10 +283,8 @@ static void __oom_kill_task(struct task_struct *p, const char *message)
return;
}
- if (message) {
- printk(KERN_ERR "%s: Killed process %d (%s).\n",
- message, p->pid, p->comm);
- }
+ if (verbose)
+ printk(KERN_ERR "Killed process %d (%s)\n", p->pid, p->comm);
/*
* We give our sacrificial lamb high priority and access to
@@ -294,7 +297,7 @@ static void __oom_kill_task(struct task_struct *p, const char *message)
force_sig(SIGKILL, p);
}
-static int oom_kill_task(struct task_struct *p, const char *message)
+static int oom_kill_task(struct task_struct *p)
{
struct mm_struct *mm;
struct task_struct *g, *q;
@@ -313,15 +316,25 @@ static int oom_kill_task(struct task_struct *p, const char *message)
if (mm == NULL)
return 1;
- __oom_kill_task(p, message);
+ /*
+ * Don't kill the process if any threads are set to OOM_DISABLE
+ */
+ do_each_thread(g, q) {
+ if (q->mm == mm && p->oomkilladj == OOM_DISABLE)
+ return 1;
+ } while_each_thread(g, q);
+
+ __oom_kill_task(p, 1);
+
/*
* kill all processes that share the ->mm (i.e. all threads),
- * but are in a different thread group
+ * but are in a different thread group. Don't let them have access
+ * to memory reserves though, otherwise we might deplete all memory.
*/
- do_each_thread(g, q)
+ do_each_thread(g, q) {
if (q->mm == mm && q->tgid != p->tgid)
- __oom_kill_task(q, message);
- while_each_thread(g, q);
+ force_sig(SIGKILL, p);
+ } while_each_thread(g, q);
return 0;
}
@@ -337,21 +350,22 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
* its children or threads, just set TIF_MEMDIE so it can die quickly
*/
if (p->flags & PF_EXITING) {
- __oom_kill_task(p, NULL);
+ __oom_kill_task(p, 0);
return 0;
}
- printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li"
- " and children.\n", p->pid, p->comm, points);
+ printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
+ message, p->pid, p->comm, points);
+
/* Try to kill a child first */
list_for_each(tsk, &p->children) {
c = list_entry(tsk, struct task_struct, sibling);
if (c->mm == p->mm)
continue;
- if (!oom_kill_task(c, message))
+ if (!oom_kill_task(c))
return 0;
}
- return oom_kill_task(p, message);
+ return oom_kill_task(p);
}
static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 8d9b19f239c..1d2fc89ca56 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -21,6 +21,7 @@
#include <linux/writeback.h>
#include <linux/init.h>
#include <linux/backing-dev.h>
+#include <linux/task_io_accounting_ops.h>
#include <linux/blkdev.h>
#include <linux/mpage.h>
#include <linux/rmap.h>
@@ -761,23 +762,24 @@ int __set_page_dirty_nobuffers(struct page *page)
struct address_space *mapping = page_mapping(page);
struct address_space *mapping2;
- if (mapping) {
- write_lock_irq(&mapping->tree_lock);
- mapping2 = page_mapping(page);
- if (mapping2) { /* Race with truncate? */
- BUG_ON(mapping2 != mapping);
- if (mapping_cap_account_dirty(mapping))
- __inc_zone_page_state(page,
- NR_FILE_DIRTY);
- radix_tree_tag_set(&mapping->page_tree,
- page_index(page), PAGECACHE_TAG_DIRTY);
- }
- write_unlock_irq(&mapping->tree_lock);
- if (mapping->host) {
- /* !PageAnon && !swapper_space */
- __mark_inode_dirty(mapping->host,
- I_DIRTY_PAGES);
+ if (!mapping)
+ return 1;
+
+ write_lock_irq(&mapping->tree_lock);
+ mapping2 = page_mapping(page);
+ if (mapping2) { /* Race with truncate? */
+ BUG_ON(mapping2 != mapping);
+ if (mapping_cap_account_dirty(mapping)) {
+ __inc_zone_page_state(page, NR_FILE_DIRTY);
+ task_io_account_write(PAGE_CACHE_SIZE);
}
+ radix_tree_tag_set(&mapping->page_tree,
+ page_index(page), PAGECACHE_TAG_DIRTY);
+ }
+ write_unlock_irq(&mapping->tree_lock);
+ if (mapping->host) {
+ /* !PageAnon && !swapper_space */
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
return 1;
}
@@ -843,39 +845,6 @@ int set_page_dirty_lock(struct page *page)
EXPORT_SYMBOL(set_page_dirty_lock);
/*
- * Clear a page's dirty flag, while caring for dirty memory accounting.
- * Returns true if the page was previously dirty.
- */
-int test_clear_page_dirty(struct page *page)
-{
- struct address_space *mapping = page_mapping(page);
- unsigned long flags;
-
- if (mapping) {
- write_lock_irqsave(&mapping->tree_lock, flags);
- if (TestClearPageDirty(page)) {
- radix_tree_tag_clear(&mapping->page_tree,
- page_index(page),
- PAGECACHE_TAG_DIRTY);
- write_unlock_irqrestore(&mapping->tree_lock, flags);
- /*
- * We can continue to use `mapping' here because the
- * page is locked, which pins the address_space
- */
- if (mapping_cap_account_dirty(mapping)) {
- page_mkclean(page);
- dec_zone_page_state(page, NR_FILE_DIRTY);
- }
- return 1;
- }
- write_unlock_irqrestore(&mapping->tree_lock, flags);
- return 0;
- }
- return TestClearPageDirty(page);
-}
-EXPORT_SYMBOL(test_clear_page_dirty);
-
-/*
* Clear a page's dirty flag, while caring for dirty memory accounting.
* Returns true if the page was previously dirty.
*
@@ -893,12 +862,41 @@ int clear_page_dirty_for_io(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- if (mapping) {
+ if (mapping && mapping_cap_account_dirty(mapping)) {
+ /*
+ * Yes, Virginia, this is indeed insane.
+ *
+ * We use this sequence to make sure that
+ * (a) we account for dirty stats properly
+ * (b) we tell the low-level filesystem to
+ * mark the whole page dirty if it was
+ * dirty in a pagetable. Only to then
+ * (c) clean the page again and return 1 to
+ * cause the writeback.
+ *
+ * This way we avoid all nasty races with the
+ * dirty bit in multiple places and clearing
+ * them concurrently from different threads.
+ *
+ * Note! Normally the "set_page_dirty(page)"
+ * has no effect on the actual dirty bit - since
+ * that will already usually be set. But we
+ * need the side effects, and it can help us
+ * avoid races.
+ *
+ * We basically use the page "master dirty bit"
+ * as a serialization point for all the different
+ * threads doing their things.
+ *
+ * FIXME! We still have a race here: if somebody
+ * adds the page back to the page tables in
+ * between the "page_mkclean()" and the "TestClearPageDirty()",
+ * we might have it mapped without the dirty bit set.
+ */
+ if (page_mkclean(page))
+ set_page_dirty(page);
if (TestClearPageDirty(page)) {
- if (mapping_cap_account_dirty(mapping)) {
- page_mkclean(page);
- dec_zone_page_state(page, NR_FILE_DIRTY);
- }
+ dec_zone_page_state(page, NR_FILE_DIRTY);
return 1;
}
return 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index aa6fcc7ca66..fc5b5442e94 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -40,6 +40,7 @@
#include <linux/sort.h>
#include <linux/pfn.h>
#include <linux/backing-dev.h>
+#include <linux/fault-inject.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -83,14 +84,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
EXPORT_SYMBOL(totalram_pages);
-/*
- * Used by page_zone() to look up the address of the struct zone whose
- * id is encoded in the upper bits of page->flags
- */
-struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
-EXPORT_SYMBOL(zone_table);
-
-static char *zone_names[MAX_NR_ZONES] = {
+static char * const zone_names[MAX_NR_ZONES] = {
"DMA",
#ifdef CONFIG_ZONE_DMA32
"DMA32",
@@ -237,7 +231,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
int i;
int nr_pages = 1 << order;
- page[1].lru.next = (void *)free_compound_page; /* set dtor */
+ set_compound_page_dtor(page, free_compound_page);
page[1].lru.prev = (void *)order;
for (i = 0; i < nr_pages; i++) {
struct page *p = page + i;
@@ -486,7 +480,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order)
spin_lock(&zone->lock);
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
- __free_one_page(page, zone ,order);
+ __free_one_page(page, zone, order);
spin_unlock(&zone->lock);
}
@@ -605,6 +599,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
1 << PG_checked | 1 << PG_mappedtodisk);
set_page_private(page, 0);
set_page_refcounted(page);
+
+ arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1);
if (gfp_flags & __GFP_ZERO)
@@ -690,9 +686,15 @@ void drain_node_pages(int nodeid)
pcp = &pset->pcp[i];
if (pcp->count) {
+ int to_drain;
+
local_irq_save(flags);
- free_pages_bulk(zone, pcp->count, &pcp->list, 0);
- pcp->count = 0;
+ if (pcp->count >= pcp->batch)
+ to_drain = pcp->batch;
+ else
+ to_drain = pcp->count;
+ free_pages_bulk(zone, to_drain, &pcp->list, 0);
+ pcp->count -= to_drain;
local_irq_restore(flags);
}
}
@@ -700,7 +702,6 @@ void drain_node_pages(int nodeid)
}
#endif
-#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
static void __drain_pages(unsigned int cpu)
{
unsigned long flags;
@@ -710,6 +711,9 @@ static void __drain_pages(unsigned int cpu)
for_each_zone(zone) {
struct per_cpu_pageset *pset;
+ if (!populated_zone(zone))
+ continue;
+
pset = zone_pcp(zone, cpu);
for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
struct per_cpu_pages *pcp;
@@ -722,7 +726,6 @@ static void __drain_pages(unsigned int cpu)
}
}
}
-#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
#ifdef CONFIG_PM
@@ -893,6 +896,91 @@ failed:
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
+#ifdef CONFIG_FAIL_PAGE_ALLOC
+
+static struct fail_page_alloc_attr {
+ struct fault_attr attr;
+
+ u32 ignore_gfp_highmem;
+ u32 ignore_gfp_wait;
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+ struct dentry *ignore_gfp_highmem_file;
+ struct dentry *ignore_gfp_wait_file;
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+} fail_page_alloc = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_gfp_wait = 1,
+ .ignore_gfp_highmem = 1,
+};
+
+static int __init setup_fail_page_alloc(char *str)
+{
+ return setup_fault_attr(&fail_page_alloc.attr, str);
+}
+__setup("fail_page_alloc=", setup_fail_page_alloc);
+
+static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+ if (gfp_mask & __GFP_NOFAIL)
+ return 0;
+ if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
+ return 0;
+ if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
+ return 0;
+
+ return should_fail(&fail_page_alloc.attr, 1 << order);
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_page_alloc_debugfs(void)
+{
+ mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+ struct dentry *dir;
+ int err;
+
+ err = init_fault_attr_dentries(&fail_page_alloc.attr,
+ "fail_page_alloc");
+ if (err)
+ return err;
+ dir = fail_page_alloc.attr.dentries.dir;
+
+ fail_page_alloc.ignore_gfp_wait_file =
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &fail_page_alloc.ignore_gfp_wait);
+
+ fail_page_alloc.ignore_gfp_highmem_file =
+ debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+ &fail_page_alloc.ignore_gfp_highmem);
+
+ if (!fail_page_alloc.ignore_gfp_wait_file ||
+ !fail_page_alloc.ignore_gfp_highmem_file) {
+ err = -ENOMEM;
+ debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
+ debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
+ cleanup_fault_attr_dentries(&fail_page_alloc.attr);
+ }
+
+ return err;
+}
+
+late_initcall(fail_page_alloc_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+#else /* CONFIG_FAIL_PAGE_ALLOC */
+
+static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+ return 0;
+}
+
+#endif /* CONFIG_FAIL_PAGE_ALLOC */
+
/*
* Return 1 if free pages are above 'mark'. This takes into account the order
* of the allocation.
@@ -925,31 +1013,160 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
return 1;
}
+#ifdef CONFIG_NUMA
+/*
+ * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
+ * skip over zones that are not allowed by the cpuset, or that have
+ * been recently (in last second) found to be nearly full. See further
+ * comments in mmzone.h. Reduces cache footprint of zonelist scans
+ * that have to skip over alot of full or unallowed zones.
+ *
+ * If the zonelist cache is present in the passed in zonelist, then
+ * returns a pointer to the allowed node mask (either the current
+ * tasks mems_allowed, or node_online_map.)
+ *
+ * If the zonelist cache is not available for this zonelist, does
+ * nothing and returns NULL.
+ *
+ * If the fullzones BITMAP in the zonelist cache is stale (more than
+ * a second since last zap'd) then we zap it out (clear its bits.)
+ *
+ * We hold off even calling zlc_setup, until after we've checked the
+ * first zone in the zonelist, on the theory that most allocations will
+ * be satisfied from that first zone, so best to examine that zone as
+ * quickly as we can.
+ */
+static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
+{
+ struct zonelist_cache *zlc; /* cached zonelist speedup info */
+ nodemask_t *allowednodes; /* zonelist_cache approximation */
+
+ zlc = zonelist->zlcache_ptr;
+ if (!zlc)
+ return NULL;
+
+ if (jiffies - zlc->last_full_zap > 1 * HZ) {
+ bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+ zlc->last_full_zap = jiffies;
+ }
+
+ allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
+ &cpuset_current_mems_allowed :
+ &node_online_map;
+ return allowednodes;
+}
+
+/*
+ * Given 'z' scanning a zonelist, run a couple of quick checks to see
+ * if it is worth looking at further for free memory:
+ * 1) Check that the zone isn't thought to be full (doesn't have its
+ * bit set in the zonelist_cache fullzones BITMAP).
+ * 2) Check that the zones node (obtained from the zonelist_cache
+ * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
+ * Return true (non-zero) if zone is worth looking at further, or
+ * else return false (zero) if it is not.
+ *
+ * This check -ignores- the distinction between various watermarks,
+ * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
+ * found to be full for any variation of these watermarks, it will
+ * be considered full for up to one second by all requests, unless
+ * we are so low on memory on all allowed nodes that we are forced
+ * into the second scan of the zonelist.
+ *
+ * In the second scan we ignore this zonelist cache and exactly
+ * apply the watermarks to all zones, even it is slower to do so.
+ * We are low on memory in the second scan, and should leave no stone
+ * unturned looking for a free page.
+ */
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+ nodemask_t *allowednodes)
+{
+ struct zonelist_cache *zlc; /* cached zonelist speedup info */
+ int i; /* index of *z in zonelist zones */
+ int n; /* node that zone *z is on */
+
+ zlc = zonelist->zlcache_ptr;
+ if (!zlc)
+ return 1;
+
+ i = z - zonelist->zones;
+ n = zlc->z_to_n[i];
+
+ /* This zone is worth trying if it is allowed but not full */
+ return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
+}
+
+/*
+ * Given 'z' scanning a zonelist, set the corresponding bit in
+ * zlc->fullzones, so that subsequent attempts to allocate a page
+ * from that zone don't waste time re-examining it.
+ */
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+{
+ struct zonelist_cache *zlc; /* cached zonelist speedup info */
+ int i; /* index of *z in zonelist zones */
+
+ zlc = zonelist->zlcache_ptr;
+ if (!zlc)
+ return;
+
+ i = z - zonelist->zones;
+
+ set_bit(i, zlc->fullzones);
+}
+
+#else /* CONFIG_NUMA */
+
+static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
+{
+ return NULL;
+}
+
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+ nodemask_t *allowednodes)
+{
+ return 1;
+}
+
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+{
+}
+#endif /* CONFIG_NUMA */
+
/*
- * get_page_from_freeliest goes through the zonelist trying to allocate
+ * get_page_from_freelist goes through the zonelist trying to allocate
* a page.
*/
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, int alloc_flags)
{
- struct zone **z = zonelist->zones;
+ struct zone **z;
struct page *page = NULL;
- int classzone_idx = zone_idx(*z);
+ int classzone_idx = zone_idx(zonelist->zones[0]);
struct zone *zone;
+ nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
+ int zlc_active = 0; /* set if using zonelist_cache */
+ int did_zlc_setup = 0; /* just call zlc_setup() one time */
+zonelist_scan:
/*
- * Go through the zonelist once, looking for a zone with enough free.
+ * Scan zonelist, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
+ z = zonelist->zones;
+
do {
+ if (NUMA_BUILD && zlc_active &&
+ !zlc_zone_worth_trying(zonelist, z, allowednodes))
+ continue;
zone = *z;
if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
break;
if ((alloc_flags & ALLOC_CPUSET) &&
- !cpuset_zone_allowed(zone, gfp_mask))
- continue;
+ !cpuset_zone_allowed_softwall(zone, gfp_mask))
+ goto try_next_zone;
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
unsigned long mark;
@@ -959,18 +1176,34 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
mark = zone->pages_low;
else
mark = zone->pages_high;
- if (!zone_watermark_ok(zone , order, mark,
- classzone_idx, alloc_flags))
+ if (!zone_watermark_ok(zone, order, mark,
+ classzone_idx, alloc_flags)) {
if (!zone_reclaim_mode ||
!zone_reclaim(zone, gfp_mask, order))
- continue;
+ goto this_zone_full;
+ }
}
page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
- if (page) {
+ if (page)
break;
+this_zone_full:
+ if (NUMA_BUILD)
+ zlc_mark_zone_full(zonelist, z);
+try_next_zone:
+ if (NUMA_BUILD && !did_zlc_setup) {
+ /* we do zlc_setup after the first zone is tried */
+ allowednodes = zlc_setup(zonelist, alloc_flags);
+ zlc_active = 1;
+ did_zlc_setup = 1;
}
} while (*(++z) != NULL);
+
+ if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
+ /* Disable zlc cache for second zonelist scan */
+ zlc_active = 0;
+ goto zonelist_scan;
+ }
return page;
}
@@ -992,6 +1225,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
might_sleep_if(wait);
+ if (should_fail_alloc_page(gfp_mask, order))
+ return NULL;
+
restart:
z = zonelist->zones; /* the list of zones suitable for gfp_mask */
@@ -1005,9 +1241,19 @@ restart:
if (page)
goto got_pg;
- do {
+ /*
+ * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
+ * __GFP_NOWARN set) should not cause reclaim since the subsystem
+ * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
+ * using a larger set of nodes after it has established that the
+ * allowed per node queues are empty and that nodes are
+ * over allocated.
+ */
+ if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+ goto nopage;
+
+ for (z = zonelist->zones; *z; z++)
wakeup_kswapd(*z, order);
- } while (*(++z));
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -1041,6 +1287,7 @@ restart:
/* This allocation should allow future memory freeing. */
+rebalance:
if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
&& !in_interrupt()) {
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
@@ -1062,7 +1309,6 @@ nofail_alloc:
if (!wait)
goto nopage;
-rebalance:
cond_resched();
/* We now go into synchronous reclaim */
@@ -1262,7 +1508,7 @@ unsigned int nr_free_pagecache_pages(void)
static inline void show_node(struct zone *zone)
{
if (NUMA_BUILD)
- printk("Node %ld ", zone_to_nid(zone));
+ printk("Node %d ", zone_to_nid(zone));
}
void si_meminfo(struct sysinfo *val)
@@ -1542,6 +1788,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
}
}
+/* Construct the zonelist performance cache - see further mmzone.h */
+static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+{
+ int i;
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zonelist *zonelist;
+ struct zonelist_cache *zlc;
+ struct zone **z;
+
+ zonelist = pgdat->node_zonelists + i;
+ zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
+ bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+ for (z = zonelist->zones; *z; z++)
+ zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
+ }
+}
+
#else /* CONFIG_NUMA */
static void __meminit build_zonelists(pg_data_t *pgdat)
@@ -1579,14 +1843,26 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
}
}
+/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
+static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+{
+ int i;
+
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ pgdat->node_zonelists[i].zlcache_ptr = NULL;
+}
+
#endif /* CONFIG_NUMA */
/* return values int ....just for stop_machine_run() */
static int __meminit __build_all_zonelists(void *dummy)
{
int nid;
- for_each_online_node(nid)
+
+ for_each_online_node(nid) {
build_zonelists(NODE_DATA(nid));
+ build_zonelist_cache(NODE_DATA(nid));
+ }
return 0;
}
@@ -1680,17 +1956,24 @@ static inline unsigned long wait_table_bits(unsigned long size)
* done. Non-atomic initialization, single-pass.
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
- unsigned long start_pfn)
+ unsigned long start_pfn, enum memmap_context context)
{
struct page *page;
unsigned long end_pfn = start_pfn + size;
unsigned long pfn;
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
- if (!early_pfn_valid(pfn))
- continue;
- if (!early_pfn_in_nid(pfn, nid))
- continue;
+ /*
+ * There can be holes in boot-time mem_map[]s
+ * handed to this function. They do not
+ * exist on hotplugged memory.
+ */
+ if (context == MEMMAP_EARLY) {
+ if (!early_pfn_valid(pfn))
+ continue;
+ if (!early_pfn_in_nid(pfn, nid))
+ continue;
+ }
page = pfn_to_page(pfn);
set_page_links(page, zone, nid, pfn);
init_page_count(page);
@@ -1715,23 +1998,9 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
}
}
-#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
-void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
- unsigned long pfn, unsigned long size)
-{
- unsigned long snum = pfn_to_section_nr(pfn);
- unsigned long end = pfn_to_section_nr(pfn + size);
-
- if (FLAGS_HAS_NODE)
- zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
- else
- for (; snum <= end; snum++)
- zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
-}
-
#ifndef __HAVE_ARCH_MEMMAP_INIT
#define memmap_init(size, nid, zone, start_pfn) \
- memmap_init_zone((size), (nid), (zone), (start_pfn))
+ memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
#endif
static int __cpuinit zone_batchsize(struct zone *zone)
@@ -1881,16 +2150,16 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
int ret = NOTIFY_OK;
switch (action) {
- case CPU_UP_PREPARE:
- if (process_zones(cpu))
- ret = NOTIFY_BAD;
- break;
- case CPU_UP_CANCELED:
- case CPU_DEAD:
- free_zone_pagesets(cpu);
- break;
- default:
- break;
+ case CPU_UP_PREPARE:
+ if (process_zones(cpu))
+ ret = NOTIFY_BAD;
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_DEAD:
+ free_zone_pagesets(cpu);
+ break;
+ default:
+ break;
}
return ret;
}
@@ -1977,7 +2246,8 @@ static __meminit void zone_pcp_init(struct zone *zone)
__meminit int init_currently_empty_zone(struct zone *zone,
unsigned long zone_start_pfn,
- unsigned long size)
+ unsigned long size,
+ enum memmap_context context)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int ret;
@@ -2421,8 +2691,8 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
if (!size)
continue;
- zonetable_add(zone, nid, j, zone_start_pfn, size);
- ret = init_currently_empty_zone(zone, zone_start_pfn, size);
+ ret = init_currently_empty_zone(zone, zone_start_pfn,
+ size, MEMMAP_EARLY);
BUG_ON(ret);
zone_start_pfn += size;
}
@@ -2736,7 +3006,6 @@ void __init free_area_init(unsigned long *zones_size)
__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
}
-#ifdef CONFIG_HOTPLUG_CPU
static int page_alloc_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
@@ -2751,7 +3020,6 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
}
return NOTIFY_OK;
}
-#endif /* CONFIG_HOTPLUG_CPU */
void __init page_alloc_init(void)
{
@@ -3055,7 +3323,7 @@ void *__init alloc_large_system_hash(const char *tablename,
/* allow the kernel cmdline to have a say */
if (!numentries) {
/* round applicable memory size up to nearest megabyte */
- numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
+ numentries = nr_kernel_pages;
numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
numentries >>= 20 - PAGE_SHIFT;
numentries <<= 20 - PAGE_SHIFT;
@@ -3065,6 +3333,10 @@ void *__init alloc_large_system_hash(const char *tablename,
numentries >>= (scale - PAGE_SHIFT);
else
numentries <<= (PAGE_SHIFT - scale);
+
+ /* Make sure we've got at least a 0-order allocation.. */
+ if (unlikely((numentries * bucketsize) < PAGE_SIZE))
+ numentries = PAGE_SIZE / bucketsize;
}
numentries = roundup_pow_of_two(numentries);
@@ -3077,7 +3349,7 @@ void *__init alloc_large_system_hash(const char *tablename,
if (numentries > max)
numentries = max;
- log2qty = long_log2(numentries);
+ log2qty = ilog2(numentries);
do {
size = bucketsize << log2qty;
@@ -3099,7 +3371,7 @@ void *__init alloc_large_system_hash(const char *tablename,
printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
tablename,
(1U << log2qty),
- long_log2(size) - PAGE_SHIFT,
+ ilog2(size) - PAGE_SHIFT,
size);
if (_hash_shift)
diff --git a/mm/page_io.c b/mm/page_io.c
index d4840ecbf8f..dbffec0d78c 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -147,48 +147,3 @@ int swap_readpage(struct file *file, struct page *page)
out:
return ret;
}
-
-#ifdef CONFIG_SOFTWARE_SUSPEND
-/*
- * A scruffy utility function to read or write an arbitrary swap page
- * and wait on the I/O. The caller must have a ref on the page.
- *
- * We use end_swap_bio_read() even for writes, because it happens to do what
- * we want.
- */
-int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page,
- struct bio **bio_chain)
-{
- struct bio *bio;
- int ret = 0;
- int bio_rw;
-
- lock_page(page);
-
- bio = get_swap_bio(GFP_KERNEL, entry.val, page, end_swap_bio_read);
- if (bio == NULL) {
- unlock_page(page);
- ret = -ENOMEM;
- goto out;
- }
-
- bio_rw = rw;
- if (!bio_chain)
- bio_rw |= (1 << BIO_RW_SYNC);
- if (bio_chain)
- bio_get(bio);
- submit_bio(bio_rw, bio);
- if (bio_chain == NULL) {
- wait_on_page_locked(page);
-
- if (!PageUptodate(page) || PageError(page))
- ret = -EIO;
- }
- if (bio_chain) {
- bio->bi_private = *bio_chain;
- *bio_chain = bio;
- }
-out:
- return ret;
-}
-#endif
diff --git a/mm/pdflush.c b/mm/pdflush.c
index b02102feeb4..8ce0900dc95 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -21,6 +21,7 @@
#include <linux/writeback.h> // Prototypes pdflush_operation()
#include <linux/kthread.h>
#include <linux/cpuset.h>
+#include <linux/freezer.h>
/*
diff --git a/mm/readahead.c b/mm/readahead.c
index 23cb61a01c6..0f539e8e827 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
+#include <linux/task_io_accounting_ops.h>
#include <linux/pagevec.h>
void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
@@ -148,15 +149,10 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
if (!pagevec_add(&lru_pvec, page))
__pagevec_lru_add(&lru_pvec);
if (ret) {
- while (!list_empty(pages)) {
- struct page *victim;
-
- victim = list_to_page(pages);
- list_del(&victim->lru);
- page_cache_release(victim);
- }
+ put_pages_list(pages);
break;
}
+ task_io_account_read(PAGE_CACHE_SIZE);
}
pagevec_lru_add(&lru_pvec);
return ret;
@@ -456,7 +452,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp,
*
* Note that @filp is purely used for passing on to the ->readpage[s]()
* handler: it may refer to a different file from @mapping (so we may not use
- * @filp->f_mapping or @filp->f_dentry->d_inode here).
+ * @filp->f_mapping or @filp->f_path.dentry->d_inode here).
* Also, @ra may not be equal to &@filp->f_ra.
*
*/
diff --git a/mm/rmap.c b/mm/rmap.c
index d8a842a586d..669acb22b57 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -47,6 +47,7 @@
#include <linux/rmap.h>
#include <linux/rcupdate.h>
#include <linux/module.h>
+#include <linux/kallsyms.h>
#include <asm/tlbflush.h>
@@ -432,7 +433,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
- pte_t *pte, entry;
+ pte_t *pte;
spinlock_t *ptl;
int ret = 0;
@@ -444,17 +445,18 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
if (!pte)
goto out;
- if (!pte_dirty(*pte) && !pte_write(*pte))
- goto unlock;
+ if (pte_dirty(*pte) || pte_write(*pte)) {
+ pte_t entry;
- entry = ptep_get_and_clear(mm, address, pte);
- entry = pte_mkclean(entry);
- entry = pte_wrprotect(entry);
- ptep_establish(vma, address, pte, entry);
- lazy_mmu_prot_update(entry);
- ret = 1;
+ flush_cache_page(vma, address, pte_pfn(*pte));
+ entry = ptep_clear_flush(vma, address, pte);
+ entry = pte_wrprotect(entry);
+ entry = pte_mkclean(entry);
+ set_pte_at(mm, address, pte, entry);
+ lazy_mmu_prot_update(entry);
+ ret = 1;
+ }
-unlock:
pte_unmap_unlock(pte, ptl);
out:
return ret;
@@ -489,6 +491,8 @@ int page_mkclean(struct page *page)
if (mapping)
ret = page_mkclean_file(mapping, page);
}
+ if (page_test_and_clear_dirty(page))
+ ret = 1;
return ret;
}
@@ -567,14 +571,20 @@ void page_add_file_rmap(struct page *page)
*
* The caller needs to hold the pte lock.
*/
-void page_remove_rmap(struct page *page)
+void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
{
if (atomic_add_negative(-1, &page->_mapcount)) {
if (unlikely(page_mapcount(page) < 0)) {
printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
+ printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page));
printk (KERN_EMERG " page->flags = %lx\n", page->flags);
printk (KERN_EMERG " page->count = %x\n", page_count(page));
printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
+ print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
+ if (vma->vm_ops)
+ print_symbol (KERN_EMERG " vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage);
+ if (vma->vm_file && vma->vm_file->f_op)
+ print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
BUG();
}
@@ -679,7 +689,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
dec_mm_counter(mm, file_rss);
- page_remove_rmap(page);
+ page_remove_rmap(page, vma);
page_cache_release(page);
out_unmap:
@@ -769,7 +779,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
if (pte_dirty(pteval))
set_page_dirty(page);
- page_remove_rmap(page);
+ page_remove_rmap(page, vma);
page_cache_release(page);
dec_mm_counter(mm, file_rss);
(*mapcount)--;
diff --git a/mm/shmem.c b/mm/shmem.c
index 4959535fc14..70da7a0981b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -177,7 +177,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
static struct super_operations shmem_ops;
static const struct address_space_operations shmem_aops;
-static struct file_operations shmem_file_operations;
+static const struct file_operations shmem_file_operations;
static struct inode_operations shmem_inode_operations;
static struct inode_operations shmem_dir_inode_operations;
static struct inode_operations shmem_special_inode_operations;
@@ -515,7 +515,12 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
size = SHMEM_NR_DIRECT;
nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size);
}
- if (!topdir)
+
+ /*
+ * If there are no indirect blocks or we are punching a hole
+ * below indirect blocks, nothing to be done.
+ */
+ if (!topdir || (punch_hole && (limit <= SHMEM_NR_DIRECT)))
goto done2;
BUG_ON(limit <= SHMEM_NR_DIRECT);
@@ -1225,7 +1230,7 @@ failed:
struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type)
{
- struct inode *inode = vma->vm_file->f_dentry->d_inode;
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
struct page *page = NULL;
unsigned long idx;
int error;
@@ -1248,7 +1253,7 @@ static int shmem_populate(struct vm_area_struct *vma,
unsigned long addr, unsigned long len,
pgprot_t prot, unsigned long pgoff, int nonblock)
{
- struct inode *inode = vma->vm_file->f_dentry->d_inode;
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
struct mm_struct *mm = vma->vm_mm;
enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE;
unsigned long size;
@@ -1293,14 +1298,14 @@ static int shmem_populate(struct vm_area_struct *vma,
#ifdef CONFIG_NUMA
int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
{
- struct inode *i = vma->vm_file->f_dentry->d_inode;
+ struct inode *i = vma->vm_file->f_path.dentry->d_inode;
return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
}
struct mempolicy *
shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
{
- struct inode *i = vma->vm_file->f_dentry->d_inode;
+ struct inode *i = vma->vm_file->f_path.dentry->d_inode;
unsigned long idx;
idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -1310,7 +1315,7 @@ shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
int shmem_lock(struct file *file, int lock, struct user_struct *user)
{
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file->f_path.dentry->d_inode;
struct shmem_inode_info *info = SHMEM_I(inode);
int retval = -ENOMEM;
@@ -1422,7 +1427,7 @@ shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsig
static ssize_t
shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file->f_path.dentry->d_inode;
loff_t pos;
unsigned long written;
ssize_t err;
@@ -1442,7 +1447,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
if (err || !count)
goto out;
- err = remove_suid(file->f_dentry);
+ err = remove_suid(file->f_path.dentry);
if (err)
goto out;
@@ -1524,7 +1529,7 @@ out:
static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
{
- struct inode *inode = filp->f_dentry->d_inode;
+ struct inode *inode = filp->f_path.dentry->d_inode;
struct address_space *mapping = inode->i_mapping;
unsigned long index, offset;
@@ -1943,7 +1948,7 @@ static int shmem_xattr_security_set(struct inode *inode, const char *name,
return security_inode_setsecurity(inode, name, value, size, flags);
}
-struct xattr_handler shmem_xattr_security_handler = {
+static struct xattr_handler shmem_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.list = shmem_xattr_security_list,
.get = shmem_xattr_security_get,
@@ -2263,7 +2268,7 @@ static struct kmem_cache *shmem_inode_cachep;
static struct inode *shmem_alloc_inode(struct super_block *sb)
{
struct shmem_inode_info *p;
- p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, SLAB_KERNEL);
+ p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
if (!p)
return NULL;
return &p->vfs_inode;
@@ -2319,7 +2324,7 @@ static const struct address_space_operations shmem_aops = {
.migratepage = migrate_page,
};
-static struct file_operations shmem_file_operations = {
+static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
#ifdef CONFIG_TMPFS
.llseek = generic_file_llseek,
@@ -2493,8 +2498,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
d_instantiate(dentry, inode);
inode->i_size = size;
inode->i_nlink = 0; /* It is unlinked */
- file->f_vfsmnt = mntget(shm_mnt);
- file->f_dentry = dentry;
+ file->f_path.mnt = mntget(shm_mnt);
+ file->f_path.dentry = dentry;
file->f_mapping = inode->i_mapping;
file->f_op = &shmem_file_operations;
file->f_mode = FMODE_WRITE | FMODE_READ;
diff --git a/mm/slab.c b/mm/slab.c
index 3c4a7e34edd..c6100628a6e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -103,12 +103,14 @@
#include <linux/module.h>
#include <linux/rcupdate.h>
#include <linux/string.h>
+#include <linux/uaccess.h>
#include <linux/nodemask.h>
#include <linux/mempolicy.h>
#include <linux/mutex.h>
+#include <linux/fault-inject.h>
#include <linux/rtmutex.h>
+#include <linux/reciprocal_div.h>
-#include <asm/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/page.h>
@@ -313,7 +315,7 @@ static int drain_freelist(struct kmem_cache *cache,
static void free_block(struct kmem_cache *cachep, void **objpp, int len,
int node);
static int enable_cpucache(struct kmem_cache *cachep);
-static void cache_reap(void *unused);
+static void cache_reap(struct work_struct *unused);
/*
* This function must be completely optimized away if a constant is passed to
@@ -385,6 +387,7 @@ struct kmem_cache {
unsigned int shared;
unsigned int buffer_size;
+ u32 reciprocal_buffer_size;
/* 3) touched by every alloc & free from the backend */
struct kmem_list3 *nodelists[MAX_NUMNODES];
@@ -626,10 +629,17 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
return slab->s_mem + cache->buffer_size * idx;
}
-static inline unsigned int obj_to_index(struct kmem_cache *cache,
- struct slab *slab, void *obj)
+/*
+ * We want to avoid an expensive divide : (offset / cache->buffer_size)
+ * Using the fact that buffer_size is a constant for a particular cache,
+ * we can replace (offset / cache->buffer_size) by
+ * reciprocal_divide(offset, cache->reciprocal_buffer_size)
+ */
+static inline unsigned int obj_to_index(const struct kmem_cache *cache,
+ const struct slab *slab, void *obj)
{
- return (unsigned)(obj - slab->s_mem) / cache->buffer_size;
+ u32 offset = (obj - slab->s_mem);
+ return reciprocal_divide(offset, cache->reciprocal_buffer_size);
}
/*
@@ -730,7 +740,10 @@ static inline void init_lock_keys(void)
}
#endif
-/* Guard access to the cache-chain. */
+/*
+ * 1. Guard access to the cache-chain.
+ * 2. Protect sanity of cpu_online_map against cpu hotplug events
+ */
static DEFINE_MUTEX(cache_chain_mutex);
static struct list_head cache_chain;
@@ -753,7 +766,7 @@ int slab_is_available(void)
return g_cpucache_up == FULL;
}
-static DEFINE_PER_CPU(struct work_struct, reap_work);
+static DEFINE_PER_CPU(struct delayed_work, reap_work);
static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
{
@@ -866,6 +879,22 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
dump_stack();
}
+/*
+ * By default on NUMA we use alien caches to stage the freeing of
+ * objects allocated from other nodes. This causes massive memory
+ * inefficiencies when using fake NUMA setup to split memory into a
+ * large number of small nodes, so it can be disabled on the command
+ * line
+ */
+
+static int use_alien_caches __read_mostly = 1;
+static int __init noaliencache_setup(char *s)
+{
+ use_alien_caches = 0;
+ return 1;
+}
+__setup("noaliencache", noaliencache_setup);
+
#ifdef CONFIG_NUMA
/*
* Special reaping functions for NUMA systems called from cache_reap().
@@ -916,17 +945,18 @@ static void next_reap_node(void)
*/
static void __devinit start_cpu_timer(int cpu)
{
- struct work_struct *reap_work = &per_cpu(reap_work, cpu);
+ struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
/*
* When this gets called from do_initcalls via cpucache_init(),
* init_workqueues() has already run, so keventd will be setup
* at that time.
*/
- if (keventd_up() && reap_work->func == NULL) {
+ if (keventd_up() && reap_work->work.func == NULL) {
init_reap_node(cpu);
- INIT_WORK(reap_work, cache_reap, NULL);
- schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
+ INIT_DELAYED_WORK(reap_work, cache_reap);
+ schedule_delayed_work_on(cpu, reap_work,
+ __round_jiffies_relative(HZ, cpu));
}
}
@@ -996,7 +1026,7 @@ static inline void *alternate_node_alloc(struct kmem_cache *cachep,
return NULL;
}
-static inline void *__cache_alloc_node(struct kmem_cache *cachep,
+static inline void *____cache_alloc_node(struct kmem_cache *cachep,
gfp_t flags, int nodeid)
{
return NULL;
@@ -1004,7 +1034,7 @@ static inline void *__cache_alloc_node(struct kmem_cache *cachep,
#else /* CONFIG_NUMA */
-static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
+static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
static struct array_cache **alloc_alien_cache(int node, int limit)
@@ -1114,7 +1144,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
* Make sure we are not freeing a object from another node to the array
* cache on this cpu.
*/
- if (likely(slabp->nodeid == node))
+ if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches))
return 0;
l3 = cachep->nodelists[node];
@@ -1192,7 +1222,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
list_for_each_entry(cachep, &cache_chain, next) {
struct array_cache *nc;
struct array_cache *shared;
- struct array_cache **alien;
+ struct array_cache **alien = NULL;
nc = alloc_arraycache(node, cachep->limit,
cachep->batchcount);
@@ -1204,9 +1234,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
if (!shared)
goto bad;
- alien = alloc_alien_cache(node, cachep->limit);
- if (!alien)
- goto bad;
+ if (use_alien_caches) {
+ alien = alloc_alien_cache(node, cachep->limit);
+ if (!alien)
+ goto bad;
+ }
cachep->array[cpu] = nc;
l3 = cachep->nodelists[node];
BUG_ON(!l3);
@@ -1230,12 +1262,18 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
kfree(shared);
free_alien_cache(alien);
}
- mutex_unlock(&cache_chain_mutex);
break;
case CPU_ONLINE:
+ mutex_unlock(&cache_chain_mutex);
start_cpu_timer(cpu);
break;
#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DOWN_PREPARE:
+ mutex_lock(&cache_chain_mutex);
+ break;
+ case CPU_DOWN_FAILED:
+ mutex_unlock(&cache_chain_mutex);
+ break;
case CPU_DEAD:
/*
* Even if all the cpus of a node are down, we don't free the
@@ -1246,8 +1284,8 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
* gets destroyed at kmem_cache_destroy().
*/
/* fall thru */
+#endif
case CPU_UP_CANCELED:
- mutex_lock(&cache_chain_mutex);
list_for_each_entry(cachep, &cache_chain, next) {
struct array_cache *nc;
struct array_cache *shared;
@@ -1308,11 +1346,9 @@ free_array_cache:
}
mutex_unlock(&cache_chain_mutex);
break;
-#endif
}
return NOTIFY_OK;
bad:
- mutex_unlock(&cache_chain_mutex);
return NOTIFY_BAD;
}
@@ -1400,6 +1436,8 @@ void __init kmem_cache_init(void)
cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
cache_line_size());
+ cache_cache.reciprocal_buffer_size =
+ reciprocal_value(cache_cache.buffer_size);
for (order = 0; order < MAX_ORDER; order++) {
cache_estimate(order, cache_cache.buffer_size,
@@ -1580,12 +1618,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
flags |= __GFP_COMP;
#endif
- /*
- * Under NUMA we want memory on the indicated node. We will handle
- * the needed fallback ourselves since we want to serve from our
- * per node object lists first for other nodes.
- */
- flags |= cachep->gfpflags | GFP_THISNODE;
+ flags |= cachep->gfpflags;
page = alloc_pages_node(nodeid, flags, cachep->gfporder);
if (!page)
@@ -2098,15 +2131,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
}
/*
- * Prevent CPUs from coming and going.
- * lock_cpu_hotplug() nests outside cache_chain_mutex
+ * We use cache_chain_mutex to ensure a consistent view of
+ * cpu_online_map as well. Please see cpuup_callback
*/
- lock_cpu_hotplug();
-
mutex_lock(&cache_chain_mutex);
list_for_each_entry(pc, &cache_chain, next) {
- mm_segment_t old_fs = get_fs();
char tmp;
int res;
@@ -2115,9 +2145,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
* destroy its slab cache and no-one else reuses the vmalloc
* area of the module. Print a warning.
*/
- set_fs(KERNEL_DS);
- res = __get_user(tmp, pc->name);
- set_fs(old_fs);
+ res = probe_kernel_address(pc->name, tmp);
if (res) {
printk("SLAB: cache with size %d has lost its name\n",
pc->buffer_size);
@@ -2197,25 +2225,24 @@ kmem_cache_create (const char *name, size_t size, size_t align,
if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
ralign = BYTES_PER_WORD;
- /* 2) arch mandated alignment: disables debug if necessary */
+ /* 2) arch mandated alignment */
if (ralign < ARCH_SLAB_MINALIGN) {
ralign = ARCH_SLAB_MINALIGN;
- if (ralign > BYTES_PER_WORD)
- flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
}
- /* 3) caller mandated alignment: disables debug if necessary */
+ /* 3) caller mandated alignment */
if (ralign < align) {
ralign = align;
- if (ralign > BYTES_PER_WORD)
- flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
}
+ /* disable debug if necessary */
+ if (ralign > BYTES_PER_WORD)
+ flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
/*
* 4) Store it.
*/
align = ralign;
/* Get cache's description obj. */
- cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL);
+ cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
if (!cachep)
goto oops;
@@ -2297,6 +2324,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
if (flags & SLAB_CACHE_DMA)
cachep->gfpflags |= GFP_DMA;
cachep->buffer_size = size;
+ cachep->reciprocal_buffer_size = reciprocal_value(size);
if (flags & CFLGS_OFF_SLAB) {
cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
@@ -2326,7 +2354,6 @@ oops:
panic("kmem_cache_create(): failed to create slab `%s'\n",
name);
mutex_unlock(&cache_chain_mutex);
- unlock_cpu_hotplug();
return cachep;
}
EXPORT_SYMBOL(kmem_cache_create);
@@ -2444,6 +2471,7 @@ out:
return nr_freed;
}
+/* Called with cache_chain_mutex held to protect against cpu hotplug */
static int __cache_shrink(struct kmem_cache *cachep)
{
int ret = 0, i = 0;
@@ -2474,9 +2502,13 @@ static int __cache_shrink(struct kmem_cache *cachep)
*/
int kmem_cache_shrink(struct kmem_cache *cachep)
{
+ int ret;
BUG_ON(!cachep || in_interrupt());
- return __cache_shrink(cachep);
+ mutex_lock(&cache_chain_mutex);
+ ret = __cache_shrink(cachep);
+ mutex_unlock(&cache_chain_mutex);
+ return ret;
}
EXPORT_SYMBOL(kmem_cache_shrink);
@@ -2500,23 +2532,16 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
{
BUG_ON(!cachep || in_interrupt());
- /* Don't let CPUs to come and go */
- lock_cpu_hotplug();
-
/* Find the cache in the chain of caches. */
mutex_lock(&cache_chain_mutex);
/*
* the chain is never empty, cache_cache is never destroyed
*/
list_del(&cachep->next);
- mutex_unlock(&cache_chain_mutex);
-
if (__cache_shrink(cachep)) {
slab_error(cachep, "Can't free all objects");
- mutex_lock(&cache_chain_mutex);
list_add(&cachep->next, &cache_chain);
mutex_unlock(&cache_chain_mutex);
- unlock_cpu_hotplug();
return;
}
@@ -2524,7 +2549,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
synchronize_rcu();
__kmem_cache_destroy(cachep);
- unlock_cpu_hotplug();
+ mutex_unlock(&cache_chain_mutex);
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2548,7 +2573,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
if (OFF_SLAB(cachep)) {
/* Slab management obj is off-slab. */
slabp = kmem_cache_alloc_node(cachep->slabp_cache,
- local_flags, nodeid);
+ local_flags & ~GFP_THISNODE, nodeid);
if (!slabp)
return NULL;
} else {
@@ -2618,7 +2643,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
{
- if (flags & SLAB_DMA)
+ if (flags & GFP_DMA)
BUG_ON(!(cachep->gfpflags & GFP_DMA));
else
BUG_ON(cachep->gfpflags & GFP_DMA);
@@ -2689,10 +2714,10 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
* Grow (by 1) the number of slabs within a cache. This is called by
* kmem_cache_alloc() when there are no active objs left in a cache.
*/
-static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static int cache_grow(struct kmem_cache *cachep,
+ gfp_t flags, int nodeid, void *objp)
{
struct slab *slabp;
- void *objp;
size_t offset;
gfp_t local_flags;
unsigned long ctor_flags;
@@ -2702,12 +2727,12 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
* Be lazy and only check for valid flags here, keeping it out of the
* critical path in kmem_cache_alloc().
*/
- BUG_ON(flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW));
- if (flags & SLAB_NO_GROW)
+ BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW));
+ if (flags & __GFP_NO_GROW)
return 0;
ctor_flags = SLAB_CTOR_CONSTRUCTOR;
- local_flags = (flags & SLAB_LEVEL_MASK);
+ local_flags = (flags & GFP_LEVEL_MASK);
if (!(local_flags & __GFP_WAIT))
/*
* Not allowed to sleep. Need to tell a constructor about
@@ -2744,12 +2769,14 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
* Get mem for the objs. Attempt to allocate a physical page from
* 'nodeid'.
*/
- objp = kmem_getpages(cachep, flags, nodeid);
+ if (!objp)
+ objp = kmem_getpages(cachep, flags, nodeid);
if (!objp)
goto failed;
/* Get slab management. */
- slabp = alloc_slabmgmt(cachep, objp, offset, local_flags, nodeid);
+ slabp = alloc_slabmgmt(cachep, objp, offset,
+ local_flags & ~GFP_THISNODE, nodeid);
if (!slabp)
goto opps1;
@@ -2987,7 +3014,7 @@ alloc_done:
if (unlikely(!ac->avail)) {
int x;
- x = cache_grow(cachep, flags, node);
+ x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
/* cache_grow can reenable interrupts, then ac could change. */
ac = cpu_cache_get(cachep);
@@ -3063,18 +3090,101 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
cachep->ctor(objp, cachep, ctor_flags);
}
+#if ARCH_SLAB_MINALIGN
+ if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
+ printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
+ objp, ARCH_SLAB_MINALIGN);
+ }
+#endif
return objp;
}
#else
#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
#endif
+#ifdef CONFIG_FAILSLAB
+
+static struct failslab_attr {
+
+ struct fault_attr attr;
+
+ u32 ignore_gfp_wait;
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+ struct dentry *ignore_gfp_wait_file;
+#endif
+
+} failslab = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_gfp_wait = 1,
+};
+
+static int __init setup_failslab(char *str)
+{
+ return setup_fault_attr(&failslab.attr, str);
+}
+__setup("failslab=", setup_failslab);
+
+static int should_failslab(struct kmem_cache *cachep, gfp_t flags)
+{
+ if (cachep == &cache_cache)
+ return 0;
+ if (flags & __GFP_NOFAIL)
+ return 0;
+ if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT))
+ return 0;
+
+ return should_fail(&failslab.attr, obj_size(cachep));
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init failslab_debugfs(void)
+{
+ mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+ struct dentry *dir;
+ int err;
+
+ err = init_fault_attr_dentries(&failslab.attr, "failslab");
+ if (err)
+ return err;
+ dir = failslab.attr.dentries.dir;
+
+ failslab.ignore_gfp_wait_file =
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &failslab.ignore_gfp_wait);
+
+ if (!failslab.ignore_gfp_wait_file) {
+ err = -ENOMEM;
+ debugfs_remove(failslab.ignore_gfp_wait_file);
+ cleanup_fault_attr_dentries(&failslab.attr);
+ }
+
+ return err;
+}
+
+late_initcall(failslab_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+#else /* CONFIG_FAILSLAB */
+
+static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags)
+{
+ return 0;
+}
+
+#endif /* CONFIG_FAILSLAB */
+
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *objp;
struct array_cache *ac;
check_irq_off();
+
+ if (should_failslab(cachep, flags))
+ return NULL;
+
ac = cpu_cache_get(cachep);
if (likely(ac->avail)) {
STATS_INC_ALLOCHIT(cachep);
@@ -3105,10 +3215,10 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
objp = ____cache_alloc(cachep, flags);
/*
* We may just have run out of memory on the local node.
- * __cache_alloc_node() knows how to locate memory on other nodes
+ * ____cache_alloc_node() knows how to locate memory on other nodes
*/
if (NUMA_BUILD && !objp)
- objp = __cache_alloc_node(cachep, flags, numa_node_id());
+ objp = ____cache_alloc_node(cachep, flags, numa_node_id());
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp,
caller);
@@ -3135,15 +3245,17 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
else if (current->mempolicy)
nid_alloc = slab_node(current->mempolicy);
if (nid_alloc != nid_here)
- return __cache_alloc_node(cachep, flags, nid_alloc);
+ return ____cache_alloc_node(cachep, flags, nid_alloc);
return NULL;
}
/*
* Fallback function if there was no memory available and no objects on a
- * certain node and we are allowed to fall back. We mimick the behavior of
- * the page allocator. We fall back according to a zonelist determined by
- * the policy layer while obeying cpuset constraints.
+ * certain node and fall back is permitted. First we scan all the
+ * available nodelists for available objects. If that fails then we
+ * perform an allocation without specifying a node. This allows the page
+ * allocator to do its reclaim / fallback magic. We then insert the
+ * slab into the proper nodelist and then allocate from it.
*/
void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
{
@@ -3151,15 +3263,57 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
->node_zonelists[gfp_zone(flags)];
struct zone **z;
void *obj = NULL;
+ int nid;
+ gfp_t local_flags = (flags & GFP_LEVEL_MASK);
+retry:
+ /*
+ * Look through allowed nodes for objects available
+ * from existing per node queues.
+ */
for (z = zonelist->zones; *z && !obj; z++) {
- int nid = zone_to_nid(*z);
+ nid = zone_to_nid(*z);
- if (zone_idx(*z) <= ZONE_NORMAL &&
- cpuset_zone_allowed(*z, flags) &&
- cache->nodelists[nid])
- obj = __cache_alloc_node(cache,
- flags | __GFP_THISNODE, nid);
+ if (cpuset_zone_allowed_hardwall(*z, flags) &&
+ cache->nodelists[nid] &&
+ cache->nodelists[nid]->free_objects)
+ obj = ____cache_alloc_node(cache,
+ flags | GFP_THISNODE, nid);
+ }
+
+ if (!obj && !(flags & __GFP_NO_GROW)) {
+ /*
+ * This allocation will be performed within the constraints
+ * of the current cpuset / memory policy requirements.
+ * We may trigger various forms of reclaim on the allowed
+ * set and go into memory reserves if necessary.
+ */
+ if (local_flags & __GFP_WAIT)
+ local_irq_enable();
+ kmem_flagcheck(cache, flags);
+ obj = kmem_getpages(cache, flags, -1);
+ if (local_flags & __GFP_WAIT)
+ local_irq_disable();
+ if (obj) {
+ /*
+ * Insert into the appropriate per node queues
+ */
+ nid = page_to_nid(virt_to_page(obj));
+ if (cache_grow(cache, flags, nid, obj)) {
+ obj = ____cache_alloc_node(cache,
+ flags | GFP_THISNODE, nid);
+ if (!obj)
+ /*
+ * Another processor may allocate the
+ * objects in the slab since we are
+ * not holding any locks.
+ */
+ goto retry;
+ } else {
+ /* cache_grow already freed obj */
+ obj = NULL;
+ }
+ }
}
return obj;
}
@@ -3167,7 +3321,7 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
/*
* A interface to enable slab creation on nodeid
*/
-static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
int nodeid)
{
struct list_head *entry;
@@ -3216,7 +3370,7 @@ retry:
must_grow:
spin_unlock(&l3->list_lock);
- x = cache_grow(cachep, flags, nodeid);
+ x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
if (x)
goto retry;
@@ -3399,7 +3553,7 @@ EXPORT_SYMBOL(kmem_cache_zalloc);
*
* Currently only used for dentry validation.
*/
-int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr)
+int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
{
unsigned long addr = (unsigned long)ptr;
unsigned long min_addr = PAGE_OFFSET;
@@ -3433,36 +3587,61 @@ out:
* @cachep: The cache to allocate from.
* @flags: See kmalloc().
* @nodeid: node number of the target node.
+ * @caller: return address of caller, used for debug information
+ *
+ * Identical to kmem_cache_alloc but it will allocate memory on the given
+ * node, which can improve the performance for cpu bound structures.
*
- * Identical to kmem_cache_alloc, except that this function is slow
- * and can sleep. And it will allocate memory on the given node, which
- * can improve the performance for cpu bound structures.
- * New and improved: it will now make sure that the object gets
- * put on the correct node list so that there is no false sharing.
+ * Fallback to other node is possible if __GFP_THISNODE is not set.
*/
-void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static __always_inline void *
+__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+ int nodeid, void *caller)
{
unsigned long save_flags;
- void *ptr;
+ void *ptr = NULL;
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
- if (nodeid == -1 || nodeid == numa_node_id() ||
- !cachep->nodelists[nodeid])
- ptr = ____cache_alloc(cachep, flags);
- else
- ptr = __cache_alloc_node(cachep, flags, nodeid);
- local_irq_restore(save_flags);
+ if (unlikely(nodeid == -1))
+ nodeid = numa_node_id();
- ptr = cache_alloc_debugcheck_after(cachep, flags, ptr,
- __builtin_return_address(0));
+ if (likely(cachep->nodelists[nodeid])) {
+ if (nodeid == numa_node_id()) {
+ /*
+ * Use the locally cached objects if possible.
+ * However ____cache_alloc does not allow fallback
+ * to other nodes. It may fail while we still have
+ * objects on other nodes available.
+ */
+ ptr = ____cache_alloc(cachep, flags);
+ }
+ if (!ptr) {
+ /* ___cache_alloc_node can fall back to other nodes */
+ ptr = ____cache_alloc_node(cachep, flags, nodeid);
+ }
+ } else {
+ /* Node not bootstrapped yet */
+ if (!(flags & __GFP_THISNODE))
+ ptr = fallback_alloc(cachep, flags);
+ }
+
+ local_irq_restore(save_flags);
+ ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
return ptr;
}
+
+void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+{
+ return __cache_alloc_node(cachep, flags, nodeid,
+ __builtin_return_address(0));
+}
EXPORT_SYMBOL(kmem_cache_alloc_node);
-void *__kmalloc_node(size_t size, gfp_t flags, int node)
+static __always_inline void *
+__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
{
struct kmem_cache *cachep;
@@ -3471,8 +3650,29 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
return NULL;
return kmem_cache_alloc_node(cachep, flags, node);
}
+
+#ifdef CONFIG_DEBUG_SLAB
+void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+ return __do_kmalloc_node(size, flags, node,
+ __builtin_return_address(0));
+}
EXPORT_SYMBOL(__kmalloc_node);
-#endif
+
+void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
+ int node, void *caller)
+{
+ return __do_kmalloc_node(size, flags, node, caller);
+}
+EXPORT_SYMBOL(__kmalloc_node_track_caller);
+#else
+void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+ return __do_kmalloc_node(size, flags, node, NULL);
+}
+EXPORT_SYMBOL(__kmalloc_node);
+#endif /* CONFIG_DEBUG_SLAB */
+#endif /* CONFIG_NUMA */
/**
* __do_kmalloc - allocate memory
@@ -3583,13 +3783,15 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
int node;
struct kmem_list3 *l3;
struct array_cache *new_shared;
- struct array_cache **new_alien;
+ struct array_cache **new_alien = NULL;
for_each_online_node(node) {
- new_alien = alloc_alien_cache(node, cachep->limit);
- if (!new_alien)
- goto fail;
+ if (use_alien_caches) {
+ new_alien = alloc_alien_cache(node, cachep->limit);
+ if (!new_alien)
+ goto fail;
+ }
new_shared = alloc_arraycache(node,
cachep->shared*cachep->batchcount,
@@ -3815,7 +4017,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
* If we cannot acquire the cache chain mutex then just give up - we'll try
* again on the next iteration.
*/
-static void cache_reap(void *unused)
+static void cache_reap(struct work_struct *unused)
{
struct kmem_cache *searchp;
struct kmem_list3 *l3;
@@ -3824,7 +4026,7 @@ static void cache_reap(void *unused)
if (!mutex_trylock(&cache_chain_mutex)) {
/* Give up. Setup the next iteration. */
schedule_delayed_work(&__get_cpu_var(reap_work),
- REAPTIMEOUT_CPUC);
+ round_jiffies_relative(REAPTIMEOUT_CPUC));
return;
}
@@ -3870,7 +4072,8 @@ next:
next_reap_node();
refresh_cpu_vm_stats(smp_processor_id());
/* Set up the next iteration */
- schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
+ schedule_delayed_work(&__get_cpu_var(reap_work),
+ round_jiffies_relative(REAPTIMEOUT_CPUC));
}
#ifdef CONFIG_PROC_FS
@@ -4038,7 +4241,7 @@ static int s_show(struct seq_file *m, void *p)
* + further values on SMP and with statistics enabled
*/
-struct seq_operations slabinfo_op = {
+const struct seq_operations slabinfo_op = {
.start = s_start,
.next = s_next,
.stop = s_stop,
@@ -4236,7 +4439,7 @@ static int leaks_show(struct seq_file *m, void *p)
return 0;
}
-struct seq_operations slabstats_op = {
+const struct seq_operations slabstats_op = {
.start = leaks_start,
.next = s_next,
.stop = s_stop,
diff --git a/mm/slob.c b/mm/slob.c
index 542394184a5..5adc29cb58d 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -60,6 +60,8 @@ static DEFINE_SPINLOCK(slob_lock);
static DEFINE_SPINLOCK(block_lock);
static void slob_free(void *b, int size);
+static void slob_timer_cbk(void);
+
static void *slob_alloc(size_t size, gfp_t gfp, int align)
{
@@ -157,7 +159,7 @@ static int fastcall find_order(int size)
return order;
}
-void *kmalloc(size_t size, gfp_t gfp)
+void *__kmalloc(size_t size, gfp_t gfp)
{
slob_t *m;
bigblock_t *bb;
@@ -186,8 +188,7 @@ void *kmalloc(size_t size, gfp_t gfp)
slob_free(bb, sizeof(bigblock_t));
return 0;
}
-
-EXPORT_SYMBOL(kmalloc);
+EXPORT_SYMBOL(__kmalloc);
void kfree(const void *block)
{
@@ -327,9 +328,25 @@ const char *kmem_cache_name(struct kmem_cache *c)
EXPORT_SYMBOL(kmem_cache_name);
static struct timer_list slob_timer = TIMER_INITIALIZER(
- (void (*)(unsigned long))kmem_cache_init, 0, 0);
+ (void (*)(unsigned long))slob_timer_cbk, 0, 0);
+
+int kmem_cache_shrink(struct kmem_cache *d)
+{
+ return 0;
+}
+EXPORT_SYMBOL(kmem_cache_shrink);
+
+int kmem_ptr_validate(struct kmem_cache *a, const void *b)
+{
+ return 0;
+}
+
+void __init kmem_cache_init(void)
+{
+ slob_timer_cbk();
+}
-void kmem_cache_init(void)
+static void slob_timer_cbk(void)
{
void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
diff --git a/mm/sparse.c b/mm/sparse.c
index b3c82ba3001..ac26eb0d73c 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -24,6 +24,25 @@ struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
#endif
EXPORT_SYMBOL(mem_section);
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+/*
+ * If we did not store the node number in the page then we have to
+ * do a lookup in the section_to_node_table in order to find which
+ * node the page belongs to.
+ */
+#if MAX_NUMNODES <= 256
+static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
+#else
+static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
+#endif
+
+int page_to_nid(struct page *page)
+{
+ return section_to_node_table[page_to_section(page)];
+}
+EXPORT_SYMBOL(page_to_nid);
+#endif
+
#ifdef CONFIG_SPARSEMEM_EXTREME
static struct mem_section *sparse_index_alloc(int nid)
{
@@ -49,6 +68,10 @@ static int sparse_index_init(unsigned long section_nr, int nid)
struct mem_section *section;
int ret = 0;
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+ section_to_node_table[section_nr] = nid;
+#endif
+
if (mem_section[root])
return -EEXIST;
diff --git a/mm/swap.c b/mm/swap.c
index 2e0e871f542..2ed7be39795 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -57,9 +57,9 @@ static void put_compound_page(struct page *page)
{
page = (struct page *)page_private(page);
if (put_page_testzero(page)) {
- void (*dtor)(struct page *page);
+ compound_page_dtor *dtor;
- dtor = (void (*)(struct page *))page[1].lru.next;
+ dtor = get_compound_page_dtor(page);
(*dtor)(page);
}
}
@@ -216,7 +216,7 @@ void lru_add_drain(void)
}
#ifdef CONFIG_NUMA
-static void lru_add_drain_per_cpu(void *dummy)
+static void lru_add_drain_per_cpu(struct work_struct *dummy)
{
lru_add_drain();
}
@@ -226,7 +226,7 @@ static void lru_add_drain_per_cpu(void *dummy)
*/
int lru_add_drain_all(void)
{
- return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
+ return schedule_on_each_cpu(lru_add_drain_per_cpu);
}
#else
@@ -514,5 +514,7 @@ void __init swap_setup(void)
* Right now other parts of the system means that we
* _really_ don't want to cluster much more
*/
+#ifdef CONFIG_HOTPLUG_CPU
hotcpu_notifier(cpu_swap_callback, 0);
+#endif
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a15def63f28..a2d9bb4e80d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -427,34 +427,54 @@ void free_swap_and_cache(swp_entry_t entry)
#ifdef CONFIG_SOFTWARE_SUSPEND
/*
- * Find the swap type that corresponds to given device (if any)
+ * Find the swap type that corresponds to given device (if any).
*
- * This is needed for software suspend and is done in such a way that inode
- * aliasing is allowed.
+ * @offset - number of the PAGE_SIZE-sized block of the device, starting
+ * from 0, in which the swap header is expected to be located.
+ *
+ * This is needed for the suspend to disk (aka swsusp).
*/
-int swap_type_of(dev_t device)
+int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
{
+ struct block_device *bdev = NULL;
int i;
+ if (device)
+ bdev = bdget(device);
+
spin_lock(&swap_lock);
for (i = 0; i < nr_swapfiles; i++) {
- struct inode *inode;
+ struct swap_info_struct *sis = swap_info + i;
- if (!(swap_info[i].flags & SWP_WRITEOK))
+ if (!(sis->flags & SWP_WRITEOK))
continue;
- if (!device) {
+ if (!bdev) {
+ if (bdev_p)
+ *bdev_p = sis->bdev;
+
spin_unlock(&swap_lock);
return i;
}
- inode = swap_info[i].swap_file->f_dentry->d_inode;
- if (S_ISBLK(inode->i_mode) &&
- device == MKDEV(imajor(inode), iminor(inode))) {
- spin_unlock(&swap_lock);
- return i;
+ if (bdev == sis->bdev) {
+ struct swap_extent *se;
+
+ se = list_entry(sis->extent_list.next,
+ struct swap_extent, list);
+ if (se->start_block == offset) {
+ if (bdev_p)
+ *bdev_p = sis->bdev;
+
+ spin_unlock(&swap_lock);
+ bdput(bdev);
+ return i;
+ }
}
}
spin_unlock(&swap_lock);
+ if (bdev)
+ bdput(bdev);
+
return -ENODEV;
}
@@ -931,6 +951,23 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
}
}
+#ifdef CONFIG_SOFTWARE_SUSPEND
+/*
+ * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
+ * corresponding to given index in swap_info (swap type).
+ */
+sector_t swapdev_block(int swap_type, pgoff_t offset)
+{
+ struct swap_info_struct *sis;
+
+ if (swap_type >= nr_swapfiles)
+ return 0;
+
+ sis = swap_info + swap_type;
+ return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0;
+}
+#endif /* CONFIG_SOFTWARE_SUSPEND */
+
/*
* Free all of a swapdev's extent information
*/
@@ -1274,10 +1311,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
mutex_lock(&swapon_mutex);
+ if (!l)
+ return SEQ_START_TOKEN;
+
for (i = 0; i < nr_swapfiles; i++, ptr++) {
if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
continue;
- if (!l--)
+ if (!--l)
return ptr;
}
@@ -1286,10 +1326,17 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
{
- struct swap_info_struct *ptr = v;
+ struct swap_info_struct *ptr;
struct swap_info_struct *endptr = swap_info + nr_swapfiles;
- for (++ptr; ptr < endptr; ptr++) {
+ if (v == SEQ_START_TOKEN)
+ ptr = swap_info;
+ else {
+ ptr = v;
+ ptr++;
+ }
+
+ for (; ptr < endptr; ptr++) {
if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
continue;
++*pos;
@@ -1310,14 +1357,16 @@ static int swap_show(struct seq_file *swap, void *v)
struct file *file;
int len;
- if (v == swap_info)
- seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+ if (ptr == SEQ_START_TOKEN) {
+ seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+ return 0;
+ }
file = ptr->swap_file;
- len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\");
+ len = seq_path(swap, file->f_path.mnt, file->f_path.dentry, " \t\n\\");
seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
len < 40 ? 40 - len : 1, " ",
- S_ISBLK(file->f_dentry->d_inode->i_mode) ?
+ S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
"partition" : "file\t",
ptr->pages << (PAGE_SHIFT - 10),
ptr->inuse_pages << (PAGE_SHIFT - 10),
@@ -1325,7 +1374,7 @@ static int swap_show(struct seq_file *swap, void *v)
return 0;
}
-static struct seq_operations swaps_op = {
+static const struct seq_operations swaps_op = {
.start = swap_start,
.next = swap_next,
.stop = swap_stop,
@@ -1337,7 +1386,7 @@ static int swaps_open(struct inode *inode, struct file *file)
return seq_open(file, &swaps_op);
}
-static struct file_operations proc_swaps_operations = {
+static const struct file_operations proc_swaps_operations = {
.open = swaps_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -1540,6 +1589,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
error = -EINVAL;
if (!maxpages)
goto bad_swap;
+ if (swapfilesize && maxpages > swapfilesize) {
+ printk(KERN_WARNING
+ "Swap area shorter than signature indicates\n");
+ goto bad_swap;
+ }
if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
goto bad_swap;
if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
@@ -1567,12 +1621,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
goto bad_swap;
}
- if (swapfilesize && maxpages > swapfilesize) {
- printk(KERN_WARNING
- "Swap area shorter than signature indicates\n");
- error = -EINVAL;
- goto bad_swap;
- }
if (nr_good_pages) {
p->swap_map[0] = SWAP_MAP_BAD;
p->max = maxpages;
diff --git a/mm/thrash.c b/mm/thrash.c
index f4c560b4a2b..9ef9071f99b 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -7,100 +7,74 @@
*
* Simple token based thrashing protection, using the algorithm
* described in: http://www.cs.wm.edu/~sjiang/token.pdf
+ *
+ * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
+ * Improved algorithm to pass token:
+ * Each task has a priority which is incremented if it contended
+ * for the token in an interval less than its previous attempt.
+ * If the token is acquired, that task's priority is boosted to prevent
+ * the token from bouncing around too often and to let the task make
+ * some progress in its execution.
*/
+
#include <linux/jiffies.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/swap.h>
static DEFINE_SPINLOCK(swap_token_lock);
-static unsigned long swap_token_timeout;
-static unsigned long swap_token_check;
-struct mm_struct * swap_token_mm = &init_mm;
-
-#define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2)
-#define SWAP_TOKEN_TIMEOUT (300 * HZ)
-/*
- * Currently disabled; Needs further code to work at HZ * 300.
- */
-unsigned long swap_token_default_timeout = SWAP_TOKEN_TIMEOUT;
-
-/*
- * Take the token away if the process had no page faults
- * in the last interval, or if it has held the token for
- * too long.
- */
-#define SWAP_TOKEN_ENOUGH_RSS 1
-#define SWAP_TOKEN_TIMED_OUT 2
-static int should_release_swap_token(struct mm_struct *mm)
-{
- int ret = 0;
- if (!mm->recent_pagein)
- ret = SWAP_TOKEN_ENOUGH_RSS;
- else if (time_after(jiffies, swap_token_timeout))
- ret = SWAP_TOKEN_TIMED_OUT;
- mm->recent_pagein = 0;
- return ret;
-}
+struct mm_struct *swap_token_mm;
+static unsigned int global_faults;
-/*
- * Try to grab the swapout protection token. We only try to
- * grab it once every TOKEN_CHECK_INTERVAL, both to prevent
- * SMP lock contention and to check that the process that held
- * the token before is no longer thrashing.
- */
void grab_swap_token(void)
{
- struct mm_struct *mm;
- int reason;
+ int current_interval;
- /* We have the token. Let others know we still need it. */
- if (has_swap_token(current->mm)) {
- current->mm->recent_pagein = 1;
- if (unlikely(!swap_token_default_timeout))
- disable_swap_token();
- return;
- }
-
- if (time_after(jiffies, swap_token_check)) {
+ global_faults++;
- if (!swap_token_default_timeout) {
- swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
- return;
- }
-
- /* ... or if we recently held the token. */
- if (time_before(jiffies, current->mm->swap_token_time))
- return;
+ current_interval = global_faults - current->mm->faultstamp;
- if (!spin_trylock(&swap_token_lock))
- return;
+ if (!spin_trylock(&swap_token_lock))
+ return;
- swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
+ /* First come first served */
+ if (swap_token_mm == NULL) {
+ current->mm->token_priority = current->mm->token_priority + 2;
+ swap_token_mm = current->mm;
+ goto out;
+ }
- mm = swap_token_mm;
- if ((reason = should_release_swap_token(mm))) {
- unsigned long eligible = jiffies;
- if (reason == SWAP_TOKEN_TIMED_OUT) {
- eligible += swap_token_default_timeout;
- }
- mm->swap_token_time = eligible;
- swap_token_timeout = jiffies + swap_token_default_timeout;
+ if (current->mm != swap_token_mm) {
+ if (current_interval < current->mm->last_interval)
+ current->mm->token_priority++;
+ else {
+ current->mm->token_priority--;
+ if (unlikely(current->mm->token_priority < 0))
+ current->mm->token_priority = 0;
+ }
+ /* Check if we deserve the token */
+ if (current->mm->token_priority >
+ swap_token_mm->token_priority) {
+ current->mm->token_priority += 2;
swap_token_mm = current->mm;
}
- spin_unlock(&swap_token_lock);
+ } else {
+ /* Token holder came in again! */
+ current->mm->token_priority += 2;
}
- return;
+
+out:
+ current->mm->faultstamp = global_faults;
+ current->mm->last_interval = current_interval;
+ spin_unlock(&swap_token_lock);
+return;
}
/* Called on process exit. */
void __put_swap_token(struct mm_struct *mm)
{
spin_lock(&swap_token_lock);
- if (likely(mm == swap_token_mm)) {
- mm->swap_token_time = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
- swap_token_mm = &init_mm;
- swap_token_check = jiffies;
- }
+ if (likely(mm == swap_token_mm))
+ swap_token_mm = NULL;
spin_unlock(&swap_token_lock);
}
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index 5f2cbf0f153..c7f6e1914bc 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -79,8 +79,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
d_instantiate(dentry, inode);
inode->i_nlink = 0; /* It is unlinked */
- file->f_vfsmnt = mntget(shm_mnt);
- file->f_dentry = dentry;
+ file->f_path.mnt = mntget(shm_mnt);
+ file->f_path.dentry = dentry;
file->f_mapping = inode->i_mapping;
file->f_op = &ramfs_file_operations;
file->f_mode = FMODE_WRITE | FMODE_READ;
diff --git a/mm/truncate.c b/mm/truncate.c
index e07b1e682c3..6c79ca4a1ca 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
+#include <linux/task_io_accounting_ops.h>
#include <linux/buffer_head.h> /* grr. try_to_release_page,
do_invalidatepage */
@@ -50,6 +51,26 @@ static inline void truncate_partial_page(struct page *page, unsigned partial)
do_invalidatepage(page, partial);
}
+void cancel_dirty_page(struct page *page, unsigned int account_size)
+{
+ /* If we're cancelling the page, it had better not be mapped any more */
+ if (page_mapped(page)) {
+ static unsigned int warncount;
+
+ WARN_ON(++warncount < 5);
+ }
+
+ if (TestClearPageDirty(page)) {
+ struct address_space *mapping = page->mapping;
+ if (mapping && mapping_cap_account_dirty(mapping)) {
+ dec_zone_page_state(page, NR_FILE_DIRTY);
+ if (account_size)
+ task_io_account_cancelled_write(account_size);
+ }
+ }
+}
+EXPORT_SYMBOL(cancel_dirty_page);
+
/*
* If truncate cannot remove the fs-private metadata from the page, the page
* becomes anonymous. It will be left on the LRU and may even be mapped into
@@ -66,10 +87,11 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
if (page->mapping != mapping)
return;
+ cancel_dirty_page(page, PAGE_CACHE_SIZE);
+
if (PagePrivate(page))
do_invalidatepage(page, 0);
- clear_page_dirty(page);
ClearPageUptodate(page);
ClearPageMappedToDisk(page);
remove_from_page_cache(page);
@@ -319,6 +341,15 @@ failed:
return 0;
}
+static int do_launder_page(struct address_space *mapping, struct page *page)
+{
+ if (!PageDirty(page))
+ return 0;
+ if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
+ return 0;
+ return mapping->a_ops->launder_page(page);
+}
+
/**
* invalidate_inode_pages2_range - remove range of pages from an address_space
* @mapping: the address_space
@@ -348,7 +379,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
for (i = 0; !ret && i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
pgoff_t page_index;
- int was_dirty;
lock_page(page);
if (page->mapping != mapping) {
@@ -384,12 +414,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
PAGE_CACHE_SIZE, 0);
}
}
- was_dirty = test_clear_page_dirty(page);
- if (!invalidate_complete_page2(mapping, page)) {
- if (was_dirty)
- set_page_dirty(page);
+ ret = do_launder_page(mapping, page);
+ if (ret == 0 && !invalidate_complete_page2(mapping, page))
ret = -EIO;
- }
unlock_page(page);
}
pagevec_release(&pvec);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 518540a4a2a..7430df68cb6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -36,6 +36,7 @@
#include <linux/rwsem.h>
#include <linux/delay.h>
#include <linux/kthread.h>
+#include <linux/freezer.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -691,7 +692,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
__count_vm_events(KSWAPD_STEAL, nr_freed);
} else
__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
- __count_vm_events(PGACTIVATE, nr_freed);
+ __count_zone_vm_events(PGSTEAL, zone, nr_freed);
if (nr_taken == 0)
goto done;
@@ -983,7 +984,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
if (!populated_zone(zone))
continue;
- if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
note_zone_scanning_priority(zone, priority);
@@ -1033,7 +1034,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
for (i = 0; zones[i] != NULL; i++) {
struct zone *zone = zones[i];
- if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
lru_pages += zone->nr_active + zone->nr_inactive;
@@ -1088,7 +1089,7 @@ out:
for (i = 0; zones[i] != 0; i++) {
struct zone *zone = zones[i];
- if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
zone->prev_priority = priority;
@@ -1172,11 +1173,12 @@ loop_again:
if (!zone_watermark_ok(zone, order, zone->pages_high,
0, 0)) {
end_zone = i;
- goto scan;
+ break;
}
}
- goto out;
-scan:
+ if (i < 0)
+ goto out;
+
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
@@ -1259,6 +1261,9 @@ out:
}
if (!all_zones_ok) {
cond_resched();
+
+ try_to_freeze();
+
goto loop_again;
}
@@ -1349,7 +1354,7 @@ void wakeup_kswapd(struct zone *zone, int order)
return;
if (pgdat->kswapd_max_order < order)
pgdat->kswapd_max_order = order;
- if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
return;
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
@@ -1364,8 +1369,8 @@ void wakeup_kswapd(struct zone *zone, int order)
*
* For pass > 3 we also try to shrink the LRU lists that contain a few pages
*/
-static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
- int prio, struct scan_control *sc)
+static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
+ int pass, struct scan_control *sc)
{
struct zone *zone;
unsigned long nr_to_scan, ret = 0;
@@ -1401,6 +1406,16 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
return ret;
}
+static unsigned long count_lru_pages(void)
+{
+ struct zone *zone;
+ unsigned long ret = 0;
+
+ for_each_zone(zone)
+ ret += zone->nr_active + zone->nr_inactive;
+ return ret;
+}
+
/*
* Try to free `nr_pages' of memory, system-wide, and return the number of
* freed pages.
@@ -1415,7 +1430,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
unsigned long ret = 0;
int pass;
struct reclaim_state reclaim_state;
- struct zone *zone;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.may_swap = 0,
@@ -1426,10 +1440,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
current->reclaim_state = &reclaim_state;
- lru_pages = 0;
- for_each_zone(zone)
- lru_pages += zone->nr_active + zone->nr_inactive;
-
+ lru_pages = count_lru_pages();
nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
/* If slab caches are huge, it's better to hit them first */
while (nr_slab >= lru_pages) {
@@ -1456,13 +1467,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
for (pass = 0; pass < 5; pass++) {
int prio;
- /* Needed for shrinking slab caches later on */
- if (!lru_pages)
- for_each_zone(zone) {
- lru_pages += zone->nr_active;
- lru_pages += zone->nr_inactive;
- }
-
/* Force reclaiming mapped pages in the passes #3 and #4 */
if (pass > 2) {
sc.may_swap = 1;
@@ -1478,7 +1482,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
goto out;
reclaim_state.reclaimed_slab = 0;
- shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages);
+ shrink_slab(sc.nr_scanned, sc.gfp_mask,
+ count_lru_pages());
ret += reclaim_state.reclaimed_slab;
if (ret >= nr_pages)
goto out;
@@ -1486,20 +1491,19 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
congestion_wait(WRITE, HZ / 10);
}
-
- lru_pages = 0;
}
/*
* If ret = 0, we could not shrink LRUs, but there may be something
* in slab caches
*/
- if (!ret)
+ if (!ret) {
do {
reclaim_state.reclaimed_slab = 0;
- shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+ shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages());
ret += reclaim_state.reclaimed_slab;
} while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
+ }
out:
current->reclaim_state = NULL;
@@ -1508,7 +1512,6 @@ out:
}
#endif
-#ifdef CONFIG_HOTPLUG_CPU
/* It's optimal to keep kswapds on the same CPUs as their memory, but
not required for correctness. So if the last cpu in a node goes
away, we get changed to run anywhere: as the first one comes back,
@@ -1529,7 +1532,6 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
}
return NOTIFY_OK;
}
-#endif /* CONFIG_HOTPLUG_CPU */
/*
* This kswapd start function will be called by init and node-hot-add.
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8614e8f6743..dc005a0c96a 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -430,7 +430,7 @@ static int frag_show(struct seq_file *m, void *arg)
return 0;
}
-struct seq_operations fragmentation_op = {
+const struct seq_operations fragmentation_op = {
.start = frag_start,
.next = frag_next,
.stop = frag_stop,
@@ -452,7 +452,7 @@ struct seq_operations fragmentation_op = {
#define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \
TEXT_FOR_HIGHMEM(xx)
-static char *vmstat_text[] = {
+static const char * const vmstat_text[] = {
/* Zoned VM counters */
"nr_anon_pages",
"nr_mapped",
@@ -597,7 +597,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
return 0;
}
-struct seq_operations zoneinfo_op = {
+const struct seq_operations zoneinfo_op = {
.start = frag_start, /* iterate over all zones. The same as in
* fragmentation. */
.next = frag_next,
@@ -660,7 +660,7 @@ static void vmstat_stop(struct seq_file *m, void *arg)
m->private = NULL;
}
-struct seq_operations vmstat_op = {
+const struct seq_operations vmstat_op = {
.start = vmstat_start,
.next = vmstat_next,
.stop = vmstat_stop,
@@ -679,13 +679,13 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
void *hcpu)
{
switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_CANCELED:
- case CPU_DEAD:
- refresh_zone_stat_thresholds();
- break;
- default:
- break;
+ case CPU_UP_PREPARE:
+ case CPU_UP_CANCELED:
+ case CPU_DEAD:
+ refresh_zone_stat_thresholds();
+ break;
+ default:
+ break;
}
return NOTIFY_OK;
}