summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/fadvise.c10
-rw-r--r--mm/failslab.c18
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/fremap.c2
-rw-r--r--mm/ksm.c12
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/memory-failure.c5
-rw-r--r--mm/memory.c161
-rw-r--r--mm/memory_hotplug.c4
-rw-r--r--mm/mempolicy.c112
-rw-r--r--mm/migrate.c4
-rw-r--r--mm/mlock.c12
-rw-r--r--mm/mmap.c151
-rw-r--r--mm/mremap.c9
-rw-r--r--mm/nommu.c6
-rw-r--r--mm/oom_kill.c4
-rw-r--r--mm/page_alloc.c53
-rw-r--r--mm/readahead.c6
-rw-r--r--mm/rmap.c185
-rw-r--r--mm/slab.c13
-rw-r--r--mm/slub.c337
-rw-r--r--mm/swap.c2
-rw-r--r--mm/swapfile.c40
-rw-r--r--mm/vmscan.c177
-rw-r--r--mm/vmstat.c2
26 files changed, 795 insertions, 536 deletions
diff --git a/mm/fadvise.c b/mm/fadvise.c
index e43359214f6..8d723c9e8b7 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -77,12 +77,20 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
switch (advice) {
case POSIX_FADV_NORMAL:
file->f_ra.ra_pages = bdi->ra_pages;
+ spin_lock(&file->f_lock);
+ file->f_mode &= ~FMODE_RANDOM;
+ spin_unlock(&file->f_lock);
break;
case POSIX_FADV_RANDOM:
- file->f_ra.ra_pages = 0;
+ spin_lock(&file->f_lock);
+ file->f_mode |= FMODE_RANDOM;
+ spin_unlock(&file->f_lock);
break;
case POSIX_FADV_SEQUENTIAL:
file->f_ra.ra_pages = bdi->ra_pages * 2;
+ spin_lock(&file->f_lock);
+ file->f_mode &= ~FMODE_RANDOM;
+ spin_unlock(&file->f_lock);
break;
case POSIX_FADV_WILLNEED:
if (!mapping->a_ops->readpage) {
diff --git a/mm/failslab.c b/mm/failslab.c
index 9339de5f0a9..bb41f98dd8b 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -1,18 +1,22 @@
#include <linux/fault-inject.h>
#include <linux/gfp.h>
+#include <linux/slab.h>
static struct {
struct fault_attr attr;
u32 ignore_gfp_wait;
+ int cache_filter;
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
struct dentry *ignore_gfp_wait_file;
+ struct dentry *cache_filter_file;
#endif
} failslab = {
.attr = FAULT_ATTR_INITIALIZER,
.ignore_gfp_wait = 1,
+ .cache_filter = 0,
};
-bool should_failslab(size_t size, gfp_t gfpflags)
+bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags)
{
if (gfpflags & __GFP_NOFAIL)
return false;
@@ -20,6 +24,9 @@ bool should_failslab(size_t size, gfp_t gfpflags)
if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT))
return false;
+ if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB))
+ return false;
+
return should_fail(&failslab.attr, size);
}
@@ -30,7 +37,6 @@ static int __init setup_failslab(char *str)
__setup("failslab=", setup_failslab);
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-
static int __init failslab_debugfs_init(void)
{
mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
@@ -46,8 +52,14 @@ static int __init failslab_debugfs_init(void)
debugfs_create_bool("ignore-gfp-wait", mode, dir,
&failslab.ignore_gfp_wait);
- if (!failslab.ignore_gfp_wait_file) {
+ failslab.cache_filter_file =
+ debugfs_create_bool("cache-filter", mode, dir,
+ &failslab.cache_filter);
+
+ if (!failslab.ignore_gfp_wait_file ||
+ !failslab.cache_filter_file) {
err = -ENOMEM;
+ debugfs_remove(failslab.cache_filter_file);
debugfs_remove(failslab.ignore_gfp_wait_file);
cleanup_fault_attr_dentries(&failslab.attr);
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 148b52a5bb7..045b31c3765 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1986,7 +1986,7 @@ EXPORT_SYMBOL(iov_iter_single_seg_count);
inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
{
struct inode *inode = file->f_mapping->host;
- unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ unsigned long limit = rlimit(RLIMIT_FSIZE);
if (unlikely(*pos < 0))
return -EINVAL;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 1888b2d71bb..78b94f0b6d5 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -194,7 +194,7 @@ retry:
flush_cache_page(vma, address, pte_pfn(*pte));
pteval = ptep_clear_flush_notify(vma, address, pte);
page_remove_rmap(page);
- dec_mm_counter(mm, file_rss);
+ dec_mm_counter(mm, MM_FILEPAGES);
BUG_ON(pte_dirty(pteval));
pte_unmap_unlock(pte, ptl);
page_cache_release(page);
diff --git a/mm/fremap.c b/mm/fremap.c
index b6ec85abbb3..46f5dacf90a 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
page_remove_rmap(page);
page_cache_release(page);
update_hiwater_rss(mm);
- dec_mm_counter(mm, file_rss);
+ dec_mm_counter(mm, MM_FILEPAGES);
}
} else {
if (!pte_file(pte))
diff --git a/mm/ksm.c b/mm/ksm.c
index 56a0da1f997..a93f1b7f508 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1563,10 +1563,12 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
again:
hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
struct anon_vma *anon_vma = rmap_item->anon_vma;
+ struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
spin_lock(&anon_vma->lock);
- list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+ vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
continue;
@@ -1614,10 +1616,12 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
again:
hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
struct anon_vma *anon_vma = rmap_item->anon_vma;
+ struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
spin_lock(&anon_vma->lock);
- list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+ vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
continue;
@@ -1664,10 +1668,12 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
again:
hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
struct anon_vma *anon_vma = rmap_item->anon_vma;
+ struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
spin_lock(&anon_vma->lock);
- list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+ vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
continue;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 954032b80be..d813823ab08 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2545,7 +2545,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
pc = list_entry(list->prev, struct page_cgroup, lru);
if (busy == pc) {
list_move(&pc->lru, list);
- busy = 0;
+ busy = NULL;
spin_unlock_irqrestore(&zone->lru_lock, flags);
continue;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 17299fd4577..d1f33516297 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -383,9 +383,12 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
if (av == NULL) /* Not actually mapped anymore */
goto out;
for_each_process (tsk) {
+ struct anon_vma_chain *vmac;
+
if (!task_early_kill(tsk))
continue;
- list_for_each_entry (vma, &av->head, anon_vma_node) {
+ list_for_each_entry(vmac, &av->head, same_anon_vma) {
+ vma = vmac->vma;
if (!page_mapped_in_vma(page, vma))
continue;
if (vma->vm_mm == tsk->mm)
diff --git a/mm/memory.c b/mm/memory.c
index 72fb5f39bcc..d1153e37e9b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -121,6 +121,80 @@ static int __init init_zero_pfn(void)
}
core_initcall(init_zero_pfn);
+
+#if defined(SPLIT_RSS_COUNTING)
+
+void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
+{
+ int i;
+
+ for (i = 0; i < NR_MM_COUNTERS; i++) {
+ if (task->rss_stat.count[i]) {
+ add_mm_counter(mm, i, task->rss_stat.count[i]);
+ task->rss_stat.count[i] = 0;
+ }
+ }
+ task->rss_stat.events = 0;
+}
+
+static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
+{
+ struct task_struct *task = current;
+
+ if (likely(task->mm == mm))
+ task->rss_stat.count[member] += val;
+ else
+ add_mm_counter(mm, member, val);
+}
+#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
+#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
+
+/* sync counter once per 64 page faults */
+#define TASK_RSS_EVENTS_THRESH (64)
+static void check_sync_rss_stat(struct task_struct *task)
+{
+ if (unlikely(task != current))
+ return;
+ if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
+ __sync_task_rss_stat(task, task->mm);
+}
+
+unsigned long get_mm_counter(struct mm_struct *mm, int member)
+{
+ long val = 0;
+
+ /*
+ * Don't use task->mm here...for avoiding to use task_get_mm()..
+ * The caller must guarantee task->mm is not invalid.
+ */
+ val = atomic_long_read(&mm->rss_stat.count[member]);
+ /*
+ * counter is updated in asynchronous manner and may go to minus.
+ * But it's never be expected number for users.
+ */
+ if (val < 0)
+ return 0;
+ return (unsigned long)val;
+}
+
+void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
+{
+ __sync_task_rss_stat(task, mm);
+}
+#else
+
+#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
+#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
+
+static void check_sync_rss_stat(struct task_struct *task)
+{
+}
+
+void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
+{
+}
+#endif
+
/*
* If a p?d_bad entry is found while walking page tables, report
* the error, before resetting entry to p?d_none. Usually (but
@@ -300,7 +374,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
* Hide vma from rmap and truncate_pagecache before freeing
* pgtables
*/
- anon_vma_unlink(vma);
+ unlink_anon_vmas(vma);
unlink_file_vma(vma);
if (is_vm_hugetlb_page(vma)) {
@@ -314,7 +388,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
&& !is_vm_hugetlb_page(next)) {
vma = next;
next = vma->vm_next;
- anon_vma_unlink(vma);
+ unlink_anon_vmas(vma);
unlink_file_vma(vma);
}
free_pgd_range(tlb, addr, vma->vm_end,
@@ -376,12 +450,20 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
return 0;
}
-static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+static inline void init_rss_vec(int *rss)
{
- if (file_rss)
- add_mm_counter(mm, file_rss, file_rss);
- if (anon_rss)
- add_mm_counter(mm, anon_rss, anon_rss);
+ memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
+}
+
+static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
+{
+ int i;
+
+ if (current->mm == mm)
+ sync_mm_rss(current, mm);
+ for (i = 0; i < NR_MM_COUNTERS; i++)
+ if (rss[i])
+ add_mm_counter(mm, i, rss[i]);
}
/*
@@ -597,7 +679,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
&src_mm->mmlist);
spin_unlock(&mmlist_lock);
}
- if (is_write_migration_entry(entry) &&
+ if (likely(!non_swap_entry(entry)))
+ rss[MM_SWAPENTS]++;
+ else if (is_write_migration_entry(entry) &&
is_cow_mapping(vm_flags)) {
/*
* COW mappings require pages in both parent
@@ -632,7 +716,10 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (page) {
get_page(page);
page_dup_rmap(page);
- rss[PageAnon(page)]++;
+ if (PageAnon(page))
+ rss[MM_ANONPAGES]++;
+ else
+ rss[MM_FILEPAGES]++;
}
out_set_pte:
@@ -648,11 +735,12 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *src_pte, *dst_pte;
spinlock_t *src_ptl, *dst_ptl;
int progress = 0;
- int rss[2];
+ int rss[NR_MM_COUNTERS];
swp_entry_t entry = (swp_entry_t){0};
again:
- rss[1] = rss[0] = 0;
+ init_rss_vec(rss);
+
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte)
return -ENOMEM;
@@ -688,7 +776,7 @@ again:
arch_leave_lazy_mmu_mode();
spin_unlock(src_ptl);
pte_unmap_nested(orig_src_pte);
- add_mm_rss(dst_mm, rss[0], rss[1]);
+ add_mm_rss_vec(dst_mm, rss);
pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched();
@@ -816,8 +904,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct mm_struct *mm = tlb->mm;
pte_t *pte;
spinlock_t *ptl;
- int file_rss = 0;
- int anon_rss = 0;
+ int rss[NR_MM_COUNTERS];
+
+ init_rss_vec(rss);
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
@@ -863,14 +952,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
set_pte_at(mm, addr, pte,
pgoff_to_pte(page->index));
if (PageAnon(page))
- anon_rss--;
+ rss[MM_ANONPAGES]--;
else {
if (pte_dirty(ptent))
set_page_dirty(page);
if (pte_young(ptent) &&
likely(!VM_SequentialReadHint(vma)))
mark_page_accessed(page);
- file_rss--;
+ rss[MM_FILEPAGES]--;
}
page_remove_rmap(page);
if (unlikely(page_mapcount(page) < 0))
@@ -887,13 +976,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
if (pte_file(ptent)) {
if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
print_bad_pte(vma, addr, ptent, NULL);
- } else if
- (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
- print_bad_pte(vma, addr, ptent, NULL);
+ } else {
+ swp_entry_t entry = pte_to_swp_entry(ptent);
+
+ if (!non_swap_entry(entry))
+ rss[MM_SWAPENTS]--;
+ if (unlikely(!free_swap_and_cache(entry)))
+ print_bad_pte(vma, addr, ptent, NULL);
+ }
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
- add_mm_rss(mm, file_rss, anon_rss);
+ add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
@@ -1527,7 +1621,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
/* Ok, finally just insert the thing.. */
get_page(page);
- inc_mm_counter(mm, file_rss);
+ inc_mm_counter_fast(mm, MM_FILEPAGES);
page_add_file_rmap(page);
set_pte_at(mm, addr, pte, mk_pte(page, prot));
@@ -2044,6 +2138,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
page_cache_release(old_page);
}
reuse = reuse_swap_page(old_page);
+ if (reuse)
+ /*
+ * The page is all ours. Move it to our anon_vma so
+ * the rmap code will not search our parent or siblings.
+ * Protected against the rmap code by the page lock.
+ */
+ page_move_anon_rmap(old_page, vma, address);
unlock_page(old_page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
@@ -2163,11 +2264,11 @@ gotten:
if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
- dec_mm_counter(mm, file_rss);
- inc_mm_counter(mm, anon_rss);
+ dec_mm_counter_fast(mm, MM_FILEPAGES);
+ inc_mm_counter_fast(mm, MM_ANONPAGES);
}
} else
- inc_mm_counter(mm, anon_rss);
+ inc_mm_counter_fast(mm, MM_ANONPAGES);
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2604,7 +2705,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
* discarded at swap_free().
*/
- inc_mm_counter(mm, anon_rss);
+ inc_mm_counter_fast(mm, MM_ANONPAGES);
+ dec_mm_counter_fast(mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -2688,7 +2790,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pte_none(*page_table))
goto release;
- inc_mm_counter(mm, anon_rss);
+ inc_mm_counter_fast(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, address);
setpte:
set_pte_at(mm, address, page_table, entry);
@@ -2842,10 +2944,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (flags & FAULT_FLAG_WRITE)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (anon) {
- inc_mm_counter(mm, anon_rss);
+ inc_mm_counter_fast(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, address);
} else {
- inc_mm_counter(mm, file_rss);
+ inc_mm_counter_fast(mm, MM_FILEPAGES);
page_add_file_rmap(page);
if (flags & FAULT_FLAG_WRITE) {
dirty_page = page;
@@ -3023,6 +3125,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(PGFAULT);
+ /* do counter updates before entering really critical section. */
+ check_sync_rss_stat(current);
+
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 030ce8a5bb0..78e34e63c7b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -28,6 +28,7 @@
#include <linux/pfn.h>
#include <linux/suspend.h>
#include <linux/mm_inline.h>
+#include <linux/firmware-map.h>
#include <asm/tlbflush.h>
@@ -523,6 +524,9 @@ int __ref add_memory(int nid, u64 start, u64 size)
BUG_ON(ret);
}
+ /* create new memmap entry */
+ firmware_map_add_hotplug(start, start + size, "System RAM");
+
goto out;
error:
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 290fb5bf044..bda230e52ac 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -563,24 +563,50 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
}
/* Step 2: apply policy to a range and do splits. */
-static int mbind_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct mempolicy *new)
+static int mbind_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct mempolicy *new_pol)
{
struct vm_area_struct *next;
- int err;
+ struct vm_area_struct *prev;
+ struct vm_area_struct *vma;
+ int err = 0;
+ pgoff_t pgoff;
+ unsigned long vmstart;
+ unsigned long vmend;
- err = 0;
- for (; vma && vma->vm_start < end; vma = next) {
+ vma = find_vma_prev(mm, start, &prev);
+ if (!vma || vma->vm_start > start)
+ return -EFAULT;
+
+ for (; vma && vma->vm_start < end; prev = vma, vma = next) {
next = vma->vm_next;
- if (vma->vm_start < start)
- err = split_vma(vma->vm_mm, vma, start, 1);
- if (!err && vma->vm_end > end)
- err = split_vma(vma->vm_mm, vma, end, 0);
- if (!err)
- err = policy_vma(vma, new);
+ vmstart = max(start, vma->vm_start);
+ vmend = min(end, vma->vm_end);
+
+ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
+ vma->anon_vma, vma->vm_file, pgoff, new_pol);
+ if (prev) {
+ vma = prev;
+ next = vma->vm_next;
+ continue;
+ }
+ if (vma->vm_start != vmstart) {
+ err = split_vma(vma->vm_mm, vma, vmstart, 1);
+ if (err)
+ goto out;
+ }
+ if (vma->vm_end != vmend) {
+ err = split_vma(vma->vm_mm, vma, vmend, 0);
+ if (err)
+ goto out;
+ }
+ err = policy_vma(vma, new_pol);
if (err)
- break;
+ goto out;
}
+
+ out:
return err;
}
@@ -862,36 +888,36 @@ int do_migrate_pages(struct mm_struct *mm,
if (err)
goto out;
-/*
- * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
- * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
- * bit in 'tmp', and return that <source, dest> pair for migration.
- * The pair of nodemasks 'to' and 'from' define the map.
- *
- * If no pair of bits is found that way, fallback to picking some
- * pair of 'source' and 'dest' bits that are not the same. If the
- * 'source' and 'dest' bits are the same, this represents a node
- * that will be migrating to itself, so no pages need move.
- *
- * If no bits are left in 'tmp', or if all remaining bits left
- * in 'tmp' correspond to the same bit in 'to', return false
- * (nothing left to migrate).
- *
- * This lets us pick a pair of nodes to migrate between, such that
- * if possible the dest node is not already occupied by some other
- * source node, minimizing the risk of overloading the memory on a
- * node that would happen if we migrated incoming memory to a node
- * before migrating outgoing memory source that same node.
- *
- * A single scan of tmp is sufficient. As we go, we remember the
- * most recent <s, d> pair that moved (s != d). If we find a pair
- * that not only moved, but what's better, moved to an empty slot
- * (d is not set in tmp), then we break out then, with that pair.
- * Otherwise when we finish scannng from_tmp, we at least have the
- * most recent <s, d> pair that moved. If we get all the way through
- * the scan of tmp without finding any node that moved, much less
- * moved to an empty node, then there is nothing left worth migrating.
- */
+ /*
+ * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
+ * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
+ * bit in 'tmp', and return that <source, dest> pair for migration.
+ * The pair of nodemasks 'to' and 'from' define the map.
+ *
+ * If no pair of bits is found that way, fallback to picking some
+ * pair of 'source' and 'dest' bits that are not the same. If the
+ * 'source' and 'dest' bits are the same, this represents a node
+ * that will be migrating to itself, so no pages need move.
+ *
+ * If no bits are left in 'tmp', or if all remaining bits left
+ * in 'tmp' correspond to the same bit in 'to', return false
+ * (nothing left to migrate).
+ *
+ * This lets us pick a pair of nodes to migrate between, such that
+ * if possible the dest node is not already occupied by some other
+ * source node, minimizing the risk of overloading the memory on a
+ * node that would happen if we migrated incoming memory to a node
+ * before migrating outgoing memory source that same node.
+ *
+ * A single scan of tmp is sufficient. As we go, we remember the
+ * most recent <s, d> pair that moved (s != d). If we find a pair
+ * that not only moved, but what's better, moved to an empty slot
+ * (d is not set in tmp), then we break out then, with that pair.
+ * Otherwise when we finish scannng from_tmp, we at least have the
+ * most recent <s, d> pair that moved. If we get all the way through
+ * the scan of tmp without finding any node that moved, much less
+ * moved to an empty node, then there is nothing left worth migrating.
+ */
tmp = *from_nodes;
while (!nodes_empty(tmp)) {
@@ -1047,7 +1073,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!IS_ERR(vma)) {
int nr_failed = 0;
- err = mbind_range(vma, start, end, new);
+ err = mbind_range(mm, start, end, new);
if (!list_empty(&pagelist))
nr_failed = migrate_pages(&pagelist, new_vma_page,
diff --git a/mm/migrate.c b/mm/migrate.c
index edb6101ed77..88000b89fc9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -275,8 +275,6 @@ static int migrate_page_move_mapping(struct address_space *mapping,
*/
static void migrate_page_copy(struct page *newpage, struct page *page)
{
- int anon;
-
copy_highpage(newpage, page);
if (PageError(page))
@@ -313,8 +311,6 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
ClearPageSwapCache(page);
ClearPagePrivate(page);
set_page_private(page, 0);
- /* page->mapping contains a flag for PageAnon() */
- anon = PageAnon(page);
page->mapping = NULL;
/*
diff --git a/mm/mlock.c b/mm/mlock.c
index 2b8335a8940..8f4e2dfceec 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -25,7 +25,7 @@ int can_do_mlock(void)
{
if (capable(CAP_IPC_LOCK))
return 1;
- if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0)
+ if (rlimit(RLIMIT_MEMLOCK) != 0)
return 1;
return 0;
}
@@ -487,7 +487,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
locked = len >> PAGE_SHIFT;
locked += current->mm->locked_vm;
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
/* check against resource limits */
@@ -550,7 +550,7 @@ SYSCALL_DEFINE1(mlockall, int, flags)
down_write(&current->mm->mmap_sem);
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
ret = -ENOMEM;
@@ -584,7 +584,7 @@ int user_shm_lock(size_t size, struct user_struct *user)
int allowed = 0;
locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
if (lock_limit == RLIM_INFINITY)
allowed = 1;
lock_limit >>= PAGE_SHIFT;
@@ -618,12 +618,12 @@ int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
down_write(&mm->mmap_sem);
- lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
+ lim = ACCESS_ONCE(rlim[RLIMIT_AS].rlim_cur) >> PAGE_SHIFT;
vm = mm->total_vm + pgsz;
if (lim < vm)
goto out;
- lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+ lim = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur) >> PAGE_SHIFT;
vm = mm->locked_vm + pgsz;
if (lim < vm)
goto out;
diff --git a/mm/mmap.c b/mm/mmap.c
index ee2298936fe..f1b4448626b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -265,7 +265,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
* segment grow beyond its set limit the in case where the limit is
* not page aligned -Ram Gupta
*/
- rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
+ rlim = rlimit(RLIMIT_DATA);
if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
(mm->end_data - mm->start_data) > rlim)
goto out;
@@ -437,7 +437,6 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
{
__vma_link_list(mm, vma, prev, rb_parent);
__vma_link_rb(mm, vma, rb_link, rb_parent);
- __anon_vma_link(vma);
}
static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -499,7 +498,7 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
* are necessary. The "insert" vma (if any) is to be inserted
* before we drop the necessary locks.
*/
-void vma_adjust(struct vm_area_struct *vma, unsigned long start,
+int vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
{
struct mm_struct *mm = vma->vm_mm;
@@ -542,6 +541,26 @@ again: remove_next = 1 + (end > next->vm_end);
}
}
+ /*
+ * When changing only vma->vm_end, we don't really need anon_vma lock.
+ */
+ if (vma->anon_vma && (insert || importer || start != vma->vm_start))
+ anon_vma = vma->anon_vma;
+ if (anon_vma) {
+ /*
+ * Easily overlooked: when mprotect shifts the boundary,
+ * make sure the expanding vma has anon_vma set if the
+ * shrinking vma had, to cover any anon pages imported.
+ */
+ if (importer && !importer->anon_vma) {
+ /* Block reverse map lookups until things are set up. */
+ if (anon_vma_clone(importer, vma)) {
+ return -ENOMEM;
+ }
+ importer->anon_vma = anon_vma;
+ }
+ }
+
if (file) {
mapping = file->f_mapping;
if (!(vma->vm_flags & VM_NONLINEAR))
@@ -567,25 +586,6 @@ again: remove_next = 1 + (end > next->vm_end);
}
}
- /*
- * When changing only vma->vm_end, we don't really need
- * anon_vma lock.
- */
- if (vma->anon_vma && (insert || importer || start != vma->vm_start))
- anon_vma = vma->anon_vma;
- if (anon_vma) {
- spin_lock(&anon_vma->lock);
- /*
- * Easily overlooked: when mprotect shifts the boundary,
- * make sure the expanding vma has anon_vma set if the
- * shrinking vma had, to cover any anon pages imported.
- */
- if (importer && !importer->anon_vma) {
- importer->anon_vma = anon_vma;
- __anon_vma_link(importer);
- }
- }
-
if (root) {
flush_dcache_mmap_lock(mapping);
vma_prio_tree_remove(vma, root);
@@ -616,8 +616,6 @@ again: remove_next = 1 + (end > next->vm_end);
__vma_unlink(mm, next, vma);
if (file)
__remove_shared_vm_struct(next, file, mapping);
- if (next->anon_vma)
- __anon_vma_merge(vma, next);
} else if (insert) {
/*
* split_vma has split insert from vma, and needs
@@ -627,8 +625,6 @@ again: remove_next = 1 + (end > next->vm_end);
__insert_vm_struct(mm, insert);
}
- if (anon_vma)
- spin_unlock(&anon_vma->lock);
if (mapping)
spin_unlock(&mapping->i_mmap_lock);
@@ -638,6 +634,8 @@ again: remove_next = 1 + (end > next->vm_end);
if (next->vm_flags & VM_EXECUTABLE)
removed_exe_file_vma(mm);
}
+ if (next->anon_vma)
+ anon_vma_merge(vma, next);
mm->map_count--;
mpol_put(vma_policy(next));
kmem_cache_free(vm_area_cachep, next);
@@ -653,6 +651,8 @@ again: remove_next = 1 + (end > next->vm_end);
}
validate_mm(mm);
+
+ return 0;
}
/*
@@ -759,6 +759,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
+ int err;
/*
* We later require that vma->vm_flags == vm_flags,
@@ -792,11 +793,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
is_mergeable_anon_vma(prev->anon_vma,
next->anon_vma)) {
/* cases 1, 6 */
- vma_adjust(prev, prev->vm_start,
+ err = vma_adjust(prev, prev->vm_start,
next->vm_end, prev->vm_pgoff, NULL);
} else /* cases 2, 5, 7 */
- vma_adjust(prev, prev->vm_start,
+ err = vma_adjust(prev, prev->vm_start,
end, prev->vm_pgoff, NULL);
+ if (err)
+ return NULL;
return prev;
}
@@ -808,11 +811,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen)) {
if (prev && addr < prev->vm_end) /* case 4 */
- vma_adjust(prev, prev->vm_start,
+ err = vma_adjust(prev, prev->vm_start,
addr, prev->vm_pgoff, NULL);
else /* cases 3, 8 */
- vma_adjust(area, addr, next->vm_end,
+ err = vma_adjust(area, addr, next->vm_end,
next->vm_pgoff - pglen, NULL);
+ if (err)
+ return NULL;
return area;
}
@@ -967,7 +972,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long locked, lock_limit;
locked = len >> PAGE_SHIFT;
locked += mm->locked_vm;
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
return -EAGAIN;
@@ -1205,6 +1210,7 @@ munmap_back:
vma->vm_flags = vm_flags;
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
+ INIT_LIST_HEAD(&vma->anon_vma_chain);
if (file) {
error = -EINVAL;
@@ -1265,13 +1271,8 @@ out:
mm->total_vm += len >> PAGE_SHIFT;
vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
- /*
- * makes pages present; downgrades, drops, reacquires mmap_sem
- */
- long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);
- if (nr_pages < 0)
- return nr_pages; /* vma gone! */
- mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
+ if (!mlock_vma_pages_range(vma, addr, addr + len))
+ mm->locked_vm += (len >> PAGE_SHIFT);
} else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
make_pages_present(addr, addr + len);
return addr;
@@ -1599,7 +1600,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
return -ENOMEM;
/* Stack limit test */
- if (size > rlim[RLIMIT_STACK].rlim_cur)
+ if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
return -ENOMEM;
/* mlock limit tests */
@@ -1607,7 +1608,8 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
unsigned long locked;
unsigned long limit;
locked = mm->locked_vm + grow;
- limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+ limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
+ limit >>= PAGE_SHIFT;
if (locked > limit && !capable(CAP_IPC_LOCK))
return -ENOMEM;
}
@@ -1754,8 +1756,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
if (!prev || expand_stack(prev, addr))
return NULL;
if (prev->vm_flags & VM_LOCKED) {
- if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0)
- return NULL; /* vma gone! */
+ mlock_vma_pages_range(prev, addr, prev->vm_end);
}
return prev;
}
@@ -1783,8 +1784,7 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
if (expand_stack(vma, addr))
return NULL;
if (vma->vm_flags & VM_LOCKED) {
- if (mlock_vma_pages_range(vma, addr, start) < 0)
- return NULL; /* vma gone! */
+ mlock_vma_pages_range(vma, addr, start);
}
return vma;
}
@@ -1871,6 +1871,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
{
struct mempolicy *pol;
struct vm_area_struct *new;
+ int err = -ENOMEM;
if (is_vm_hugetlb_page(vma) && (addr &
~(huge_page_mask(hstate_vma(vma)))))
@@ -1878,11 +1879,13 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (!new)
- return -ENOMEM;
+ goto out_err;
/* most fields are the same, copy all, and then fixup */
*new = *vma;
+ INIT_LIST_HEAD(&new->anon_vma_chain);
+
if (new_below)
new->vm_end = addr;
else {
@@ -1892,11 +1895,14 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
pol = mpol_dup(vma_policy(vma));
if (IS_ERR(pol)) {
- kmem_cache_free(vm_area_cachep, new);
- return PTR_ERR(pol);
+ err = PTR_ERR(pol);
+ goto out_free_vma;
}
vma_set_policy(new, pol);
+ if (anon_vma_clone(new, vma))
+ goto out_free_mpol;
+
if (new->vm_file) {
get_file(new->vm_file);
if (vma->vm_flags & VM_EXECUTABLE)
@@ -1907,12 +1913,28 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
new->vm_ops->open(new);
if (new_below)
- vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
+ err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
((addr - new->vm_start) >> PAGE_SHIFT), new);
else
- vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
+ err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
- return 0;
+ /* Success. */
+ if (!err)
+ return 0;
+
+ /* Clean everything up if vma_adjust failed. */
+ new->vm_ops->close(new);
+ if (new->vm_file) {
+ if (vma->vm_flags & VM_EXECUTABLE)
+ removed_exe_file_vma(mm);
+ fput(new->vm_file);
+ }
+ out_free_mpol:
+ mpol_put(pol);
+ out_free_vma:
+ kmem_cache_free(vm_area_cachep, new);
+ out_err:
+ return err;
}
/*
@@ -2074,7 +2096,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
unsigned long locked, lock_limit;
locked = len >> PAGE_SHIFT;
locked += mm->locked_vm;
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
return -EAGAIN;
@@ -2122,6 +2144,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
return -ENOMEM;
}
+ INIT_LIST_HEAD(&vma->anon_vma_chain);
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
@@ -2258,10 +2281,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
if (new_vma) {
*new_vma = *vma;
pol = mpol_dup(vma_policy(vma));
- if (IS_ERR(pol)) {
- kmem_cache_free(vm_area_cachep, new_vma);
- return NULL;
- }
+ if (IS_ERR(pol))
+ goto out_free_vma;
+ INIT_LIST_HEAD(&new_vma->anon_vma_chain);
+ if (anon_vma_clone(new_vma, vma))
+ goto out_free_mempol;
vma_set_policy(new_vma, pol);
new_vma->vm_start = addr;
new_vma->vm_end = addr + len;
@@ -2277,6 +2301,12 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
}
}
return new_vma;
+
+ out_free_mempol:
+ mpol_put(pol);
+ out_free_vma:
+ kmem_cache_free(vm_area_cachep, new_vma);
+ return NULL;
}
/*
@@ -2288,7 +2318,7 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages)
unsigned long cur = mm->total_vm; /* pages */
unsigned long lim;
- lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
+ lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT;
if (cur + npages > lim)
return 0;
@@ -2354,6 +2384,7 @@ int install_special_mapping(struct mm_struct *mm,
if (unlikely(vma == NULL))
return -ENOMEM;
+ INIT_LIST_HEAD(&vma->anon_vma_chain);
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
@@ -2454,6 +2485,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
int mm_take_all_locks(struct mm_struct *mm)
{
struct vm_area_struct *vma;
+ struct anon_vma_chain *avc;
int ret = -EINTR;
BUG_ON(down_read_trylock(&mm->mmap_sem));
@@ -2471,7 +2503,8 @@ int mm_take_all_locks(struct mm_struct *mm)
if (signal_pending(current))
goto out_unlock;
if (vma->anon_vma)
- vm_lock_anon_vma(mm, vma->anon_vma);
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ vm_lock_anon_vma(mm, avc->anon_vma);
}
ret = 0;
@@ -2526,13 +2559,15 @@ static void vm_unlock_mapping(struct address_space *mapping)
void mm_drop_all_locks(struct mm_struct *mm)
{
struct vm_area_struct *vma;
+ struct anon_vma_chain *avc;
BUG_ON(down_read_trylock(&mm->mmap_sem));
BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma->anon_vma)
- vm_unlock_anon_vma(vma->anon_vma);
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ vm_unlock_anon_vma(avc->anon_vma);
if (vma->vm_file && vma->vm_file->f_mapping)
vm_unlock_mapping(vma->vm_file->f_mapping);
}
diff --git a/mm/mremap.c b/mm/mremap.c
index 845190898d5..e9c75efce60 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -285,7 +285,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
if (vma->vm_flags & VM_LOCKED) {
unsigned long locked, lock_limit;
locked = mm->locked_vm << PAGE_SHIFT;
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
locked += new_len - old_len;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
goto Eagain;
@@ -460,8 +460,11 @@ unsigned long do_mremap(unsigned long addr,
if (vma_expandable(vma, new_len - old_len)) {
int pages = (new_len - old_len) >> PAGE_SHIFT;
- vma_adjust(vma, vma->vm_start,
- addr + new_len, vma->vm_pgoff, NULL);
+ if (vma_adjust(vma, vma->vm_start, addr + new_len,
+ vma->vm_pgoff, NULL)) {
+ ret = -ENOMEM;
+ goto out;
+ }
mm->total_vm += pages;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
diff --git a/mm/nommu.c b/mm/nommu.c
index 48a2ecfaf05..b9b5cceb1b6 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -146,7 +146,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
for (i = 0; i < nr_pages; i++) {
- vma = find_vma(mm, start);
+ vma = find_extend_vma(mm, start);
if (!vma)
goto finish_or_fault;
@@ -764,7 +764,7 @@ EXPORT_SYMBOL(find_vma);
*/
struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
{
- return find_vma(mm, addr);
+ return find_vma(mm, addr & PAGE_MASK);
}
/*
@@ -1209,7 +1209,7 @@ unsigned long do_mmap_pgoff(struct file *file,
region->vm_flags = vm_flags;
region->vm_pgoff = pgoff;
- INIT_LIST_HEAD(&vma->anon_vma_node);
+ INIT_LIST_HEAD(&vma->anon_vma_chain);
vma->vm_flags = vm_flags;
vma->vm_pgoff = pgoff;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 237050478f2..35755a4156d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -401,8 +401,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
"vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
task_pid_nr(p), p->comm,
K(p->mm->total_vm),
- K(get_mm_counter(p->mm, anon_rss)),
- K(get_mm_counter(p->mm, file_rss)));
+ K(get_mm_counter(p->mm, MM_ANONPAGES)),
+ K(get_mm_counter(p->mm, MM_FILEPAGES)));
task_unlock(p);
/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a6b17aa4740..a8182c89de5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -76,6 +76,31 @@ unsigned long totalreserve_pages __read_mostly;
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
+#ifdef CONFIG_PM_SLEEP
+/*
+ * The following functions are used by the suspend/hibernate code to temporarily
+ * change gfp_allowed_mask in order to avoid using I/O during memory allocations
+ * while devices are suspended. To avoid races with the suspend/hibernate code,
+ * they should always be called with pm_mutex held (gfp_allowed_mask also should
+ * only be modified with pm_mutex held, unless the suspend/hibernate code is
+ * guaranteed not to run in parallel with that modification).
+ */
+void set_gfp_allowed_mask(gfp_t mask)
+{
+ WARN_ON(!mutex_is_locked(&pm_mutex));
+ gfp_allowed_mask = mask;
+}
+
+gfp_t clear_gfp_allowed_mask(gfp_t mask)
+{
+ gfp_t ret = gfp_allowed_mask;
+
+ WARN_ON(!mutex_is_locked(&pm_mutex));
+ gfp_allowed_mask &= ~mask;
+ return ret;
+}
+#endif /* CONFIG_PM_SLEEP */
+
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
int pageblock_order __read_mostly;
#endif
@@ -530,7 +555,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
int batch_free = 0;
spin_lock(&zone->lock);
- zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
+ zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
__mod_zone_page_state(zone, NR_FREE_PAGES, count);
@@ -568,7 +593,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
int migratetype)
{
spin_lock(&zone->lock);
- zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
+ zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
@@ -583,6 +608,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
int bad = 0;
int wasMlocked = __TestClearPageMlocked(page);
+ trace_mm_page_free_direct(page, order);
kmemcheck_free_shadow(page, order);
for (i = 0 ; i < (1 << order) ; ++i)
@@ -1073,8 +1099,9 @@ void mark_free_pages(struct zone *zone)
/*
* Free a 0-order page
+ * cold == 1 ? free a cold page : free a hot page
*/
-static void free_hot_cold_page(struct page *page, int cold)
+void free_hot_cold_page(struct page *page, int cold)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
@@ -1082,6 +1109,7 @@ static void free_hot_cold_page(struct page *page, int cold)
int migratetype;
int wasMlocked = __TestClearPageMlocked(page);
+ trace_mm_page_free_direct(page, 0);
kmemcheck_free_shadow(page, 0);
if (PageAnon(page))
@@ -1133,12 +1161,6 @@ out:
local_irq_restore(flags);
}
-void free_hot_page(struct page *page)
-{
- trace_mm_page_free_direct(page, 0);
- free_hot_cold_page(page, 0);
-}
-
/*
* split_page takes a non-compound higher-order page, and splits it into
* n (1<<order) sub-pages: page[0..n]
@@ -2008,9 +2030,8 @@ void __pagevec_free(struct pagevec *pvec)
void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
- trace_mm_page_free_direct(page, order);
if (order == 0)
- free_hot_page(page);
+ free_hot_cold_page(page, 0);
else
__free_pages_ok(page, order);
}
@@ -2266,7 +2287,7 @@ void show_free_areas(void)
K(zone_page_state(zone, NR_BOUNCE)),
K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
zone->pages_scanned,
- (zone_is_all_unreclaimable(zone) ? "yes" : "no")
+ (zone->all_unreclaimable ? "yes" : "no")
);
printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
@@ -4371,8 +4392,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
for (i = 0; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
- printk(" %-8s %0#10lx -> %0#10lx\n",
- zone_names[i],
+ printk(" %-8s ", zone_names[i]);
+ if (arch_zone_lowest_possible_pfn[i] ==
+ arch_zone_highest_possible_pfn[i])
+ printk("empty\n");
+ else
+ printk("%0#10lx -> %0#10lx\n",
arch_zone_lowest_possible_pfn[i],
arch_zone_highest_possible_pfn[i]);
}
diff --git a/mm/readahead.c b/mm/readahead.c
index 033bc135a41..337b20e946f 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -501,6 +501,12 @@ void page_cache_sync_readahead(struct address_space *mapping,
if (!ra->ra_pages)
return;
+ /* be dumb */
+ if (filp->f_mode & FMODE_RANDOM) {
+ force_page_cache_readahead(mapping, filp, offset, req_size);
+ return;
+ }
+
/* do read-ahead */
ondemand_readahead(mapping, ra, filp, false, offset, req_size);
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 278cd277bde..fcd593c9c99 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -62,6 +62,7 @@
#include "internal.h"
static struct kmem_cache *anon_vma_cachep;
+static struct kmem_cache *anon_vma_chain_cachep;
static inline struct anon_vma *anon_vma_alloc(void)
{
@@ -73,6 +74,16 @@ void anon_vma_free(struct anon_vma *anon_vma)
kmem_cache_free(anon_vma_cachep, anon_vma);
}
+static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
+{
+ return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
+}
+
+void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
+{
+ kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
+}
+
/**
* anon_vma_prepare - attach an anon_vma to a memory region
* @vma: the memory region in question
@@ -103,18 +114,23 @@ void anon_vma_free(struct anon_vma *anon_vma)
int anon_vma_prepare(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
+ struct anon_vma_chain *avc;
might_sleep();
if (unlikely(!anon_vma)) {
struct mm_struct *mm = vma->vm_mm;
struct anon_vma *allocated;
+ avc = anon_vma_chain_alloc();
+ if (!avc)
+ goto out_enomem;
+
anon_vma = find_mergeable_anon_vma(vma);
allocated = NULL;
if (!anon_vma) {
anon_vma = anon_vma_alloc();
if (unlikely(!anon_vma))
- return -ENOMEM;
+ goto out_enomem_free_avc;
allocated = anon_vma;
}
spin_lock(&anon_vma->lock);
@@ -123,53 +139,113 @@ int anon_vma_prepare(struct vm_area_struct *vma)
spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma;
- list_add_tail(&vma->anon_vma_node, &anon_vma->head);
+ avc->anon_vma = anon_vma;
+ avc->vma = vma;
+ list_add(&avc->same_vma, &vma->anon_vma_chain);
+ list_add(&avc->same_anon_vma, &anon_vma->head);
allocated = NULL;
}
spin_unlock(&mm->page_table_lock);
spin_unlock(&anon_vma->lock);
- if (unlikely(allocated))
+ if (unlikely(allocated)) {
anon_vma_free(allocated);
+ anon_vma_chain_free(avc);
+ }
}
return 0;
+
+ out_enomem_free_avc:
+ anon_vma_chain_free(avc);
+ out_enomem:
+ return -ENOMEM;
}
-void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
+static void anon_vma_chain_link(struct vm_area_struct *vma,
+ struct anon_vma_chain *avc,
+ struct anon_vma *anon_vma)
{
- BUG_ON(vma->anon_vma != next->anon_vma);
- list_del(&next->anon_vma_node);
+ avc->vma = vma;
+ avc->anon_vma = anon_vma;
+ list_add(&avc->same_vma, &vma->anon_vma_chain);
+
+ spin_lock(&anon_vma->lock);
+ list_add_tail(&avc->same_anon_vma, &anon_vma->head);
+ spin_unlock(&anon_vma->lock);
}
-void __anon_vma_link(struct vm_area_struct *vma)
+/*
+ * Attach the anon_vmas from src to dst.
+ * Returns 0 on success, -ENOMEM on failure.
+ */
+int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{
- struct anon_vma *anon_vma = vma->anon_vma;
+ struct anon_vma_chain *avc, *pavc;
- if (anon_vma)
- list_add_tail(&vma->anon_vma_node, &anon_vma->head);
+ list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) {
+ avc = anon_vma_chain_alloc();
+ if (!avc)
+ goto enomem_failure;
+ anon_vma_chain_link(dst, avc, pavc->anon_vma);
+ }
+ return 0;
+
+ enomem_failure:
+ unlink_anon_vmas(dst);
+ return -ENOMEM;
}
-void anon_vma_link(struct vm_area_struct *vma)
+/*
+ * Attach vma to its own anon_vma, as well as to the anon_vmas that
+ * the corresponding VMA in the parent process is attached to.
+ * Returns 0 on success, non-zero on failure.
+ */
+int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
{
- struct anon_vma *anon_vma = vma->anon_vma;
+ struct anon_vma_chain *avc;
+ struct anon_vma *anon_vma;
- if (anon_vma) {
- spin_lock(&anon_vma->lock);
- list_add_tail(&vma->anon_vma_node, &anon_vma->head);
- spin_unlock(&anon_vma->lock);
- }
+ /* Don't bother if the parent process has no anon_vma here. */
+ if (!pvma->anon_vma)
+ return 0;
+
+ /*
+ * First, attach the new VMA to the parent VMA's anon_vmas,
+ * so rmap can find non-COWed pages in child processes.
+ */
+ if (anon_vma_clone(vma, pvma))
+ return -ENOMEM;
+
+ /* Then add our own anon_vma. */
+ anon_vma = anon_vma_alloc();
+ if (!anon_vma)
+ goto out_error;
+ avc = anon_vma_chain_alloc();
+ if (!avc)
+ goto out_error_free_anon_vma;
+ anon_vma_chain_link(vma, avc, anon_vma);
+ /* Mark this anon_vma as the one where our new (COWed) pages go. */
+ vma->anon_vma = anon_vma;
+
+ return 0;
+
+ out_error_free_anon_vma:
+ anon_vma_free(anon_vma);
+ out_error:
+ return -ENOMEM;
}
-void anon_vma_unlink(struct vm_area_struct *vma)
+static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
{
- struct anon_vma *anon_vma = vma->anon_vma;
+ struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
int empty;
+ /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */
if (!anon_vma)
return;
spin_lock(&anon_vma->lock);
- list_del(&vma->anon_vma_node);
+ list_del(&anon_vma_chain->same_anon_vma);
/* We must garbage collect the anon_vma if it's empty */
empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma);
@@ -179,6 +255,18 @@ void anon_vma_unlink(struct vm_area_struct *vma)
anon_vma_free(anon_vma);
}
+void unlink_anon_vmas(struct vm_area_struct *vma)
+{
+ struct anon_vma_chain *avc, *next;
+
+ /* Unlink each anon_vma chained to the VMA. */
+ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
+ anon_vma_unlink(avc);
+ list_del(&avc->same_vma);
+ anon_vma_chain_free(avc);
+ }
+}
+
static void anon_vma_ctor(void *data)
{
struct anon_vma *anon_vma = data;
@@ -192,6 +280,7 @@ void __init anon_vma_init(void)
{
anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
+ anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);
}
/*
@@ -396,7 +485,7 @@ static int page_referenced_anon(struct page *page,
{
unsigned int mapcount;
struct anon_vma *anon_vma;
- struct vm_area_struct *vma;
+ struct anon_vma_chain *avc;
int referenced = 0;
anon_vma = page_lock_anon_vma(page);
@@ -404,7 +493,8 @@ static int page_referenced_anon(struct page *page,
return referenced;
mapcount = page_mapcount(page);
- list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
if (address == -EFAULT)
continue;
@@ -511,9 +601,6 @@ int page_referenced(struct page *page,
int referenced = 0;
int we_locked = 0;
- if (TestClearPageReferenced(page))
- referenced++;
-
*vm_flags = 0;
if (page_mapped(page) && page_rmapping(page)) {
if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
@@ -614,6 +701,30 @@ int page_mkclean(struct page *page)
EXPORT_SYMBOL_GPL(page_mkclean);
/**
+ * page_move_anon_rmap - move a page to our anon_vma
+ * @page: the page to move to our anon_vma
+ * @vma: the vma the page belongs to
+ * @address: the user virtual address mapped
+ *
+ * When a page belongs exclusively to one process after a COW event,
+ * that page can be moved into the anon_vma that belongs to just that
+ * process, so the rmap code will not search the parent or sibling
+ * processes.
+ */
+void page_move_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(!anon_vma);
+ VM_BUG_ON(page->index != linear_page_index(vma, address));
+
+ anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+ page->mapping = (struct address_space *) anon_vma;
+}
+
+/**
* __page_set_anon_rmap - setup new anonymous rmap
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
@@ -652,9 +763,6 @@ static void __page_check_anon_rmap(struct page *page,
* are initially only visible via the pagetables, and the pte is locked
* over the call to page_add_new_anon_rmap.
*/
- struct anon_vma *anon_vma = vma->anon_vma;
- anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
- BUG_ON(page->mapping != (struct address_space *)anon_vma);
BUG_ON(page->index != linear_page_index(vma, address));
#endif
}
@@ -815,9 +923,9 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
if (PageAnon(page))
- dec_mm_counter(mm, anon_rss);
+ dec_mm_counter(mm, MM_ANONPAGES);
else
- dec_mm_counter(mm, file_rss);
+ dec_mm_counter(mm, MM_FILEPAGES);
set_pte_at(mm, address, pte,
swp_entry_to_pte(make_hwpoison_entry(page)));
} else if (PageAnon(page)) {
@@ -839,7 +947,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
list_add(&mm->mmlist, &init_mm.mmlist);
spin_unlock(&mmlist_lock);
}
- dec_mm_counter(mm, anon_rss);
+ dec_mm_counter(mm, MM_ANONPAGES);
+ inc_mm_counter(mm, MM_SWAPENTS);
} else if (PAGE_MIGRATION) {
/*
* Store the pfn of the page in a special migration
@@ -857,7 +966,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
entry = make_migration_entry(page, pte_write(pteval));
set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
} else
- dec_mm_counter(mm, file_rss);
+ dec_mm_counter(mm, MM_FILEPAGES);
page_remove_rmap(page);
page_cache_release(page);
@@ -996,7 +1105,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
page_remove_rmap(page);
page_cache_release(page);
- dec_mm_counter(mm, file_rss);
+ dec_mm_counter(mm, MM_FILEPAGES);
(*mapcount)--;
}
pte_unmap_unlock(pte - 1, ptl);
@@ -1024,14 +1133,15 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
{
struct anon_vma *anon_vma;
- struct vm_area_struct *vma;
+ struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
anon_vma = page_lock_anon_vma(page);
if (!anon_vma)
return ret;
- list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
if (address == -EFAULT)
continue;
@@ -1222,7 +1332,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
struct vm_area_struct *, unsigned long, void *), void *arg)
{
struct anon_vma *anon_vma;
- struct vm_area_struct *vma;
+ struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
/*
@@ -1237,7 +1347,8 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
if (!anon_vma)
return ret;
spin_lock(&anon_vma->lock);
- list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
if (address == -EFAULT)
continue;
diff --git a/mm/slab.c b/mm/slab.c
index 7451bdacaf1..a9f325b28be 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -935,7 +935,6 @@ static int transfer_objects(struct array_cache *to,
from->avail -= nr;
to->avail += nr;
- to->touched = 1;
return nr;
}
@@ -983,13 +982,11 @@ static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
if (limit > 1)
limit = 12;
- ac_ptr = kmalloc_node(memsize, gfp, node);
+ ac_ptr = kzalloc_node(memsize, gfp, node);
if (ac_ptr) {
for_each_node(i) {
- if (i == node || !node_online(i)) {
- ac_ptr[i] = NULL;
+ if (i == node || !node_online(i))
continue;
- }
ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
if (!ac_ptr[i]) {
for (i--; i >= 0; i--)
@@ -2963,8 +2960,10 @@ retry:
spin_lock(&l3->list_lock);
/* See if we can refill from the shared array */
- if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
+ if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
+ l3->shared->touched = 1;
goto alloc_done;
+ }
while (batchcount > 0) {
struct list_head *entry;
@@ -3101,7 +3100,7 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
if (cachep == &cache_cache)
return false;
- return should_failslab(obj_size(cachep), flags);
+ return should_failslab(obj_size(cachep), flags, cachep->flags);
}
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
diff --git a/mm/slub.c b/mm/slub.c
index 8d71aaf888d..0bfd3863d52 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -151,7 +151,8 @@
* Set of flags that will prevent slab merging
*/
#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
- SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE)
+ SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
+ SLAB_FAILSLAB)
#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
SLAB_CACHE_DMA | SLAB_NOTRACK)
@@ -217,10 +218,10 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
#endif
-static inline void stat(struct kmem_cache_cpu *c, enum stat_item si)
+static inline void stat(struct kmem_cache *s, enum stat_item si)
{
#ifdef CONFIG_SLUB_STATS
- c->stat[si]++;
+ __this_cpu_inc(s->cpu_slab->stat[si]);
#endif
}
@@ -242,15 +243,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
#endif
}
-static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
-{
-#ifdef CONFIG_SMP
- return s->cpu_slab[cpu];
-#else
- return &s->cpu_slab;
-#endif
-}
-
/* Verify that a pointer has an address that is valid within a slab page */
static inline int check_valid_pointer(struct kmem_cache *s,
struct page *page, const void *object)
@@ -269,13 +261,6 @@ static inline int check_valid_pointer(struct kmem_cache *s,
return 1;
}
-/*
- * Slow version of get and set free pointer.
- *
- * This version requires touching the cache lines of kmem_cache which
- * we avoid to do in the fast alloc free paths. There we obtain the offset
- * from the page struct.
- */
static inline void *get_freepointer(struct kmem_cache *s, void *object)
{
return *(void **)(object + s->offset);
@@ -1020,6 +1005,9 @@ static int __init setup_slub_debug(char *str)
case 't':
slub_debug |= SLAB_TRACE;
break;
+ case 'a':
+ slub_debug |= SLAB_FAILSLAB;
+ break;
default:
printk(KERN_ERR "slub_debug option '%c' "
"unknown. skipped\n", *str);
@@ -1124,7 +1112,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
if (!page)
return NULL;
- stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK);
+ stat(s, ORDER_FALLBACK);
}
if (kmemcheck_enabled
@@ -1422,23 +1410,22 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
{
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
- struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
__ClearPageSlubFrozen(page);
if (page->inuse) {
if (page->freelist) {
add_partial(n, page, tail);
- stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
+ stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
} else {
- stat(c, DEACTIVATE_FULL);
+ stat(s, DEACTIVATE_FULL);
if (SLABDEBUG && PageSlubDebug(page) &&
(s->flags & SLAB_STORE_USER))
add_full(n, page);
}
slab_unlock(page);
} else {
- stat(c, DEACTIVATE_EMPTY);
+ stat(s, DEACTIVATE_EMPTY);
if (n->nr_partial < s->min_partial) {
/*
* Adding an empty slab to the partial slabs in order
@@ -1454,7 +1441,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
slab_unlock(page);
} else {
slab_unlock(page);
- stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB);
+ stat(s, FREE_SLAB);
discard_slab(s, page);
}
}
@@ -1469,7 +1456,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
int tail = 1;
if (page->freelist)
- stat(c, DEACTIVATE_REMOTE_FREES);
+ stat(s, DEACTIVATE_REMOTE_FREES);
/*
* Merge cpu freelist into slab freelist. Typically we get here
* because both freelists are empty. So this is unlikely
@@ -1482,10 +1469,10 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
/* Retrieve object from cpu_freelist */
object = c->freelist;
- c->freelist = c->freelist[c->offset];
+ c->freelist = get_freepointer(s, c->freelist);
/* And put onto the regular freelist */
- object[c->offset] = page->freelist;
+ set_freepointer(s, object, page->freelist);
page->freelist = object;
page->inuse--;
}
@@ -1495,7 +1482,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
{
- stat(c, CPUSLAB_FLUSH);
+ stat(s, CPUSLAB_FLUSH);
slab_lock(c->page);
deactivate_slab(s, c);
}
@@ -1507,7 +1494,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
*/
static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
{
- struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+ struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
if (likely(c && c->page))
flush_slab(s, c);
@@ -1635,7 +1622,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
if (unlikely(!node_match(c, node)))
goto another_slab;
- stat(c, ALLOC_REFILL);
+ stat(s, ALLOC_REFILL);
load_freelist:
object = c->page->freelist;
@@ -1644,13 +1631,13 @@ load_freelist:
if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
goto debug;
- c->freelist = object[c->offset];
+ c->freelist = get_freepointer(s, object);
c->page->inuse = c->page->objects;
c->page->freelist = NULL;
c->node = page_to_nid(c->page);
unlock_out:
slab_unlock(c->page);
- stat(c, ALLOC_SLOWPATH);
+ stat(s, ALLOC_SLOWPATH);
return object;
another_slab:
@@ -1660,7 +1647,7 @@ new_slab:
new = get_partial(s, gfpflags, node);
if (new) {
c->page = new;
- stat(c, ALLOC_FROM_PARTIAL);
+ stat(s, ALLOC_FROM_PARTIAL);
goto load_freelist;
}
@@ -1673,8 +1660,8 @@ new_slab:
local_irq_disable();
if (new) {
- c = get_cpu_slab(s, smp_processor_id());
- stat(c, ALLOC_SLAB);
+ c = __this_cpu_ptr(s->cpu_slab);
+ stat(s, ALLOC_SLAB);
if (c->page)
flush_slab(s, c);
slab_lock(new);
@@ -1690,7 +1677,7 @@ debug:
goto another_slab;
c->page->inuse++;
- c->page->freelist = object[c->offset];
+ c->page->freelist = get_freepointer(s, object);
c->node = -1;
goto unlock_out;
}
@@ -1711,35 +1698,33 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
void **object;
struct kmem_cache_cpu *c;
unsigned long flags;
- unsigned int objsize;
gfpflags &= gfp_allowed_mask;
lockdep_trace_alloc(gfpflags);
might_sleep_if(gfpflags & __GFP_WAIT);
- if (should_failslab(s->objsize, gfpflags))
+ if (should_failslab(s->objsize, gfpflags, s->flags))
return NULL;
local_irq_save(flags);
- c = get_cpu_slab(s, smp_processor_id());
- objsize = c->objsize;
- if (unlikely(!c->freelist || !node_match(c, node)))
+ c = __this_cpu_ptr(s->cpu_slab);
+ object = c->freelist;
+ if (unlikely(!object || !node_match(c, node)))
object = __slab_alloc(s, gfpflags, node, addr, c);
else {
- object = c->freelist;
- c->freelist = object[c->offset];
- stat(c, ALLOC_FASTPATH);
+ c->freelist = get_freepointer(s, object);
+ stat(s, ALLOC_FASTPATH);
}
local_irq_restore(flags);
if (unlikely(gfpflags & __GFP_ZERO) && object)
- memset(object, 0, objsize);
+ memset(object, 0, s->objsize);
- kmemcheck_slab_alloc(s, gfpflags, object, c->objsize);
- kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags);
+ kmemcheck_slab_alloc(s, gfpflags, object, s->objsize);
+ kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags);
return object;
}
@@ -1794,26 +1779,25 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
* handling required then we can return immediately.
*/
static void __slab_free(struct kmem_cache *s, struct page *page,
- void *x, unsigned long addr, unsigned int offset)
+ void *x, unsigned long addr)
{
void *prior;
void **object = (void *)x;
- struct kmem_cache_cpu *c;
- c = get_cpu_slab(s, raw_smp_processor_id());
- stat(c, FREE_SLOWPATH);
+ stat(s, FREE_SLOWPATH);
slab_lock(page);
if (unlikely(SLABDEBUG && PageSlubDebug(page)))
goto debug;
checks_ok:
- prior = object[offset] = page->freelist;
+ prior = page->freelist;
+ set_freepointer(s, object, prior);
page->freelist = object;
page->inuse--;
if (unlikely(PageSlubFrozen(page))) {
- stat(c, FREE_FROZEN);
+ stat(s, FREE_FROZEN);
goto out_unlock;
}
@@ -1826,7 +1810,7 @@ checks_ok:
*/
if (unlikely(!prior)) {
add_partial(get_node(s, page_to_nid(page)), page, 1);
- stat(c, FREE_ADD_PARTIAL);
+ stat(s, FREE_ADD_PARTIAL);
}
out_unlock:
@@ -1839,10 +1823,10 @@ slab_empty:
* Slab still on the partial list.
*/
remove_partial(s, page);
- stat(c, FREE_REMOVE_PARTIAL);
+ stat(s, FREE_REMOVE_PARTIAL);
}
slab_unlock(page);
- stat(c, FREE_SLAB);
+ stat(s, FREE_SLAB);
discard_slab(s, page);
return;
@@ -1872,17 +1856,17 @@ static __always_inline void slab_free(struct kmem_cache *s,
kmemleak_free_recursive(x, s->flags);
local_irq_save(flags);
- c = get_cpu_slab(s, smp_processor_id());
- kmemcheck_slab_free(s, object, c->objsize);
- debug_check_no_locks_freed(object, c->objsize);
+ c = __this_cpu_ptr(s->cpu_slab);
+ kmemcheck_slab_free(s, object, s->objsize);
+ debug_check_no_locks_freed(object, s->objsize);
if (!(s->flags & SLAB_DEBUG_OBJECTS))
- debug_check_no_obj_freed(object, c->objsize);
+ debug_check_no_obj_freed(object, s->objsize);
if (likely(page == c->page && c->node >= 0)) {
- object[c->offset] = c->freelist;
+ set_freepointer(s, object, c->freelist);
c->freelist = object;
- stat(c, FREE_FASTPATH);
+ stat(s, FREE_FASTPATH);
} else
- __slab_free(s, page, x, addr, c->offset);
+ __slab_free(s, page, x, addr);
local_irq_restore(flags);
}
@@ -2069,19 +2053,6 @@ static unsigned long calculate_alignment(unsigned long flags,
return ALIGN(align, sizeof(void *));
}
-static void init_kmem_cache_cpu(struct kmem_cache *s,
- struct kmem_cache_cpu *c)
-{
- c->page = NULL;
- c->freelist = NULL;
- c->node = 0;
- c->offset = s->offset / sizeof(void *);
- c->objsize = s->objsize;
-#ifdef CONFIG_SLUB_STATS
- memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned));
-#endif
-}
-
static void
init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
{
@@ -2095,130 +2066,24 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
#endif
}
-#ifdef CONFIG_SMP
-/*
- * Per cpu array for per cpu structures.
- *
- * The per cpu array places all kmem_cache_cpu structures from one processor
- * close together meaning that it becomes possible that multiple per cpu
- * structures are contained in one cacheline. This may be particularly
- * beneficial for the kmalloc caches.
- *
- * A desktop system typically has around 60-80 slabs. With 100 here we are
- * likely able to get per cpu structures for all caches from the array defined
- * here. We must be able to cover all kmalloc caches during bootstrap.
- *
- * If the per cpu array is exhausted then fall back to kmalloc
- * of individual cachelines. No sharing is possible then.
- */
-#define NR_KMEM_CACHE_CPU 100
-
-static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
- kmem_cache_cpu);
-
-static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
-static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
-
-static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
- int cpu, gfp_t flags)
-{
- struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu);
-
- if (c)
- per_cpu(kmem_cache_cpu_free, cpu) =
- (void *)c->freelist;
- else {
- /* Table overflow: So allocate ourselves */
- c = kmalloc_node(
- ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()),
- flags, cpu_to_node(cpu));
- if (!c)
- return NULL;
- }
-
- init_kmem_cache_cpu(s, c);
- return c;
-}
-
-static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
-{
- if (c < per_cpu(kmem_cache_cpu, cpu) ||
- c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
- kfree(c);
- return;
- }
- c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu);
- per_cpu(kmem_cache_cpu_free, cpu) = c;
-}
-
-static void free_kmem_cache_cpus(struct kmem_cache *s)
-{
- int cpu;
-
- for_each_online_cpu(cpu) {
- struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
-
- if (c) {
- s->cpu_slab[cpu] = NULL;
- free_kmem_cache_cpu(c, cpu);
- }
- }
-}
-
-static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
-{
- int cpu;
-
- for_each_online_cpu(cpu) {
- struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
-
- if (c)
- continue;
-
- c = alloc_kmem_cache_cpu(s, cpu, flags);
- if (!c) {
- free_kmem_cache_cpus(s);
- return 0;
- }
- s->cpu_slab[cpu] = c;
- }
- return 1;
-}
-
-/*
- * Initialize the per cpu array.
- */
-static void init_alloc_cpu_cpu(int cpu)
-{
- int i;
+static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]);
- if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once)))
- return;
-
- for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--)
- free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu);
-
- cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once));
-}
-
-static void __init init_alloc_cpu(void)
+static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
{
- int cpu;
-
- for_each_online_cpu(cpu)
- init_alloc_cpu_cpu(cpu);
- }
+ if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches)
+ /*
+ * Boot time creation of the kmalloc array. Use static per cpu data
+ * since the per cpu allocator is not available yet.
+ */
+ s->cpu_slab = kmalloc_percpu + (s - kmalloc_caches);
+ else
+ s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
-#else
-static inline void free_kmem_cache_cpus(struct kmem_cache *s) {}
-static inline void init_alloc_cpu(void) {}
+ if (!s->cpu_slab)
+ return 0;
-static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
-{
- init_kmem_cache_cpu(s, &s->cpu_slab);
return 1;
}
-#endif
#ifdef CONFIG_NUMA
/*
@@ -2287,7 +2152,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
int node;
int local_node;
- if (slab_state >= UP)
+ if (slab_state >= UP && (s < kmalloc_caches ||
+ s > kmalloc_caches + KMALLOC_CACHES))
local_node = page_to_nid(virt_to_page(s));
else
local_node = 0;
@@ -2502,6 +2368,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
return 1;
+
free_kmem_cache_nodes(s);
error:
if (flags & SLAB_PANIC)
@@ -2609,9 +2476,8 @@ static inline int kmem_cache_close(struct kmem_cache *s)
int node;
flush_all(s);
-
+ free_percpu(s->cpu_slab);
/* Attempt to free all objects */
- free_kmem_cache_cpus(s);
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = get_node(s, node);
@@ -2651,7 +2517,7 @@ EXPORT_SYMBOL(kmem_cache_destroy);
* Kmalloc subsystem
*******************************************************************/
-struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned;
+struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned;
EXPORT_SYMBOL(kmalloc_caches);
static int __init setup_slub_min_order(char *str)
@@ -2741,6 +2607,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
char *text;
size_t realsize;
unsigned long slabflags;
+ int i;
s = kmalloc_caches_dma[index];
if (s)
@@ -2760,7 +2627,14 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
realsize = kmalloc_caches[index].objsize;
text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
(unsigned int)realsize);
- s = kmalloc(kmem_size, flags & ~SLUB_DMA);
+
+ s = NULL;
+ for (i = 0; i < KMALLOC_CACHES; i++)
+ if (!kmalloc_caches[i].size)
+ break;
+
+ BUG_ON(i >= KMALLOC_CACHES);
+ s = kmalloc_caches + i;
/*
* Must defer sysfs creation to a workqueue because we don't know
@@ -2772,9 +2646,9 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
if (slab_state >= SYSFS)
slabflags |= __SYSFS_ADD_DEFERRED;
- if (!s || !text || !kmem_cache_open(s, flags, text,
+ if (!text || !kmem_cache_open(s, flags, text,
realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
- kfree(s);
+ s->size = 0;
kfree(text);
goto unlock_out;
}
@@ -3176,8 +3050,6 @@ void __init kmem_cache_init(void)
int i;
int caches = 0;
- init_alloc_cpu();
-
#ifdef CONFIG_NUMA
/*
* Must first have the slab cache available for the allocations of the
@@ -3261,8 +3133,10 @@ void __init kmem_cache_init(void)
#ifdef CONFIG_SMP
register_cpu_notifier(&slab_notifier);
- kmem_size = offsetof(struct kmem_cache, cpu_slab) +
- nr_cpu_ids * sizeof(struct kmem_cache_cpu *);
+#endif
+#ifdef CONFIG_NUMA
+ kmem_size = offsetof(struct kmem_cache, node) +
+ nr_node_ids * sizeof(struct kmem_cache_node *);
#else
kmem_size = sizeof(struct kmem_cache);
#endif
@@ -3351,22 +3225,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
down_write(&slub_lock);
s = find_mergeable(size, align, flags, name, ctor);
if (s) {
- int cpu;
-
s->refcount++;
/*
* Adjust the object sizes so that we clear
* the complete object on kzalloc.
*/
s->objsize = max(s->objsize, (int)size);
-
- /*
- * And then we need to update the object size in the
- * per cpu structures
- */
- for_each_online_cpu(cpu)
- get_cpu_slab(s, cpu)->objsize = s->objsize;
-
s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
up_write(&slub_lock);
@@ -3420,29 +3284,15 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
unsigned long flags;
switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- init_alloc_cpu_cpu(cpu);
- down_read(&slub_lock);
- list_for_each_entry(s, &slab_caches, list)
- s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu,
- GFP_KERNEL);
- up_read(&slub_lock);
- break;
-
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
down_read(&slub_lock);
list_for_each_entry(s, &slab_caches, list) {
- struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
-
local_irq_save(flags);
__flush_cpu_slab(s, cpu);
local_irq_restore(flags);
- free_kmem_cache_cpu(c, cpu);
- s->cpu_slab[cpu] = NULL;
}
up_read(&slub_lock);
break;
@@ -3928,7 +3778,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
int cpu;
for_each_possible_cpu(cpu) {
- struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+ struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
if (!c || c->node < 0)
continue;
@@ -4171,6 +4021,23 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf,
}
SLAB_ATTR(trace);
+#ifdef CONFIG_FAILSLAB
+static ssize_t failslab_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
+}
+
+static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
+ size_t length)
+{
+ s->flags &= ~SLAB_FAILSLAB;
+ if (buf[0] == '1')
+ s->flags |= SLAB_FAILSLAB;
+ return length;
+}
+SLAB_ATTR(failslab);
+#endif
+
static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
@@ -4353,7 +4220,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
return -ENOMEM;
for_each_online_cpu(cpu) {
- unsigned x = get_cpu_slab(s, cpu)->stat[si];
+ unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
data[cpu] = x;
sum += x;
@@ -4376,7 +4243,7 @@ static void clear_stat(struct kmem_cache *s, enum stat_item si)
int cpu;
for_each_online_cpu(cpu)
- get_cpu_slab(s, cpu)->stat[si] = 0;
+ per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
}
#define STAT_ATTR(si, text) \
@@ -4467,6 +4334,10 @@ static struct attribute *slab_attrs[] = {
&deactivate_remote_frees_attr.attr,
&order_fallback_attr.attr,
#endif
+#ifdef CONFIG_FAILSLAB
+ &failslab_attr.attr,
+#endif
+
NULL
};
diff --git a/mm/swap.c b/mm/swap.c
index 308e57d8d7e..9036b89813a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -55,7 +55,7 @@ static void __page_cache_release(struct page *page)
del_page_from_lru(zone, page);
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
- free_hot_page(page);
+ free_hot_cold_page(page, 0);
}
static void put_compound_page(struct page *page)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6c0585b1641..84374d8cf81 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -840,7 +840,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
goto out;
}
- inc_mm_counter(vma->vm_mm, anon_rss);
+ dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
+ inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
get_page(page);
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
@@ -1759,11 +1760,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
unsigned int type;
int i, prev;
int error;
- union swap_header *swap_header = NULL;
- unsigned int nr_good_pages = 0;
+ union swap_header *swap_header;
+ unsigned int nr_good_pages;
int nr_extents = 0;
sector_t span;
- unsigned long maxpages = 1;
+ unsigned long maxpages;
unsigned long swapfilepages;
unsigned char *swap_map = NULL;
struct page *page = NULL;
@@ -1922,9 +1923,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
* swap pte.
*/
maxpages = swp_offset(pte_to_swp_entry(
- swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1;
- if (maxpages > swap_header->info.last_page)
- maxpages = swap_header->info.last_page;
+ swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+ if (maxpages > swap_header->info.last_page) {
+ maxpages = swap_header->info.last_page + 1;
+ /* p->max is an unsigned int: don't overflow it */
+ if ((unsigned int)maxpages == 0)
+ maxpages = UINT_MAX;
+ }
p->highest_bit = maxpages - 1;
error = -EINVAL;
@@ -1948,23 +1953,24 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
memset(swap_map, 0, maxpages);
+ nr_good_pages = maxpages - 1; /* omit header page */
+
for (i = 0; i < swap_header->info.nr_badpages; i++) {
- int page_nr = swap_header->info.badpages[i];
- if (page_nr <= 0 || page_nr >= swap_header->info.last_page) {
+ unsigned int page_nr = swap_header->info.badpages[i];
+ if (page_nr == 0 || page_nr > swap_header->info.last_page) {
error = -EINVAL;
goto bad_swap;
}
- swap_map[page_nr] = SWAP_MAP_BAD;
+ if (page_nr < maxpages) {
+ swap_map[page_nr] = SWAP_MAP_BAD;
+ nr_good_pages--;
+ }
}
error = swap_cgroup_swapon(type, maxpages);
if (error)
goto bad_swap;
- nr_good_pages = swap_header->info.last_page -
- swap_header->info.nr_badpages -
- 1 /* header page */;
-
if (nr_good_pages) {
swap_map[0] = SWAP_MAP_BAD;
p->max = maxpages;
@@ -2155,7 +2161,11 @@ void swap_shmem_alloc(swp_entry_t entry)
}
/*
- * increase reference count of swap entry by 1.
+ * Increase reference count of swap entry by 1.
+ * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
+ * but could not be atomically allocated. Returns 0, just as if it succeeded,
+ * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
+ * might occur if a page table entry has got corrupted.
*/
int swap_duplicate(swp_entry_t entry)
{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c26986c85ce..79c809895fb 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -262,27 +262,6 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
return ret;
}
-/* Called without lock on whether page is mapped, so answer is unstable */
-static inline int page_mapping_inuse(struct page *page)
-{
- struct address_space *mapping;
-
- /* Page is in somebody's page tables. */
- if (page_mapped(page))
- return 1;
-
- /* Be more reluctant to reclaim swapcache than pagecache */
- if (PageSwapCache(page))
- return 1;
-
- mapping = page_mapping(page);
- if (!mapping)
- return 0;
-
- /* File is mmap'd by somebody? */
- return mapping_mapped(mapping);
-}
-
static inline int is_page_cache_freeable(struct page *page)
{
/*
@@ -579,6 +558,65 @@ redo:
put_page(page); /* drop ref from isolate */
}
+enum page_references {
+ PAGEREF_RECLAIM,
+ PAGEREF_RECLAIM_CLEAN,
+ PAGEREF_KEEP,
+ PAGEREF_ACTIVATE,
+};
+
+static enum page_references page_check_references(struct page *page,
+ struct scan_control *sc)
+{
+ int referenced_ptes, referenced_page;
+ unsigned long vm_flags;
+
+ referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags);
+ referenced_page = TestClearPageReferenced(page);
+
+ /* Lumpy reclaim - ignore references */
+ if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+ return PAGEREF_RECLAIM;
+
+ /*
+ * Mlock lost the isolation race with us. Let try_to_unmap()
+ * move the page to the unevictable list.
+ */
+ if (vm_flags & VM_LOCKED)
+ return PAGEREF_RECLAIM;
+
+ if (referenced_ptes) {
+ if (PageAnon(page))
+ return PAGEREF_ACTIVATE;
+ /*
+ * All mapped pages start out with page table
+ * references from the instantiating fault, so we need
+ * to look twice if a mapped file page is used more
+ * than once.
+ *
+ * Mark it and spare it for another trip around the
+ * inactive list. Another page table reference will
+ * lead to its activation.
+ *
+ * Note: the mark is set for activated pages as well
+ * so that recently deactivated but used pages are
+ * quickly recovered.
+ */
+ SetPageReferenced(page);
+
+ if (referenced_page)
+ return PAGEREF_ACTIVATE;
+
+ return PAGEREF_KEEP;
+ }
+
+ /* Reclaim if clean, defer dirty pages to writeback */
+ if (referenced_page)
+ return PAGEREF_RECLAIM_CLEAN;
+
+ return PAGEREF_RECLAIM;
+}
+
/*
* shrink_page_list() returns the number of reclaimed pages
*/
@@ -590,16 +628,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
struct pagevec freed_pvec;
int pgactivate = 0;
unsigned long nr_reclaimed = 0;
- unsigned long vm_flags;
cond_resched();
pagevec_init(&freed_pvec, 1);
while (!list_empty(page_list)) {
+ enum page_references references;
struct address_space *mapping;
struct page *page;
int may_enter_fs;
- int referenced;
cond_resched();
@@ -641,17 +678,16 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep_locked;
}
- referenced = page_referenced(page, 1,
- sc->mem_cgroup, &vm_flags);
- /*
- * In active use or really unfreeable? Activate it.
- * If page which have PG_mlocked lost isoltation race,
- * try_to_unmap moves it to unevictable list
- */
- if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
- referenced && page_mapping_inuse(page)
- && !(vm_flags & VM_LOCKED))
+ references = page_check_references(page, sc);
+ switch (references) {
+ case PAGEREF_ACTIVATE:
goto activate_locked;
+ case PAGEREF_KEEP:
+ goto keep_locked;
+ case PAGEREF_RECLAIM:
+ case PAGEREF_RECLAIM_CLEAN:
+ ; /* try to reclaim the page below */
+ }
/*
* Anonymous process memory has backing store?
@@ -685,7 +721,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
if (PageDirty(page)) {
- if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
+ if (references == PAGEREF_RECLAIM_CLEAN)
goto keep_locked;
if (!may_enter_fs)
goto keep_locked;
@@ -1350,9 +1386,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
continue;
}
- /* page_referenced clears PageReferenced */
- if (page_mapping_inuse(page) &&
- page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
+ if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
nr_rotated++;
/*
* Identify referenced, file-backed active pages and
@@ -1501,6 +1535,13 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
unsigned long ap, fp;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+ /* If we have no swap space, do not bother scanning anon pages. */
+ if (!sc->may_swap || (nr_swap_pages <= 0)) {
+ percent[0] = 0;
+ percent[1] = 100;
+ return;
+ }
+
anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
@@ -1598,22 +1639,20 @@ static void shrink_zone(int priority, struct zone *zone,
unsigned long nr_reclaimed = sc->nr_reclaimed;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
- int noswap = 0;
- /* If we have no swap space, do not bother scanning anon pages. */
- if (!sc->may_swap || (nr_swap_pages <= 0)) {
- noswap = 1;
- percent[0] = 0;
- percent[1] = 100;
- } else
- get_scan_ratio(zone, sc, percent);
+ get_scan_ratio(zone, sc, percent);
for_each_evictable_lru(l) {
int file = is_file_lru(l);
unsigned long scan;
+ if (percent[file] == 0) {
+ nr[l] = 0;
+ continue;
+ }
+
scan = zone_nr_lru_pages(zone, sc, l);
- if (priority || noswap) {
+ if (priority) {
scan >>= priority;
scan = (scan * percent[file]) / 100;
}
@@ -1694,8 +1733,7 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
continue;
note_zone_scanning_priority(zone, priority);
- if (zone_is_all_unreclaimable(zone) &&
- priority != DEF_PRIORITY)
+ if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
sc->all_unreclaimable = 0;
} else {
@@ -1922,7 +1960,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
if (!populated_zone(zone))
continue;
- if (zone_is_all_unreclaimable(zone))
+ if (zone->all_unreclaimable)
continue;
if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
@@ -2012,8 +2050,7 @@ loop_again:
if (!populated_zone(zone))
continue;
- if (zone_is_all_unreclaimable(zone) &&
- priority != DEF_PRIORITY)
+ if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue;
/*
@@ -2056,13 +2093,9 @@ loop_again:
if (!populated_zone(zone))
continue;
- if (zone_is_all_unreclaimable(zone) &&
- priority != DEF_PRIORITY)
+ if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue;
- if (!zone_watermark_ok(zone, order,
- high_wmark_pages(zone), end_zone, 0))
- all_zones_ok = 0;
temp_priority[i] = priority;
sc.nr_scanned = 0;
note_zone_scanning_priority(zone, priority);
@@ -2087,12 +2120,11 @@ loop_again:
lru_pages);
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
total_scanned += sc.nr_scanned;
- if (zone_is_all_unreclaimable(zone))
+ if (zone->all_unreclaimable)
continue;
- if (nr_slab == 0 && zone->pages_scanned >=
- (zone_reclaimable_pages(zone) * 6))
- zone_set_flag(zone,
- ZONE_ALL_UNRECLAIMABLE);
+ if (nr_slab == 0 &&
+ zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6))
+ zone->all_unreclaimable = 1;
/*
* If we've done a decent amount of scanning and
* the reclaim ratio is low, start doing writepage
@@ -2102,13 +2134,18 @@ loop_again:
total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
sc.may_writepage = 1;
- /*
- * We are still under min water mark. it mean we have
- * GFP_ATOMIC allocation failure risk. Hurry up!
- */
- if (!zone_watermark_ok(zone, order, min_wmark_pages(zone),
- end_zone, 0))
- has_under_min_watermark_zone = 1;
+ if (!zone_watermark_ok(zone, order,
+ high_wmark_pages(zone), end_zone, 0)) {
+ all_zones_ok = 0;
+ /*
+ * We are still under min water mark. This
+ * means that we have a GFP_ATOMIC allocation
+ * failure risk. Hurry up!
+ */
+ if (!zone_watermark_ok(zone, order,
+ min_wmark_pages(zone), end_zone, 0))
+ has_under_min_watermark_zone = 1;
+ }
}
if (all_zones_ok)
@@ -2550,6 +2587,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* and RECLAIM_SWAP.
*/
p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
+ lockdep_set_current_reclaim_state(gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
@@ -2593,6 +2631,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
p->reclaim_state = NULL;
current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
+ lockdep_clear_current_reclaim_state();
return sc.nr_reclaimed >= nr_pages;
}
@@ -2615,7 +2654,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
return ZONE_RECLAIM_FULL;
- if (zone_is_all_unreclaimable(zone))
+ if (zone->all_unreclaimable)
return ZONE_RECLAIM_FULL;
/*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index fc5aa183bc4..7f760cbc73f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -763,7 +763,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n prev_priority: %i"
"\n start_pfn: %lu"
"\n inactive_ratio: %u",
- zone_is_all_unreclaimable(zone),
+ zone->all_unreclaimable,
zone->prev_priority,
zone->zone_start_pfn,
zone->inactive_ratio);