summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/compaction.c1
-rw-r--r--mm/filemap.c37
-rw-r--r--mm/hugetlb.c3
-rw-r--r--mm/ksm.c7
-rw-r--r--mm/memcontrol.c101
-rw-r--r--mm/memory-failure.c8
-rw-r--r--mm/memory_hotplug.c31
-rw-r--r--mm/mempolicy.c3
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mmap.c16
-rw-r--r--mm/nommu.c29
-rw-r--r--mm/page-writeback.c2
-rw-r--r--mm/page_alloc.c33
-rw-r--r--mm/pagewalk.c5
-rw-r--r--mm/percpu.c10
-rw-r--r--mm/shmem.c9
-rw-r--r--mm/slab.c76
-rw-r--r--mm/slob.c5
-rw-r--r--mm/slub.c77
-rw-r--r--mm/truncate.c4
-rw-r--r--mm/util.c21
-rw-r--r--mm/vmalloc.c28
-rw-r--r--mm/vmscan.c9
-rw-r--r--mm/vmstat.c155
24 files changed, 394 insertions, 278 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 4d709ee5901..1a8894eadf7 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -279,7 +279,6 @@ static unsigned long isolate_migratepages(struct zone *zone,
/* Successfully isolated */
del_page_from_lru_list(zone, page, page_lru(page));
list_add(&page->lru, migratelist);
- mem_cgroup_del_lru(page);
cc->nr_migratepages++;
/* Avoid isolating too much */
diff --git a/mm/filemap.c b/mm/filemap.c
index 61ba5e40579..ca389394fa2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -102,9 +102,6 @@
* ->inode_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
*
- * ->task->proc_lock
- * ->dcache_lock (proc_pid_lookup)
- *
* (code doesn't rely on that order, so you could switch it around)
* ->tasklist_lock (memory_failure, collect_procs_ao)
* ->i_mmap_lock
@@ -143,13 +140,18 @@ void __remove_from_page_cache(struct page *page)
void remove_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
+ void (*freepage)(struct page *);
BUG_ON(!PageLocked(page));
+ freepage = mapping->a_ops->freepage;
spin_lock_irq(&mapping->tree_lock);
__remove_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
+
+ if (freepage)
+ freepage(page);
}
EXPORT_SYMBOL(remove_from_page_cache);
@@ -644,7 +646,9 @@ repeat:
pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
if (pagep) {
page = radix_tree_deref_slot(pagep);
- if (unlikely(!page || page == RADIX_TREE_RETRY))
+ if (unlikely(!page))
+ goto out;
+ if (radix_tree_deref_retry(page))
goto repeat;
if (!page_cache_get_speculative(page))
@@ -660,6 +664,7 @@ repeat:
goto repeat;
}
}
+out:
rcu_read_unlock();
return page;
@@ -777,12 +782,11 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
- /*
- * this can only trigger if nr_found == 1, making livelock
- * a non issue.
- */
- if (unlikely(page == RADIX_TREE_RETRY))
+ if (radix_tree_deref_retry(page)) {
+ if (ret)
+ start = pages[ret-1]->index;
goto restart;
+ }
if (!page_cache_get_speculative(page))
goto repeat;
@@ -830,11 +834,7 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
- /*
- * this can only trigger if nr_found == 1, making livelock
- * a non issue.
- */
- if (unlikely(page == RADIX_TREE_RETRY))
+ if (radix_tree_deref_retry(page))
goto restart;
if (page->mapping == NULL || page->index != index)
@@ -887,11 +887,7 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
- /*
- * this can only trigger if nr_found == 1, making livelock
- * a non issue.
- */
- if (unlikely(page == RADIX_TREE_RETRY))
+ if (radix_tree_deref_retry(page))
goto restart;
if (!page_cache_get_speculative(page))
@@ -1029,6 +1025,9 @@ find_page:
goto page_not_up_to_date;
if (!trylock_page(page))
goto page_not_up_to_date;
+ /* Did it get truncated before we got the lock? */
+ if (!page->mapping)
+ goto page_not_up_to_date_locked;
if (!mapping->a_ops->is_partially_uptodate(page,
desc, offset))
goto page_not_up_to_date_locked;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c4a3558589a..85855240933 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2738,7 +2738,8 @@ out_page_table_lock:
unlock_page(pagecache_page);
put_page(pagecache_page);
}
- unlock_page(page);
+ if (page != pagecache_page)
+ unlock_page(page);
out_mutex:
mutex_unlock(&hugetlb_instantiation_mutex);
diff --git a/mm/ksm.c b/mm/ksm.c
index 65ab5c7067d..43bc893470b 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1724,8 +1724,13 @@ static int ksm_memory_callback(struct notifier_block *self,
/*
* Keep it very simple for now: just lock out ksmd and
* MADV_UNMERGEABLE while any memory is going offline.
+ * mutex_lock_nested() is necessary because lockdep was alarmed
+ * that here we take ksm_thread_mutex inside notifier chain
+ * mutex, and later take notifier chain mutex inside
+ * ksm_thread_mutex to unlock it. But that's safe because both
+ * are inside mem_hotplug_mutex.
*/
- mutex_lock(&ksm_thread_mutex);
+ mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING);
break;
case MEM_OFFLINE:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9a99cfaf0a1..00bb8a64d02 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -61,7 +61,14 @@ struct mem_cgroup *root_mem_cgroup __read_mostly;
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
int do_swap_account __read_mostly;
-static int really_do_swap_account __initdata = 1; /* for remember boot option*/
+
+/* for remember boot option*/
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
+static int really_do_swap_account __initdata = 1;
+#else
+static int really_do_swap_account __initdata = 0;
+#endif
+
#else
#define do_swap_account (0)
#endif
@@ -278,13 +285,14 @@ enum move_type {
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
- spinlock_t lock; /* for from, to, moving_task */
+ spinlock_t lock; /* for from, to */
struct mem_cgroup *from;
struct mem_cgroup *to;
unsigned long precharge;
unsigned long moved_charge;
unsigned long moved_swap;
struct task_struct *moving_task; /* a task moving charges */
+ struct mm_struct *mm;
wait_queue_head_t waitq; /* a waitq for other context */
} mc = {
.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
@@ -1917,19 +1925,18 @@ again:
rcu_read_lock();
p = rcu_dereference(mm->owner);
- VM_BUG_ON(!p);
/*
- * because we don't have task_lock(), "p" can exit while
- * we're here. In that case, "mem" can point to root
- * cgroup but never be NULL. (and task_struct itself is freed
- * by RCU, cgroup itself is RCU safe.) Then, we have small
- * risk here to get wrong cgroup. But such kind of mis-account
- * by race always happens because we don't have cgroup_mutex().
- * It's overkill and we allow that small race, here.
+ * Because we don't have task_lock(), "p" can exit.
+ * In that case, "mem" can point to root or p can be NULL with
+ * race with swapoff. Then, we have small risk of mis-accouning.
+ * But such kind of mis-account by race always happens because
+ * we don't have cgroup_mutex(). It's overkill and we allo that
+ * small race, here.
+ * (*) swapoff at el will charge against mm-struct not against
+ * task-struct. So, mm->owner can be NULL.
*/
mem = mem_cgroup_from_task(p);
- VM_BUG_ON(!mem);
- if (mem_cgroup_is_root(mem)) {
+ if (!mem || mem_cgroup_is_root(mem)) {
rcu_read_unlock();
goto done;
}
@@ -2152,7 +2159,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
{
VM_BUG_ON(from == to);
VM_BUG_ON(PageLRU(pc->page));
- VM_BUG_ON(!PageCgroupLocked(pc));
+ VM_BUG_ON(!page_is_cgroup_locked(pc));
VM_BUG_ON(!PageCgroupUsed(pc));
VM_BUG_ON(pc->mem_cgroup != from);
@@ -4208,15 +4215,17 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
memset(mem, 0, size);
mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
- if (!mem->stat) {
- if (size < PAGE_SIZE)
- kfree(mem);
- else
- vfree(mem);
- mem = NULL;
- }
+ if (!mem->stat)
+ goto out_free;
spin_lock_init(&mem->pcp_counter_lock);
return mem;
+
+out_free:
+ if (size < PAGE_SIZE)
+ kfree(mem);
+ else
+ vfree(mem);
+ return NULL;
}
/*
@@ -4629,7 +4638,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
unsigned long precharge;
struct vm_area_struct *vma;
- down_read(&mm->mmap_sem);
+ /* We've already held the mmap_sem */
for (vma = mm->mmap; vma; vma = vma->vm_next) {
struct mm_walk mem_cgroup_count_precharge_walk = {
.pmd_entry = mem_cgroup_count_precharge_pte_range,
@@ -4641,7 +4650,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
walk_page_range(vma->vm_start, vma->vm_end,
&mem_cgroup_count_precharge_walk);
}
- up_read(&mm->mmap_sem);
precharge = mc.precharge;
mc.precharge = 0;
@@ -4692,11 +4700,16 @@ static void mem_cgroup_clear_mc(void)
mc.moved_swap = 0;
}
+ if (mc.mm) {
+ up_read(&mc.mm->mmap_sem);
+ mmput(mc.mm);
+ }
spin_lock(&mc.lock);
mc.from = NULL;
mc.to = NULL;
- mc.moving_task = NULL;
spin_unlock(&mc.lock);
+ mc.moving_task = NULL;
+ mc.mm = NULL;
mem_cgroup_end_move(from);
memcg_oom_recover(from);
memcg_oom_recover(to);
@@ -4722,12 +4735,21 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
return 0;
/* We move charges only when we move a owner of the mm */
if (mm->owner == p) {
+ /*
+ * We do all the move charge works under one mmap_sem to
+ * avoid deadlock with down_write(&mmap_sem)
+ * -> try_charge() -> if (mc.moving_task) -> sleep.
+ */
+ down_read(&mm->mmap_sem);
+
VM_BUG_ON(mc.from);
VM_BUG_ON(mc.to);
VM_BUG_ON(mc.precharge);
VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap);
VM_BUG_ON(mc.moving_task);
+ VM_BUG_ON(mc.mm);
+
mem_cgroup_start_move(from);
spin_lock(&mc.lock);
mc.from = from;
@@ -4735,14 +4757,16 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
mc.precharge = 0;
mc.moved_charge = 0;
mc.moved_swap = 0;
- mc.moving_task = current;
spin_unlock(&mc.lock);
+ mc.moving_task = current;
+ mc.mm = mm;
ret = mem_cgroup_precharge_mc(mm);
if (ret)
mem_cgroup_clear_mc();
- }
- mmput(mm);
+ /* We call up_read() and mmput() in clear_mc(). */
+ } else
+ mmput(mm);
}
return ret;
}
@@ -4830,7 +4854,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
struct vm_area_struct *vma;
lru_add_drain_all();
- down_read(&mm->mmap_sem);
+ /* We've already held the mmap_sem */
for (vma = mm->mmap; vma; vma = vma->vm_next) {
int ret;
struct mm_walk mem_cgroup_move_charge_walk = {
@@ -4849,7 +4873,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
*/
break;
}
- up_read(&mm->mmap_sem);
}
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
@@ -4858,17 +4881,11 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct task_struct *p,
bool threadgroup)
{
- struct mm_struct *mm;
-
- if (!mc.to)
+ if (!mc.mm)
/* no need to move charge */
return;
- mm = get_task_mm(p);
- if (mm) {
- mem_cgroup_move_charge(mm);
- mmput(mm);
- }
+ mem_cgroup_move_charge(mc.mm);
mem_cgroup_clear_mc();
}
#else /* !CONFIG_MMU */
@@ -4909,10 +4926,20 @@ struct cgroup_subsys mem_cgroup_subsys = {
};
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+static int __init enable_swap_account(char *s)
+{
+ /* consider enabled if no parameter or 1 is given */
+ if (!s || !strcmp(s, "1"))
+ really_do_swap_account = 1;
+ else if (!strcmp(s, "0"))
+ really_do_swap_account = 0;
+ return 1;
+}
+__setup("swapaccount", enable_swap_account);
static int __init disable_swap_account(char *s)
{
- really_do_swap_account = 0;
+ enable_swap_account("0");
return 1;
}
__setup("noswapaccount", disable_swap_account);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 124324134ff..46ab2c044b0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -51,6 +51,7 @@
#include <linux/slab.h>
#include <linux/swapops.h>
#include <linux/hugetlb.h>
+#include <linux/memory_hotplug.h>
#include "internal.h"
int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -1230,11 +1231,10 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
return 1;
/*
- * The lock_system_sleep prevents a race with memory hotplug,
- * because the isolation assumes there's only a single user.
+ * The lock_memory_hotplug prevents a race with memory hotplug.
* This is a big hammer, a better would be nicer.
*/
- lock_system_sleep();
+ lock_memory_hotplug();
/*
* Isolate the page, so that it doesn't get reallocated if it
@@ -1264,7 +1264,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
ret = 1;
}
unset_migratetype_isolate(p);
- unlock_system_sleep();
+ unlock_memory_hotplug();
return ret;
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9260314a221..2c6523af547 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -34,6 +34,23 @@
#include "internal.h"
+DEFINE_MUTEX(mem_hotplug_mutex);
+
+void lock_memory_hotplug(void)
+{
+ mutex_lock(&mem_hotplug_mutex);
+
+ /* for exclusive hibernation if CONFIG_HIBERNATION=y */
+ lock_system_sleep();
+}
+
+void unlock_memory_hotplug(void)
+{
+ unlock_system_sleep();
+ mutex_unlock(&mem_hotplug_mutex);
+}
+
+
/* add this memory to iomem resource */
static struct resource *register_memory_resource(u64 start, u64 size)
{
@@ -493,7 +510,7 @@ int mem_online_node(int nid)
pg_data_t *pgdat;
int ret;
- lock_system_sleep();
+ lock_memory_hotplug();
pgdat = hotadd_new_pgdat(nid, 0);
if (pgdat) {
ret = -ENOMEM;
@@ -504,7 +521,7 @@ int mem_online_node(int nid)
BUG_ON(ret);
out:
- unlock_system_sleep();
+ unlock_memory_hotplug();
return ret;
}
@@ -516,7 +533,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
struct resource *res;
int ret;
- lock_system_sleep();
+ lock_memory_hotplug();
res = register_memory_resource(start, size);
ret = -EEXIST;
@@ -563,7 +580,7 @@ error:
release_memory_resource(res);
out:
- unlock_system_sleep();
+ unlock_memory_hotplug();
return ret;
}
EXPORT_SYMBOL_GPL(add_memory);
@@ -791,7 +808,7 @@ static int offline_pages(unsigned long start_pfn,
if (!test_pages_in_a_zone(start_pfn, end_pfn))
return -EINVAL;
- lock_system_sleep();
+ lock_memory_hotplug();
zone = page_zone(pfn_to_page(start_pfn));
node = zone_to_nid(zone);
@@ -880,7 +897,7 @@ repeat:
writeback_set_ratelimit();
memory_notify(MEM_OFFLINE, &arg);
- unlock_system_sleep();
+ unlock_memory_hotplug();
return 0;
failed_removal:
@@ -891,7 +908,7 @@ failed_removal:
undo_isolate_page_range(start_pfn, end_pfn);
out:
- unlock_system_sleep();
+ unlock_memory_hotplug();
return ret;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4a57f135b76..11ff260fb28 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1307,15 +1307,18 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
goto out;
/* Find the mm_struct */
+ rcu_read_lock();
read_lock(&tasklist_lock);
task = pid ? find_task_by_vpid(pid) : current;
if (!task) {
read_unlock(&tasklist_lock);
+ rcu_read_unlock();
err = -ESRCH;
goto out;
}
mm = get_task_mm(task);
read_unlock(&tasklist_lock);
+ rcu_read_unlock();
err = -EINVAL;
if (!mm)
diff --git a/mm/migrate.c b/mm/migrate.c
index fe5a3c6a542..6ae8a66a704 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -35,6 +35,8 @@
#include <linux/hugetlb.h>
#include <linux/gfp.h>
+#include <asm/tlbflush.h>
+
#include "internal.h"
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
diff --git a/mm/mmap.c b/mm/mmap.c
index b179abb1474..50a4aa0255a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2462,6 +2462,7 @@ int install_special_mapping(struct mm_struct *mm,
unsigned long addr, unsigned long len,
unsigned long vm_flags, struct page **pages)
{
+ int ret;
struct vm_area_struct *vma;
vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
@@ -2479,16 +2480,23 @@ int install_special_mapping(struct mm_struct *mm,
vma->vm_ops = &special_mapping_vmops;
vma->vm_private_data = pages;
- if (unlikely(insert_vm_struct(mm, vma))) {
- kmem_cache_free(vm_area_cachep, vma);
- return -ENOMEM;
- }
+ ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
+ if (ret)
+ goto out;
+
+ ret = insert_vm_struct(mm, vma);
+ if (ret)
+ goto out;
mm->total_vm += len >> PAGE_SHIFT;
perf_event_mmap(vma);
return 0;
+
+out:
+ kmem_cache_free(vm_area_cachep, vma);
+ return ret;
}
static DEFINE_MUTEX(mm_all_locks_mutex);
diff --git a/mm/nommu.c b/mm/nommu.c
index 3613517c759..ef4045d010d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -10,7 +10,7 @@
* Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
* Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
* Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com>
- * Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org>
+ * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
*/
#include <linux/module.h>
@@ -328,6 +328,7 @@ void *vmalloc_node(unsigned long size, int node)
{
return vmalloc(size);
}
+EXPORT_SYMBOL(vmalloc_node);
/**
* vzalloc_node - allocate memory on a specific node with zero fill
@@ -440,6 +441,31 @@ void __attribute__((weak)) vmalloc_sync_all(void)
{
}
+/**
+ * alloc_vm_area - allocate a range of kernel address space
+ * @size: size of the area
+ *
+ * Returns: NULL on failure, vm_struct on success
+ *
+ * This function reserves a range of kernel address space, and
+ * allocates pagetables to map that range. No actual mappings
+ * are created. If the kernel address space is not shared
+ * between processes, it syncs the pagetable across all
+ * processes.
+ */
+struct vm_struct *alloc_vm_area(size_t size)
+{
+ BUG();
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(alloc_vm_area);
+
+void free_vm_area(struct vm_struct *area)
+{
+ BUG();
+}
+EXPORT_SYMBOL_GPL(free_vm_area);
+
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page)
{
@@ -1717,6 +1743,7 @@ void exit_mmap(struct mm_struct *mm)
mm->mmap = vma->vm_next;
delete_vma_from_mm(vma);
delete_vma(mm, vma);
+ cond_resched();
}
kleave("");
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b840afa8976..b4edfe7ce06 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -563,7 +563,7 @@ static void balance_dirty_pages(struct address_space *mapping,
break; /* We've done our duty */
}
trace_wbc_balance_dirty_wait(&wbc, bdi);
- __set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_UNINTERRUPTIBLE);
io_schedule_timeout(pause);
/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 07a654486f7..ff7e1587239 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -104,19 +104,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
* only be modified with pm_mutex held, unless the suspend/hibernate code is
* guaranteed not to run in parallel with that modification).
*/
-void set_gfp_allowed_mask(gfp_t mask)
+
+static gfp_t saved_gfp_mask;
+
+void pm_restore_gfp_mask(void)
{
WARN_ON(!mutex_is_locked(&pm_mutex));
- gfp_allowed_mask = mask;
+ if (saved_gfp_mask) {
+ gfp_allowed_mask = saved_gfp_mask;
+ saved_gfp_mask = 0;
+ }
}
-gfp_t clear_gfp_allowed_mask(gfp_t mask)
+void pm_restrict_gfp_mask(void)
{
- gfp_t ret = gfp_allowed_mask;
-
WARN_ON(!mutex_is_locked(&pm_mutex));
- gfp_allowed_mask &= ~mask;
- return ret;
+ WARN_ON(saved_gfp_mask);
+ saved_gfp_mask = gfp_allowed_mask;
+ gfp_allowed_mask &= ~GFP_IOFS;
}
#endif /* CONFIG_PM_SLEEP */
@@ -3008,14 +3013,6 @@ static __init_refok int __build_all_zonelists(void *data)
build_zonelist_cache(pgdat);
}
-#ifdef CONFIG_MEMORY_HOTPLUG
- /* Setup real pagesets for the new zone */
- if (data) {
- struct zone *zone = data;
- setup_zone_pageset(zone);
- }
-#endif
-
/*
* Initialize the boot_pagesets that are going to be used
* for bootstrapping processors. The real pagesets for
@@ -3064,7 +3061,11 @@ void build_all_zonelists(void *data)
} else {
/* we have to stop all cpus to guarantee there is no user
of zonelist */
- stop_machine(__build_all_zonelists, data, NULL);
+#ifdef CONFIG_MEMORY_HOTPLUG
+ if (data)
+ setup_zone_pageset((struct zone *)data);
+#endif
+ stop_machine(__build_all_zonelists, NULL, NULL);
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 8b1a2ce21ee..38cc58b8b2b 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -139,7 +139,6 @@ int walk_page_range(unsigned long addr, unsigned long end,
pgd_t *pgd;
unsigned long next;
int err = 0;
- struct vm_area_struct *vma;
if (addr >= end)
return err;
@@ -149,15 +148,17 @@ int walk_page_range(unsigned long addr, unsigned long end,
pgd = pgd_offset(walk->mm, addr);
do {
+ struct vm_area_struct *uninitialized_var(vma);
+
next = pgd_addr_end(addr, end);
+#ifdef CONFIG_HUGETLB_PAGE
/*
* handle hugetlb vma individually because pagetable walk for
* the hugetlb page is dependent on the architecture and
* we can't handled it in the same manner as non-huge pages.
*/
vma = find_vma(walk->mm, addr);
-#ifdef CONFIG_HUGETLB_PAGE
if (vma && is_vm_hugetlb_page(vma)) {
if (vma->vm_end < next)
next = vma->vm_end;
diff --git a/mm/percpu.c b/mm/percpu.c
index efe816856a9..3dd4984bdef 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -293,12 +293,8 @@ static void *pcpu_mem_alloc(size_t size)
if (size <= PAGE_SIZE)
return kzalloc(size, GFP_KERNEL);
- else {
- void *ptr = vmalloc(size);
- if (ptr)
- memset(ptr, 0, size);
- return ptr;
- }
+ else
+ return vzalloc(size);
}
/**
@@ -1268,7 +1264,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
/* we're done parsing the input, undefine BUG macro and dump config */
#undef PCPU_SETUP_BUG_ON
- pcpu_dump_alloc_info(KERN_INFO, ai);
+ pcpu_dump_alloc_info(KERN_DEBUG, ai);
pcpu_nr_groups = ai->nr_groups;
pcpu_group_offsets = group_offsets;
diff --git a/mm/shmem.c b/mm/shmem.c
index 47fdeeb9d63..5ee67c99060 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2415,13 +2415,20 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
return &p->vfs_inode;
}
+static void shmem_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ INIT_LIST_HEAD(&inode->i_dentry);
+ kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
+}
+
static void shmem_destroy_inode(struct inode *inode)
{
if ((inode->i_mode & S_IFMT) == S_IFREG) {
/* only struct inode is valid if it's an inline symlink */
mpol_free_shared_policy(&SHMEM_I(inode)->policy);
}
- kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
+ call_rcu(&inode->i_rcu, shmem_i_callback);
}
static void init_once(void *foo)
diff --git a/mm/slab.c b/mm/slab.c
index b1e40dafbab..264037449f0 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -829,12 +829,12 @@ static void init_reap_node(int cpu)
static void next_reap_node(void)
{
- int node = __get_cpu_var(slab_reap_node);
+ int node = __this_cpu_read(slab_reap_node);
node = next_node(node, node_online_map);
if (unlikely(node >= MAX_NUMNODES))
node = first_node(node_online_map);
- __get_cpu_var(slab_reap_node) = node;
+ __this_cpu_write(slab_reap_node, node);
}
#else
@@ -1012,7 +1012,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
*/
static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
{
- int node = __get_cpu_var(slab_reap_node);
+ int node = __this_cpu_read(slab_reap_node);
if (l3->alien) {
struct array_cache *ac = l3->alien[node];
@@ -1293,7 +1293,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
* anything expensive but will only modify reap_work
* and reschedule the timer.
*/
- cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu));
+ cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
/* Now the cache_reaper is guaranteed to be not running. */
per_cpu(slab_reap_work, cpu).work.func = NULL;
break;
@@ -2781,7 +2781,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
/*
* Map pages beginning at addr to the given cache and slab. This is required
* for the slab allocator to be able to lookup the cache and slab of a
- * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
+ * virtual address for kfree, ksize, and slab debugging.
*/
static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
void *addr)
@@ -3653,42 +3653,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
EXPORT_SYMBOL(kmem_cache_alloc);
#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
+void *
+kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags)
{
- return __cache_alloc(cachep, flags, __builtin_return_address(0));
-}
-EXPORT_SYMBOL(kmem_cache_alloc_notrace);
-#endif
+ void *ret;
-/**
- * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
- * @cachep: the cache we're checking against
- * @ptr: pointer to validate
- *
- * This verifies that the untrusted pointer looks sane;
- * it is _not_ a guarantee that the pointer is actually
- * part of the slab cache in question, but it at least
- * validates that the pointer can be dereferenced and
- * looks half-way sane.
- *
- * Currently only used for dentry validation.
- */
-int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
-{
- unsigned long size = cachep->buffer_size;
- struct page *page;
+ ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
- if (unlikely(!kern_ptr_validate(ptr, size)))
- goto out;
- page = virt_to_page(ptr);
- if (unlikely(!PageSlab(page)))
- goto out;
- if (unlikely(page_get_cache(page) != cachep))
- goto out;
- return 1;
-out:
- return 0;
+ trace_kmalloc(_RET_IP_, ret,
+ size, slab_buffer_size(cachep), flags);
+ return ret;
}
+EXPORT_SYMBOL(kmem_cache_alloc_trace);
+#endif
#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
@@ -3705,31 +3682,32 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
EXPORT_SYMBOL(kmem_cache_alloc_node);
#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
- gfp_t flags,
- int nodeid)
+void *kmem_cache_alloc_node_trace(size_t size,
+ struct kmem_cache *cachep,
+ gfp_t flags,
+ int nodeid)
{
- return __cache_alloc_node(cachep, flags, nodeid,
+ void *ret;
+
+ ret = __cache_alloc_node(cachep, flags, nodeid,
__builtin_return_address(0));
+ trace_kmalloc_node(_RET_IP_, ret,
+ size, slab_buffer_size(cachep),
+ flags, nodeid);
+ return ret;
}
-EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
+EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
#endif
static __always_inline void *
__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
{
struct kmem_cache *cachep;
- void *ret;
cachep = kmem_find_general_cachep(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
- ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
-
- trace_kmalloc_node((unsigned long) caller, ret,
- size, cachep->buffer_size, flags, node);
-
- return ret;
+ return kmem_cache_alloc_node_trace(size, cachep, flags, node);
}
#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
diff --git a/mm/slob.c b/mm/slob.c
index 617b6d6c42c..3588eaaef72 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -678,11 +678,6 @@ int kmem_cache_shrink(struct kmem_cache *d)
}
EXPORT_SYMBOL(kmem_cache_shrink);
-int kmem_ptr_validate(struct kmem_cache *a, const void *b)
-{
- return 0;
-}
-
static unsigned int slob_ready __read_mostly;
int slab_is_available(void)
diff --git a/mm/slub.c b/mm/slub.c
index 8fd5401bb07..008cd743a36 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -28,6 +28,8 @@
#include <linux/math64.h>
#include <linux/fault-inject.h>
+#include <trace/events/kmem.h>
+
/*
* Lock order:
* 1. slab_lock(page)
@@ -1774,11 +1776,21 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
EXPORT_SYMBOL(kmem_cache_alloc);
#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
+void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
+{
+ void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
+ trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_trace);
+
+void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
{
- return slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
+ void *ret = kmalloc_order(size, flags, order);
+ trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
+ return ret;
}
-EXPORT_SYMBOL(kmem_cache_alloc_notrace);
+EXPORT_SYMBOL(kmalloc_order_trace);
#endif
#ifdef CONFIG_NUMA
@@ -1794,13 +1806,17 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
EXPORT_SYMBOL(kmem_cache_alloc_node);
#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
+void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
gfp_t gfpflags,
- int node)
+ int node, size_t size)
{
- return slab_alloc(s, gfpflags, node, _RET_IP_);
+ void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
+
+ trace_kmalloc_node(_RET_IP_, ret,
+ size, s->size, gfpflags, node);
+ return ret;
}
-EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
+EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
#endif
#endif
@@ -1917,17 +1933,6 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
}
EXPORT_SYMBOL(kmem_cache_free);
-/* Figure out on which slab page the object resides */
-static struct page *get_object_page(const void *x)
-{
- struct page *page = virt_to_head_page(x);
-
- if (!PageSlab(page))
- return NULL;
-
- return page;
-}
-
/*
* Object placement in a slab is made very easy because we always start at
* offset 0. If we tune the size of the object to the alignment then we can
@@ -2386,35 +2391,6 @@ error:
}
/*
- * Check if a given pointer is valid
- */
-int kmem_ptr_validate(struct kmem_cache *s, const void *object)
-{
- struct page *page;
-
- if (!kern_ptr_validate(object, s->size))
- return 0;
-
- page = get_object_page(object);
-
- if (!page || s != page->slab)
- /* No slab or wrong slab */
- return 0;
-
- if (!check_valid_pointer(s, page, object))
- return 0;
-
- /*
- * We could also check if the object is on the slabs freelist.
- * But this would be too expensive and it seems that the main
- * purpose of kmem_ptr_valid() is to check if the object belongs
- * to a certain slab.
- */
- return 1;
-}
-EXPORT_SYMBOL(kmem_ptr_validate);
-
-/*
* Determine the size of a slab object
*/
unsigned int kmem_cache_size(struct kmem_cache *s)
@@ -3273,9 +3249,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
kfree(n);
kfree(s);
}
+err:
up_write(&slub_lock);
-err:
if (flags & SLAB_PANIC)
panic("Cannot create slabcache %s\n", name);
else
@@ -3401,13 +3377,13 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
for_each_free_object(p, s, page->freelist) {
set_bit(slab_index(p, s, addr), map);
- if (!check_object(s, page, p, 0))
+ if (!check_object(s, page, p, SLUB_RED_INACTIVE))
return 0;
}
for_each_object(p, s, addr, page->objects)
if (!test_bit(slab_index(p, s, addr), map))
- if (!check_object(s, page, p, 1))
+ if (!check_object(s, page, p, SLUB_RED_ACTIVE))
return 0;
return 1;
}
@@ -3862,6 +3838,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
x += sprintf(buf + x, " N%d=%lu",
node, nodes[node]);
#endif
+ up_read(&slub_lock);
kfree(nodes);
return x + sprintf(buf + x, "\n");
}
diff --git a/mm/truncate.c b/mm/truncate.c
index ba887bff48c..3c2d5ddfa0d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -390,6 +390,10 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
__remove_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
+
+ if (mapping->a_ops->freepage)
+ mapping->a_ops->freepage(page);
+
page_cache_release(page); /* pagecache ref */
return 1;
failed:
diff --git a/mm/util.c b/mm/util.c
index 73dac81e9f7..f126975ef23 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -186,27 +186,6 @@ void kzfree(const void *p)
}
EXPORT_SYMBOL(kzfree);
-int kern_ptr_validate(const void *ptr, unsigned long size)
-{
- unsigned long addr = (unsigned long)ptr;
- unsigned long min_addr = PAGE_OFFSET;
- unsigned long align_mask = sizeof(void *) - 1;
-
- if (unlikely(addr < min_addr))
- goto out;
- if (unlikely(addr > (unsigned long)high_memory - size))
- goto out;
- if (unlikely(addr & align_mask))
- goto out;
- if (unlikely(!kern_addr_valid(addr)))
- goto out;
- if (unlikely(!kern_addr_valid(addr + size - 1)))
- goto out;
- return 1;
-out:
- return 0;
-}
-
/*
* strndup_user - duplicate an existing string from user space
* @s: The string to duplicate
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a3d66b3dc5c..eb5cc7d00c5 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -31,8 +31,6 @@
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
-bool vmap_lazy_unmap __read_mostly = true;
-
/*** Page table manipulation functions ***/
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
@@ -503,9 +501,6 @@ static unsigned long lazy_max_pages(void)
{
unsigned int log;
- if (!vmap_lazy_unmap)
- return 0;
-
log = fls(num_online_cpus());
return log * (32UL * 1024 * 1024 / PAGE_SIZE);
@@ -566,7 +561,6 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
if (va->va_end > *end)
*end = va->va_end;
nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
- unmap_vmap_area(va);
list_add_tail(&va->purge_list, &valist);
va->flags |= VM_LAZY_FREEING;
va->flags &= ~VM_LAZY_FREE;
@@ -611,10 +605,11 @@ static void purge_vmap_area_lazy(void)
}
/*
- * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
- * called for the correct range previously.
+ * Free a vmap area, caller ensuring that the area has been unmapped
+ * and flush_cache_vunmap had been called for the correct range
+ * previously.
*/
-static void free_unmap_vmap_area_noflush(struct vmap_area *va)
+static void free_vmap_area_noflush(struct vmap_area *va)
{
va->flags |= VM_LAZY_FREE;
atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
@@ -623,6 +618,16 @@ static void free_unmap_vmap_area_noflush(struct vmap_area *va)
}
/*
+ * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
+ * called for the correct range previously.
+ */
+static void free_unmap_vmap_area_noflush(struct vmap_area *va)
+{
+ unmap_vmap_area(va);
+ free_vmap_area_noflush(va);
+}
+
+/*
* Free and unmap a vmap area
*/
static void free_unmap_vmap_area(struct vmap_area *va)
@@ -798,7 +803,7 @@ static void free_vmap_block(struct vmap_block *vb)
spin_unlock(&vmap_block_tree_lock);
BUG_ON(tmp != vb);
- free_unmap_vmap_area_noflush(vb->va);
+ free_vmap_area_noflush(vb->va);
call_rcu(&vb->rcu_head, rcu_free_vb);
}
@@ -936,6 +941,8 @@ static void vb_free(const void *addr, unsigned long size)
rcu_read_unlock();
BUG_ON(!vb);
+ vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
+
spin_lock(&vb->lock);
BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
@@ -988,7 +995,6 @@ void vm_unmap_aliases(void)
s = vb->va->va_start + (i << PAGE_SHIFT);
e = vb->va->va_start + (j << PAGE_SHIFT);
- vunmap_page_range(s, e);
flush = 1;
if (s < start)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b8a6fdc2131..9ca587c6927 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -494,9 +494,16 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
spin_unlock_irq(&mapping->tree_lock);
swapcache_free(swap, page);
} else {
+ void (*freepage)(struct page *);
+
+ freepage = mapping->a_ops->freepage;
+
__remove_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
+
+ if (freepage != NULL)
+ freepage(page);
}
return 1;
@@ -913,7 +920,7 @@ keep_lumpy:
* back off and wait for congestion to clear because further reclaim
* will encounter the same problem
*/
- if (nr_dirty == nr_congested)
+ if (nr_dirty == nr_congested && nr_dirty != 0)
zone_set_flag(zone, ZONE_CONGESTED);
free_page_list(&free_pages);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 42eac4d3321..312d728976f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -167,36 +167,24 @@ static void refresh_zone_stat_thresholds(void)
void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
int delta)
{
- struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
-
- s8 *p = pcp->vm_stat_diff + item;
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
long x;
+ long t;
+
+ x = delta + __this_cpu_read(*p);
- x = delta + *p;
+ t = __this_cpu_read(pcp->stat_threshold);
- if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
+ if (unlikely(x > t || x < -t)) {
zone_page_state_add(x, zone, item);
x = 0;
}
- *p = x;
+ __this_cpu_write(*p, x);
}
EXPORT_SYMBOL(__mod_zone_page_state);
/*
- * For an unknown interrupt state
- */
-void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
- int delta)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __mod_zone_page_state(zone, item, delta);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL(mod_zone_page_state);
-
-/*
* Optimized increment and decrement functions.
*
* These are only for a single page and therefore can take a struct page *
@@ -221,16 +209,17 @@ EXPORT_SYMBOL(mod_zone_page_state);
*/
void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
- s8 *p = pcp->vm_stat_diff + item;
-
- (*p)++;
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
+ s8 v, t;
- if (unlikely(*p > pcp->stat_threshold)) {
- int overstep = pcp->stat_threshold / 2;
+ v = __this_cpu_inc_return(*p);
+ t = __this_cpu_read(pcp->stat_threshold);
+ if (unlikely(v > t)) {
+ s8 overstep = t >> 1;
- zone_page_state_add(*p + overstep, zone, item);
- *p = -overstep;
+ zone_page_state_add(v + overstep, zone, item);
+ __this_cpu_write(*p, -overstep);
}
}
@@ -242,16 +231,17 @@ EXPORT_SYMBOL(__inc_zone_page_state);
void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
- s8 *p = pcp->vm_stat_diff + item;
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
+ s8 v, t;
- (*p)--;
+ v = __this_cpu_dec_return(*p);
+ t = __this_cpu_read(pcp->stat_threshold);
+ if (unlikely(v < - t)) {
+ s8 overstep = t >> 1;
- if (unlikely(*p < - pcp->stat_threshold)) {
- int overstep = pcp->stat_threshold / 2;
-
- zone_page_state_add(*p - overstep, zone, item);
- *p = overstep;
+ zone_page_state_add(v - overstep, zone, item);
+ __this_cpu_write(*p, overstep);
}
}
@@ -261,6 +251,92 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
}
EXPORT_SYMBOL(__dec_zone_page_state);
+#ifdef CONFIG_CMPXCHG_LOCAL
+/*
+ * If we have cmpxchg_local support then we do not need to incur the overhead
+ * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
+ *
+ * mod_state() modifies the zone counter state through atomic per cpu
+ * operations.
+ *
+ * Overstep mode specifies how overstep should handled:
+ * 0 No overstepping
+ * 1 Overstepping half of threshold
+ * -1 Overstepping minus half of threshold
+*/
+static inline void mod_state(struct zone *zone,
+ enum zone_stat_item item, int delta, int overstep_mode)
+{
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
+ long o, n, t, z;
+
+ do {
+ z = 0; /* overflow to zone counters */
+
+ /*
+ * The fetching of the stat_threshold is racy. We may apply
+ * a counter threshold to the wrong the cpu if we get
+ * rescheduled while executing here. However, the following
+ * will apply the threshold again and therefore bring the
+ * counter under the threshold.
+ */
+ t = this_cpu_read(pcp->stat_threshold);
+
+ o = this_cpu_read(*p);
+ n = delta + o;
+
+ if (n > t || n < -t) {
+ int os = overstep_mode * (t >> 1) ;
+
+ /* Overflow must be added to zone counters */
+ z = n + os;
+ n = -os;
+ }
+ } while (this_cpu_cmpxchg(*p, o, n) != o);
+
+ if (z)
+ zone_page_state_add(z, zone, item);
+}
+
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+ int delta)
+{
+ mod_state(zone, item, delta, 0);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
+void inc_zone_state(struct zone *zone, enum zone_stat_item item)
+{
+ mod_state(zone, item, 1, 1);
+}
+
+void inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ mod_state(page_zone(page), item, 1, 1);
+}
+EXPORT_SYMBOL(inc_zone_page_state);
+
+void dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ mod_state(page_zone(page), item, -1, -1);
+}
+EXPORT_SYMBOL(dec_zone_page_state);
+#else
+/*
+ * Use interrupt disable to serialize counter updates
+ */
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+ int delta)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __mod_zone_page_state(zone, item, delta);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
void inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
unsigned long flags;
@@ -291,6 +367,7 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
local_irq_restore(flags);
}
EXPORT_SYMBOL(dec_zone_page_state);
+#endif
/*
* Update the zone counters for one cpu.
@@ -750,8 +827,6 @@ static const char * const vmstat_text[] = {
"nr_shmem",
"nr_dirtied",
"nr_written",
- "nr_dirty_threshold",
- "nr_dirty_background_threshold",
#ifdef CONFIG_NUMA
"numa_hit",
@@ -761,6 +836,8 @@ static const char * const vmstat_text[] = {
"numa_local",
"numa_other",
#endif
+ "nr_dirty_threshold",
+ "nr_dirty_background_threshold",
#ifdef CONFIG_VM_EVENT_COUNTERS
"pgpgin",
@@ -1033,7 +1110,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
- cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu));
+ cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
per_cpu(vmstat_work, cpu).work.func = NULL;
break;
case CPU_DOWN_FAILED: