From f6ac2354d791195ca40822b84d73d48a4e8b7f2b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 30 Jun 2006 01:55:32 -0700
Subject: [PATCH] zoned vm counters: create vmstat.c/.h from page_alloc.c/.h

NOTE: ZVC are *not* the lightweight event counters.  ZVCs are reliable whereas
event counters do not need to be.

Zone based VM statistics are necessary to be able to determine what the state
of memory in one zone is.  In a NUMA system this can be helpful for local
reclaim and other memory optimizations that may be able to shift VM load in
order to get more balanced memory use.

It is also useful to know how the computing load affects the memory
allocations on various zones.  This patchset allows the retrieval of that data
from userspace.

The patchset introduces a framework for counters that is a cross between the
existing page_stats --which are simply global counters split per cpu-- and the
approach of deferred incremental updates implemented for nr_pagecache.

Small per cpu 8 bit counters are added to struct zone.  If the counter exceeds
certain thresholds then the counters are accumulated in an array of
atomic_long in the zone and in a global array that sums up all zone values.
The small 8 bit counters are next to the per cpu page pointers and so they
will be in high in the cpu cache when pages are allocated and freed.

Access to VM counter information for a zone and for the whole machine is then
possible by simply indexing an array (Thanks to Nick Piggin for pointing out
that approach).  The access to the total number of pages of various types does
no longer require the summing up of all per cpu counters.

Benefits of this patchset right now:

- Ability for UP and SMP configuration to determine how memory
  is balanced between the DMA, NORMAL and HIGHMEM zones.

- loops over all processors are avoided in writeback and
  reclaim paths. We can avoid caching the writeback information
  because the needed information is directly accessible.

- Special handling for nr_pagecache removed.

- zone_reclaim_interval vanishes since VM stats can now determine
  when it is worth to do local reclaim.

- Fast inline per node page state determination.

- Accurate counters in /sys/devices/system/node/node*/meminfo. Current
  counters are counting simply which processor allocated a page somewhere
  and guestimate based on that. So the counters were not useful to show
  the actual distribution of page use on a specific zone.

- The swap_prefetch patch requires per node statistics in order to
  figure out when processors of a node can prefetch. This patch provides
  some of the needed numbers.

- Detailed VM counters available in more /proc and /sys status files.

References to earlier discussions:
V1 http://marc.theaimsgroup.com/?l=linux-kernel&m=113511649910826&w=2
V2 http://marc.theaimsgroup.com/?l=linux-kernel&m=114980851924230&w=2
V3 http://marc.theaimsgroup.com/?l=linux-kernel&m=115014697910351&w=2
V4 http://marc.theaimsgroup.com/?l=linux-kernel&m=115024767318740&w=2

Performance tests with AIM7 did not show any regressions.  Seems to be a tad
faster even.  Tested on ia64/NUMA.  Builds fine on i386, SMP / UP.  Includes
fixes for s390/arm/uml arch code.

This patch:

Move counter code from page_alloc.c/page-flags.h to vmstat.c/h.

Create vmstat.c/vmstat.h by separating the counter code and the proc
functions.

Move the vm_stat_text array before zoneinfo_show.

[akpm@osdl.org: s390 build fix]
[akpm@osdl.org: HOTPLUG_CPU build fix]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmstat.c | 417 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 417 insertions(+)
 create mode 100644 mm/vmstat.c

(limited to 'mm/vmstat.c')

diff --git a/mm/vmstat.c b/mm/vmstat.c
new file mode 100644
index 00000000000..ad456202ff1
--- /dev/null
+++ b/mm/vmstat.c
@@ -0,0 +1,417 @@
+/*
+ *  linux/mm/vmstat.c
+ *
+ *  Manages VM statistics
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+
+/*
+ * Accumulate the page_state information across all CPUs.
+ * The result is unavoidably approximate - it can change
+ * during and after execution of this function.
+ */
+DEFINE_PER_CPU(struct page_state, page_states) = {0};
+
+atomic_t nr_pagecache = ATOMIC_INIT(0);
+EXPORT_SYMBOL(nr_pagecache);
+#ifdef CONFIG_SMP
+DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
+#endif
+
+static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
+{
+	unsigned cpu;
+
+	memset(ret, 0, nr * sizeof(unsigned long));
+	cpus_and(*cpumask, *cpumask, cpu_online_map);
+
+	for_each_cpu_mask(cpu, *cpumask) {
+		unsigned long *in;
+		unsigned long *out;
+		unsigned off;
+		unsigned next_cpu;
+
+		in = (unsigned long *)&per_cpu(page_states, cpu);
+
+		next_cpu = next_cpu(cpu, *cpumask);
+		if (likely(next_cpu < NR_CPUS))
+			prefetch(&per_cpu(page_states, next_cpu));
+
+		out = (unsigned long *)ret;
+		for (off = 0; off < nr; off++)
+			*out++ += *in++;
+	}
+}
+
+void get_page_state_node(struct page_state *ret, int node)
+{
+	int nr;
+	cpumask_t mask = node_to_cpumask(node);
+
+	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
+	nr /= sizeof(unsigned long);
+
+	__get_page_state(ret, nr+1, &mask);
+}
+
+void get_page_state(struct page_state *ret)
+{
+	int nr;
+	cpumask_t mask = CPU_MASK_ALL;
+
+	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
+	nr /= sizeof(unsigned long);
+
+	__get_page_state(ret, nr + 1, &mask);
+}
+
+void get_full_page_state(struct page_state *ret)
+{
+	cpumask_t mask = CPU_MASK_ALL;
+
+	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
+}
+
+unsigned long read_page_state_offset(unsigned long offset)
+{
+	unsigned long ret = 0;
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		unsigned long in;
+
+		in = (unsigned long)&per_cpu(page_states, cpu) + offset;
+		ret += *((unsigned long *)in);
+	}
+	return ret;
+}
+
+void __mod_page_state_offset(unsigned long offset, unsigned long delta)
+{
+	void *ptr;
+
+	ptr = &__get_cpu_var(page_states);
+	*(unsigned long *)(ptr + offset) += delta;
+}
+EXPORT_SYMBOL(__mod_page_state_offset);
+
+void mod_page_state_offset(unsigned long offset, unsigned long delta)
+{
+	unsigned long flags;
+	void *ptr;
+
+	local_irq_save(flags);
+	ptr = &__get_cpu_var(page_states);
+	*(unsigned long *)(ptr + offset) += delta;
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(mod_page_state_offset);
+
+void __get_zone_counts(unsigned long *active, unsigned long *inactive,
+			unsigned long *free, struct pglist_data *pgdat)
+{
+	struct zone *zones = pgdat->node_zones;
+	int i;
+
+	*active = 0;
+	*inactive = 0;
+	*free = 0;
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		*active += zones[i].nr_active;
+		*inactive += zones[i].nr_inactive;
+		*free += zones[i].free_pages;
+	}
+}
+
+void get_zone_counts(unsigned long *active,
+		unsigned long *inactive, unsigned long *free)
+{
+	struct pglist_data *pgdat;
+
+	*active = 0;
+	*inactive = 0;
+	*free = 0;
+	for_each_online_pgdat(pgdat) {
+		unsigned long l, m, n;
+		__get_zone_counts(&l, &m, &n, pgdat);
+		*active += l;
+		*inactive += m;
+		*free += n;
+	}
+}
+
+#ifdef CONFIG_PROC_FS
+
+#include <linux/seq_file.h>
+
+static void *frag_start(struct seq_file *m, loff_t *pos)
+{
+	pg_data_t *pgdat;
+	loff_t node = *pos;
+	for (pgdat = first_online_pgdat();
+	     pgdat && node;
+	     pgdat = next_online_pgdat(pgdat))
+		--node;
+
+	return pgdat;
+}
+
+static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+	pg_data_t *pgdat = (pg_data_t *)arg;
+
+	(*pos)++;
+	return next_online_pgdat(pgdat);
+}
+
+static void frag_stop(struct seq_file *m, void *arg)
+{
+}
+
+/*
+ * This walks the free areas for each zone.
+ */
+static int frag_show(struct seq_file *m, void *arg)
+{
+	pg_data_t *pgdat = (pg_data_t *)arg;
+	struct zone *zone;
+	struct zone *node_zones = pgdat->node_zones;
+	unsigned long flags;
+	int order;
+
+	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+		if (!populated_zone(zone))
+			continue;
+
+		spin_lock_irqsave(&zone->lock, flags);
+		seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+		for (order = 0; order < MAX_ORDER; ++order)
+			seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
+		spin_unlock_irqrestore(&zone->lock, flags);
+		seq_putc(m, '\n');
+	}
+	return 0;
+}
+
+struct seq_operations fragmentation_op = {
+	.start	= frag_start,
+	.next	= frag_next,
+	.stop	= frag_stop,
+	.show	= frag_show,
+};
+
+static char *vmstat_text[] = {
+	"nr_dirty",
+	"nr_writeback",
+	"nr_unstable",
+	"nr_page_table_pages",
+	"nr_mapped",
+	"nr_slab",
+
+	"pgpgin",
+	"pgpgout",
+	"pswpin",
+	"pswpout",
+
+	"pgalloc_high",
+	"pgalloc_normal",
+	"pgalloc_dma32",
+	"pgalloc_dma",
+
+	"pgfree",
+	"pgactivate",
+	"pgdeactivate",
+
+	"pgfault",
+	"pgmajfault",
+
+	"pgrefill_high",
+	"pgrefill_normal",
+	"pgrefill_dma32",
+	"pgrefill_dma",
+
+	"pgsteal_high",
+	"pgsteal_normal",
+	"pgsteal_dma32",
+	"pgsteal_dma",
+
+	"pgscan_kswapd_high",
+	"pgscan_kswapd_normal",
+	"pgscan_kswapd_dma32",
+	"pgscan_kswapd_dma",
+
+	"pgscan_direct_high",
+	"pgscan_direct_normal",
+	"pgscan_direct_dma32",
+	"pgscan_direct_dma",
+
+	"pginodesteal",
+	"slabs_scanned",
+	"kswapd_steal",
+	"kswapd_inodesteal",
+	"pageoutrun",
+	"allocstall",
+
+	"pgrotated",
+	"nr_bounce",
+};
+
+/*
+ * Output information about zones in @pgdat.
+ */
+static int zoneinfo_show(struct seq_file *m, void *arg)
+{
+	pg_data_t *pgdat = arg;
+	struct zone *zone;
+	struct zone *node_zones = pgdat->node_zones;
+	unsigned long flags;
+
+	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
+		int i;
+
+		if (!populated_zone(zone))
+			continue;
+
+		spin_lock_irqsave(&zone->lock, flags);
+		seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
+		seq_printf(m,
+			   "\n  pages free     %lu"
+			   "\n        min      %lu"
+			   "\n        low      %lu"
+			   "\n        high     %lu"
+			   "\n        active   %lu"
+			   "\n        inactive %lu"
+			   "\n        scanned  %lu (a: %lu i: %lu)"
+			   "\n        spanned  %lu"
+			   "\n        present  %lu",
+			   zone->free_pages,
+			   zone->pages_min,
+			   zone->pages_low,
+			   zone->pages_high,
+			   zone->nr_active,
+			   zone->nr_inactive,
+			   zone->pages_scanned,
+			   zone->nr_scan_active, zone->nr_scan_inactive,
+			   zone->spanned_pages,
+			   zone->present_pages);
+		seq_printf(m,
+			   "\n        protection: (%lu",
+			   zone->lowmem_reserve[0]);
+		for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
+			seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
+		seq_printf(m,
+			   ")"
+			   "\n  pagesets");
+		for_each_online_cpu(i) {
+			struct per_cpu_pageset *pageset;
+			int j;
+
+			pageset = zone_pcp(zone, i);
+			for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
+				if (pageset->pcp[j].count)
+					break;
+			}
+			if (j == ARRAY_SIZE(pageset->pcp))
+				continue;
+			for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
+				seq_printf(m,
+					   "\n    cpu: %i pcp: %i"
+					   "\n              count: %i"
+					   "\n              high:  %i"
+					   "\n              batch: %i",
+					   i, j,
+					   pageset->pcp[j].count,
+					   pageset->pcp[j].high,
+					   pageset->pcp[j].batch);
+			}
+#ifdef CONFIG_NUMA
+			seq_printf(m,
+				   "\n            numa_hit:       %lu"
+				   "\n            numa_miss:      %lu"
+				   "\n            numa_foreign:   %lu"
+				   "\n            interleave_hit: %lu"
+				   "\n            local_node:     %lu"
+				   "\n            other_node:     %lu",
+				   pageset->numa_hit,
+				   pageset->numa_miss,
+				   pageset->numa_foreign,
+				   pageset->interleave_hit,
+				   pageset->local_node,
+				   pageset->other_node);
+#endif
+		}
+		seq_printf(m,
+			   "\n  all_unreclaimable: %u"
+			   "\n  prev_priority:     %i"
+			   "\n  temp_priority:     %i"
+			   "\n  start_pfn:         %lu",
+			   zone->all_unreclaimable,
+			   zone->prev_priority,
+			   zone->temp_priority,
+			   zone->zone_start_pfn);
+		spin_unlock_irqrestore(&zone->lock, flags);
+		seq_putc(m, '\n');
+	}
+	return 0;
+}
+
+struct seq_operations zoneinfo_op = {
+	.start	= frag_start, /* iterate over all zones. The same as in
+			       * fragmentation. */
+	.next	= frag_next,
+	.stop	= frag_stop,
+	.show	= zoneinfo_show,
+};
+
+static void *vmstat_start(struct seq_file *m, loff_t *pos)
+{
+	struct page_state *ps;
+
+	if (*pos >= ARRAY_SIZE(vmstat_text))
+		return NULL;
+
+	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
+	m->private = ps;
+	if (!ps)
+		return ERR_PTR(-ENOMEM);
+	get_full_page_state(ps);
+	ps->pgpgin /= 2;		/* sectors -> kbytes */
+	ps->pgpgout /= 2;
+	return (unsigned long *)ps + *pos;
+}
+
+static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+	(*pos)++;
+	if (*pos >= ARRAY_SIZE(vmstat_text))
+		return NULL;
+	return (unsigned long *)m->private + *pos;
+}
+
+static int vmstat_show(struct seq_file *m, void *arg)
+{
+	unsigned long *l = arg;
+	unsigned long off = l - (unsigned long *)m->private;
+
+	seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
+	return 0;
+}
+
+static void vmstat_stop(struct seq_file *m, void *arg)
+{
+	kfree(m->private);
+	m->private = NULL;
+}
+
+struct seq_operations vmstat_op = {
+	.start	= vmstat_start,
+	.next	= vmstat_next,
+	.stop	= vmstat_stop,
+	.show	= vmstat_show,
+};
+
+#endif /* CONFIG_PROC_FS */
+
-- 
cgit v1.2.3-70-g09d2


From 2244b95a7bcf8d24196f8a3a44187ba5dfff754c Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 30 Jun 2006 01:55:33 -0700
Subject: [PATCH] zoned vm counters: basic ZVC (zoned vm counter)
 implementation

Per zone counter infrastructure

The counters that we currently have for the VM are split per processor.  The
processor however has not much to do with the zone these pages belong to.  We
cannot tell f.e.  how many ZONE_DMA pages are dirty.

So we are blind to potentially inbalances in the usage of memory in various
zones.  F.e.  in a NUMA system we cannot tell how many pages are dirty on a
particular node.  If we knew then we could put measures into the VM to balance
the use of memory between different zones and different nodes in a NUMA
system.  For example it would be possible to limit the dirty pages per node so
that fast local memory is kept available even if a process is dirtying huge
amounts of pages.

Another example is zone reclaim.  We do not know how many unmapped pages exist
per zone.  So we just have to try to reclaim.  If it is not working then we
pause and try again later.  It would be better if we knew when it makes sense
to reclaim unmapped pages from a zone.  This patchset allows the determination
of the number of unmapped pages per zone.  We can remove the zone reclaim
interval with the counters introduced here.

Futhermore the ability to have various usage statistics available will allow
the development of new NUMA balancing algorithms that may be able to improve
the decision making in the scheduler of when to move a process to another node
and hopefully will also enable automatic page migration through a user space
program that can analyse the memory load distribution and then rebalance
memory use in order to increase performance.

The counter framework here implements differential counters for each processor
in struct zone.  The differential counters are consolidated when a threshold
is exceeded (like done in the current implementation for nr_pageache), when
slab reaping occurs or when a consolidation function is called.

Consolidation uses atomic operations and accumulates counters per zone in the
zone structure and also globally in the vm_stat array.  VM functions can
access the counts by simply indexing a global or zone specific array.

The arrangement of counters in an array also simplifies processing when output
has to be generated for /proc/*.

Counters can be updated by calling inc/dec_zone_page_state or
_inc/dec_zone_page_state analogous to *_page_state.  The second group of
functions can be called if it is known that interrupts are disabled.

Special optimized increment and decrement functions are provided.  These can
avoid certain checks and use increment or decrement instructions that an
architecture may provide.

We also add a new CONFIG_DMA_IS_NORMAL that signifies that an architecture can
do DMA to all memory and therefore ZONE_NORMAL will not be populated.  This is
only currently set for IA64 SGI SN2 and currently only affects
node_page_state().  In the best case node_page_state can be reduced to
retrieving a single counter for the one zone on the node.

[akpm@osdl.org: cleanups]
[akpm@osdl.org: export vm_stat[] for filesystems]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/Kconfig      |   5 ++
 include/linux/mmzone.h |   9 ++
 include/linux/vmstat.h | 129 ++++++++++++++++++++++++++++-
 mm/page_alloc.c        |   2 +
 mm/slab.c              |   1 +
 mm/vmstat.c            | 218 ++++++++++++++++++++++++++++++++++++++++++++++++-
 6 files changed, 359 insertions(+), 5 deletions(-)

(limited to 'mm/vmstat.c')

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index b487e227a1f..47de9ee6bcd 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -70,6 +70,11 @@ config DMA_IS_DMA32
 	bool
 	default y
 
+config DMA_IS_NORMAL
+	bool
+	depends on IA64_SGI_SN2
+	default y
+
 choice
 	prompt "System type"
 	default IA64_GENERIC
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d6120fa6911..543f9e41156 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -46,6 +46,9 @@ struct zone_padding {
 #define ZONE_PADDING(name)
 #endif
 
+enum zone_stat_item {
+	NR_VM_ZONE_STAT_ITEMS };
+
 struct per_cpu_pages {
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
@@ -55,6 +58,10 @@ struct per_cpu_pages {
 
 struct per_cpu_pageset {
 	struct per_cpu_pages pcp[2];	/* 0: hot.  1: cold */
+#ifdef CONFIG_SMP
+	s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
+#endif
+
 #ifdef CONFIG_NUMA
 	unsigned long numa_hit;		/* allocated in intended node */
 	unsigned long numa_miss;	/* allocated in non intended node */
@@ -165,6 +172,8 @@ struct zone {
 	/* A count of how many reclaimers are scanning this zone */
 	atomic_t		reclaim_in_progress;
 
+	/* Zone statistics */
+	atomic_long_t		vm_stat[NR_VM_ZONE_STAT_ITEMS];
 	/*
 	 * timestamp (in jiffies) of the last zone reclaim that did not
 	 * result in freeing of pages. This is used to avoid repeated scans
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 3ca0c1989fc..3fd5c11e544 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -3,6 +3,9 @@
 
 #include <linux/types.h>
 #include <linux/percpu.h>
+#include <linux/config.h>
+#include <linux/mmzone.h>
+#include <asm/atomic.h>
 
 /*
  * Global page accounting.  One instance per CPU.  Only unsigned longs are
@@ -134,5 +137,129 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
 
 DECLARE_PER_CPU(struct page_state, page_states);
 
-#endif /* _LINUX_VMSTAT_H */
+/*
+ * Zone based page accounting with per cpu differentials.
+ */
+extern atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
+
+static inline void zone_page_state_add(long x, struct zone *zone,
+				 enum zone_stat_item item)
+{
+	atomic_long_add(x, &zone->vm_stat[item]);
+	atomic_long_add(x, &vm_stat[item]);
+}
+
+static inline unsigned long global_page_state(enum zone_stat_item item)
+{
+	long x = atomic_long_read(&vm_stat[item]);
+#ifdef CONFIG_SMP
+	if (x < 0)
+		x = 0;
+#endif
+	return x;
+}
+
+static inline unsigned long zone_page_state(struct zone *zone,
+					enum zone_stat_item item)
+{
+	long x = atomic_long_read(&zone->vm_stat[item]);
+#ifdef CONFIG_SMP
+	if (x < 0)
+		x = 0;
+#endif
+	return x;
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * Determine the per node value of a stat item. This function
+ * is called frequently in a NUMA machine, so try to be as
+ * frugal as possible.
+ */
+static inline unsigned long node_page_state(int node,
+				 enum zone_stat_item item)
+{
+	struct zone *zones = NODE_DATA(node)->node_zones;
+
+	return
+#ifndef CONFIG_DMA_IS_NORMAL
+#if !defined(CONFIG_DMA_IS_DMA32) && BITS_PER_LONG >= 64
+		zone_page_state(&zones[ZONE_DMA32], item) +
+#endif
+		zone_page_state(&zones[ZONE_NORMAL], item) +
+#endif
+#ifdef CONFIG_HIGHMEM
+		zone_page_state(&zones[ZONE_HIGHMEM], item) +
+#endif
+		zone_page_state(&zones[ZONE_DMA], item);
+}
+#else
+#define node_page_state(node, item) global_page_state(item)
+#endif
+
+#define __add_zone_page_state(__z, __i, __d)	\
+		__mod_zone_page_state(__z, __i, __d)
+#define __sub_zone_page_state(__z, __i, __d)	\
+		__mod_zone_page_state(__z, __i,-(__d))
+
+#define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d)
+#define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d))
+
+static inline void zap_zone_vm_stats(struct zone *zone)
+{
+	memset(zone->vm_stat, 0, sizeof(zone->vm_stat));
+}
+
+#ifdef CONFIG_SMP
+void __mod_zone_page_state(struct zone *, enum zone_stat_item item, int);
+void __inc_zone_page_state(struct page *, enum zone_stat_item);
+void __dec_zone_page_state(struct page *, enum zone_stat_item);
 
+void mod_zone_page_state(struct zone *, enum zone_stat_item, int);
+void inc_zone_page_state(struct page *, enum zone_stat_item);
+void dec_zone_page_state(struct page *, enum zone_stat_item);
+
+extern void inc_zone_state(struct zone *, enum zone_stat_item);
+
+void refresh_cpu_vm_stats(int);
+void refresh_vm_stats(void);
+
+#else /* CONFIG_SMP */
+
+/*
+ * We do not maintain differentials in a single processor configuration.
+ * The functions directly modify the zone and global counters.
+ */
+static inline void __mod_zone_page_state(struct zone *zone,
+			enum zone_stat_item item, int delta)
+{
+	zone_page_state_add(delta, zone, item);
+}
+
+static inline void __inc_zone_page_state(struct page *page,
+			enum zone_stat_item item)
+{
+	atomic_long_inc(&page_zone(page)->vm_stat[item]);
+	atomic_long_inc(&vm_stat[item]);
+}
+
+static inline void __dec_zone_page_state(struct page *page,
+			enum zone_stat_item item)
+{
+	atomic_long_dec(&page_zone(page)->vm_stat[item]);
+	atomic_long_dec(&vm_stat[item]);
+}
+
+/*
+ * We only use atomic operations to update counters. So there is no need to
+ * disable interrupts.
+ */
+#define inc_zone_page_state __inc_zone_page_state
+#define dec_zone_page_state __dec_zone_page_state
+#define mod_zone_page_state __mod_zone_page_state
+
+static inline void refresh_cpu_vm_stats(int cpu) { }
+static inline void refresh_vm_stats(void) { }
+#endif
+
+#endif /* _LINUX_VMSTAT_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 87dc1297fe3..3a877fecc30 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2045,6 +2045,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
 		zone->nr_scan_inactive = 0;
 		zone->nr_active = 0;
 		zone->nr_inactive = 0;
+		zap_zone_vm_stats(zone);
 		atomic_set(&zone->reclaim_in_progress, 0);
 		if (!size)
 			continue;
@@ -2147,6 +2148,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
 		}
 
 		local_irq_enable();
+		refresh_cpu_vm_stats(cpu);
 	}
 	return NOTIFY_OK;
 }
diff --git a/mm/slab.c b/mm/slab.c
index 233e39d14ca..0c33820038c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3763,6 +3763,7 @@ next:
 	check_irq_on();
 	mutex_unlock(&cache_chain_mutex);
 	next_reap_node();
+	refresh_cpu_vm_stats(smp_processor_id());
 	/* Set up the next iteration */
 	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index ad456202ff1..210f9bbbb04 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -3,10 +3,15 @@
  *
  *  Manages VM statistics
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  zoned VM statistics
+ *  Copyright (C) 2006 Silicon Graphics, Inc.,
+ *		Christoph Lameter <christoph@lameter.com>
  */
 
 #include <linux/config.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 
 /*
  * Accumulate the page_state information across all CPUs.
@@ -143,6 +148,197 @@ void get_zone_counts(unsigned long *active,
 	}
 }
 
+/*
+ * Manage combined zone based / global counters
+ *
+ * vm_stat contains the global counters
+ */
+atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
+EXPORT_SYMBOL(vm_stat);
+
+#ifdef CONFIG_SMP
+
+#define STAT_THRESHOLD 32
+
+/*
+ * Determine pointer to currently valid differential byte given a zone and
+ * the item number.
+ *
+ * Preemption must be off
+ */
+static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item)
+{
+	return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item];
+}
+
+/*
+ * For use when we know that interrupts are disabled.
+ */
+void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+				int delta)
+{
+	s8 *p;
+	long x;
+
+	p = diff_pointer(zone, item);
+	x = delta + *p;
+
+	if (unlikely(x > STAT_THRESHOLD || x < -STAT_THRESHOLD)) {
+		zone_page_state_add(x, zone, item);
+		x = 0;
+	}
+
+	*p = x;
+}
+EXPORT_SYMBOL(__mod_zone_page_state);
+
+/*
+ * For an unknown interrupt state
+ */
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+					int delta)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__mod_zone_page_state(zone, item, delta);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
+/*
+ * Optimized increment and decrement functions.
+ *
+ * These are only for a single page and therefore can take a struct page *
+ * argument instead of struct zone *. This allows the inclusion of the code
+ * generated for page_zone(page) into the optimized functions.
+ *
+ * No overflow check is necessary and therefore the differential can be
+ * incremented or decremented in place which may allow the compilers to
+ * generate better code.
+ *
+ * The increment or decrement is known and therefore one boundary check can
+ * be omitted.
+ *
+ * Some processors have inc/dec instructions that are atomic vs an interrupt.
+ * However, the code must first determine the differential location in a zone
+ * based on the processor number and then inc/dec the counter. There is no
+ * guarantee without disabling preemption that the processor will not change
+ * in between and therefore the atomicity vs. interrupt cannot be exploited
+ * in a useful way here.
+ */
+void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+	struct zone *zone = page_zone(page);
+	s8 *p = diff_pointer(zone, item);
+
+	(*p)++;
+
+	if (unlikely(*p > STAT_THRESHOLD)) {
+		zone_page_state_add(*p, zone, item);
+		*p = 0;
+	}
+}
+EXPORT_SYMBOL(__inc_zone_page_state);
+
+void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+	struct zone *zone = page_zone(page);
+	s8 *p = diff_pointer(zone, item);
+
+	(*p)--;
+
+	if (unlikely(*p < -STAT_THRESHOLD)) {
+		zone_page_state_add(*p, zone, item);
+		*p = 0;
+	}
+}
+EXPORT_SYMBOL(__dec_zone_page_state);
+
+void inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+	unsigned long flags;
+	struct zone *zone;
+	s8 *p;
+
+	zone = page_zone(page);
+	local_irq_save(flags);
+	p = diff_pointer(zone, item);
+
+	(*p)++;
+
+	if (unlikely(*p > STAT_THRESHOLD)) {
+		zone_page_state_add(*p, zone, item);
+		*p = 0;
+	}
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(inc_zone_page_state);
+
+void dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+	unsigned long flags;
+	struct zone *zone;
+	s8 *p;
+
+	zone = page_zone(page);
+	local_irq_save(flags);
+	p = diff_pointer(zone, item);
+
+	(*p)--;
+
+	if (unlikely(*p < -STAT_THRESHOLD)) {
+		zone_page_state_add(*p, zone, item);
+		*p = 0;
+	}
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(dec_zone_page_state);
+
+/*
+ * Update the zone counters for one cpu.
+ */
+void refresh_cpu_vm_stats(int cpu)
+{
+	struct zone *zone;
+	int i;
+	unsigned long flags;
+
+	for_each_zone(zone) {
+		struct per_cpu_pageset *pcp;
+
+		pcp = zone_pcp(zone, cpu);
+
+		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+			if (pcp->vm_stat_diff[i]) {
+				local_irq_save(flags);
+				zone_page_state_add(pcp->vm_stat_diff[i],
+					zone, i);
+				pcp->vm_stat_diff[i] = 0;
+				local_irq_restore(flags);
+			}
+	}
+}
+
+static void __refresh_cpu_vm_stats(void *dummy)
+{
+	refresh_cpu_vm_stats(smp_processor_id());
+}
+
+/*
+ * Consolidate all counters.
+ *
+ * Note that the result is less inaccurate but still inaccurate
+ * if concurrent processes are allowed to run.
+ */
+void refresh_vm_stats(void)
+{
+	on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1);
+}
+EXPORT_SYMBOL(refresh_vm_stats);
+
+#endif
+
 #ifdef CONFIG_PROC_FS
 
 #include <linux/seq_file.h>
@@ -204,6 +400,9 @@ struct seq_operations fragmentation_op = {
 };
 
 static char *vmstat_text[] = {
+	/* Zoned VM counters */
+
+	/* Page state */
 	"nr_dirty",
 	"nr_writeback",
 	"nr_unstable",
@@ -297,6 +496,11 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
 			   zone->nr_scan_active, zone->nr_scan_inactive,
 			   zone->spanned_pages,
 			   zone->present_pages);
+
+		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+			seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
+					zone_page_state(zone, i));
+
 		seq_printf(m,
 			   "\n        protection: (%lu",
 			   zone->lowmem_reserve[0]);
@@ -368,19 +572,25 @@ struct seq_operations zoneinfo_op = {
 
 static void *vmstat_start(struct seq_file *m, loff_t *pos)
 {
+	unsigned long *v;
 	struct page_state *ps;
+	int i;
 
 	if (*pos >= ARRAY_SIZE(vmstat_text))
 		return NULL;
 
-	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
-	m->private = ps;
-	if (!ps)
+	v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
+			+ sizeof(*ps), GFP_KERNEL);
+	m->private = v;
+	if (!v)
 		return ERR_PTR(-ENOMEM);
+	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+		v[i] = global_page_state(i);
+	ps = (struct page_state *)(v + NR_VM_ZONE_STAT_ITEMS);
 	get_full_page_state(ps);
 	ps->pgpgin /= 2;		/* sectors -> kbytes */
 	ps->pgpgout /= 2;
-	return (unsigned long *)ps + *pos;
+	return v + *pos;
 }
 
 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
-- 
cgit v1.2.3-70-g09d2


From 65ba55f500a37272985d071c9bbb35256a2f7c14 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 30 Jun 2006 01:55:34 -0700
Subject: [PATCH] zoned vm counters: convert nr_mapped to per zone counter

nr_mapped is important because it allows a determination of how many pages of
a zone are not mapped, which would allow a more efficient means of determining
when we need to reclaim memory in a zone.

We take the nr_mapped field out of the page state structure and define a new
per zone counter named NR_FILE_MAPPED (the anonymous pages will be split off
from NR_MAPPED in the next patch).

We replace the use of nr_mapped in various kernel locations.  This avoids the
looping over all processors in try_to_free_pages(), writeback, reclaim (swap +
zone reclaim).

[akpm@osdl.org: bugfix]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/pgtable.c | 2 +-
 drivers/base/node.c    | 4 +---
 fs/proc/proc_misc.c    | 2 +-
 include/linux/mmzone.h | 3 +++
 include/linux/vmstat.h | 2 --
 mm/page-writeback.c    | 2 +-
 mm/page_alloc.c        | 2 +-
 mm/rmap.c              | 6 +++---
 mm/vmscan.c            | 8 ++++----
 mm/vmstat.c            | 2 +-
 10 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'mm/vmstat.c')

diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index 2889567e21a..f85f1a40e5c 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -61,7 +61,7 @@ void show_mem(void)
 	get_page_state(&ps);
 	printk(KERN_INFO "%lu pages dirty\n", ps.nr_dirty);
 	printk(KERN_INFO "%lu pages writeback\n", ps.nr_writeback);
-	printk(KERN_INFO "%lu pages mapped\n", ps.nr_mapped);
+	printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
 	printk(KERN_INFO "%lu pages slab\n", ps.nr_slab);
 	printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages);
 }
diff --git a/drivers/base/node.c b/drivers/base/node.c
index eae2bdc183b..8b1232320a9 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -54,8 +54,6 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 		ps.nr_dirty = 0;
 	if ((long)ps.nr_writeback < 0)
 		ps.nr_writeback = 0;
-	if ((long)ps.nr_mapped < 0)
-		ps.nr_mapped = 0;
 	if ((long)ps.nr_slab < 0)
 		ps.nr_slab = 0;
 
@@ -84,7 +82,7 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 		       nid, K(i.freeram - i.freehigh),
 		       nid, K(ps.nr_dirty),
 		       nid, K(ps.nr_writeback),
-		       nid, K(ps.nr_mapped),
+		       nid, K(node_page_state(nid, NR_FILE_MAPPED)),
 		       nid, K(ps.nr_slab));
 	n += hugetlb_report_node_meminfo(nid, buf + n);
 	return n;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 5c10ea15742..bc7d9abca74 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -190,7 +190,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(i.freeswap),
 		K(ps.nr_dirty),
 		K(ps.nr_writeback),
-		K(ps.nr_mapped),
+		K(global_page_state(NR_FILE_MAPPED)),
 		K(ps.nr_slab),
 		K(allowed),
 		K(committed),
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 543f9e41156..eb42c127702 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -47,6 +47,9 @@ struct zone_padding {
 #endif
 
 enum zone_stat_item {
+	NR_FILE_MAPPED,	/* mapped into pagetables.
+			   only modified from process context */
+
 	NR_VM_ZONE_STAT_ITEMS };
 
 struct per_cpu_pages {
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 3fd5c11e544..8ab8229523e 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -26,8 +26,6 @@ struct page_state {
 	unsigned long nr_writeback;	/* Pages under writeback */
 	unsigned long nr_unstable;	/* NFS unstable pages */
 	unsigned long nr_page_table_pages;/* Pages used for pagetables */
-	unsigned long nr_mapped;	/* mapped into pagetables.
-					 * only modified from process context */
 	unsigned long nr_slab;		/* In slab */
 #define GET_PAGE_STATE_LAST nr_slab
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 4ec7026c7ba..60c7244c42e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -111,7 +111,7 @@ static void get_writeback_state(struct writeback_state *wbs)
 {
 	wbs->nr_dirty = read_page_state(nr_dirty);
 	wbs->nr_unstable = read_page_state(nr_unstable);
-	wbs->nr_mapped = read_page_state(nr_mapped);
+	wbs->nr_mapped = global_page_state(NR_FILE_MAPPED);
 	wbs->nr_writeback = read_page_state(nr_writeback);
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3a877fecc30..04dd2b01b2b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1319,7 +1319,7 @@ void show_free_areas(void)
 		ps.nr_unstable,
 		nr_free_pages(),
 		ps.nr_slab,
-		ps.nr_mapped,
+		global_page_state(NR_FILE_MAPPED),
 		ps.nr_page_table_pages);
 
 	for_each_zone(zone) {
diff --git a/mm/rmap.c b/mm/rmap.c
index e76909e880c..af5e9808e65 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -455,7 +455,7 @@ static void __page_set_anon_rmap(struct page *page,
 	 * nr_mapped state can be updated without turning off
 	 * interrupts because it is not modified via interrupt.
 	 */
-	__inc_page_state(nr_mapped);
+	__inc_zone_page_state(page, NR_FILE_MAPPED);
 }
 
 /**
@@ -499,7 +499,7 @@ void page_add_new_anon_rmap(struct page *page,
 void page_add_file_rmap(struct page *page)
 {
 	if (atomic_inc_and_test(&page->_mapcount))
-		__inc_page_state(nr_mapped);
+		__inc_zone_page_state(page, NR_FILE_MAPPED);
 }
 
 /**
@@ -531,7 +531,7 @@ void page_remove_rmap(struct page *page)
 		 */
 		if (page_test_and_clear_dirty(page))
 			set_page_dirty(page);
-		__dec_page_state(nr_mapped);
+		__dec_zone_page_state(page, NR_FILE_MAPPED);
 	}
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eeacb0d695c..d2caf7471cf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -990,7 +990,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 	}
 
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
-		sc.nr_mapped = read_page_state(nr_mapped);
+		sc.nr_mapped = global_page_state(NR_FILE_MAPPED);
 		sc.nr_scanned = 0;
 		if (!priority)
 			disable_swap_token();
@@ -1075,7 +1075,7 @@ loop_again:
 	total_scanned = 0;
 	nr_reclaimed = 0;
 	sc.may_writepage = !laptop_mode;
-	sc.nr_mapped = read_page_state(nr_mapped);
+	sc.nr_mapped = global_page_state(NR_FILE_MAPPED);
 
 	inc_page_state(pageoutrun);
 
@@ -1407,7 +1407,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 		for (prio = DEF_PRIORITY; prio >= 0; prio--) {
 			unsigned long nr_to_scan = nr_pages - ret;
 
-			sc.nr_mapped = read_page_state(nr_mapped);
+			sc.nr_mapped = global_page_state(NR_FILE_MAPPED);
 			sc.nr_scanned = 0;
 
 			ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
@@ -1548,7 +1548,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	struct scan_control sc = {
 		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
 		.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
-		.nr_mapped = read_page_state(nr_mapped),
+		.nr_mapped = global_page_state(NR_FILE_MAPPED),
 		.swap_cluster_max = max_t(unsigned long, nr_pages,
 					SWAP_CLUSTER_MAX),
 		.gfp_mask = gfp_mask,
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 210f9bbbb04..4800091c129 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -401,13 +401,13 @@ struct seq_operations fragmentation_op = {
 
 static char *vmstat_text[] = {
 	/* Zoned VM counters */
+	"nr_mapped",
 
 	/* Page state */
 	"nr_dirty",
 	"nr_writeback",
 	"nr_unstable",
 	"nr_page_table_pages",
-	"nr_mapped",
 	"nr_slab",
 
 	"pgpgin",
-- 
cgit v1.2.3-70-g09d2


From 347ce434d57da80fd5809c0c836f206a50999c26 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 30 Jun 2006 01:55:35 -0700
Subject: [PATCH] zoned vm counters: conversion of nr_pagecache to per zone
 counter

Currently a single atomic variable is used to establish the size of the page
cache in the whole machine.  The zoned VM counters have the same method of
implementation as the nr_pagecache code but also allow the determination of
the pagecache size per zone.

Remove the special implementation for nr_pagecache and make it a zoned counter
named NR_FILE_PAGES.

Updates of the page cache counters are always performed with interrupts off.
We can therefore use the __ variant here.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/appldata/appldata_mem.c |  3 ++-
 arch/sparc/kernel/sys_sunos.c     |  2 +-
 arch/sparc64/kernel/sys_sunos32.c |  2 +-
 drivers/base/node.c               |  2 ++
 fs/proc/proc_misc.c               |  3 ++-
 include/linux/mmzone.h            |  2 +-
 include/linux/pagemap.h           | 45 ---------------------------------------
 mm/filemap.c                      |  4 ++--
 mm/mmap.c                         |  2 +-
 mm/nommu.c                        |  2 +-
 mm/page_alloc.c                   |  5 -----
 mm/swap_state.c                   |  4 ++--
 mm/vmstat.c                       |  7 +-----
 13 files changed, 16 insertions(+), 67 deletions(-)

(limited to 'mm/vmstat.c')

diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c
index 7915a197d96..180ba79a626 100644
--- a/arch/s390/appldata/appldata_mem.c
+++ b/arch/s390/appldata/appldata_mem.c
@@ -130,7 +130,8 @@ static void appldata_get_mem_data(void *data)
 	mem_data->totalhigh = P2K(val.totalhigh);
 	mem_data->freehigh  = P2K(val.freehigh);
 	mem_data->bufferram = P2K(val.bufferram);
-	mem_data->cached    = P2K(atomic_read(&nr_pagecache) - val.bufferram);
+	mem_data->cached    = P2K(global_page_state(NR_FILE_PAGES)
+				- val.bufferram);
 
 	si_swapinfo(&val);
 	mem_data->totalswap = P2K(val.totalswap);
diff --git a/arch/sparc/kernel/sys_sunos.c b/arch/sparc/kernel/sys_sunos.c
index 288de276d9f..aa0fb2efb61 100644
--- a/arch/sparc/kernel/sys_sunos.c
+++ b/arch/sparc/kernel/sys_sunos.c
@@ -196,7 +196,7 @@ asmlinkage int sunos_brk(unsigned long brk)
 	 * simple, it hopefully works in most obvious cases.. Easy to
 	 * fool it, but this should catch most mistakes.
 	 */
-	freepages = get_page_cache_size();
+	freepages = global_page_state(NR_FILE_PAGES);
 	freepages >>= 1;
 	freepages += nr_free_pages();
 	freepages += nr_swap_pages;
diff --git a/arch/sparc64/kernel/sys_sunos32.c b/arch/sparc64/kernel/sys_sunos32.c
index ae5b32f817f..87ebdf858a3 100644
--- a/arch/sparc64/kernel/sys_sunos32.c
+++ b/arch/sparc64/kernel/sys_sunos32.c
@@ -155,7 +155,7 @@ asmlinkage int sunos_brk(u32 baddr)
 	 * simple, it hopefully works in most obvious cases.. Easy to
 	 * fool it, but this should catch most mistakes.
 	 */
-	freepages = get_page_cache_size();
+	freepages = global_page_state(NR_FILE_PAGES);
 	freepages >>= 1;
 	freepages += nr_free_pages();
 	freepages += nr_swap_pages;
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 8b1232320a9..ae9e3fea4b3 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -69,6 +69,7 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 		       "Node %d LowFree:      %8lu kB\n"
 		       "Node %d Dirty:        %8lu kB\n"
 		       "Node %d Writeback:    %8lu kB\n"
+		       "Node %d FilePages:    %8lu kB\n"
 		       "Node %d Mapped:       %8lu kB\n"
 		       "Node %d Slab:         %8lu kB\n",
 		       nid, K(i.totalram),
@@ -82,6 +83,7 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 		       nid, K(i.freeram - i.freehigh),
 		       nid, K(ps.nr_dirty),
 		       nid, K(ps.nr_writeback),
+		       nid, K(node_page_state(nid, NR_FILE_PAGES)),
 		       nid, K(node_page_state(nid, NR_FILE_MAPPED)),
 		       nid, K(ps.nr_slab));
 	n += hugetlb_report_node_meminfo(nid, buf + n);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index bc7d9abca74..1af12fd77fe 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -142,7 +142,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 	allowed = ((totalram_pages - hugetlb_total_pages())
 		* sysctl_overcommit_ratio / 100) + total_swap_pages;
 
-	cached = get_page_cache_size() - total_swapcache_pages - i.bufferram;
+	cached = global_page_state(NR_FILE_PAGES) -
+			total_swapcache_pages - i.bufferram;
 	if (cached < 0)
 		cached = 0;
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index eb42c127702..08be91e6cec 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -49,7 +49,7 @@ struct zone_padding {
 enum zone_stat_item {
 	NR_FILE_MAPPED,	/* mapped into pagetables.
 			   only modified from process context */
-
+	NR_FILE_PAGES,
 	NR_VM_ZONE_STAT_ITEMS };
 
 struct per_cpu_pages {
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 1245df7141a..0a2f5d27f60 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -113,51 +113,6 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 extern void remove_from_page_cache(struct page *page);
 extern void __remove_from_page_cache(struct page *page);
 
-extern atomic_t nr_pagecache;
-
-#ifdef CONFIG_SMP
-
-#define PAGECACHE_ACCT_THRESHOLD        max(16, NR_CPUS * 2)
-DECLARE_PER_CPU(long, nr_pagecache_local);
-
-/*
- * pagecache_acct implements approximate accounting for pagecache.
- * vm_enough_memory() do not need high accuracy. Writers will keep
- * an offset in their per-cpu arena and will spill that into the
- * global count whenever the absolute value of the local count
- * exceeds the counter's threshold.
- *
- * MUST be protected from preemption.
- * current protection is mapping->page_lock.
- */
-static inline void pagecache_acct(int count)
-{
-	long *local;
-
-	local = &__get_cpu_var(nr_pagecache_local);
-	*local += count;
-	if (*local > PAGECACHE_ACCT_THRESHOLD || *local < -PAGECACHE_ACCT_THRESHOLD) {
-		atomic_add(*local, &nr_pagecache);
-		*local = 0;
-	}
-}
-
-#else
-
-static inline void pagecache_acct(int count)
-{
-	atomic_add(count, &nr_pagecache);
-}
-#endif
-
-static inline unsigned long get_page_cache_size(void)
-{
-	int ret = atomic_read(&nr_pagecache);
-	if (unlikely(ret < 0))
-		ret = 0;
-	return ret;
-}
-
 /*
  * Return byte-offset into filesystem object for page.
  */
diff --git a/mm/filemap.c b/mm/filemap.c
index 648f2c0c8e1..87d62c44c3f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -120,7 +120,7 @@ void __remove_from_page_cache(struct page *page)
 	radix_tree_delete(&mapping->page_tree, page->index);
 	page->mapping = NULL;
 	mapping->nrpages--;
-	pagecache_acct(-1);
+	__dec_zone_page_state(page, NR_FILE_PAGES);
 }
 
 void remove_from_page_cache(struct page *page)
@@ -449,7 +449,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
 			page->mapping = mapping;
 			page->index = offset;
 			mapping->nrpages++;
-			pagecache_acct(1);
+			__inc_zone_page_state(page, NR_FILE_PAGES);
 		}
 		write_unlock_irq(&mapping->tree_lock);
 		radix_tree_preload_end();
diff --git a/mm/mmap.c b/mm/mmap.c
index 6446c6134b0..c1868ecdbc5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -96,7 +96,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
 	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
 		unsigned long n;
 
-		free = get_page_cache_size();
+		free = global_page_state(NR_FILE_PAGES);
 		free += nr_swap_pages;
 
 		/*
diff --git a/mm/nommu.c b/mm/nommu.c
index 029fadac0fb..5151c44a825 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1122,7 +1122,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
 	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
 		unsigned long n;
 
-		free = get_page_cache_size();
+		free = global_page_state(NR_FILE_PAGES);
 		free += nr_swap_pages;
 
 		/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 04dd2b01b2b..8350720f98a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2124,16 +2124,11 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
 				 unsigned long action, void *hcpu)
 {
 	int cpu = (unsigned long)hcpu;
-	long *count;
 	unsigned long *src, *dest;
 
 	if (action == CPU_DEAD) {
 		int i;
 
-		/* Drain local pagecache count. */
-		count = &per_cpu(nr_pagecache_local, cpu);
-		atomic_add(*count, &nr_pagecache);
-		*count = 0;
 		local_irq_disable();
 		__drain_pages(cpu);
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 7535211bb49..fccbd9bba77 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -87,7 +87,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
 			SetPageSwapCache(page);
 			set_page_private(page, entry.val);
 			total_swapcache_pages++;
-			pagecache_acct(1);
+			__inc_zone_page_state(page, NR_FILE_PAGES);
 		}
 		write_unlock_irq(&swapper_space.tree_lock);
 		radix_tree_preload_end();
@@ -132,7 +132,7 @@ void __delete_from_swap_cache(struct page *page)
 	set_page_private(page, 0);
 	ClearPageSwapCache(page);
 	total_swapcache_pages--;
-	pagecache_acct(-1);
+	__dec_zone_page_state(page, NR_FILE_PAGES);
 	INC_CACHE_INFO(del_total);
 }
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4800091c129..f16b33eb6d5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -20,12 +20,6 @@
  */
 DEFINE_PER_CPU(struct page_state, page_states) = {0};
 
-atomic_t nr_pagecache = ATOMIC_INIT(0);
-EXPORT_SYMBOL(nr_pagecache);
-#ifdef CONFIG_SMP
-DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
-#endif
-
 static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
 {
 	unsigned cpu;
@@ -402,6 +396,7 @@ struct seq_operations fragmentation_op = {
 static char *vmstat_text[] = {
 	/* Zoned VM counters */
 	"nr_mapped",
+	"nr_file_pages",
 
 	/* Page state */
 	"nr_dirty",
-- 
cgit v1.2.3-70-g09d2


From f3dbd34460ff54962d3e3244b6bcb7f5295356e6 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 30 Jun 2006 01:55:36 -0700
Subject: [PATCH] zoned vm counters: split NR_ANON_PAGES off from
 NR_FILE_MAPPED

The current NR_FILE_MAPPED is used by zone reclaim and the dirty load
calculation as the number of mapped pagecache pages.  However, that is not
true.  NR_FILE_MAPPED includes the mapped anonymous pages.  This patch
separates those and therefore allows an accurate tracking of the anonymous
pages per zone.

It then becomes possible to determine the number of unmapped pages per zone
and we can avoid scanning for unmapped pages if there are none.

Also it may now be possible to determine the mapped/unmapped ratio in
get_dirty_limit.  Isnt the number of anonymous pages irrelevant in that
calculation?

Note that this will change the meaning of the number of mapped pages reported
in /proc/vmstat /proc/meminfo and in the per node statistics.  This may affect
user space tools that monitor these counters!  NR_FILE_MAPPED works like
NR_FILE_DIRTY.  It is only valid for pagecache pages.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/base/node.c    | 2 ++
 fs/proc/proc_misc.c    | 2 ++
 include/linux/mmzone.h | 3 ++-
 mm/page-writeback.c    | 3 ++-
 mm/rmap.c              | 5 +++--
 mm/vmscan.c            | 3 ++-
 mm/vmstat.c            | 1 +
 7 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'mm/vmstat.c')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index ae9e3fea4b3..c3bf05158c6 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -71,6 +71,7 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 		       "Node %d Writeback:    %8lu kB\n"
 		       "Node %d FilePages:    %8lu kB\n"
 		       "Node %d Mapped:       %8lu kB\n"
+		       "Node %d AnonPages:    %8lu kB\n"
 		       "Node %d Slab:         %8lu kB\n",
 		       nid, K(i.totalram),
 		       nid, K(i.freeram),
@@ -85,6 +86,7 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 		       nid, K(ps.nr_writeback),
 		       nid, K(node_page_state(nid, NR_FILE_PAGES)),
 		       nid, K(node_page_state(nid, NR_FILE_MAPPED)),
+		       nid, K(node_page_state(nid, NR_ANON_PAGES)),
 		       nid, K(ps.nr_slab));
 	n += hugetlb_report_node_meminfo(nid, buf + n);
 	return n;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 1af12fd77fe..ff809656ce3 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -168,6 +168,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		"SwapFree:     %8lu kB\n"
 		"Dirty:        %8lu kB\n"
 		"Writeback:    %8lu kB\n"
+		"AnonPages:    %8lu kB\n"
 		"Mapped:       %8lu kB\n"
 		"Slab:         %8lu kB\n"
 		"CommitLimit:  %8lu kB\n"
@@ -191,6 +192,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(i.freeswap),
 		K(ps.nr_dirty),
 		K(ps.nr_writeback),
+		K(global_page_state(NR_ANON_PAGES)),
 		K(global_page_state(NR_FILE_MAPPED)),
 		K(ps.nr_slab),
 		K(allowed),
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 08be91e6cec..4833abd4458 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -47,7 +47,8 @@ struct zone_padding {
 #endif
 
 enum zone_stat_item {
-	NR_FILE_MAPPED,	/* mapped into pagetables.
+	NR_ANON_PAGES,	/* Mapped anonymous pages */
+	NR_FILE_MAPPED,	/* pagecache pages mapped into pagetables.
 			   only modified from process context */
 	NR_FILE_PAGES,
 	NR_VM_ZONE_STAT_ITEMS };
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 60c7244c42e..0faacfe1890 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -111,7 +111,8 @@ static void get_writeback_state(struct writeback_state *wbs)
 {
 	wbs->nr_dirty = read_page_state(nr_dirty);
 	wbs->nr_unstable = read_page_state(nr_unstable);
-	wbs->nr_mapped = global_page_state(NR_FILE_MAPPED);
+	wbs->nr_mapped = global_page_state(NR_FILE_MAPPED) +
+				global_page_state(NR_ANON_PAGES);
 	wbs->nr_writeback = read_page_state(nr_writeback);
 }
 
diff --git a/mm/rmap.c b/mm/rmap.c
index af5e9808e65..40158b59729 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -455,7 +455,7 @@ static void __page_set_anon_rmap(struct page *page,
 	 * nr_mapped state can be updated without turning off
 	 * interrupts because it is not modified via interrupt.
 	 */
-	__inc_zone_page_state(page, NR_FILE_MAPPED);
+	__inc_zone_page_state(page, NR_ANON_PAGES);
 }
 
 /**
@@ -531,7 +531,8 @@ void page_remove_rmap(struct page *page)
 		 */
 		if (page_test_and_clear_dirty(page))
 			set_page_dirty(page);
-		__dec_zone_page_state(page, NR_FILE_MAPPED);
+		__dec_zone_page_state(page,
+				PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
 	}
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 08bc54e8086..2f0390161c0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -742,7 +742,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		 * how much memory
 		 * is mapped.
 		 */
-		mapped_ratio = (global_page_state(NR_FILE_MAPPED) * 100) /
+		mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
+				global_page_state(NR_ANON_PAGES)) * 100) /
 					vm_total_pages;
 
 		/*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f16b33eb6d5..3baf4dffa62 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -395,6 +395,7 @@ struct seq_operations fragmentation_op = {
 
 static char *vmstat_text[] = {
 	/* Zoned VM counters */
+	"nr_anon_pages",
 	"nr_mapped",
 	"nr_file_pages",
 
-- 
cgit v1.2.3-70-g09d2


From 9a865ffa34b6117a5e0b67640a084d8c2e198c93 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 30 Jun 2006 01:55:38 -0700
Subject: [PATCH] zoned vm counters: conversion of nr_slab to per zone counter

- Allows reclaim to access counter without looping over processor counts.

- Allows accurate statistics on how many pages are used in a zone by
  the slab. This may become useful to balance slab allocations over
  various zones.

[akpm@osdl.org: bugfix]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/pgtable.c | 2 +-
 drivers/base/node.c    | 4 +---
 fs/proc/proc_misc.c    | 2 +-
 include/linux/mmzone.h | 1 +
 include/linux/vmstat.h | 3 +--
 mm/page_alloc.c        | 2 +-
 mm/slab.c              | 4 ++--
 mm/vmscan.c            | 2 +-
 mm/vmstat.c            | 2 +-
 9 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'mm/vmstat.c')

diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index f85f1a40e5c..73ac3599a0e 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -62,7 +62,7 @@ void show_mem(void)
 	printk(KERN_INFO "%lu pages dirty\n", ps.nr_dirty);
 	printk(KERN_INFO "%lu pages writeback\n", ps.nr_writeback);
 	printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
-	printk(KERN_INFO "%lu pages slab\n", ps.nr_slab);
+	printk(KERN_INFO "%lu pages slab\n", global_page_state(NR_SLAB));
 	printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages);
 }
 
diff --git a/drivers/base/node.c b/drivers/base/node.c
index c3bf05158c6..db116a8791c 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -54,8 +54,6 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 		ps.nr_dirty = 0;
 	if ((long)ps.nr_writeback < 0)
 		ps.nr_writeback = 0;
-	if ((long)ps.nr_slab < 0)
-		ps.nr_slab = 0;
 
 	n = sprintf(buf, "\n"
 		       "Node %d MemTotal:     %8lu kB\n"
@@ -87,7 +85,7 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 		       nid, K(node_page_state(nid, NR_FILE_PAGES)),
 		       nid, K(node_page_state(nid, NR_FILE_MAPPED)),
 		       nid, K(node_page_state(nid, NR_ANON_PAGES)),
-		       nid, K(ps.nr_slab));
+		       nid, K(node_page_state(nid, NR_SLAB)));
 	n += hugetlb_report_node_meminfo(nid, buf + n);
 	return n;
 }
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index ff809656ce3..16aaf7187bb 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -194,7 +194,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(ps.nr_writeback),
 		K(global_page_state(NR_ANON_PAGES)),
 		K(global_page_state(NR_FILE_MAPPED)),
-		K(ps.nr_slab),
+		K(global_page_state(NR_SLAB)),
 		K(allowed),
 		K(committed),
 		K(ps.nr_page_table_pages),
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 839e9a04fd4..67e03fc8533 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -51,6 +51,7 @@ enum zone_stat_item {
 	NR_FILE_MAPPED,	/* pagecache pages mapped into pagetables.
 			   only modified from process context */
 	NR_FILE_PAGES,
+	NR_SLAB,	/* Pages used by slab allocator */
 	NR_VM_ZONE_STAT_ITEMS };
 
 struct per_cpu_pages {
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 8ab8229523e..4b97381a293 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -26,8 +26,7 @@ struct page_state {
 	unsigned long nr_writeback;	/* Pages under writeback */
 	unsigned long nr_unstable;	/* NFS unstable pages */
 	unsigned long nr_page_table_pages;/* Pages used for pagetables */
-	unsigned long nr_slab;		/* In slab */
-#define GET_PAGE_STATE_LAST nr_slab
+#define GET_PAGE_STATE_LAST nr_page_table_pages
 
 	/*
 	 * The below are zeroed by get_page_state().  Use get_full_page_state()
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8350720f98a..a38a11cfb48 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1318,7 +1318,7 @@ void show_free_areas(void)
 		ps.nr_writeback,
 		ps.nr_unstable,
 		nr_free_pages(),
-		ps.nr_slab,
+		global_page_state(NR_SLAB),
 		global_page_state(NR_FILE_MAPPED),
 		ps.nr_page_table_pages);
 
diff --git a/mm/slab.c b/mm/slab.c
index 0c33820038c..5dcfb904480 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1507,7 +1507,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	nr_pages = (1 << cachep->gfporder);
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
 		atomic_add(nr_pages, &slab_reclaim_pages);
-	add_page_state(nr_slab, nr_pages);
+	add_zone_page_state(page_zone(page), NR_SLAB, nr_pages);
 	for (i = 0; i < nr_pages; i++)
 		__SetPageSlab(page + i);
 	return page_address(page);
@@ -1522,12 +1522,12 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
 	struct page *page = virt_to_page(addr);
 	const unsigned long nr_freed = i;
 
+	sub_zone_page_state(page_zone(page), NR_SLAB, nr_freed);
 	while (i--) {
 		BUG_ON(!PageSlab(page));
 		__ClearPageSlab(page);
 		page++;
 	}
-	sub_page_state(nr_slab, nr_freed);
 	if (current->reclaim_state)
 		current->reclaim_state->reclaimed_slab += nr_freed;
 	free_pages((unsigned long)addr, cachep->gfporder);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0960846d649..d6942436ac9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1362,7 +1362,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 	for_each_zone(zone)
 		lru_pages += zone->nr_active + zone->nr_inactive;
 
-	nr_slab = read_page_state(nr_slab);
+	nr_slab = global_page_state(NR_SLAB);
 	/* If slab caches are huge, it's better to hit them first */
 	while (nr_slab >= lru_pages) {
 		reclaim_state.reclaimed_slab = 0;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 3baf4dffa62..dc9e6920922 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -398,13 +398,13 @@ static char *vmstat_text[] = {
 	"nr_anon_pages",
 	"nr_mapped",
 	"nr_file_pages",
+	"nr_slab",
 
 	/* Page state */
 	"nr_dirty",
 	"nr_writeback",
 	"nr_unstable",
 	"nr_page_table_pages",
-	"nr_slab",
 
 	"pgpgin",
 	"pgpgout",
-- 
cgit v1.2.3-70-g09d2


From df849a1529c106f7460e51479ca78fe07b07dc8c Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 30 Jun 2006 01:55:38 -0700
Subject: [PATCH] zoned vm counters: conversion of nr_pagetables to per zone
 counter

Conversion of nr_page_table_pages to a per zone counter

[akpm@osdl.org: bugfix]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/mm/mm-armv.c     | 2 +-
 arch/i386/mm/pgtable.c    | 3 ++-
 arch/um/kernel/skas/mmu.c | 2 +-
 drivers/base/node.c       | 2 ++
 fs/proc/proc_misc.c       | 4 ++--
 include/linux/mmzone.h    | 1 +
 include/linux/vmstat.h    | 3 +--
 mm/memory.c               | 4 ++--
 mm/page_alloc.c           | 2 +-
 mm/vmstat.c               | 2 +-
 10 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'mm/vmstat.c')

diff --git a/arch/arm/mm/mm-armv.c b/arch/arm/mm/mm-armv.c
index 95273de4f77..931be179812 100644
--- a/arch/arm/mm/mm-armv.c
+++ b/arch/arm/mm/mm-armv.c
@@ -227,7 +227,7 @@ void free_pgd_slow(pgd_t *pgd)
 
 	pte = pmd_page(*pmd);
 	pmd_clear(pmd);
-	dec_page_state(nr_page_table_pages);
+	dec_zone_page_state(virt_to_page((unsigned long *)pgd), NR_PAGETABLE);
 	pte_lock_deinit(pte);
 	pte_free(pte);
 	pmd_free(pmd);
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index 73ac3599a0e..0bb1e5c1344 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -63,7 +63,8 @@ void show_mem(void)
 	printk(KERN_INFO "%lu pages writeback\n", ps.nr_writeback);
 	printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
 	printk(KERN_INFO "%lu pages slab\n", global_page_state(NR_SLAB));
-	printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages);
+	printk(KERN_INFO "%lu pages pagetables\n",
+					global_page_state(NR_PAGETABLE));
 }
 
 /*
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index c5c9885a829..624ca238d1f 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -152,7 +152,7 @@ void destroy_context_skas(struct mm_struct *mm)
 		free_page(mmu->id.stack);
 		pte_lock_deinit(virt_to_page(mmu->last_page_table));
 		pte_free_kernel((pte_t *) mmu->last_page_table);
-                dec_page_state(nr_page_table_pages);
+		dec_zone_page_state(virt_to_page(mmu->last_page_table), NR_PAGETABLE);
 #ifdef CONFIG_3_LEVEL_PGTABLES
 		pmd_free((pmd_t *) mmu->last_pmd);
 #endif
diff --git a/drivers/base/node.c b/drivers/base/node.c
index db116a8791c..c22fb67ec50 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -70,6 +70,7 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 		       "Node %d FilePages:    %8lu kB\n"
 		       "Node %d Mapped:       %8lu kB\n"
 		       "Node %d AnonPages:    %8lu kB\n"
+		       "Node %d PageTables:   %8lu kB\n"
 		       "Node %d Slab:         %8lu kB\n",
 		       nid, K(i.totalram),
 		       nid, K(i.freeram),
@@ -85,6 +86,7 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 		       nid, K(node_page_state(nid, NR_FILE_PAGES)),
 		       nid, K(node_page_state(nid, NR_FILE_MAPPED)),
 		       nid, K(node_page_state(nid, NR_ANON_PAGES)),
+		       nid, K(node_page_state(nid, NR_PAGETABLE)),
 		       nid, K(node_page_state(nid, NR_SLAB)));
 	n += hugetlb_report_node_meminfo(nid, buf + n);
 	return n;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 16aaf7187bb..0eae68f8421 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -171,9 +171,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		"AnonPages:    %8lu kB\n"
 		"Mapped:       %8lu kB\n"
 		"Slab:         %8lu kB\n"
+		"PageTables:   %8lu kB\n"
 		"CommitLimit:  %8lu kB\n"
 		"Committed_AS: %8lu kB\n"
-		"PageTables:   %8lu kB\n"
 		"VmallocTotal: %8lu kB\n"
 		"VmallocUsed:  %8lu kB\n"
 		"VmallocChunk: %8lu kB\n",
@@ -195,9 +195,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(global_page_state(NR_ANON_PAGES)),
 		K(global_page_state(NR_FILE_MAPPED)),
 		K(global_page_state(NR_SLAB)),
+		K(global_page_state(NR_PAGETABLE)),
 		K(allowed),
 		K(committed),
-		K(ps.nr_page_table_pages),
 		(unsigned long)VMALLOC_TOTAL >> 10,
 		vmi.used >> 10,
 		vmi.largest_chunk >> 10
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 67e03fc8533..15adb435f24 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -52,6 +52,7 @@ enum zone_stat_item {
 			   only modified from process context */
 	NR_FILE_PAGES,
 	NR_SLAB,	/* Pages used by slab allocator */
+	NR_PAGETABLE,	/* used for pagetables */
 	NR_VM_ZONE_STAT_ITEMS };
 
 struct per_cpu_pages {
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 4b97381a293..56220441d7c 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -25,8 +25,7 @@ struct page_state {
 	unsigned long nr_dirty;		/* Dirty writeable pages */
 	unsigned long nr_writeback;	/* Pages under writeback */
 	unsigned long nr_unstable;	/* NFS unstable pages */
-	unsigned long nr_page_table_pages;/* Pages used for pagetables */
-#define GET_PAGE_STATE_LAST nr_page_table_pages
+#define GET_PAGE_STATE_LAST nr_unstable
 
 	/*
 	 * The below are zeroed by get_page_state().  Use get_full_page_state()
diff --git a/mm/memory.c b/mm/memory.c
index 247b5c312b9..1a78791590f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -126,7 +126,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
 	pmd_clear(pmd);
 	pte_lock_deinit(page);
 	pte_free_tlb(tlb, page);
-	dec_page_state(nr_page_table_pages);
+	dec_zone_page_state(page, NR_PAGETABLE);
 	tlb->mm->nr_ptes--;
 }
 
@@ -311,7 +311,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 		pte_free(new);
 	} else {
 		mm->nr_ptes++;
-		inc_page_state(nr_page_table_pages);
+		inc_zone_page_state(new, NR_PAGETABLE);
 		pmd_populate(mm, pmd, new);
 	}
 	spin_unlock(&mm->page_table_lock);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a38a11cfb48..ed3f2a7b407 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1320,7 +1320,7 @@ void show_free_areas(void)
 		nr_free_pages(),
 		global_page_state(NR_SLAB),
 		global_page_state(NR_FILE_MAPPED),
-		ps.nr_page_table_pages);
+		global_page_state(NR_PAGETABLE));
 
 	for_each_zone(zone) {
 		int i;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index dc9e6920922..292a35fe56c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -399,12 +399,12 @@ static char *vmstat_text[] = {
 	"nr_mapped",
 	"nr_file_pages",
 	"nr_slab",
+	"nr_page_table_pages",
 
 	/* Page state */
 	"nr_dirty",
 	"nr_writeback",
 	"nr_unstable",
-	"nr_page_table_pages",
 
 	"pgpgin",
 	"pgpgout",
-- 
cgit v1.2.3-70-g09d2


From b1e7a8fd854d2f895730e82137400012b509650e Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 30 Jun 2006 01:55:39 -0700
Subject: [PATCH] zoned vm counters: conversion of nr_dirty to per zone counter

This makes nr_dirty a per zone counter.  Looping over all processors is
avoided during writeback state determination.

The counter aggregation for nr_dirty had to be undone in the NFS layer since
we summed up the page counts from multiple zones.  Someone more familiar with
NFS should probably review what I have done.

[akpm@osdl.org: bugfix]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/pgtable.c |  2 +-
 drivers/base/node.c    |  4 +---
 fs/buffer.c            |  2 +-
 fs/fs-writeback.c      |  2 +-
 fs/nfs/pagelist.c      |  1 +
 fs/nfs/write.c         |  3 +--
 fs/proc/proc_misc.c    |  2 +-
 include/linux/mmzone.h |  1 +
 include/linux/vmstat.h |  1 -
 mm/page-writeback.c    | 11 ++++++-----
 mm/page_alloc.c        |  2 +-
 mm/vmstat.c            |  2 +-
 12 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'mm/vmstat.c')

diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index 0bb1e5c1344..aa211dcddb0 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -59,7 +59,7 @@ void show_mem(void)
 	printk(KERN_INFO "%d pages swap cached\n", cached);
 
 	get_page_state(&ps);
-	printk(KERN_INFO "%lu pages dirty\n", ps.nr_dirty);
+	printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
 	printk(KERN_INFO "%lu pages writeback\n", ps.nr_writeback);
 	printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
 	printk(KERN_INFO "%lu pages slab\n", global_page_state(NR_SLAB));
diff --git a/drivers/base/node.c b/drivers/base/node.c
index c22fb67ec50..6fed520d00f 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -50,8 +50,6 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 	__get_zone_counts(&active, &inactive, &free, NODE_DATA(nid));
 
 	/* Check for negative values in these approximate counters */
-	if ((long)ps.nr_dirty < 0)
-		ps.nr_dirty = 0;
 	if ((long)ps.nr_writeback < 0)
 		ps.nr_writeback = 0;
 
@@ -81,7 +79,7 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 		       nid, K(i.freehigh),
 		       nid, K(i.totalram - i.totalhigh),
 		       nid, K(i.freeram - i.freehigh),
-		       nid, K(ps.nr_dirty),
+		       nid, K(node_page_state(nid, NR_FILE_DIRTY)),
 		       nid, K(ps.nr_writeback),
 		       nid, K(node_page_state(nid, NR_FILE_PAGES)),
 		       nid, K(node_page_state(nid, NR_FILE_MAPPED)),
diff --git a/fs/buffer.c b/fs/buffer.c
index e9994722f4a..90e52e67720 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -852,7 +852,7 @@ int __set_page_dirty_buffers(struct page *page)
 		write_lock_irq(&mapping->tree_lock);
 		if (page->mapping) {	/* Race with truncate? */
 			if (mapping_cap_account_dirty(mapping))
-				inc_page_state(nr_dirty);
+				__inc_zone_page_state(page, NR_FILE_DIRTY);
 			radix_tree_tag_set(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 031b27a4bc9..e5ad1075684 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -464,7 +464,7 @@ void sync_inodes_sb(struct super_block *sb, int wait)
 		.range_start	= 0,
 		.range_end	= LLONG_MAX,
 	};
-	unsigned long nr_dirty = read_page_state(nr_dirty);
+	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
 	unsigned long nr_unstable = read_page_state(nr_unstable);
 
 	wbc.nr_to_write = nr_dirty + nr_unstable +
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index d89f6fb3b3a..26b1fe90937 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -315,6 +315,7 @@ nfs_scan_lock_dirty(struct nfs_inode *nfsi, struct list_head *dst,
 						req->wb_index, NFS_PAGE_TAG_DIRTY);
 				nfs_list_remove_request(req);
 				nfs_list_add_request(req, dst);
+				dec_zone_page_state(req->wb_page, NR_FILE_DIRTY);
 				res++;
 			}
 		}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 8fccb9cb173..a6d1ca513dd 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -497,7 +497,7 @@ nfs_mark_request_dirty(struct nfs_page *req)
 	nfs_list_add_request(req, &nfsi->dirty);
 	nfsi->ndirty++;
 	spin_unlock(&nfsi->req_lock);
-	inc_page_state(nr_dirty);
+	inc_zone_page_state(req->wb_page, NR_FILE_DIRTY);
 	mark_inode_dirty(inode);
 }
 
@@ -609,7 +609,6 @@ nfs_scan_dirty(struct inode *inode, struct list_head *dst, unsigned long idx_sta
 	if (nfsi->ndirty != 0) {
 		res = nfs_scan_lock_dirty(nfsi, dst, idx_start, npages);
 		nfsi->ndirty -= res;
-		sub_page_state(nr_dirty,res);
 		if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty))
 			printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n");
 	}
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 0eae68f8421..e23717dec3c 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -190,7 +190,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(i.freeram-i.freehigh),
 		K(i.totalswap),
 		K(i.freeswap),
-		K(ps.nr_dirty),
+		K(global_page_state(NR_FILE_DIRTY)),
 		K(ps.nr_writeback),
 		K(global_page_state(NR_ANON_PAGES)),
 		K(global_page_state(NR_FILE_MAPPED)),
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 15adb435f24..1cc8412ac26 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -53,6 +53,7 @@ enum zone_stat_item {
 	NR_FILE_PAGES,
 	NR_SLAB,	/* Pages used by slab allocator */
 	NR_PAGETABLE,	/* used for pagetables */
+	NR_FILE_DIRTY,
 	NR_VM_ZONE_STAT_ITEMS };
 
 struct per_cpu_pages {
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 56220441d7c..b323ea2c626 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -22,7 +22,6 @@
  * commented here.
  */
 struct page_state {
-	unsigned long nr_dirty;		/* Dirty writeable pages */
 	unsigned long nr_writeback;	/* Pages under writeback */
 	unsigned long nr_unstable;	/* NFS unstable pages */
 #define GET_PAGE_STATE_LAST nr_unstable
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0faacfe1890..da854783009 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -109,7 +109,7 @@ struct writeback_state
 
 static void get_writeback_state(struct writeback_state *wbs)
 {
-	wbs->nr_dirty = read_page_state(nr_dirty);
+	wbs->nr_dirty = global_page_state(NR_FILE_DIRTY);
 	wbs->nr_unstable = read_page_state(nr_unstable);
 	wbs->nr_mapped = global_page_state(NR_FILE_MAPPED) +
 				global_page_state(NR_ANON_PAGES);
@@ -641,7 +641,8 @@ int __set_page_dirty_nobuffers(struct page *page)
 			if (mapping2) { /* Race with truncate? */
 				BUG_ON(mapping2 != mapping);
 				if (mapping_cap_account_dirty(mapping))
-					inc_page_state(nr_dirty);
+					__inc_zone_page_state(page,
+								NR_FILE_DIRTY);
 				radix_tree_tag_set(&mapping->page_tree,
 					page_index(page), PAGECACHE_TAG_DIRTY);
 			}
@@ -728,9 +729,9 @@ int test_clear_page_dirty(struct page *page)
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
-			write_unlock_irqrestore(&mapping->tree_lock, flags);
 			if (mapping_cap_account_dirty(mapping))
-				dec_page_state(nr_dirty);
+				__dec_zone_page_state(page, NR_FILE_DIRTY);
+			write_unlock_irqrestore(&mapping->tree_lock, flags);
 			return 1;
 		}
 		write_unlock_irqrestore(&mapping->tree_lock, flags);
@@ -761,7 +762,7 @@ int clear_page_dirty_for_io(struct page *page)
 	if (mapping) {
 		if (TestClearPageDirty(page)) {
 			if (mapping_cap_account_dirty(mapping))
-				dec_page_state(nr_dirty);
+				dec_zone_page_state(page, NR_FILE_DIRTY);
 			return 1;
 		}
 		return 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ed3f2a7b407..c2b9aa4acc4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1314,7 +1314,7 @@ void show_free_areas(void)
 		"unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
 		active,
 		inactive,
-		ps.nr_dirty,
+		global_page_state(NR_FILE_DIRTY),
 		ps.nr_writeback,
 		ps.nr_unstable,
 		nr_free_pages(),
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 292a35fe56c..1982fb533a4 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -400,9 +400,9 @@ static char *vmstat_text[] = {
 	"nr_file_pages",
 	"nr_slab",
 	"nr_page_table_pages",
+	"nr_dirty",
 
 	/* Page state */
-	"nr_dirty",
 	"nr_writeback",
 	"nr_unstable",
 
-- 
cgit v1.2.3-70-g09d2


From ce866b34ae1b7f1ce60234cf65855886ac7e7d30 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 30 Jun 2006 01:55:40 -0700
Subject: [PATCH] zoned vm counters: conversion of nr_writeback to per zone
 counter

Conversion of nr_writeback to per zone counter.

This removes the last page_state counter from arch/i386/mm/pgtable.c so we
drop the page_state from there.

[akpm@osdl.org: bugfix]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/pgtable.c     | 5 ++---
 drivers/base/node.c        | 5 +----
 fs/proc/proc_misc.c        | 2 +-
 include/linux/mmzone.h     | 1 +
 include/linux/page-flags.h | 8 ++++----
 include/linux/vmstat.h     | 1 -
 mm/page-writeback.c        | 2 +-
 mm/page_alloc.c            | 2 +-
 mm/vmstat.c                | 2 +-
 9 files changed, 12 insertions(+), 16 deletions(-)

(limited to 'mm/vmstat.c')

diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index aa211dcddb0..5e735ff90e8 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -30,7 +30,6 @@ void show_mem(void)
 	struct page *page;
 	pg_data_t *pgdat;
 	unsigned long i;
-	struct page_state ps;
 	unsigned long flags;
 
 	printk(KERN_INFO "Mem-info:\n");
@@ -58,9 +57,9 @@ void show_mem(void)
 	printk(KERN_INFO "%d pages shared\n", shared);
 	printk(KERN_INFO "%d pages swap cached\n", cached);
 
-	get_page_state(&ps);
 	printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
-	printk(KERN_INFO "%lu pages writeback\n", ps.nr_writeback);
+	printk(KERN_INFO "%lu pages writeback\n",
+					global_page_state(NR_WRITEBACK));
 	printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
 	printk(KERN_INFO "%lu pages slab\n", global_page_state(NR_SLAB));
 	printk(KERN_INFO "%lu pages pagetables\n",
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 6fed520d00f..a7b3dcbbdfc 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -49,9 +49,6 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 	get_page_state_node(&ps, nid);
 	__get_zone_counts(&active, &inactive, &free, NODE_DATA(nid));
 
-	/* Check for negative values in these approximate counters */
-	if ((long)ps.nr_writeback < 0)
-		ps.nr_writeback = 0;
 
 	n = sprintf(buf, "\n"
 		       "Node %d MemTotal:     %8lu kB\n"
@@ -80,7 +77,7 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 		       nid, K(i.totalram - i.totalhigh),
 		       nid, K(i.freeram - i.freehigh),
 		       nid, K(node_page_state(nid, NR_FILE_DIRTY)),
-		       nid, K(ps.nr_writeback),
+		       nid, K(node_page_state(nid, NR_WRITEBACK)),
 		       nid, K(node_page_state(nid, NR_FILE_PAGES)),
 		       nid, K(node_page_state(nid, NR_FILE_MAPPED)),
 		       nid, K(node_page_state(nid, NR_ANON_PAGES)),
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index e23717dec3c..48cb3586244 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -191,7 +191,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(i.totalswap),
 		K(i.freeswap),
 		K(global_page_state(NR_FILE_DIRTY)),
-		K(ps.nr_writeback),
+		K(global_page_state(NR_WRITEBACK)),
 		K(global_page_state(NR_ANON_PAGES)),
 		K(global_page_state(NR_FILE_MAPPED)),
 		K(global_page_state(NR_SLAB)),
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1cc8412ac26..885cc972700 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -54,6 +54,7 @@ enum zone_stat_item {
 	NR_SLAB,	/* Pages used by slab allocator */
 	NR_PAGETABLE,	/* used for pagetables */
 	NR_FILE_DIRTY,
+	NR_WRITEBACK,
 	NR_VM_ZONE_STAT_ITEMS };
 
 struct per_cpu_pages {
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index ff235c4b79e..5748642e9f3 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -181,7 +181,7 @@
 	do {								\
 		if (!test_and_set_bit(PG_writeback,			\
 				&(page)->flags))			\
-			inc_page_state(nr_writeback);			\
+			inc_zone_page_state(page, NR_WRITEBACK);	\
 	} while (0)
 #define TestSetPageWriteback(page)					\
 	({								\
@@ -189,14 +189,14 @@
 		ret = test_and_set_bit(PG_writeback,			\
 					&(page)->flags);		\
 		if (!ret)						\
-			inc_page_state(nr_writeback);			\
+			inc_zone_page_state(page, NR_WRITEBACK);	\
 		ret;							\
 	})
 #define ClearPageWriteback(page)					\
 	do {								\
 		if (test_and_clear_bit(PG_writeback,			\
 				&(page)->flags))			\
-			dec_page_state(nr_writeback);			\
+			dec_zone_page_state(page, NR_WRITEBACK);	\
 	} while (0)
 #define TestClearPageWriteback(page)					\
 	({								\
@@ -204,7 +204,7 @@
 		ret = test_and_clear_bit(PG_writeback,			\
 				&(page)->flags);			\
 		if (ret)						\
-			dec_page_state(nr_writeback);			\
+			dec_zone_page_state(page, NR_WRITEBACK);	\
 		ret;							\
 	})
 
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index b323ea2c626..60c2e0382ce 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -22,7 +22,6 @@
  * commented here.
  */
 struct page_state {
-	unsigned long nr_writeback;	/* Pages under writeback */
 	unsigned long nr_unstable;	/* NFS unstable pages */
 #define GET_PAGE_STATE_LAST nr_unstable
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index da854783009..3cfdff4b198 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -113,7 +113,7 @@ static void get_writeback_state(struct writeback_state *wbs)
 	wbs->nr_unstable = read_page_state(nr_unstable);
 	wbs->nr_mapped = global_page_state(NR_FILE_MAPPED) +
 				global_page_state(NR_ANON_PAGES);
-	wbs->nr_writeback = read_page_state(nr_writeback);
+	wbs->nr_writeback = global_page_state(NR_WRITEBACK);
 }
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c2b9aa4acc4..a12e894a8bc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1315,7 +1315,7 @@ void show_free_areas(void)
 		active,
 		inactive,
 		global_page_state(NR_FILE_DIRTY),
-		ps.nr_writeback,
+		global_page_state(NR_WRITEBACK),
 		ps.nr_unstable,
 		nr_free_pages(),
 		global_page_state(NR_SLAB),
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1982fb533a4..e84c7e520a1 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -401,9 +401,9 @@ static char *vmstat_text[] = {
 	"nr_slab",
 	"nr_page_table_pages",
 	"nr_dirty",
+	"nr_writeback",
 
 	/* Page state */
-	"nr_writeback",
 	"nr_unstable",
 
 	"pgpgin",
-- 
cgit v1.2.3-70-g09d2


From fd39fc8561be33065306bdac0e30414e1e8ac8e1 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 30 Jun 2006 01:55:40 -0700
Subject: [PATCH] zoned vm counters: conversion of nr_unstable to per zone
 counter

Conversion of nr_unstable to a per zone counter

We need to do some special modifications to the nfs code since there are
multiple cases of disposition and we need to have a page ref for proper
accounting.

This converts the last critical page state of the VM and therefore we need to
remove several functions that were depending on GET_PAGE_STATE_LAST in order
to make the kernel compile again.  We are only left with event type counters
in page state.

[akpm@osdl.org: bugfixes]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/base/node.c    |  4 ++--
 fs/fs-writeback.c      |  2 +-
 fs/nfs/write.c         |  6 ++----
 fs/proc/proc_misc.c    |  4 ++--
 include/linux/mmzone.h |  1 +
 include/linux/vmstat.h |  9 ---------
 mm/page-writeback.c    |  2 +-
 mm/page_alloc.c        |  4 +---
 mm/vmstat.c            | 25 +------------------------
 9 files changed, 11 insertions(+), 46 deletions(-)

(limited to 'mm/vmstat.c')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index a7b3dcbbdfc..b73d869c9d4 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -40,13 +40,11 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 	int n;
 	int nid = dev->id;
 	struct sysinfo i;
-	struct page_state ps;
 	unsigned long inactive;
 	unsigned long active;
 	unsigned long free;
 
 	si_meminfo_node(&i, nid);
-	get_page_state_node(&ps, nid);
 	__get_zone_counts(&active, &inactive, &free, NODE_DATA(nid));
 
 
@@ -66,6 +64,7 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 		       "Node %d Mapped:       %8lu kB\n"
 		       "Node %d AnonPages:    %8lu kB\n"
 		       "Node %d PageTables:   %8lu kB\n"
+		       "Node %d NFS Unstable: %8lu kB\n"
 		       "Node %d Slab:         %8lu kB\n",
 		       nid, K(i.totalram),
 		       nid, K(i.freeram),
@@ -82,6 +81,7 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 		       nid, K(node_page_state(nid, NR_FILE_MAPPED)),
 		       nid, K(node_page_state(nid, NR_ANON_PAGES)),
 		       nid, K(node_page_state(nid, NR_PAGETABLE)),
+		       nid, K(node_page_state(nid, NR_UNSTABLE_NFS)),
 		       nid, K(node_page_state(nid, NR_SLAB)));
 	n += hugetlb_report_node_meminfo(nid, buf + n);
 	return n;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e5ad1075684..892643dc9af 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -465,7 +465,7 @@ void sync_inodes_sb(struct super_block *sb, int wait)
 		.range_end	= LLONG_MAX,
 	};
 	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-	unsigned long nr_unstable = read_page_state(nr_unstable);
+	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
 
 	wbc.nr_to_write = nr_dirty + nr_unstable +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused) +
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index a6d1ca513dd..f21e268705c 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -525,7 +525,7 @@ nfs_mark_request_commit(struct nfs_page *req)
 	nfs_list_add_request(req, &nfsi->commit);
 	nfsi->ncommit++;
 	spin_unlock(&nfsi->req_lock);
-	inc_page_state(nr_unstable);
+	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 	mark_inode_dirty(inode);
 }
 #endif
@@ -1393,7 +1393,6 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data	*data = calldata;
 	struct nfs_page		*req;
-	int res = 0;
 
         dprintk("NFS: %4d nfs_commit_done (status %d)\n",
                                 task->tk_pid, task->tk_status);
@@ -1405,6 +1404,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
 	while (!list_empty(&data->pages)) {
 		req = nfs_list_entry(data->pages.next);
 		nfs_list_remove_request(req);
+		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 
 		dprintk("NFS: commit (%s/%Ld %d@%Ld)",
 			req->wb_context->dentry->d_inode->i_sb->s_id,
@@ -1431,9 +1431,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
 		nfs_mark_request_dirty(req);
 	next:
 		nfs_clear_page_writeback(req);
-		res++;
 	}
-	sub_page_state(nr_unstable,res);
 }
 
 static const struct rpc_call_ops nfs_commit_ops = {
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 48cb3586244..eae66b41a1a 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -120,7 +120,6 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 {
 	struct sysinfo i;
 	int len;
-	struct page_state ps;
 	unsigned long inactive;
 	unsigned long active;
 	unsigned long free;
@@ -129,7 +128,6 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 	struct vmalloc_info vmi;
 	long cached;
 
-	get_page_state(&ps);
 	get_zone_counts(&active, &inactive, &free);
 
 /*
@@ -172,6 +170,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		"Mapped:       %8lu kB\n"
 		"Slab:         %8lu kB\n"
 		"PageTables:   %8lu kB\n"
+		"NFS Unstable: %8lu kB\n"
 		"CommitLimit:  %8lu kB\n"
 		"Committed_AS: %8lu kB\n"
 		"VmallocTotal: %8lu kB\n"
@@ -196,6 +195,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(global_page_state(NR_FILE_MAPPED)),
 		K(global_page_state(NR_SLAB)),
 		K(global_page_state(NR_PAGETABLE)),
+		K(global_page_state(NR_UNSTABLE_NFS)),
 		K(allowed),
 		K(committed),
 		(unsigned long)VMALLOC_TOTAL >> 10,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 885cc972700..e9d80697f55 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -55,6 +55,7 @@ enum zone_stat_item {
 	NR_PAGETABLE,	/* used for pagetables */
 	NR_FILE_DIRTY,
 	NR_WRITEBACK,
+	NR_UNSTABLE_NFS,	/* NFS unstable pages */
 	NR_VM_ZONE_STAT_ITEMS };
 
 struct per_cpu_pages {
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 60c2e0382ce..9de2a41c885 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -22,13 +22,6 @@
  * commented here.
  */
 struct page_state {
-	unsigned long nr_unstable;	/* NFS unstable pages */
-#define GET_PAGE_STATE_LAST nr_unstable
-
-	/*
-	 * The below are zeroed by get_page_state().  Use get_full_page_state()
-	 * to add up all these.
-	 */
 	unsigned long pgpgin;		/* Disk reads */
 	unsigned long pgpgout;		/* Disk writes */
 	unsigned long pswpin;		/* swap reads */
@@ -77,8 +70,6 @@ struct page_state {
 	unsigned long nr_bounce;	/* pages for bounce buffers */
 };
 
-extern void get_page_state(struct page_state *ret);
-extern void get_page_state_node(struct page_state *ret, int node);
 extern void get_full_page_state(struct page_state *ret);
 extern unsigned long read_page_state_offset(unsigned long offset);
 extern void mod_page_state_offset(unsigned long offset, unsigned long delta);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3cfdff4b198..de9836f43db 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -110,7 +110,7 @@ struct writeback_state
 static void get_writeback_state(struct writeback_state *wbs)
 {
 	wbs->nr_dirty = global_page_state(NR_FILE_DIRTY);
-	wbs->nr_unstable = read_page_state(nr_unstable);
+	wbs->nr_unstable = global_page_state(NR_UNSTABLE_NFS);
 	wbs->nr_mapped = global_page_state(NR_FILE_MAPPED) +
 				global_page_state(NR_ANON_PAGES);
 	wbs->nr_writeback = global_page_state(NR_WRITEBACK);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a12e894a8bc..6aa2c31f513 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1271,7 +1271,6 @@ void si_meminfo_node(struct sysinfo *val, int nid)
  */
 void show_free_areas(void)
 {
-	struct page_state ps;
 	int cpu, temperature;
 	unsigned long active;
 	unsigned long inactive;
@@ -1303,7 +1302,6 @@ void show_free_areas(void)
 		}
 	}
 
-	get_page_state(&ps);
 	get_zone_counts(&active, &inactive, &free);
 
 	printk("Free pages: %11ukB (%ukB HighMem)\n",
@@ -1316,7 +1314,7 @@ void show_free_areas(void)
 		inactive,
 		global_page_state(NR_FILE_DIRTY),
 		global_page_state(NR_WRITEBACK),
-		ps.nr_unstable,
+		global_page_state(NR_UNSTABLE_NFS),
 		nr_free_pages(),
 		global_page_state(NR_SLAB),
 		global_page_state(NR_FILE_MAPPED),
diff --git a/mm/vmstat.c b/mm/vmstat.c
index e84c7e520a1..9dc270aed5c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -45,28 +45,6 @@ static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
 	}
 }
 
-void get_page_state_node(struct page_state *ret, int node)
-{
-	int nr;
-	cpumask_t mask = node_to_cpumask(node);
-
-	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
-	nr /= sizeof(unsigned long);
-
-	__get_page_state(ret, nr+1, &mask);
-}
-
-void get_page_state(struct page_state *ret)
-{
-	int nr;
-	cpumask_t mask = CPU_MASK_ALL;
-
-	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
-	nr /= sizeof(unsigned long);
-
-	__get_page_state(ret, nr + 1, &mask);
-}
-
 void get_full_page_state(struct page_state *ret)
 {
 	cpumask_t mask = CPU_MASK_ALL;
@@ -402,10 +380,9 @@ static char *vmstat_text[] = {
 	"nr_page_table_pages",
 	"nr_dirty",
 	"nr_writeback",
-
-	/* Page state */
 	"nr_unstable",
 
+	/* Event counters */
 	"pgpgin",
 	"pgpgout",
 	"pswpin",
-- 
cgit v1.2.3-70-g09d2


From d2c5e30c9a1420902262aa923794d2ae4e0bc391 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 30 Jun 2006 01:55:41 -0700
Subject: [PATCH] zoned vm counters: conversion of nr_bounce to per zone
 counter

Conversion of nr_bounce to a per zone counter

nr_bounce is only used for proc output.  So it could be left as an event
counter.  However, the event counters may not be accurate and nr_bounce is
categorizing types of pages in a zone.  So we really need this to also be a
per zone counter.

[akpm@osdl.org: bugfix]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/base/node.c    | 2 ++
 fs/proc/proc_misc.c    | 2 ++
 include/linux/mmzone.h | 1 +
 include/linux/vmstat.h | 1 -
 mm/highmem.c           | 6 +++---
 mm/vmstat.c            | 2 +-
 6 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'mm/vmstat.c')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index b73d869c9d4..772eadac57a 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -65,6 +65,7 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 		       "Node %d AnonPages:    %8lu kB\n"
 		       "Node %d PageTables:   %8lu kB\n"
 		       "Node %d NFS Unstable: %8lu kB\n"
+		       "Node %d Bounce:       %8lu kB\n"
 		       "Node %d Slab:         %8lu kB\n",
 		       nid, K(i.totalram),
 		       nid, K(i.freeram),
@@ -82,6 +83,7 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 		       nid, K(node_page_state(nid, NR_ANON_PAGES)),
 		       nid, K(node_page_state(nid, NR_PAGETABLE)),
 		       nid, K(node_page_state(nid, NR_UNSTABLE_NFS)),
+		       nid, K(node_page_state(nid, NR_BOUNCE)),
 		       nid, K(node_page_state(nid, NR_SLAB)));
 	n += hugetlb_report_node_meminfo(nid, buf + n);
 	return n;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index eae66b41a1a..6aa2aa779a0 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -171,6 +171,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		"Slab:         %8lu kB\n"
 		"PageTables:   %8lu kB\n"
 		"NFS Unstable: %8lu kB\n"
+		"Bounce:       %8lu kB\n"
 		"CommitLimit:  %8lu kB\n"
 		"Committed_AS: %8lu kB\n"
 		"VmallocTotal: %8lu kB\n"
@@ -196,6 +197,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(global_page_state(NR_SLAB)),
 		K(global_page_state(NR_PAGETABLE)),
 		K(global_page_state(NR_UNSTABLE_NFS)),
+		K(global_page_state(NR_BOUNCE)),
 		K(allowed),
 		K(committed),
 		(unsigned long)VMALLOC_TOTAL >> 10,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e9d80697f55..2dbeec1d287 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -56,6 +56,7 @@ enum zone_stat_item {
 	NR_FILE_DIRTY,
 	NR_WRITEBACK,
 	NR_UNSTABLE_NFS,	/* NFS unstable pages */
+	NR_BOUNCE,
 	NR_VM_ZONE_STAT_ITEMS };
 
 struct per_cpu_pages {
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 9de2a41c885..5b5b96afc39 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -67,7 +67,6 @@ struct page_state {
 	unsigned long allocstall;	/* direct reclaim calls */
 
 	unsigned long pgrotated;	/* pages rotated to tail of the LRU */
-	unsigned long nr_bounce;	/* pages for bounce buffers */
 };
 
 extern void get_full_page_state(struct page_state *ret);
diff --git a/mm/highmem.c b/mm/highmem.c
index 9b274fdf9d0..9b2a5403c44 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -315,8 +315,8 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
 		if (bvec->bv_page == org_vec->bv_page)
 			continue;
 
-		mempool_free(bvec->bv_page, pool);	
-		dec_page_state(nr_bounce);
+		dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
+		mempool_free(bvec->bv_page, pool);
 	}
 
 	bio_endio(bio_orig, bio_orig->bi_size, err);
@@ -397,7 +397,7 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
 		to->bv_page = mempool_alloc(pool, q->bounce_gfp);
 		to->bv_len = from->bv_len;
 		to->bv_offset = from->bv_offset;
-		inc_page_state(nr_bounce);
+		inc_zone_page_state(to->bv_page, NR_BOUNCE);
 
 		if (rw == WRITE) {
 			char *vto, *vfrom;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9dc270aed5c..25e5ca7c174 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -381,6 +381,7 @@ static char *vmstat_text[] = {
 	"nr_dirty",
 	"nr_writeback",
 	"nr_unstable",
+	"nr_bounce",
 
 	/* Event counters */
 	"pgpgin",
@@ -428,7 +429,6 @@ static char *vmstat_text[] = {
 	"allocstall",
 
 	"pgrotated",
-	"nr_bounce",
 };
 
 /*
-- 
cgit v1.2.3-70-g09d2


From bab1846a0582f627f5ec22aa2dc5f4f3e82e8176 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 30 Jun 2006 01:55:43 -0700
Subject: [PATCH] zoned-vm-counters: remove read_page_state()

No callers.

Cc: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/vmstat.h |  4 ----
 mm/vmstat.c            | 14 --------------
 2 files changed, 18 deletions(-)

(limited to 'mm/vmstat.c')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 5b5b96afc39..5fad1613e7d 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -70,13 +70,9 @@ struct page_state {
 };
 
 extern void get_full_page_state(struct page_state *ret);
-extern unsigned long read_page_state_offset(unsigned long offset);
 extern void mod_page_state_offset(unsigned long offset, unsigned long delta);
 extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
 
-#define read_page_state(member) \
-	read_page_state_offset(offsetof(struct page_state, member))
-
 #define mod_page_state(member, delta)	\
 	mod_page_state_offset(offsetof(struct page_state, member), (delta))
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 25e5ca7c174..06a6d105219 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -52,20 +52,6 @@ void get_full_page_state(struct page_state *ret)
 	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
 }
 
-unsigned long read_page_state_offset(unsigned long offset)
-{
-	unsigned long ret = 0;
-	int cpu;
-
-	for_each_online_cpu(cpu) {
-		unsigned long in;
-
-		in = (unsigned long)&per_cpu(page_states, cpu) + offset;
-		ret += *((unsigned long *)in);
-	}
-	return ret;
-}
-
 void __mod_page_state_offset(unsigned long offset, unsigned long delta)
 {
 	void *ptr;
-- 
cgit v1.2.3-70-g09d2


From ca889e6c45e0b112cb2ca9d35afc66297519b5d5 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 30 Jun 2006 01:55:44 -0700
Subject: [PATCH] Use Zoned VM Counters for NUMA statistics

The numa statistics are really event counters.  But they are per node and
so we have had special treatment for these counters through additional
fields on the pcp structure.  We can now use the per zone nature of the
zoned VM counters to realize these.

This will shrink the size of the pcp structure on NUMA systems.  We will
have some room to add additional per zone counters that will all still fit
in the same cacheline.

 Bits	Prior pcp size	  	Size after patch	We can add
 ------------------------------------------------------------------
 64	128 bytes (16 words)	80 bytes (10 words)	48
 32	 76 bytes (19 words)	56 bytes (14 words)	8 (64 byte cacheline)
							72 (128 byte)

Remove the special statistics for numa and replace them with zoned vm
counters.  This has the side effect that global sums of these events now
show up in /proc/vmstat.

Also take the opportunity to move the zone_statistics() function from
page_alloc.c into vmstat.c.

Discussions:
V2 http://marc.theaimsgroup.com/?t=115048227000002&r=1&w=2

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/base/node.c    | 34 +++++------------------
 include/linux/mmzone.h | 17 ++++++------
 include/linux/vmstat.h | 10 ++++++-
 mm/mempolicy.c         |  6 ++---
 mm/page_alloc.c        | 23 +---------------
 mm/vmstat.c            | 73 ++++++++++++++++++++++++++++++++------------------
 6 files changed, 73 insertions(+), 90 deletions(-)

(limited to 'mm/vmstat.c')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 772eadac57a..d7de1753e09 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -94,28 +94,6 @@ static SYSDEV_ATTR(meminfo, S_IRUGO, node_read_meminfo, NULL);
 
 static ssize_t node_read_numastat(struct sys_device * dev, char * buf)
 {
-	unsigned long numa_hit, numa_miss, interleave_hit, numa_foreign;
-	unsigned long local_node, other_node;
-	int i, cpu;
-	pg_data_t *pg = NODE_DATA(dev->id);
-	numa_hit = 0;
-	numa_miss = 0;
-	interleave_hit = 0;
-	numa_foreign = 0;
-	local_node = 0;
-	other_node = 0;
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		struct zone *z = &pg->node_zones[i];
-		for_each_online_cpu(cpu) {
-			struct per_cpu_pageset *ps = zone_pcp(z,cpu);
-			numa_hit += ps->numa_hit;
-			numa_miss += ps->numa_miss;
-			numa_foreign += ps->numa_foreign;
-			interleave_hit += ps->interleave_hit;
-			local_node += ps->local_node;
-			other_node += ps->other_node;
-		}
-	}
 	return sprintf(buf,
 		       "numa_hit %lu\n"
 		       "numa_miss %lu\n"
@@ -123,12 +101,12 @@ static ssize_t node_read_numastat(struct sys_device * dev, char * buf)
 		       "interleave_hit %lu\n"
 		       "local_node %lu\n"
 		       "other_node %lu\n",
-		       numa_hit,
-		       numa_miss,
-		       numa_foreign,
-		       interleave_hit,
-		       local_node,
-		       other_node);
+		       node_page_state(dev->id, NUMA_HIT),
+		       node_page_state(dev->id, NUMA_MISS),
+		       node_page_state(dev->id, NUMA_FOREIGN),
+		       node_page_state(dev->id, NUMA_INTERLEAVE_HIT),
+		       node_page_state(dev->id, NUMA_LOCAL),
+		       node_page_state(dev->id, NUMA_OTHER));
 }
 static SYSDEV_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2dbeec1d287..27e748eb72b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -57,6 +57,14 @@ enum zone_stat_item {
 	NR_WRITEBACK,
 	NR_UNSTABLE_NFS,	/* NFS unstable pages */
 	NR_BOUNCE,
+#ifdef CONFIG_NUMA
+	NUMA_HIT,		/* allocated in intended node */
+	NUMA_MISS,		/* allocated in non intended node */
+	NUMA_FOREIGN,		/* was intended here, hit elsewhere */
+	NUMA_INTERLEAVE_HIT,	/* interleaver preferred this zone */
+	NUMA_LOCAL,		/* allocation from local node */
+	NUMA_OTHER,		/* allocation from other node */
+#endif
 	NR_VM_ZONE_STAT_ITEMS };
 
 struct per_cpu_pages {
@@ -71,15 +79,6 @@ struct per_cpu_pageset {
 #ifdef CONFIG_SMP
 	s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
 #endif
-
-#ifdef CONFIG_NUMA
-	unsigned long numa_hit;		/* allocated in intended node */
-	unsigned long numa_miss;	/* allocated in non intended node */
-	unsigned long numa_foreign;	/* was intended here, hit elsewhere */
-	unsigned long interleave_hit; 	/* interleaver prefered this zone */
-	unsigned long local_node;	/* allocation from local node */
-	unsigned long other_node;	/* allocation from other node */
-#endif
 } ____cacheline_aligned_in_smp;
 
 #ifdef CONFIG_NUMA
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 5fad1613e7d..16173b63ee6 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -173,9 +173,15 @@ static inline unsigned long node_page_state(int node,
 #endif
 		zone_page_state(&zones[ZONE_DMA], item);
 }
+
+extern void zone_statistics(struct zonelist *, struct zone *);
+
 #else
+
 #define node_page_state(node, item) global_page_state(item)
-#endif
+#define zone_statistics(_zl,_z) do { } while (0)
+
+#endif /* CONFIG_NUMA */
 
 #define __add_zone_page_state(__z, __i, __d)	\
 		__mod_zone_page_state(__z, __i, __d)
@@ -190,6 +196,8 @@ static inline void zap_zone_vm_stats(struct zone *zone)
 	memset(zone->vm_stat, 0, sizeof(zone->vm_stat));
 }
 
+extern void inc_zone_state(struct zone *, enum zone_stat_item);
+
 #ifdef CONFIG_SMP
 void __mod_zone_page_state(struct zone *, enum zone_stat_item item, int);
 void __inc_zone_page_state(struct page *, enum zone_stat_item);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 6b9740bbf4c..e07e27e846a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1209,10 +1209,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 
 	zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
 	page = __alloc_pages(gfp, order, zl);
-	if (page && page_zone(page) == zl->zones[0]) {
-		zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
-		put_cpu();
-	}
+	if (page && page_zone(page) == zl->zones[0])
+		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
 	return page;
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6aa2c31f513..d61671260f9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -709,27 +709,6 @@ void drain_local_pages(void)
 }
 #endif /* CONFIG_PM */
 
-static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
-{
-#ifdef CONFIG_NUMA
-	pg_data_t *pg = z->zone_pgdat;
-	pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
-	struct per_cpu_pageset *p;
-
-	p = zone_pcp(z, cpu);
-	if (pg == orig) {
-		p->numa_hit++;
-	} else {
-		p->numa_miss++;
-		zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
-	}
-	if (pg == NODE_DATA(numa_node_id()))
-		p->local_node++;
-	else
-		p->other_node++;
-#endif
-}
-
 /*
  * Free a 0-order page
  */
@@ -827,7 +806,7 @@ again:
 	}
 
 	__mod_page_state_zone(zone, pgalloc, 1 << order);
-	zone_statistics(zonelist, zone, cpu);
+	zone_statistics(zonelist, zone);
 	local_irq_restore(flags);
 	put_cpu();
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 06a6d105219..ee7f8966625 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -185,9 +185,8 @@ EXPORT_SYMBOL(mod_zone_page_state);
  * in between and therefore the atomicity vs. interrupt cannot be exploited
  * in a useful way here.
  */
-void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
+static void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 {
-	struct zone *zone = page_zone(page);
 	s8 *p = diff_pointer(zone, item);
 
 	(*p)++;
@@ -197,6 +196,11 @@ void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
 		*p = 0;
 	}
 }
+
+void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+	__inc_zone_state(page_zone(page), item);
+}
 EXPORT_SYMBOL(__inc_zone_page_state);
 
 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -213,22 +217,23 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
 }
 EXPORT_SYMBOL(__dec_zone_page_state);
 
+void inc_zone_state(struct zone *zone, enum zone_stat_item item)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__inc_zone_state(zone, item);
+	local_irq_restore(flags);
+}
+
 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
 {
 	unsigned long flags;
 	struct zone *zone;
-	s8 *p;
 
 	zone = page_zone(page);
 	local_irq_save(flags);
-	p = diff_pointer(zone, item);
-
-	(*p)++;
-
-	if (unlikely(*p > STAT_THRESHOLD)) {
-		zone_page_state_add(*p, zone, item);
-		*p = 0;
-	}
+	__inc_zone_state(zone, item);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(inc_zone_page_state);
@@ -297,6 +302,28 @@ EXPORT_SYMBOL(refresh_vm_stats);
 
 #endif
 
+#ifdef CONFIG_NUMA
+/*
+ * zonelist = the list of zones passed to the allocator
+ * z 	    = the zone from which the allocation occurred.
+ *
+ * Must be called with interrupts disabled.
+ */
+void zone_statistics(struct zonelist *zonelist, struct zone *z)
+{
+	if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) {
+		__inc_zone_state(z, NUMA_HIT);
+	} else {
+		__inc_zone_state(z, NUMA_MISS);
+		__inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
+	}
+	if (z->zone_pgdat == NODE_DATA(numa_node_id()))
+		__inc_zone_state(z, NUMA_LOCAL);
+	else
+		__inc_zone_state(z, NUMA_OTHER);
+}
+#endif
+
 #ifdef CONFIG_PROC_FS
 
 #include <linux/seq_file.h>
@@ -369,6 +396,15 @@ static char *vmstat_text[] = {
 	"nr_unstable",
 	"nr_bounce",
 
+#ifdef CONFIG_NUMA
+	"numa_hit",
+	"numa_miss",
+	"numa_foreign",
+	"numa_interleave",
+	"numa_local",
+	"numa_other",
+#endif
+
 	/* Event counters */
 	"pgpgin",
 	"pgpgout",
@@ -490,21 +526,6 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
 					   pageset->pcp[j].high,
 					   pageset->pcp[j].batch);
 			}
-#ifdef CONFIG_NUMA
-			seq_printf(m,
-				   "\n            numa_hit:       %lu"
-				   "\n            numa_miss:      %lu"
-				   "\n            numa_foreign:   %lu"
-				   "\n            interleave_hit: %lu"
-				   "\n            local_node:     %lu"
-				   "\n            other_node:     %lu",
-				   pageset->numa_hit,
-				   pageset->numa_miss,
-				   pageset->numa_foreign,
-				   pageset->interleave_hit,
-				   pageset->local_node,
-				   pageset->other_node);
-#endif
 		}
 		seq_printf(m,
 			   "\n  all_unreclaimable: %u"
-- 
cgit v1.2.3-70-g09d2


From f8891e5e1f93a128c3900f82035e8541357896a7 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 30 Jun 2006 01:55:45 -0700
Subject: [PATCH] Light weight event counters

The remaining counters in page_state after the zoned VM counter patches
have been applied are all just for show in /proc/vmstat.  They have no
essential function for the VM.

We use a simple increment of per cpu variables.  In order to avoid the most
severe races we disable preempt.  Preempt does not prevent the race between
an increment and an interrupt handler incrementing the same statistics
counter.  However, that race is exceedingly rare, we may only loose one
increment or so and there is no requirement (at least not in kernel) that
the vm event counters have to be accurate.

In the non preempt case this results in a simple increment for each
counter.  For many architectures this will be reduced by the compiler to a
single instruction.  This single instruction is atomic for i386 and x86_64.
 And therefore even the rare race condition in an interrupt is avoided for
both architectures in most cases.

The patchset also adds an off switch for embedded systems that allows a
building of linux kernels without these counters.

The implementation of these counters is through inline code that hopefully
results in only a single instruction increment instruction being emitted
(i386, x86_64) or in the increment being hidden though instruction
concurrency (EPIC architectures such as ia64 can get that done).

Benefits:
- VM event counter operations usually reduce to a single inline instruction
  on i386 and x86_64.
- No interrupt disable, only preempt disable for the preempt case.
  Preempt disable can also be avoided by moving the counter into a spinlock.
- Handling is similar to zoned VM counters.
- Simple and easily extendable.
- Can be omitted to reduce memory use for embedded use.

References:

RFC http://marc.theaimsgroup.com/?l=linux-kernel&m=113512330605497&w=2
RFC http://marc.theaimsgroup.com/?l=linux-kernel&m=114988082814934&w=2
local_t http://marc.theaimsgroup.com/?l=linux-kernel&m=114991748606690&w=2
V2 http://marc.theaimsgroup.com/?t=115014808400007&r=1&w=2
V3 http://marc.theaimsgroup.com/?l=linux-kernel&m=115024767022346&w=2
V4 http://marc.theaimsgroup.com/?l=linux-kernel&m=115047968808926&w=2

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/appldata/appldata_base.c |   1 -
 arch/s390/appldata/appldata_mem.c  |  20 ++---
 block/ll_rw_blk.c                  |   4 +-
 drivers/parisc/led.c               |  11 +--
 fs/inode.c                         |   9 +-
 fs/ncpfs/mmap.c                    |   2 +-
 include/linux/vmstat.h             | 170 ++++++++++++++----------------------
 init/Kconfig                       |   9 ++
 mm/filemap.c                       |   4 +-
 mm/memory.c                        |   4 +-
 mm/page_alloc.c                    |  21 +----
 mm/page_io.c                       |   4 +-
 mm/shmem.c                         |   4 +-
 mm/swap.c                          |   4 +-
 mm/vmscan.c                        |  23 +++--
 mm/vmstat.c                        | 171 +++++++++++++++++++------------------
 16 files changed, 212 insertions(+), 249 deletions(-)

(limited to 'mm/vmstat.c')

diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index 61bc44626c0..2476ca739c1 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -766,7 +766,6 @@ unsigned long nr_iowait(void)
 #endif /* MODULE */
 EXPORT_SYMBOL_GPL(si_swapinfo);
 EXPORT_SYMBOL_GPL(nr_threads);
-EXPORT_SYMBOL_GPL(get_full_page_state);
 EXPORT_SYMBOL_GPL(nr_running);
 EXPORT_SYMBOL_GPL(nr_iowait);
 //EXPORT_SYMBOL_GPL(nr_context_switches);
diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c
index 180ba79a626..4811e2dac86 100644
--- a/arch/s390/appldata/appldata_mem.c
+++ b/arch/s390/appldata/appldata_mem.c
@@ -107,21 +107,21 @@ static void appldata_get_mem_data(void *data)
 	 * serialized through the appldata_ops_lock and can use static
 	 */
 	static struct sysinfo val;
-	static struct page_state ps;
+	unsigned long ev[NR_VM_EVENT_ITEMS];
 	struct appldata_mem_data *mem_data;
 
 	mem_data = data;
 	mem_data->sync_count_1++;
 
-	get_full_page_state(&ps);
-	mem_data->pgpgin     = ps.pgpgin >> 1;
-	mem_data->pgpgout    = ps.pgpgout >> 1;
-	mem_data->pswpin     = ps.pswpin;
-	mem_data->pswpout    = ps.pswpout;
-	mem_data->pgalloc    = ps.pgalloc_high + ps.pgalloc_normal +
-			       ps.pgalloc_dma;
-	mem_data->pgfault    = ps.pgfault;
-	mem_data->pgmajfault = ps.pgmajfault;
+	all_vm_events(ev);
+	mem_data->pgpgin     = ev[PGPGIN] >> 1;
+	mem_data->pgpgout    = ev[PGPGOUT] >> 1;
+	mem_data->pswpin     = ev[PSWPIN];
+	mem_data->pswpout    = ev[PSWPOUT];
+	mem_data->pgalloc    = ev[PGALLOC_HIGH] + ev[PGALLOC_NORMAL] +
+			       ev[PGALLOC_DMA];
+	mem_data->pgfault    = ev[PGFAULT];
+	mem_data->pgmajfault = ev[PGMAJFAULT];
 
 	si_meminfo(&val);
 	mem_data->sharedram = val.sharedram;
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index eee03a3876a..fb83547f563 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3117,9 +3117,9 @@ void submit_bio(int rw, struct bio *bio)
 	BIO_BUG_ON(!bio->bi_io_vec);
 	bio->bi_rw |= rw;
 	if (rw & WRITE)
-		mod_page_state(pgpgout, count);
+		count_vm_events(PGPGOUT, count);
 	else
-		mod_page_state(pgpgin, count);
+		count_vm_events(PGPGIN, count);
 
 	if (unlikely(block_dump)) {
 		char b[BDEVNAME_SIZE];
diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c
index 298f2ddb2c1..d7024c7483b 100644
--- a/drivers/parisc/led.c
+++ b/drivers/parisc/led.c
@@ -411,16 +411,17 @@ static __inline__ int led_get_net_activity(void)
 static __inline__ int led_get_diskio_activity(void)
 {	
 	static unsigned long last_pgpgin, last_pgpgout;
-	struct page_state pgstat;
+	unsigned long events[NR_VM_EVENT_ITEMS];
 	int changed;
 
-	get_full_page_state(&pgstat); /* get no of sectors in & out */
+	all_vm_events(events);
 
 	/* Just use a very simple calculation here. Do not care about overflow,
 	   since we only want to know if there was activity or not. */
-	changed = (pgstat.pgpgin != last_pgpgin) || (pgstat.pgpgout != last_pgpgout);
-	last_pgpgin  = pgstat.pgpgin;
-	last_pgpgout = pgstat.pgpgout;
+	changed = (events[PGPGIN] != last_pgpgin) ||
+		  (events[PGPGOUT] != last_pgpgout);
+	last_pgpgin  = events[PGPGIN];
+	last_pgpgout = events[PGPGOUT];
 
 	return (changed ? LED_DISK_IO : 0);
 }
diff --git a/fs/inode.c b/fs/inode.c
index f42961eb983..14a6c4147e4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -452,15 +452,14 @@ static void prune_icache(int nr_to_scan)
 		nr_pruned++;
 	}
 	inodes_stat.nr_unused -= nr_pruned;
+	if (current_is_kswapd())
+		__count_vm_events(KSWAPD_INODESTEAL, reap);
+	else
+		__count_vm_events(PGINODESTEAL, reap);
 	spin_unlock(&inode_lock);
 
 	dispose_list(&freeable);
 	mutex_unlock(&iprune_mutex);
-
-	if (current_is_kswapd())
-		mod_page_state(kswapd_inodesteal, reap);
-	else
-		mod_page_state(pginodesteal, reap);
 }
 
 /*
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 52d60c3d899..e7d5a3097fe 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -93,7 +93,7 @@ static struct page* ncp_file_mmap_nopage(struct vm_area_struct *area,
 	 */
 	if (type)
 		*type = VM_FAULT_MAJOR;
-	inc_page_state(pgmajfault);
+	count_vm_event(PGMAJFAULT);
 	return page;
 }
 
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 16173b63ee6..3e0daf54133 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -7,115 +7,77 @@
 #include <linux/mmzone.h>
 #include <asm/atomic.h>
 
+#ifdef CONFIG_VM_EVENT_COUNTERS
 /*
- * Global page accounting.  One instance per CPU.  Only unsigned longs are
- * allowed.
+ * Light weight per cpu counter implementation.
  *
- * - Fields can be modified with xxx_page_state and xxx_page_state_zone at
- * any time safely (which protects the instance from modification by
- * interrupt.
- * - The __xxx_page_state variants can be used safely when interrupts are
- * disabled.
- * - The __xxx_page_state variants can be used if the field is only
- * modified from process context and protected from preemption, or only
- * modified from interrupt context.  In this case, the field should be
- * commented here.
+ * Counters should only be incremented and no critical kernel component
+ * should rely on the counter values.
+ *
+ * Counters are handled completely inline. On many platforms the code
+ * generated will simply be the increment of a global address.
  */
-struct page_state {
-	unsigned long pgpgin;		/* Disk reads */
-	unsigned long pgpgout;		/* Disk writes */
-	unsigned long pswpin;		/* swap reads */
-	unsigned long pswpout;		/* swap writes */
-
-	unsigned long pgalloc_high;	/* page allocations */
-	unsigned long pgalloc_normal;
-	unsigned long pgalloc_dma32;
-	unsigned long pgalloc_dma;
-
-	unsigned long pgfree;		/* page freeings */
-	unsigned long pgactivate;	/* pages moved inactive->active */
-	unsigned long pgdeactivate;	/* pages moved active->inactive */
-
-	unsigned long pgfault;		/* faults (major+minor) */
-	unsigned long pgmajfault;	/* faults (major only) */
-
-	unsigned long pgrefill_high;	/* inspected in refill_inactive_zone */
-	unsigned long pgrefill_normal;
-	unsigned long pgrefill_dma32;
-	unsigned long pgrefill_dma;
-
-	unsigned long pgsteal_high;	/* total highmem pages reclaimed */
-	unsigned long pgsteal_normal;
-	unsigned long pgsteal_dma32;
-	unsigned long pgsteal_dma;
-
-	unsigned long pgscan_kswapd_high;/* total highmem pages scanned */
-	unsigned long pgscan_kswapd_normal;
-	unsigned long pgscan_kswapd_dma32;
-	unsigned long pgscan_kswapd_dma;
-
-	unsigned long pgscan_direct_high;/* total highmem pages scanned */
-	unsigned long pgscan_direct_normal;
-	unsigned long pgscan_direct_dma32;
-	unsigned long pgscan_direct_dma;
-
-	unsigned long pginodesteal;	/* pages reclaimed via inode freeing */
-	unsigned long slabs_scanned;	/* slab objects scanned */
-	unsigned long kswapd_steal;	/* pages reclaimed by kswapd */
-	unsigned long kswapd_inodesteal;/* reclaimed via kswapd inode freeing */
-	unsigned long pageoutrun;	/* kswapd's calls to page reclaim */
-	unsigned long allocstall;	/* direct reclaim calls */
-
-	unsigned long pgrotated;	/* pages rotated to tail of the LRU */
+
+#define FOR_ALL_ZONES(x) x##_DMA, x##_DMA32, x##_NORMAL, x##_HIGH
+
+enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
+		FOR_ALL_ZONES(PGALLOC),
+		PGFREE, PGACTIVATE, PGDEACTIVATE,
+		PGFAULT, PGMAJFAULT,
+		FOR_ALL_ZONES(PGREFILL),
+		FOR_ALL_ZONES(PGSTEAL),
+		FOR_ALL_ZONES(PGSCAN_KSWAPD),
+		FOR_ALL_ZONES(PGSCAN_DIRECT),
+		PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL,
+		PAGEOUTRUN, ALLOCSTALL, PGROTATED,
+		NR_VM_EVENT_ITEMS
+};
+
+struct vm_event_state {
+	unsigned long event[NR_VM_EVENT_ITEMS];
 };
 
-extern void get_full_page_state(struct page_state *ret);
-extern void mod_page_state_offset(unsigned long offset, unsigned long delta);
-extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
-
-#define mod_page_state(member, delta)	\
-	mod_page_state_offset(offsetof(struct page_state, member), (delta))
-
-#define __mod_page_state(member, delta)	\
-	__mod_page_state_offset(offsetof(struct page_state, member), (delta))
-
-#define inc_page_state(member)		mod_page_state(member, 1UL)
-#define dec_page_state(member)		mod_page_state(member, 0UL - 1)
-#define add_page_state(member,delta)	mod_page_state(member, (delta))
-#define sub_page_state(member,delta)	mod_page_state(member, 0UL - (delta))
-
-#define __inc_page_state(member)	__mod_page_state(member, 1UL)
-#define __dec_page_state(member)	__mod_page_state(member, 0UL - 1)
-#define __add_page_state(member,delta)	__mod_page_state(member, (delta))
-#define __sub_page_state(member,delta)	__mod_page_state(member, 0UL - (delta))
-
-#define page_state(member) (*__page_state(offsetof(struct page_state, member)))
-
-#define state_zone_offset(zone, member)					\
-({									\
-	unsigned offset;						\
-	if (is_highmem(zone))						\
-		offset = offsetof(struct page_state, member##_high);	\
-	else if (is_normal(zone))					\
-		offset = offsetof(struct page_state, member##_normal);	\
-	else if (is_dma32(zone))					\
-		offset = offsetof(struct page_state, member##_dma32);	\
-	else								\
-		offset = offsetof(struct page_state, member##_dma);	\
-	offset;								\
-})
-
-#define __mod_page_state_zone(zone, member, delta)			\
- do {									\
-	__mod_page_state_offset(state_zone_offset(zone, member), (delta)); \
- } while (0)
-
-#define mod_page_state_zone(zone, member, delta)			\
- do {									\
-	mod_page_state_offset(state_zone_offset(zone, member), (delta)); \
- } while (0)
-
-DECLARE_PER_CPU(struct page_state, page_states);
+DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
+
+static inline void __count_vm_event(enum vm_event_item item)
+{
+	__get_cpu_var(vm_event_states.event[item])++;
+}
+
+static inline void count_vm_event(enum vm_event_item item)
+{
+	get_cpu_var(vm_event_states.event[item])++;
+	put_cpu();
+}
+
+static inline void __count_vm_events(enum vm_event_item item, long delta)
+{
+	__get_cpu_var(vm_event_states.event[item]) += delta;
+}
+
+static inline void count_vm_events(enum vm_event_item item, long delta)
+{
+	get_cpu_var(vm_event_states.event[item])++;
+	put_cpu();
+}
+
+extern void all_vm_events(unsigned long *);
+extern void vm_events_fold_cpu(int cpu);
+
+#else
+
+/* Disable counters */
+#define get_cpu_vm_events(e)	0L
+#define count_vm_event(e)	do { } while (0)
+#define count_vm_events(e,d)	do { } while (0)
+#define __count_vm_event(e)	do { } while (0)
+#define __count_vm_events(e,d)	do { } while (0)
+#define vm_events_fold_cpu(x)	do { } while (0)
+
+#endif /* CONFIG_VM_EVENT_COUNTERS */
+
+#define __count_zone_vm_events(item, zone, delta) \
+			__count_vm_events(item##_DMA + zone_idx(zone), delta)
 
 /*
  * Zone based page accounting with per cpu differentials.
diff --git a/init/Kconfig b/init/Kconfig
index f70f2fd273c..f515948889a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -379,6 +379,15 @@ config SLAB
 	  SLOB is more space efficient but does not scale well and is
 	  more susceptible to fragmentation.
 
+config VM_EVENT_COUNTERS
+	default y
+	bool "Enable VM event counters for /proc/vmstat" if EMBEDDED
+	help
+	  VM event counters are only needed to for event counts to be
+	  shown. They have no function for the kernel itself. This
+	  option allows the disabling of the VM event counters.
+	  /proc/vmstat will only show page counts.
+
 endmenu		# General setup
 
 config TINY_SHMEM
diff --git a/mm/filemap.c b/mm/filemap.c
index 87d62c44c3f..796a5471b49 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1416,7 +1416,7 @@ retry_find:
 		 */
 		if (!did_readaround) {
 			majmin = VM_FAULT_MAJOR;
-			inc_page_state(pgmajfault);
+			count_vm_event(PGMAJFAULT);
 		}
 		did_readaround = 1;
 		ra_pages = max_sane_readahead(file->f_ra.ra_pages);
@@ -1487,7 +1487,7 @@ no_cached_page:
 page_not_uptodate:
 	if (!did_readaround) {
 		majmin = VM_FAULT_MAJOR;
-		inc_page_state(pgmajfault);
+		count_vm_event(PGMAJFAULT);
 	}
 	lock_page(page);
 
diff --git a/mm/memory.c b/mm/memory.c
index 1a78791590f..7e2a4b1580e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1951,7 +1951,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 		/* Had to read the page from swap area: Major fault */
 		ret = VM_FAULT_MAJOR;
-		inc_page_state(pgmajfault);
+		count_vm_event(PGMAJFAULT);
 		grab_swap_token();
 	}
 
@@ -2324,7 +2324,7 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	__set_current_state(TASK_RUNNING);
 
-	inc_page_state(pgfault);
+	count_vm_event(PGFAULT);
 
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		return hugetlb_fault(mm, vma, address, write_access);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d61671260f9..30b0b97ad02 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -456,7 +456,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 
 	kernel_map_pages(page, 1 << order, 0);
 	local_irq_save(flags);
-	__mod_page_state(pgfree, 1 << order);
+	__count_vm_events(PGFREE, 1 << order);
 	free_one_page(page_zone(page), page, order);
 	local_irq_restore(flags);
 }
@@ -729,7 +729,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
 
 	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
 	local_irq_save(flags);
-	__inc_page_state(pgfree);
+	__count_vm_event(PGFREE);
 	list_add(&page->lru, &pcp->list);
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
@@ -805,7 +805,7 @@ again:
 			goto failed;
 	}
 
-	__mod_page_state_zone(zone, pgalloc, 1 << order);
+	__count_zone_vm_events(PGALLOC, zone, 1 << order);
 	zone_statistics(zonelist, zone);
 	local_irq_restore(flags);
 	put_cpu();
@@ -2101,24 +2101,11 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
 				 unsigned long action, void *hcpu)
 {
 	int cpu = (unsigned long)hcpu;
-	unsigned long *src, *dest;
 
 	if (action == CPU_DEAD) {
-		int i;
-
 		local_irq_disable();
 		__drain_pages(cpu);
-
-		/* Add dead cpu's page_states to our own. */
-		dest = (unsigned long *)&__get_cpu_var(page_states);
-		src = (unsigned long *)&per_cpu(page_states, cpu);
-
-		for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long);
-				i++) {
-			dest[i] += src[i];
-			src[i] = 0;
-		}
-
+		vm_events_fold_cpu(cpu);
 		local_irq_enable();
 		refresh_cpu_vm_stats(cpu);
 	}
diff --git a/mm/page_io.c b/mm/page_io.c
index bb2b0d53889..88029948d00 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -101,7 +101,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
 	}
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		rw |= (1 << BIO_RW_SYNC);
-	inc_page_state(pswpout);
+	count_vm_event(PSWPOUT);
 	set_page_writeback(page);
 	unlock_page(page);
 	submit_bio(rw, bio);
@@ -123,7 +123,7 @@ int swap_readpage(struct file *file, struct page *page)
 		ret = -ENOMEM;
 		goto out;
 	}
-	inc_page_state(pswpin);
+	count_vm_event(PSWPIN);
 	submit_bio(READ, bio);
 out:
 	return ret;
diff --git a/mm/shmem.c b/mm/shmem.c
index b14ff817d16..a9c09e0ba70 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1045,12 +1045,12 @@ repeat:
 		swappage = lookup_swap_cache(swap);
 		if (!swappage) {
 			shmem_swp_unmap(entry);
-			spin_unlock(&info->lock);
 			/* here we actually do the io */
 			if (type && *type == VM_FAULT_MINOR) {
-				inc_page_state(pgmajfault);
+				__count_vm_event(PGMAJFAULT);
 				*type = VM_FAULT_MAJOR;
 			}
+			spin_unlock(&info->lock);
 			swappage = shmem_swapin(info, swap, idx);
 			if (!swappage) {
 				spin_lock(&info->lock);
diff --git a/mm/swap.c b/mm/swap.c
index 990868afc1c..8fd095c4ae5 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -87,7 +87,7 @@ int rotate_reclaimable_page(struct page *page)
 	spin_lock_irqsave(&zone->lru_lock, flags);
 	if (PageLRU(page) && !PageActive(page)) {
 		list_move_tail(&page->lru, &zone->inactive_list);
-		inc_page_state(pgrotated);
+		__count_vm_event(PGROTATED);
 	}
 	if (!test_clear_page_writeback(page))
 		BUG();
@@ -107,7 +107,7 @@ void fastcall activate_page(struct page *page)
 		del_page_from_inactive_list(zone, page);
 		SetPageActive(page);
 		add_page_to_active_list(zone, page);
-		inc_page_state(pgactivate);
+		__count_vm_event(PGACTIVATE);
 	}
 	spin_unlock_irq(&zone->lru_lock);
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d6942436ac9..ff2ebe9458a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -215,7 +215,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
 				break;
 			if (shrink_ret < nr_before)
 				ret += nr_before - shrink_ret;
-			mod_page_state(slabs_scanned, this_scan);
+			count_vm_events(SLABS_SCANNED, this_scan);
 			total_scan -= this_scan;
 
 			cond_resched();
@@ -569,7 +569,7 @@ keep:
 	list_splice(&ret_pages, page_list);
 	if (pagevec_count(&freed_pvec))
 		__pagevec_release_nonlru(&freed_pvec);
-	mod_page_state(pgactivate, pgactivate);
+	count_vm_events(PGACTIVATE, pgactivate);
 	return nr_reclaimed;
 }
 
@@ -659,11 +659,11 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 		nr_reclaimed += nr_freed;
 		local_irq_disable();
 		if (current_is_kswapd()) {
-			__mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
-			__mod_page_state(kswapd_steal, nr_freed);
+			__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
+			__count_vm_events(KSWAPD_STEAL, nr_freed);
 		} else
-			__mod_page_state_zone(zone, pgscan_direct, nr_scan);
-		__mod_page_state_zone(zone, pgsteal, nr_freed);
+			__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
+		__count_vm_events(PGACTIVATE, nr_freed);
 
 		if (nr_taken == 0)
 			goto done;
@@ -841,11 +841,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		}
 	}
 	zone->nr_active += pgmoved;
-	spin_unlock(&zone->lru_lock);
 
-	__mod_page_state_zone(zone, pgrefill, pgscanned);
-	__mod_page_state(pgdeactivate, pgdeactivate);
-	local_irq_enable();
+	__count_zone_vm_events(PGREFILL, zone, pgscanned);
+	__count_vm_events(PGDEACTIVATE, pgdeactivate);
+	spin_unlock_irq(&zone->lru_lock);
 
 	pagevec_release(&pvec);
 }
@@ -977,7 +976,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 		.swappiness = vm_swappiness,
 	};
 
-	inc_page_state(allocstall);
+	count_vm_event(ALLOCSTALL);
 
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *zone = zones[i];
@@ -1074,7 +1073,7 @@ loop_again:
 	total_scanned = 0;
 	nr_reclaimed = 0;
 	sc.may_writepage = !laptop_mode;
-	inc_page_state(pageoutrun);
+	count_vm_event(PAGEOUTRUN);
 
 	for (i = 0; i < pgdat->nr_zones; i++) {
 		struct zone *zone = pgdat->node_zones + i;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index ee7f8966625..73b83d67bab 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -13,66 +13,6 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 
-/*
- * Accumulate the page_state information across all CPUs.
- * The result is unavoidably approximate - it can change
- * during and after execution of this function.
- */
-DEFINE_PER_CPU(struct page_state, page_states) = {0};
-
-static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
-{
-	unsigned cpu;
-
-	memset(ret, 0, nr * sizeof(unsigned long));
-	cpus_and(*cpumask, *cpumask, cpu_online_map);
-
-	for_each_cpu_mask(cpu, *cpumask) {
-		unsigned long *in;
-		unsigned long *out;
-		unsigned off;
-		unsigned next_cpu;
-
-		in = (unsigned long *)&per_cpu(page_states, cpu);
-
-		next_cpu = next_cpu(cpu, *cpumask);
-		if (likely(next_cpu < NR_CPUS))
-			prefetch(&per_cpu(page_states, next_cpu));
-
-		out = (unsigned long *)ret;
-		for (off = 0; off < nr; off++)
-			*out++ += *in++;
-	}
-}
-
-void get_full_page_state(struct page_state *ret)
-{
-	cpumask_t mask = CPU_MASK_ALL;
-
-	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
-}
-
-void __mod_page_state_offset(unsigned long offset, unsigned long delta)
-{
-	void *ptr;
-
-	ptr = &__get_cpu_var(page_states);
-	*(unsigned long *)(ptr + offset) += delta;
-}
-EXPORT_SYMBOL(__mod_page_state_offset);
-
-void mod_page_state_offset(unsigned long offset, unsigned long delta)
-{
-	unsigned long flags;
-	void *ptr;
-
-	local_irq_save(flags);
-	ptr = &__get_cpu_var(page_states);
-	*(unsigned long *)(ptr + offset) += delta;
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL(mod_page_state_offset);
-
 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
 			unsigned long *free, struct pglist_data *pgdat)
 {
@@ -106,6 +46,63 @@ void get_zone_counts(unsigned long *active,
 	}
 }
 
+#ifdef CONFIG_VM_EVENT_COUNTERS
+DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
+EXPORT_PER_CPU_SYMBOL(vm_event_states);
+
+static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
+{
+	int cpu = 0;
+	int i;
+
+	memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
+
+	cpu = first_cpu(*cpumask);
+	while (cpu < NR_CPUS) {
+		struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
+
+		cpu = next_cpu(cpu, *cpumask);
+
+		if (cpu < NR_CPUS)
+			prefetch(&per_cpu(vm_event_states, cpu));
+
+
+		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+			ret[i] += this->event[i];
+	}
+}
+
+/*
+ * Accumulate the vm event counters across all CPUs.
+ * The result is unavoidably approximate - it can change
+ * during and after execution of this function.
+*/
+void all_vm_events(unsigned long *ret)
+{
+	sum_vm_events(ret, &cpu_online_map);
+}
+
+#ifdef CONFIG_HOTPLUG
+/*
+ * Fold the foreign cpu events into our own.
+ *
+ * This is adding to the events on one processor
+ * but keeps the global counts constant.
+ */
+void vm_events_fold_cpu(int cpu)
+{
+	struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
+	int i;
+
+	for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
+		count_vm_events(i, fold_state->event[i]);
+		fold_state->event[i] = 0;
+	}
+}
+#endif /* CONFIG_HOTPLUG */
+
+#endif /* CONFIG_VM_EVENT_COUNTERS */
+
 /*
  * Manage combined zone based / global counters
  *
@@ -405,16 +402,16 @@ static char *vmstat_text[] = {
 	"numa_other",
 #endif
 
-	/* Event counters */
+#ifdef CONFIG_VM_EVENT_COUNTERS
 	"pgpgin",
 	"pgpgout",
 	"pswpin",
 	"pswpout",
 
-	"pgalloc_high",
-	"pgalloc_normal",
-	"pgalloc_dma32",
 	"pgalloc_dma",
+	"pgalloc_dma32",
+	"pgalloc_normal",
+	"pgalloc_high",
 
 	"pgfree",
 	"pgactivate",
@@ -423,25 +420,25 @@ static char *vmstat_text[] = {
 	"pgfault",
 	"pgmajfault",
 
-	"pgrefill_high",
-	"pgrefill_normal",
-	"pgrefill_dma32",
 	"pgrefill_dma",
+	"pgrefill_dma32",
+	"pgrefill_normal",
+	"pgrefill_high",
 
-	"pgsteal_high",
-	"pgsteal_normal",
-	"pgsteal_dma32",
 	"pgsteal_dma",
+	"pgsteal_dma32",
+	"pgsteal_normal",
+	"pgsteal_high",
 
-	"pgscan_kswapd_high",
-	"pgscan_kswapd_normal",
-	"pgscan_kswapd_dma32",
 	"pgscan_kswapd_dma",
+	"pgscan_kswapd_dma32",
+	"pgscan_kswapd_normal",
+	"pgscan_kswapd_high",
 
-	"pgscan_direct_high",
-	"pgscan_direct_normal",
-	"pgscan_direct_dma32",
 	"pgscan_direct_dma",
+	"pgscan_direct_dma32",
+	"pgscan_direct_normal",
+	"pgscan_direct_high",
 
 	"pginodesteal",
 	"slabs_scanned",
@@ -451,6 +448,7 @@ static char *vmstat_text[] = {
 	"allocstall",
 
 	"pgrotated",
+#endif
 };
 
 /*
@@ -553,23 +551,32 @@ struct seq_operations zoneinfo_op = {
 static void *vmstat_start(struct seq_file *m, loff_t *pos)
 {
 	unsigned long *v;
-	struct page_state *ps;
+#ifdef CONFIG_VM_EVENT_COUNTERS
+	unsigned long *e;
+#endif
 	int i;
 
 	if (*pos >= ARRAY_SIZE(vmstat_text))
 		return NULL;
 
+#ifdef CONFIG_VM_EVENT_COUNTERS
 	v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
-			+ sizeof(*ps), GFP_KERNEL);
+			+ sizeof(struct vm_event_state), GFP_KERNEL);
+#else
+	v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
+			GFP_KERNEL);
+#endif
 	m->private = v;
 	if (!v)
 		return ERR_PTR(-ENOMEM);
 	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
 		v[i] = global_page_state(i);
-	ps = (struct page_state *)(v + NR_VM_ZONE_STAT_ITEMS);
-	get_full_page_state(ps);
-	ps->pgpgin /= 2;		/* sectors -> kbytes */
-	ps->pgpgout /= 2;
+#ifdef CONFIG_VM_EVENT_COUNTERS
+	e = v + NR_VM_ZONE_STAT_ITEMS;
+	all_vm_events(e);
+	e[PGPGIN] /= 2;		/* sectors -> kbytes */
+	e[PGPGOUT] /= 2;
+#endif
 	return v + *pos;
 }
 
-- 
cgit v1.2.3-70-g09d2