From 4008bab7b3969ad9f9dd1d02096a3f0aa5610bd2 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:28 -0700 Subject: mm/page_alloc: factor out setting of pcp->high and pcp->batch "Problems" with the current code: 1: there is a lack of synchronization in setting ->high and ->batch in percpu_pagelist_fraction_sysctl_handler() 2: stop_machine() in zone_pcp_update() is unnecissary. 3: zone_pcp_update() does not consider the case where percpu_pagelist_fraction is non-zero To fix: 1: add memory barriers, a safe ->batch value, an update side mutex when updating ->high and ->batch, and use ACCESS_ONCE() for ->batch users that expect a stable value. 2: avoid draining pages in zone_pcp_update(), rely upon the memory barriers added to fix #1 3: factor out quite a few functions, and then call the appropriate one. Note that it results in a change to the behavior of zone_pcp_update(), which is used by memory_hotplug. I'm rather certain that I've diserned (and preserved) the essential behavior (changing ->high and ->batch), and only eliminated unneeded actions (draining the per cpu pages), but this may not be the case. Further note that the draining of pages that previously took place in zone_pcp_update() occured after repeated draining when attempting to offline a page, and after the offline has "succeeded". It appears that the draining was added to zone_pcp_update() to avoid refactoring setup_pageset() into 2 funtions. This patch: Creates pageset_set_batch() for use in setup_pageset(). pageset_set_batch() imitates the functionality of setup_pagelist_highmark(), but uses the boot time (percpu_pagelist_fraction == 0) calculations for determining ->high based on ->batch. Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c3edb624fcc..d4bcc20ab6f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4032,6 +4032,14 @@ static int __meminit zone_batchsize(struct zone *zone) #endif } +/* a companion to setup_pagelist_highmark() */ +static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) +{ + struct per_cpu_pages *pcp = &p->pcp; + pcp->high = 6 * batch; + pcp->batch = max(1UL, 1 * batch); +} + static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) { struct per_cpu_pages *pcp; @@ -4041,8 +4049,7 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp = &p->pcp; pcp->count = 0; - pcp->high = 6 * batch; - pcp->batch = max(1UL, 1 * batch); + pageset_set_batch(p, batch); for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) INIT_LIST_HEAD(&pcp->lists[migratetype]); } @@ -4051,7 +4058,6 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist * to the value high for the pageset p. */ - static void setup_pagelist_highmark(struct per_cpu_pageset *p, unsigned long high) { -- cgit v1.2.3-70-g09d2 From c8e251fadc6220261f6e0c6b8a4f1cdf27626165 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:29 -0700 Subject: mm/page_alloc: prevent concurrent updaters of pcp ->batch and ->high Because we are going to rely upon a careful transision between old and new ->high and ->batch values using memory barriers and will remove stop_machine(), we need to prevent multiple updaters from interweaving their memory writes. Add a simple mutex to protect both update loops. Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d4bcc20ab6f..8d433577963 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -65,6 +65,9 @@ #include #include "internal.h" +/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ +static DEFINE_MUTEX(pcp_batch_high_lock); + #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -5557,6 +5560,8 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, length, ppos); if (!write || (ret < 0)) return ret; + + mutex_lock(&pcp_batch_high_lock); for_each_populated_zone(zone) { for_each_possible_cpu(cpu) { unsigned long high; @@ -5565,6 +5570,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, per_cpu_ptr(zone->pageset, cpu), high); } } + mutex_unlock(&pcp_batch_high_lock); return 0; } @@ -6078,7 +6084,9 @@ static int __meminit __zone_pcp_update(void *data) void __meminit zone_pcp_update(struct zone *zone) { + mutex_lock(&pcp_batch_high_lock); stop_machine(__zone_pcp_update, zone, NULL); + mutex_unlock(&pcp_batch_high_lock); } #endif -- cgit v1.2.3-70-g09d2 From 8d7a8fa97abeb4fd6b3975d32c9f859875157770 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:31 -0700 Subject: mm/page_alloc: insert memory barriers to allow async update of pcp batch and high Introduce pageset_update() to perform a safe transision from one set of pcp->{batch,high} to a new set using memory barriers. This ensures that batch is always set to a safe value (1) prior to updating high, and ensure that high is fully updated before setting the real value of batch. It avoids ->batch ever rising above ->high. Suggested by Gilad Ben-Yossef in these threads: https://lkml.org/lkml/2013/4/9/23 https://lkml.org/lkml/2013/4/10/49 Also reproduces his proposed comment. Signed-off-by: Cody P Schafer Reviewed-by: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8d433577963..eaaef2a0942 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4035,12 +4035,37 @@ static int __meminit zone_batchsize(struct zone *zone) #endif } +/* + * pcp->high and pcp->batch values are related and dependent on one another: + * ->batch must never be higher then ->high. + * The following function updates them in a safe manner without read side + * locking. + * + * Any new users of pcp->batch and pcp->high should ensure they can cope with + * those fields changing asynchronously (acording the the above rule). + * + * mutex_is_locked(&pcp_batch_high_lock) required when calling this function + * outside of boot time (or some other assurance that no concurrent updaters + * exist). + */ +static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, + unsigned long batch) +{ + /* start with a fail safe value for batch */ + pcp->batch = 1; + smp_wmb(); + + /* Update high, then batch, in order */ + pcp->high = high; + smp_wmb(); + + pcp->batch = batch; +} + /* a companion to setup_pagelist_highmark() */ static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) { - struct per_cpu_pages *pcp = &p->pcp; - pcp->high = 6 * batch; - pcp->batch = max(1UL, 1 * batch); + pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); } static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) @@ -4064,13 +4089,11 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) static void setup_pagelist_highmark(struct per_cpu_pageset *p, unsigned long high) { - struct per_cpu_pages *pcp; + unsigned long batch = max(1UL, high / 4); + if ((high / 4) > (PAGE_SHIFT * 8)) + batch = PAGE_SHIFT * 8; - pcp = &p->pcp; - pcp->high = high; - pcp->batch = max(1UL, high/4); - if ((high/4) > (PAGE_SHIFT * 8)) - pcp->batch = PAGE_SHIFT * 8; + pageset_update(&p->pcp, high, batch); } static void __meminit setup_zone_pageset(struct zone *zone) -- cgit v1.2.3-70-g09d2 From 998d39cb236fe464af86a3492a24d2f67ee1efc2 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:32 -0700 Subject: mm/page_alloc: protect pcp->batch accesses with ACCESS_ONCE pcp->batch could change at any point, avoid relying on it being a stable value. Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eaaef2a0942..97b8f861e63 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1182,10 +1182,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { unsigned long flags; int to_drain; + unsigned long batch; local_irq_save(flags); - if (pcp->count >= pcp->batch) - to_drain = pcp->batch; + batch = ACCESS_ONCE(pcp->batch); + if (pcp->count >= batch) + to_drain = batch; else to_drain = pcp->count; if (to_drain > 0) { @@ -1353,8 +1355,9 @@ void free_hot_cold_page(struct page *page, int cold) list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { - free_pcppages_bulk(zone, pcp->batch, pcp); - pcp->count -= pcp->batch; + unsigned long batch = ACCESS_ONCE(pcp->batch); + free_pcppages_bulk(zone, batch, pcp); + pcp->count -= batch; } out: -- cgit v1.2.3-70-g09d2 From 0a647f3811d6af56405a819341ceac23e31d4572 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:33 -0700 Subject: mm/page_alloc: convert zone_pcp_update() to rely on memory barriers instead of stop_machine() zone_pcp_update()'s goal is to adjust the ->high and ->mark members of a percpu pageset based on a zone's ->managed_pages. We don't need to drain the entire percpu pageset just to modify these fields. This lets us avoid calling setup_pageset() (and the draining required to call it) and instead allows simply setting the fields' values (with some attention paid to memory barriers to prevent the relationship between ->batch and ->high from being thrown off). This does change the behavior of zone_pcp_update() as the percpu pagesets will not be drained when zone_pcp_update() is called (they will end up being shrunk, not completely drained, later when a 0-order page is freed in free_hot_cold_page()). Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 33 +++++++++------------------------ 1 file changed, 9 insertions(+), 24 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 97b8f861e63..8125263be60 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6085,33 +6085,18 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) #endif #ifdef CONFIG_MEMORY_HOTPLUG -static int __meminit __zone_pcp_update(void *data) -{ - struct zone *zone = data; - int cpu; - unsigned long batch = zone_batchsize(zone), flags; - - for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pset; - struct per_cpu_pages *pcp; - - pset = per_cpu_ptr(zone->pageset, cpu); - pcp = &pset->pcp; - - local_irq_save(flags); - if (pcp->count > 0) - free_pcppages_bulk(zone, pcp->count, pcp); - drain_zonestat(zone, pset); - setup_pageset(pset, batch); - local_irq_restore(flags); - } - return 0; -} - +/* + * The zone indicated has a new number of managed_pages; batch sizes and percpu + * page high values need to be recalulated. + */ void __meminit zone_pcp_update(struct zone *zone) { + unsigned cpu; + unsigned long batch; mutex_lock(&pcp_batch_high_lock); - stop_machine(__zone_pcp_update, zone, NULL); + batch = zone_batchsize(zone); + for_each_possible_cpu(cpu) + pageset_set_batch(per_cpu_ptr(zone->pageset, cpu), batch); mutex_unlock(&pcp_batch_high_lock); } #endif -- cgit v1.2.3-70-g09d2 From 22a7f12b1606327f0e11fcdf9043ae00bf9917df Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:34 -0700 Subject: mm/page_alloc: when handling percpu_pagelist_fraction, don't unneedly recalulate high Simply moves calculation of the new 'high' value outside the for_each_possible_cpu() loop, as it does not depend on the cpu. Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8125263be60..386de0f11be 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5575,7 +5575,6 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist * can have before it gets flushed back to buddy allocator. */ - int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { @@ -5589,12 +5588,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, mutex_lock(&pcp_batch_high_lock); for_each_populated_zone(zone) { - for_each_possible_cpu(cpu) { - unsigned long high; - high = zone->managed_pages / percpu_pagelist_fraction; + unsigned long high; + high = zone->managed_pages / percpu_pagelist_fraction; + for_each_possible_cpu(cpu) setup_pagelist_highmark( - per_cpu_ptr(zone->pageset, cpu), high); - } + per_cpu_ptr(zone->pageset, cpu), high); } mutex_unlock(&pcp_batch_high_lock); return 0; -- cgit v1.2.3-70-g09d2 From 88c90dbccaaed35991b5336fec84294de1d23538 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:35 -0700 Subject: mm/page_alloc: factor setup_pageset() into pageset_init() and pageset_set_batch() Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 386de0f11be..a235149d940 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4071,7 +4071,7 @@ static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); } -static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +static void pageset_init(struct per_cpu_pageset *p) { struct per_cpu_pages *pcp; int migratetype; @@ -4080,11 +4080,16 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp = &p->pcp; pcp->count = 0; - pageset_set_batch(p, batch); for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) INIT_LIST_HEAD(&pcp->lists[migratetype]); } +static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +{ + pageset_init(p); + pageset_set_batch(p, batch); +} + /* * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist * to the value high for the pageset p. -- cgit v1.2.3-70-g09d2 From dd1895e2c5c9ed3a791d1d8eb4a6a3e241ec9d6e Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:36 -0700 Subject: mm/page_alloc: relocate comment to be directly above code it refers to. Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a235149d940..2793ce50f31 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3711,12 +3711,12 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); } else { - /* we have to stop all cpus to guarantee there is no user - of zonelist */ #ifdef CONFIG_MEMORY_HOTPLUG if (zone) setup_zone_pageset(zone); #endif + /* we have to stop all cpus to guarantee there is no user + of zonelist */ stop_machine(__build_all_zonelists, pgdat, NULL); /* cpuset refresh routine should be here */ } -- cgit v1.2.3-70-g09d2 From 56cef2b85c28d81efd39f2eeaddce28678756fe3 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:38 -0700 Subject: mm/page_alloc: factor zone_pageset_init() out of setup_zone_pageset() Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2793ce50f31..ee6fe7faaba 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4104,22 +4104,25 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, pageset_update(&p->pcp, high, batch); } +static void __meminit zone_pageset_init(struct zone *zone, int cpu) +{ + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); + + pageset_init(pcp); + if (percpu_pagelist_fraction) + setup_pagelist_highmark(pcp, + (zone->managed_pages / + percpu_pagelist_fraction)); + else + pageset_set_batch(pcp, zone_batchsize(zone)); +} + static void __meminit setup_zone_pageset(struct zone *zone) { int cpu; - zone->pageset = alloc_percpu(struct per_cpu_pageset); - - for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); - - setup_pageset(pcp, zone_batchsize(zone)); - - if (percpu_pagelist_fraction) - setup_pagelist_highmark(pcp, - (zone->managed_pages / - percpu_pagelist_fraction)); - } + for_each_possible_cpu(cpu) + zone_pageset_init(zone, cpu); } /* -- cgit v1.2.3-70-g09d2 From 737af4c0110fc69a81dc7464a74a4113f7645255 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:39 -0700 Subject: mm/page_alloc: in zone_pcp_update(), uze zone_pageset_init() Previously, zone_pcp_update() called pageset_set_batch() directly, essentially assuming that percpu_pagelist_fraction == 0. Correct this by calling zone_pageset_init(), which chooses the appropriate ->batch and ->high calculations. Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ee6fe7faaba..c7344d17660 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6098,11 +6098,9 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) void __meminit zone_pcp_update(struct zone *zone) { unsigned cpu; - unsigned long batch; mutex_lock(&pcp_batch_high_lock); - batch = zone_batchsize(zone); for_each_possible_cpu(cpu) - pageset_set_batch(per_cpu_ptr(zone->pageset, cpu), batch); + zone_pageset_init(zone, cpu); mutex_unlock(&pcp_batch_high_lock); } #endif -- cgit v1.2.3-70-g09d2 From 3664033c56f211a3dcf28d9d68c604ed447d8d79 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:40 -0700 Subject: mm/page_alloc: rename setup_pagelist_highmark() to match naming of pageset_set_batch() Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c7344d17660..03a3f943d98 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4065,7 +4065,7 @@ static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, pcp->batch = batch; } -/* a companion to setup_pagelist_highmark() */ +/* a companion to pageset_set_high() */ static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) { pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); @@ -4091,10 +4091,10 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) } /* - * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist + * pageset_set_high() sets the high water mark for hot per_cpu_pagelist * to the value high for the pageset p. */ -static void setup_pagelist_highmark(struct per_cpu_pageset *p, +static void pageset_set_high(struct per_cpu_pageset *p, unsigned long high) { unsigned long batch = max(1UL, high / 4); @@ -4110,7 +4110,7 @@ static void __meminit zone_pageset_init(struct zone *zone, int cpu) pageset_init(pcp); if (percpu_pagelist_fraction) - setup_pagelist_highmark(pcp, + pageset_set_high(pcp, (zone->managed_pages / percpu_pagelist_fraction)); else @@ -5599,8 +5599,8 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, unsigned long high; high = zone->managed_pages / percpu_pagelist_fraction; for_each_possible_cpu(cpu) - setup_pagelist_highmark( - per_cpu_ptr(zone->pageset, cpu), high); + pageset_set_high(per_cpu_ptr(zone->pageset, cpu), + high); } mutex_unlock(&pcp_batch_high_lock); return 0; -- cgit v1.2.3-70-g09d2 From 169f6c1999ca6d0c5e06e8d810817ed3d1ebf017 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:41 -0700 Subject: mm/page_alloc: don't re-init pageset in zone_pcp_update() When memory hotplug is triggered, we call pageset_init() on per-cpu-pagesets which both contain pages and are in use, causing both the leakage of those pages and (potentially) bad behaviour if a page is allocated from a pageset while it is being cleared. Avoid this by factoring out pageset_set_high_and_batch() (which contains all needed logic too set a pageset's ->high and ->batch inrespective of system state) from zone_pageset_init() and using the new pageset_set_high_and_batch() instead of zone_pageset_init() in zone_pcp_update(). Signed-off-by: Cody P Schafer Cc: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 03a3f943d98..fab9506273b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4104,11 +4104,9 @@ static void pageset_set_high(struct per_cpu_pageset *p, pageset_update(&p->pcp, high, batch); } -static void __meminit zone_pageset_init(struct zone *zone, int cpu) +static void __meminit pageset_set_high_and_batch(struct zone *zone, + struct per_cpu_pageset *pcp) { - struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); - - pageset_init(pcp); if (percpu_pagelist_fraction) pageset_set_high(pcp, (zone->managed_pages / @@ -4117,6 +4115,14 @@ static void __meminit zone_pageset_init(struct zone *zone, int cpu) pageset_set_batch(pcp, zone_batchsize(zone)); } +static void __meminit zone_pageset_init(struct zone *zone, int cpu) +{ + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); + + pageset_init(pcp); + pageset_set_high_and_batch(zone, pcp); +} + static void __meminit setup_zone_pageset(struct zone *zone) { int cpu; @@ -6100,7 +6106,8 @@ void __meminit zone_pcp_update(struct zone *zone) unsigned cpu; mutex_lock(&pcp_batch_high_lock); for_each_possible_cpu(cpu) - zone_pageset_init(zone, cpu); + pageset_set_high_and_batch(zone, + per_cpu_ptr(zone->pageset, cpu)); mutex_unlock(&pcp_batch_high_lock); } #endif -- cgit v1.2.3-70-g09d2 From dacbde0963d62a4962d5e8a5cc38dfd1f016124b Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Wed, 3 Jul 2013 15:02:35 -0700 Subject: mm/page_alloc.c: add additional checking and return value for the 'table->data' - check the length of the procfs data before copying it into a fixed size array. - when __parse_numa_zonelist_order() fails, save the error code for return. - 'char*' --> 'char *' coding style fix Signed-off-by: Chen Gang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fab9506273b..a662c74a0f5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3256,18 +3256,25 @@ int numa_zonelist_order_handler(ctl_table *table, int write, static DEFINE_MUTEX(zl_order_mutex); mutex_lock(&zl_order_mutex); - if (write) - strcpy(saved_string, (char*)table->data); + if (write) { + if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { + ret = -EINVAL; + goto out; + } + strcpy(saved_string, (char *)table->data); + } ret = proc_dostring(table, write, buffer, length, ppos); if (ret) goto out; if (write) { int oldval = user_zonelist_order; - if (__parse_numa_zonelist_order((char*)table->data)) { + + ret = __parse_numa_zonelist_order((char *)table->data); + if (ret) { /* * bogus value. restore saved string */ - strncpy((char*)table->data, saved_string, + strncpy((char *)table->data, saved_string, NUMA_ZONELIST_ORDER_LEN); user_zonelist_order = oldval; } else if (oldval != user_zonelist_order) { -- cgit v1.2.3-70-g09d2 From cea27eb2a202959783f81254c48c250ddd80e129 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 3 Jul 2013 15:02:40 -0700 Subject: mm/memory-hotplug: fix lowmem count overflow when offline pages The logic for the memory-remove code fails to correctly account the Total High Memory when a memory block which contains High Memory is offlined as shown in the example below. The following patch fixes it. Before logic memory remove: MemTotal: 7603740 kB MemFree: 6329612 kB Buffers: 94352 kB Cached: 872008 kB SwapCached: 0 kB Active: 626932 kB Inactive: 519216 kB Active(anon): 180776 kB Inactive(anon): 222944 kB Active(file): 446156 kB Inactive(file): 296272 kB Unevictable: 0 kB Mlocked: 0 kB HighTotal: 7294672 kB HighFree: 5704696 kB LowTotal: 309068 kB LowFree: 624916 kB After logic memory remove: MemTotal: 7079452 kB MemFree: 5805976 kB Buffers: 94372 kB Cached: 872000 kB SwapCached: 0 kB Active: 626936 kB Inactive: 519236 kB Active(anon): 180780 kB Inactive(anon): 222944 kB Active(file): 446156 kB Inactive(file): 296292 kB Unevictable: 0 kB Mlocked: 0 kB HighTotal: 7294672 kB HighFree: 5181024 kB LowTotal: 4294752076 kB LowFree: 624952 kB [mhocko@suse.cz: fix CONFIG_HIGHMEM=n build] Signed-off-by: Wanpeng Li Reviewed-by: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: [2.6.24+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a662c74a0f5..d711dcdda36 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6185,6 +6185,10 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) list_del(&page->lru); rmv_page_order(page); zone->free_area[order].nr_free--; +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages -= 1 << order; +#endif for (i = 0; i < (1 << order); i++) SetPageReserved((page+i)); pfn += (1 << order); -- cgit v1.2.3-70-g09d2 From 11199692d83dd3fe1511203024fb9853d176ec4c Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jul 2013 15:02:48 -0700 Subject: mm: change signature of free_reserved_area() to fix building warnings Change signature of free_reserved_area() according to Russell King's suggestion to fix following build warnings: arch/arm/mm/init.c: In function 'mem_init': arch/arm/mm/init.c:603:2: warning: passing argument 1 of 'free_reserved_area' makes integer from pointer without a cast [enabled by default] free_reserved_area(__va(PHYS_PFN_OFFSET), swapper_pg_dir, 0, NULL); ^ In file included from include/linux/mman.h:4:0, from arch/arm/mm/init.c:15: include/linux/mm.h:1301:22: note: expected 'long unsigned int' but argument is of type 'void *' extern unsigned long free_reserved_area(unsigned long start, unsigned long end, mm/page_alloc.c: In function 'free_reserved_area': >> mm/page_alloc.c:5134:3: warning: passing argument 1 of 'virt_to_phys' makes pointer from integer without a cast [enabled by default] In file included from arch/mips/include/asm/page.h:49:0, from include/linux/mmzone.h:20, from include/linux/gfp.h:4, from include/linux/mm.h:8, from mm/page_alloc.c:18: arch/mips/include/asm/io.h:119:29: note: expected 'const volatile void *' but argument is of type 'long unsigned int' mm/page_alloc.c: In function 'free_area_init_nodes': mm/page_alloc.c:5030:34: warning: array subscript is below array bounds [-Warray-bounds] Also address some minor code review comments. Signed-off-by: Jiang Liu Reported-by: Arnd Bergmann Cc: "H. Peter Anvin" Cc: "Michael S. Tsirkin" Cc: Cc: Catalin Marinas Cc: Chris Metcalf Cc: David Howells Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Jeremy Fitzhardinge Cc: Jianguo Wu Cc: Joonsoo Kim Cc: Kamezawa Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Marek Szyprowski Cc: Mel Gorman Cc: Michel Lespinasse Cc: Minchan Kim Cc: Rik van Riel Cc: Rusty Russell Cc: Tang Chen Cc: Tejun Heo Cc: Thomas Gleixner Cc: Wen Congyang Cc: Will Deacon Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/sys_nautilus.c | 4 ++-- arch/alpha/mm/init.c | 2 +- arch/arc/mm/init.c | 2 +- arch/arm/mm/init.c | 2 +- arch/arm64/mm/init.c | 2 +- arch/avr32/mm/init.c | 2 +- arch/blackfin/mm/init.c | 2 +- arch/c6x/mm/init.c | 2 +- arch/frv/mm/init.c | 2 +- arch/h8300/mm/init.c | 2 +- arch/ia64/mm/init.c | 3 +-- arch/m32r/mm/init.c | 2 +- arch/m68k/mm/init.c | 2 +- arch/metag/mm/init.c | 3 ++- arch/microblaze/mm/init.c | 2 +- arch/mips/mm/init.c | 3 ++- arch/mn10300/mm/init.c | 3 ++- arch/openrisc/mm/init.c | 2 +- arch/parisc/mm/init.c | 3 ++- arch/powerpc/kernel/kvm.c | 2 +- arch/powerpc/mm/mem.c | 2 +- arch/s390/mm/init.c | 3 ++- arch/score/mm/init.c | 3 ++- arch/sh/mm/init.c | 2 +- arch/sparc/mm/init_32.c | 4 ++-- arch/sparc/mm/init_64.c | 4 ++-- arch/um/kernel/mem.c | 2 +- arch/unicore32/mm/init.c | 2 +- arch/xtensa/mm/init.c | 2 +- include/linux/mm.h | 5 ++--- mm/page_alloc.c | 19 ++++++++++--------- 31 files changed, 50 insertions(+), 45 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/arch/alpha/kernel/sys_nautilus.c b/arch/alpha/kernel/sys_nautilus.c index 1d4aabfcf9a..891bd274ccb 100644 --- a/arch/alpha/kernel/sys_nautilus.c +++ b/arch/alpha/kernel/sys_nautilus.c @@ -238,8 +238,8 @@ nautilus_init_pci(void) if (pci_mem < memtop) memtop = pci_mem; if (memtop > alpha_mv.min_mem_address) { - free_reserved_area((unsigned long)__va(alpha_mv.min_mem_address), - (unsigned long)__va(memtop), 0, NULL); + free_reserved_area(__va(alpha_mv.min_mem_address), + __va(memtop), 0, NULL); printk("nautilus_init_pci: %ldk freed\n", (memtop - alpha_mv.min_mem_address) >> 10); } diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 0ba85ee4a46..d54848d4e46 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -326,6 +326,6 @@ free_initmem(void) void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index 4a177365b2c..dce02e4716a 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -152,7 +152,7 @@ void __init_refok free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 2ffee02d1d5..7fae391caf8 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -745,7 +745,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) { if (!keep_initrd) { poison_init_mem((void *)start, PAGE_ALIGN(end) - start); - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } } diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index f497ca77925..6041e4008a8 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -398,7 +398,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) { if (!keep_initrd) { poison_init_mem((void *)start, PAGE_ALIGN(end) - start); - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } } diff --git a/arch/avr32/mm/init.c b/arch/avr32/mm/init.c index e66e8406f99..5a79fa08cb3 100644 --- a/arch/avr32/mm/init.c +++ b/arch/avr32/mm/init.c @@ -154,6 +154,6 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/blackfin/mm/init.c b/arch/blackfin/mm/init.c index 82d01a71207..8e9eab27281 100644 --- a/arch/blackfin/mm/init.c +++ b/arch/blackfin/mm/init.c @@ -133,7 +133,7 @@ void __init mem_init(void) void __init free_initrd_mem(unsigned long start, unsigned long end) { #ifndef CONFIG_MPU - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); #endif } #endif diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c index b74ccb5a769..07bfcc98a3b 100644 --- a/arch/c6x/mm/init.c +++ b/arch/c6x/mm/init.c @@ -78,7 +78,7 @@ void __init mem_init(void) #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/frv/mm/init.c b/arch/frv/mm/init.c index dee354fa6b6..a67f3a5897b 100644 --- a/arch/frv/mm/init.c +++ b/arch/frv/mm/init.c @@ -173,6 +173,6 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } /* end free_initrd_mem() */ #endif diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c index ff349d70a29..57e03c59861 100644 --- a/arch/h8300/mm/init.c +++ b/arch/h8300/mm/init.c @@ -161,7 +161,7 @@ void __init mem_init(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index d1fe4b40260..da568c2e839 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -154,8 +154,7 @@ ia64_init_addr_space (void) void free_initmem (void) { - free_reserved_area((unsigned long)ia64_imva(__init_begin), - (unsigned long)ia64_imva(__init_end), + free_reserved_area(ia64_imva(__init_begin), ia64_imva(__init_end), 0, "unused kernel"); } diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c index ab4cbce91a9..d80412d0c14 100644 --- a/arch/m32r/mm/init.c +++ b/arch/m32r/mm/init.c @@ -191,6 +191,6 @@ void free_initmem(void) *======================================================================*/ void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index 1af2ca3411f..95de725534e 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -202,6 +202,6 @@ void __init mem_init(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/metag/mm/init.c b/arch/metag/mm/init.c index d05b8455c44..5e2238dd72e 100644 --- a/arch/metag/mm/init.c +++ b/arch/metag/mm/init.c @@ -414,7 +414,8 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, POISON_FREE_INITMEM, "initrd"); + free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, + "initrd"); } #endif diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index b38ae3acfeb..d7b8ada9345 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -235,7 +235,7 @@ void __init setup_memory(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 9b973e0af9c..268f2a94031 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -440,7 +440,8 @@ void free_init_pages(const char *what, unsigned long begin, unsigned long end) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, POISON_FREE_INITMEM, "initrd"); + free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, + "initrd"); } #endif diff --git a/arch/mn10300/mm/init.c b/arch/mn10300/mm/init.c index 5a8ace63a6b..e19049d1f2b 100644 --- a/arch/mn10300/mm/init.c +++ b/arch/mn10300/mm/init.c @@ -152,6 +152,7 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, POISON_FREE_INITMEM, "initrd"); + free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, + "initrd"); } #endif diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index b3cbc670383..ab113325dc4 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -261,7 +261,7 @@ void __init mem_init(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 505b56c6b9b..3223d5e4a37 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -1101,6 +1101,7 @@ void flush_tlb_all(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - num_physpages += free_reserved_area(start, end, 0, "initrd"); + num_physpages += free_reserved_area((void *)start, (void *)end, 0, + "initrd"); } #endif diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index 6782221d49b..5e4830a33c0 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -756,7 +756,7 @@ static __init void kvm_free_tmp(void) end = (ulong)&kvm_tmp[ARRAY_SIZE(kvm_tmp)] & PAGE_MASK; /* Free the tmp space we don't need */ - free_reserved_area(start, end, 0, NULL); + free_reserved_area((void *)start, (void *)end, 0, NULL); } static int __init kvm_guest_init(void) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 0988a26e041..347c5b1bbd6 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -407,7 +407,7 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 89ebae4008f..0878c89fe7d 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -172,7 +172,8 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, POISON_FREE_INITMEM, "initrd"); + free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, + "initrd"); } #endif diff --git a/arch/score/mm/init.c b/arch/score/mm/init.c index 0940682ab38..f5dd61eb454 100644 --- a/arch/score/mm/init.c +++ b/arch/score/mm/init.c @@ -108,7 +108,8 @@ void __init mem_init(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, POISON_FREE_INITMEM, "initrd"); + free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, + "initrd"); } #endif diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 20f9ead650d..b892a9b7d7e 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -505,7 +505,7 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index af472cf7c69..d5f9c023826 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -372,8 +372,8 @@ void free_initmem (void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - num_physpages += free_reserved_area(start, end, POISON_FREE_INITMEM, - "initrd"); + num_physpages += free_reserved_area((void *)start, (void *)end, + POISON_FREE_INITMEM, "initrd"); } #endif diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 04fd55a6e46..8269deb84ed 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2131,8 +2131,8 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - num_physpages += free_reserved_area(start, end, POISON_FREE_INITMEM, - "initrd"); + num_physpages += free_reserved_area((void *)start, (void *)end, + POISON_FREE_INITMEM, "initrd"); } #endif diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 9df292b270a..2aa7a2448d5 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -244,7 +244,7 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index 63df12d71ce..220755cc970 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -486,7 +486,7 @@ static int keep_initrd; void free_initrd_mem(unsigned long start, unsigned long end) { if (!keep_initrd) - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } static int __init keepinitrd_setup(char *__unused) diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index bba125b4bb0..4d658efc328 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -214,7 +214,7 @@ extern int initrd_is_mapped; void free_initrd_mem(unsigned long start, unsigned long end) { if (initrd_is_mapped) - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 949bd703589..be1b96ce065 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1311,7 +1311,7 @@ extern void free_initmem(void); * "poison" if it's non-zero. * Return pages freed into the buddy system. */ -extern unsigned long free_reserved_area(unsigned long start, unsigned long end, +extern unsigned long free_reserved_area(void *start, void *end, int poison, char *s); #ifdef CONFIG_HIGHMEM /* @@ -1355,8 +1355,7 @@ static inline unsigned long free_initmem_default(int poison) { extern char __init_begin[], __init_end[]; - return free_reserved_area(PAGE_ALIGN((unsigned long)&__init_begin) , - ((unsigned long)&__init_end) & PAGE_MASK, + return free_reserved_area(&__init_begin, &__init_end, poison, "unused kernel"); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d711dcdda36..be18ccd017b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5206,25 +5206,26 @@ early_param("movablecore", cmdline_parse_movablecore); #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ -unsigned long free_reserved_area(unsigned long start, unsigned long end, - int poison, char *s) +unsigned long free_reserved_area(void *start, void *end, int poison, char *s) { - unsigned long pages, pos; + void *pos; + unsigned long pages = 0; - pos = start = PAGE_ALIGN(start); - end &= PAGE_MASK; - for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) { + start = (void *)PAGE_ALIGN((unsigned long)start); + end = (void *)((unsigned long)end & PAGE_MASK); + for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { if (poison) - memset((void *)pos, poison, PAGE_SIZE); - free_reserved_page(virt_to_page((void *)pos)); + memset(pos, poison, PAGE_SIZE); + free_reserved_page(virt_to_page(pos)); } if (pages && s) - pr_info("Freeing %s memory: %ldK (%lx - %lx)\n", + pr_info("Freeing %s memory: %ldK (%p - %p)\n", s, pages << (PAGE_SHIFT - 10), start, end); return pages; } +EXPORT_SYMBOL(free_reserved_area); #ifdef CONFIG_HIGHMEM void free_highmem_page(struct page *page) -- cgit v1.2.3-70-g09d2 From dbe67df4ba78c79db547c7864e1120981c144c97 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jul 2013 15:02:51 -0700 Subject: mm: enhance free_reserved_area() to support poisoning memory with zero Address more review comments from last round of code review. 1) Enhance free_reserved_area() to support poisoning freed memory with pattern '0'. This could be used to get rid of poison_init_mem() on ARM64. 2) A previous patch has disabled memory poison for initmem on s390 by mistake, so restore to the original behavior. 3) Remove redundant PAGE_ALIGN() when calling free_reserved_area(). Signed-off-by: Jiang Liu Cc: Geert Uytterhoeven Cc: "H. Peter Anvin" Cc: "Michael S. Tsirkin" Cc: Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Chris Metcalf Cc: David Howells Cc: Ingo Molnar Cc: Jeremy Fitzhardinge Cc: Jianguo Wu Cc: Joonsoo Kim Cc: Kamezawa Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Marek Szyprowski Cc: Mel Gorman Cc: Michel Lespinasse Cc: Minchan Kim Cc: Rik van Riel Cc: Rusty Russell Cc: Tang Chen Cc: Tejun Heo Cc: Thomas Gleixner Cc: Wen Congyang Cc: Will Deacon Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/sys_nautilus.c | 2 +- arch/alpha/mm/init.c | 4 ++-- arch/arc/mm/init.c | 4 ++-- arch/arm/mm/init.c | 8 ++++---- arch/arm64/mm/init.c | 4 ++-- arch/avr32/mm/init.c | 4 ++-- arch/blackfin/mm/init.c | 4 ++-- arch/c6x/mm/init.c | 4 ++-- arch/cris/mm/init.c | 2 +- arch/frv/mm/init.c | 4 ++-- arch/h8300/mm/init.c | 4 ++-- arch/ia64/mm/init.c | 2 +- arch/m32r/mm/init.c | 4 ++-- arch/m68k/mm/init.c | 4 ++-- arch/microblaze/mm/init.c | 4 ++-- arch/openrisc/mm/init.c | 4 ++-- arch/parisc/mm/init.c | 4 ++-- arch/powerpc/kernel/kvm.c | 9 ++------- arch/powerpc/mm/mem.c | 2 +- arch/s390/mm/init.c | 2 +- arch/sh/mm/init.c | 4 ++-- arch/um/kernel/mem.c | 2 +- arch/unicore32/mm/init.c | 4 ++-- arch/xtensa/mm/init.c | 4 ++-- include/linux/mm.h | 7 ++++--- mm/page_alloc.c | 2 +- 26 files changed, 49 insertions(+), 53 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/arch/alpha/kernel/sys_nautilus.c b/arch/alpha/kernel/sys_nautilus.c index 891bd274ccb..837c0fa5831 100644 --- a/arch/alpha/kernel/sys_nautilus.c +++ b/arch/alpha/kernel/sys_nautilus.c @@ -239,7 +239,7 @@ nautilus_init_pci(void) memtop = pci_mem; if (memtop > alpha_mv.min_mem_address) { free_reserved_area(__va(alpha_mv.min_mem_address), - __va(memtop), 0, NULL); + __va(memtop), -1, NULL); printk("nautilus_init_pci: %ldk freed\n", (memtop - alpha_mv.min_mem_address) >> 10); } diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index d54848d4e46..218c29c14bb 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -319,13 +319,13 @@ mem_init(void) void free_initmem(void) { - free_initmem_default(0); + free_initmem_default(-1); } #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index dce02e4716a..f9c70771209 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -146,13 +146,13 @@ void __init mem_init(void) */ void __init_refok free_initmem(void) { - free_initmem_default(0); + free_initmem_default(-1); } #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 7fae391caf8..2070651c1bb 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -601,7 +601,7 @@ void __init mem_init(void) #ifdef CONFIG_SA1111 /* now that our DMA memory is actually so designated, we can free it */ - free_reserved_area(__va(PHYS_PFN_OFFSET), swapper_pg_dir, 0, NULL); + free_reserved_area(__va(PHYS_PFN_OFFSET), swapper_pg_dir, -1, NULL); #endif free_highpages(); @@ -729,12 +729,12 @@ void free_initmem(void) extern char __tcm_start, __tcm_end; poison_init_mem(&__tcm_start, &__tcm_end - &__tcm_start); - free_reserved_area(&__tcm_start, &__tcm_end, 0, "TCM link"); + free_reserved_area(&__tcm_start, &__tcm_end, -1, "TCM link"); #endif poison_init_mem(__init_begin, __init_end - __init_begin); if (!machine_is_integrator() && !machine_is_cintegrator()) - free_initmem_default(0); + free_initmem_default(-1); } #ifdef CONFIG_BLK_DEV_INITRD @@ -745,7 +745,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) { if (!keep_initrd) { poison_init_mem((void *)start, PAGE_ALIGN(end) - start); - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } } diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 6041e4008a8..997c6345cdd 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -387,7 +387,7 @@ void __init mem_init(void) void free_initmem(void) { poison_init_mem(__init_begin, __init_end - __init_begin); - free_initmem_default(0); + free_initmem_default(-1); } #ifdef CONFIG_BLK_DEV_INITRD @@ -398,7 +398,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) { if (!keep_initrd) { poison_init_mem((void *)start, PAGE_ALIGN(end) - start); - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } } diff --git a/arch/avr32/mm/init.c b/arch/avr32/mm/init.c index 5a79fa08cb3..b079e04f695 100644 --- a/arch/avr32/mm/init.c +++ b/arch/avr32/mm/init.c @@ -148,12 +148,12 @@ void __init mem_init(void) void free_initmem(void) { - free_initmem_default(0); + free_initmem_default(-1); } #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif diff --git a/arch/blackfin/mm/init.c b/arch/blackfin/mm/init.c index 8e9eab27281..fa241f5a7dc 100644 --- a/arch/blackfin/mm/init.c +++ b/arch/blackfin/mm/init.c @@ -133,7 +133,7 @@ void __init mem_init(void) void __init free_initrd_mem(unsigned long start, unsigned long end) { #ifndef CONFIG_MPU - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); #endif } #endif @@ -141,7 +141,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end) void __init_refok free_initmem(void) { #if defined CONFIG_RAMKERNEL && !defined CONFIG_MPU - free_initmem_default(0); + free_initmem_default(-1); if (memory_start == (unsigned long)(&__init_end)) memory_start = (unsigned long)(&__init_begin); #endif diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c index 07bfcc98a3b..3987a20fdee 100644 --- a/arch/c6x/mm/init.c +++ b/arch/c6x/mm/init.c @@ -78,11 +78,11 @@ void __init mem_init(void) #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif void __init free_initmem(void) { - free_initmem_default(0); + free_initmem_default(-1); } diff --git a/arch/cris/mm/init.c b/arch/cris/mm/init.c index 9ac80946dad..8fec26392ae 100644 --- a/arch/cris/mm/init.c +++ b/arch/cris/mm/init.c @@ -65,5 +65,5 @@ mem_init(void) void free_initmem(void) { - free_initmem_default(0); + free_initmem_default(-1); } diff --git a/arch/frv/mm/init.c b/arch/frv/mm/init.c index a67f3a5897b..8ba9d22d0d9 100644 --- a/arch/frv/mm/init.c +++ b/arch/frv/mm/init.c @@ -162,7 +162,7 @@ void __init mem_init(void) void free_initmem(void) { #if defined(CONFIG_RAMKERNEL) && !defined(CONFIG_PROTECT_KERNEL) - free_initmem_default(0); + free_initmem_default(-1); #endif } /* end free_initmem() */ @@ -173,6 +173,6 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } /* end free_initrd_mem() */ #endif diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c index 57e03c59861..c831f1dba13 100644 --- a/arch/h8300/mm/init.c +++ b/arch/h8300/mm/init.c @@ -161,7 +161,7 @@ void __init mem_init(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif @@ -169,7 +169,7 @@ void free_initmem(void) { #ifdef CONFIG_RAMKERNEL - free_initmem_default(0); + free_initmem_default(-1); #endif } diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index da568c2e839..f8a4f38b0ad 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -155,7 +155,7 @@ void free_initmem (void) { free_reserved_area(ia64_imva(__init_begin), ia64_imva(__init_end), - 0, "unused kernel"); + -1, "unused kernel"); } void __init diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c index d80412d0c14..cca87d91843 100644 --- a/arch/m32r/mm/init.c +++ b/arch/m32r/mm/init.c @@ -181,7 +181,7 @@ void __init mem_init(void) *======================================================================*/ void free_initmem(void) { - free_initmem_default(0); + free_initmem_default(-1); } #ifdef CONFIG_BLK_DEV_INITRD @@ -191,6 +191,6 @@ void free_initmem(void) *======================================================================*/ void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index 95de725534e..ab0b54ca5d8 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -110,7 +110,7 @@ void __init paging_init(void) void free_initmem(void) { #ifndef CONFIG_MMU_SUN3 - free_initmem_default(0); + free_initmem_default(-1); #endif /* CONFIG_MMU_SUN3 */ } @@ -202,6 +202,6 @@ void __init mem_init(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index d7b8ada9345..d149e0ebb76 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -235,13 +235,13 @@ void __init setup_memory(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif void free_initmem(void) { - free_initmem_default(0); + free_initmem_default(-1); } void __init mem_init(void) diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index ab113325dc4..c371e4a0fca 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -261,11 +261,11 @@ void __init mem_init(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif void free_initmem(void) { - free_initmem_default(0); + free_initmem_default(-1); } diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 3223d5e4a37..ebac7bd76b5 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -532,7 +532,7 @@ void free_initmem(void) * pages are no-longer executable */ flush_icache_range(init_begin, init_end); - num_physpages += free_initmem_default(0); + num_physpages += free_initmem_default(-1); /* set up a new led state on systems shipped LED State panel */ pdc_chassis_send_status(PDC_CHASSIS_DIRECT_BCOMPLETE); @@ -1101,7 +1101,7 @@ void flush_tlb_all(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - num_physpages += free_reserved_area((void *)start, (void *)end, 0, + num_physpages += free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index 5e4830a33c0..db28032e320 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -750,13 +750,8 @@ EXPORT_SYMBOL_GPL(kvm_hypercall); static __init void kvm_free_tmp(void) { - unsigned long start, end; - - start = (ulong)&kvm_tmp[kvm_tmp_index + (PAGE_SIZE - 1)] & PAGE_MASK; - end = (ulong)&kvm_tmp[ARRAY_SIZE(kvm_tmp)] & PAGE_MASK; - - /* Free the tmp space we don't need */ - free_reserved_area((void *)start, (void *)end, 0, NULL); + free_reserved_area(&kvm_tmp[kvm_tmp_index], + &kvm_tmp[ARRAY_SIZE(kvm_tmp)], -1, NULL); } static int __init kvm_guest_init(void) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 347c5b1bbd6..7f47a05f55a 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -407,7 +407,7 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 0878c89fe7d..bf01d18422e 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -166,7 +166,7 @@ void __init mem_init(void) void free_initmem(void) { - free_initmem_default(0); + free_initmem_default(POISON_FREE_INITMEM); } #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index b892a9b7d7e..d3af56b7a09 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -499,13 +499,13 @@ void __init mem_init(void) void free_initmem(void) { - free_initmem_default(0); + free_initmem_default(-1); } #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 2aa7a2448d5..8ff0b7ae8ec 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -244,7 +244,7 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index 220755cc970..df9b8abcb6a 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -476,7 +476,7 @@ void __init mem_init(void) void free_initmem(void) { - free_initmem_default(0); + free_initmem_default(-1); } #ifdef CONFIG_BLK_DEV_INITRD @@ -486,7 +486,7 @@ static int keep_initrd; void free_initrd_mem(unsigned long start, unsigned long end) { if (!keep_initrd) - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } static int __init keepinitrd_setup(char *__unused) diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 4d658efc328..026d29bee30 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -214,11 +214,11 @@ extern int initrd_is_mapped; void free_initrd_mem(unsigned long start, unsigned long end) { if (initrd_is_mapped) - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif void free_initmem(void) { - free_initmem_default(0); + free_initmem_default(-1); } diff --git a/include/linux/mm.h b/include/linux/mm.h index be1b96ce065..083cc0ba238 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1308,7 +1308,7 @@ extern void free_initmem(void); /* * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK) * into the buddy system. The freed pages will be poisoned with pattern - * "poison" if it's non-zero. + * "poison" if it's within range [0, UCHAR_MAX]. * Return pages freed into the buddy system. */ extern unsigned long free_reserved_area(void *start, void *end, @@ -1348,8 +1348,9 @@ static inline void mark_page_reserved(struct page *page) /* * Default method to free all the __init memory into the buddy system. - * The freed pages will be poisoned with pattern "poison" if it is - * non-zero. Return pages freed into the buddy system. + * The freed pages will be poisoned with pattern "poison" if it's within + * range [0, UCHAR_MAX]. + * Return pages freed into the buddy system. */ static inline unsigned long free_initmem_default(int poison) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index be18ccd017b..6780b2e18aa 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5214,7 +5214,7 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s) start = (void *)PAGE_ALIGN((unsigned long)start); end = (void *)((unsigned long)end & PAGE_MASK); for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { - if (poison) + if ((unsigned int)poison <= 0xFF) memset(pos, poison, PAGE_SIZE); free_reserved_page(virt_to_page(pos)); } -- cgit v1.2.3-70-g09d2 From 834405c3b6aebf6853663796401cdfe11aac6275 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jul 2013 15:03:04 -0700 Subject: mm: fix some trivial typos in comments Fix some trivial typos in comments. Signed-off-by: Jiang Liu Cc: Wen Congyang Cc: Tang Chen Cc: Yasuaki Ishimatsu Cc: Mel Gorman Cc: Minchan Kim Cc: Marek Szyprowski Cc: "H. Peter Anvin" Cc: "Michael S. Tsirkin" Cc: Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Chris Metcalf Cc: David Howells Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Jeremy Fitzhardinge Cc: Jianguo Wu Cc: Joonsoo Kim Cc: Kamezawa Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michel Lespinasse Cc: Rik van Riel Cc: Rusty Russell Cc: Tejun Heo Cc: Thomas Gleixner Cc: Will Deacon Cc: Yinghai Lu Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- mm/page_alloc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e3097f299f6..6096cb91873 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -309,7 +309,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, /* can't move pfns which are higher than @z2 */ if (end_pfn > zone_end_pfn(z2)) goto out_fail; - /* the move out part mast at the left most of @z2 */ + /* the move out part must be at the left most of @z2 */ if (start_pfn > z2->zone_start_pfn) goto out_fail; /* must included/overlap */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6780b2e18aa..657daea88aa 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2845,7 +2845,7 @@ EXPORT_SYMBOL(free_pages_exact); * nr_free_zone_pages() counts the number of counts pages which are beyond the * high watermark within all zones at or below a given zone index. For each * zone, the number of pages is calculated as: - * present_pages - high_pages + * managed_pages - high_pages */ static unsigned long nr_free_zone_pages(int offset) { -- cgit v1.2.3-70-g09d2 From 4f9f47745e948eca18bb97c82dbb4d53f2380086 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jul 2013 15:03:07 -0700 Subject: mm: use managed_pages to calculate default zonelist order Use zone->managed_pages instead of zone->present_pages to calculate default zonelist order because managed_pages means allocatable pages. Signed-off-by: Jiang Liu Cc: Mel Gorman Cc: Minchan Kim Cc: Marek Szyprowski Cc: "H. Peter Anvin" Cc: "Michael S. Tsirkin" Cc: Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Chris Metcalf Cc: David Howells Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Jeremy Fitzhardinge Cc: Jianguo Wu Cc: Joonsoo Kim Cc: Kamezawa Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michel Lespinasse Cc: Rik van Riel Cc: Rusty Russell Cc: Tang Chen Cc: Tejun Heo Cc: Thomas Gleixner Cc: Wen Congyang Cc: Will Deacon Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 657daea88aa..f22542f6dc1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3438,8 +3438,8 @@ static int default_zonelist_order(void) z = &NODE_DATA(nid)->node_zones[zone_type]; if (populated_zone(z)) { if (zone_type < ZONE_NORMAL) - low_kmem_size += z->present_pages; - total_size += z->present_pages; + low_kmem_size += z->managed_pages; + total_size += z->managed_pages; } else if (zone_type == ZONE_NORMAL) { /* * If any node has only lowmem, then node order -- cgit v1.2.3-70-g09d2 From 7b4b2a0d6c8500350784beb83a6a55e60ea3bea3 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jul 2013 15:03:11 -0700 Subject: mm: accurately calculate zone->managed_pages for highmem zones Commit "mm: introduce new field 'managed_pages' to struct zone" assumes that all highmem pages will be freed into the buddy system by function mem_init(). But that's not always true, some architectures may reserve some highmem pages during boot. For example PPC may allocate highmem pages for giagant HugeTLB pages, and several architectures have code to check PageReserved flag to exclude highmem pages allocated during boot when freeing highmem pages into the buddy system. So treat highmem pages in the same way as normal pages, that is to: 1) reset zone->managed_pages to zero in mem_init(). 2) recalculate managed_pages when freeing pages into the buddy system. Signed-off-by: Jiang Liu Cc: "H. Peter Anvin" Cc: Tejun Heo Cc: Joonsoo Kim Cc: Yinghai Lu Cc: Mel Gorman Cc: Minchan Kim Cc: Kamezawa Hiroyuki Cc: Marek Szyprowski Cc: "Michael S. Tsirkin" Cc: Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Chris Metcalf Cc: David Howells Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Jeremy Fitzhardinge Cc: Jianguo Wu Cc: Konrad Rzeszutek Wilk Cc: Michel Lespinasse Cc: Rik van Riel Cc: Rusty Russell Cc: Tang Chen Cc: Thomas Gleixner Cc: Wen Congyang Cc: Will Deacon Cc: Yasuaki Ishimatsu Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/metag/mm/init.c | 6 ++++++ arch/x86/mm/highmem_32.c | 6 ++++++ include/linux/bootmem.h | 1 + mm/bootmem.c | 32 ++++++++++++++++++-------------- mm/nobootmem.c | 30 ++++++++++++++++-------------- mm/page_alloc.c | 1 + 6 files changed, 48 insertions(+), 28 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/arch/metag/mm/init.c b/arch/metag/mm/init.c index 5e2238dd72e..d7595f58fad 100644 --- a/arch/metag/mm/init.c +++ b/arch/metag/mm/init.c @@ -380,6 +380,12 @@ void __init mem_init(void) #ifdef CONFIG_HIGHMEM unsigned long tmp; + + /* + * Explicitly reset zone->managed_pages because highmem pages are + * freed before calling free_all_bootmem_node(); + */ + reset_all_zones_managed_pages(); for (tmp = highstart_pfn; tmp < highend_pfn; tmp++) free_highmem_page(pfn_to_page(tmp)); num_physpages += totalhigh_pages; diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 252b8f5489b..4500142bc4a 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -1,6 +1,7 @@ #include #include #include /* for totalram_pages */ +#include void *kmap(struct page *page) { @@ -121,6 +122,11 @@ void __init set_highmem_pages_init(void) struct zone *zone; int nid; + /* + * Explicitly reset zone->managed_pages because set_highmem_pages_init() + * is invoked before free_all_bootmem() + */ + reset_all_zones_managed_pages(); for_each_zone(zone) { unsigned long zone_start_pfn, zone_end_pfn; diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 5f0b0e1f7c0..0e48c3221d8 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -46,6 +46,7 @@ extern unsigned long init_bootmem(unsigned long addr, unsigned long memend); extern unsigned long free_all_bootmem_node(pg_data_t *pgdat); extern unsigned long free_all_bootmem(void); +extern void reset_all_zones_managed_pages(void); extern void free_bootmem_node(pg_data_t *pgdat, unsigned long addr, diff --git a/mm/bootmem.c b/mm/bootmem.c index 2b0bcb019ec..eb792323187 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -241,20 +241,26 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) return count; } -static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) +static int reset_managed_pages_done __initdata; + +static inline void __init reset_node_managed_pages(pg_data_t *pgdat) { struct zone *z; - /* - * In free_area_init_core(), highmem zone's managed_pages is set to - * present_pages, and bootmem allocator doesn't allocate from highmem - * zones. So there's no need to recalculate managed_pages because all - * highmem pages will be managed by the buddy system. Here highmem - * zone also includes highmem movable zone. - */ + if (reset_managed_pages_done) + return; + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) - if (!is_highmem(z)) - z->managed_pages = 0; + z->managed_pages = 0; +} + +void __init reset_all_zones_managed_pages(void) +{ + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) + reset_node_managed_pages(pgdat); + reset_managed_pages_done = 1; } /** @@ -266,7 +272,7 @@ static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { register_page_bootmem_info_node(pgdat); - reset_node_lowmem_managed_pages(pgdat); + reset_node_managed_pages(pgdat); return free_all_bootmem_core(pgdat->bdata); } @@ -279,10 +285,8 @@ unsigned long __init free_all_bootmem(void) { unsigned long total_pages = 0; bootmem_data_t *bdata; - struct pglist_data *pgdat; - for_each_online_pgdat(pgdat) - reset_node_lowmem_managed_pages(pgdat); + reset_all_zones_managed_pages(); list_for_each_entry(bdata, &bdata_list, list) total_pages += free_all_bootmem_core(bdata); diff --git a/mm/nobootmem.c b/mm/nobootmem.c index bdd3fa2fc73..0ae8d91365a 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -137,20 +137,25 @@ static unsigned long __init free_low_memory_core_early(void) return count; } -static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) +static int reset_managed_pages_done __initdata; + +static inline void __init reset_node_managed_pages(pg_data_t *pgdat) { struct zone *z; - /* - * In free_area_init_core(), highmem zone's managed_pages is set to - * present_pages, and bootmem allocator doesn't allocate from highmem - * zones. So there's no need to recalculate managed_pages because all - * highmem pages will be managed by the buddy system. Here highmem - * zone also includes highmem movable zone. - */ + if (reset_managed_pages_done) + return; for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) - if (!is_highmem(z)) - z->managed_pages = 0; + z->managed_pages = 0; +} + +void __init reset_all_zones_managed_pages(void) +{ + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) + reset_node_managed_pages(pgdat); + reset_managed_pages_done = 1; } /** @@ -160,10 +165,7 @@ static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) */ unsigned long __init free_all_bootmem(void) { - struct pglist_data *pgdat; - - for_each_online_pgdat(pgdat) - reset_node_lowmem_managed_pages(pgdat); + reset_all_zones_managed_pages(); /* * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f22542f6dc1..22438eba00b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5232,6 +5232,7 @@ void free_highmem_page(struct page *page) { __free_reserved_page(page); totalram_pages++; + page_zone(page)->managed_pages++; totalhigh_pages++; } #endif -- cgit v1.2.3-70-g09d2 From c3d5f5f0c2bc4eabeaf49f1a21e1aeb965246cd2 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jul 2013 15:03:14 -0700 Subject: mm: use a dedicated lock to protect totalram_pages and zone->managed_pages Currently lock_memory_hotplug()/unlock_memory_hotplug() are used to protect totalram_pages and zone->managed_pages. Other than the memory hotplug driver, totalram_pages and zone->managed_pages may also be modified at runtime by other drivers, such as Xen balloon, virtio_balloon etc. For those cases, memory hotplug lock is a little too heavy, so introduce a dedicated lock to protect totalram_pages and zone->managed_pages. Now we have a simplified locking rules totalram_pages and zone->managed_pages as: 1) no locking for read accesses because they are unsigned long. 2) no locking for write accesses at boot time in single-threaded context. 3) serialize write accesses at runtime by acquiring the dedicated managed_page_count_lock. Also adjust zone->managed_pages when freeing reserved pages into the buddy system, to keep totalram_pages and zone->managed_pages in consistence. [akpm@linux-foundation.org: don't export adjust_managed_page_count to modules (for now)] Signed-off-by: Jiang Liu Cc: Mel Gorman Cc: Michel Lespinasse Cc: Rik van Riel Cc: Minchan Kim Cc: "H. Peter Anvin" Cc: "Michael S. Tsirkin" Cc: Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Chris Metcalf Cc: David Howells Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Jeremy Fitzhardinge Cc: Jianguo Wu Cc: Joonsoo Kim Cc: Kamezawa Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Marek Szyprowski Cc: Rusty Russell Cc: Tang Chen Cc: Tejun Heo Cc: Thomas Gleixner Cc: Wen Congyang Cc: Will Deacon Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 ++---- include/linux/mmzone.h | 14 ++++++++++---- mm/page_alloc.c | 11 +++++++++++ 3 files changed, 23 insertions(+), 8 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 083cc0ba238..4310f80ce95 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1313,6 +1313,7 @@ extern void free_initmem(void); */ extern unsigned long free_reserved_area(void *start, void *end, int poison, char *s); + #ifdef CONFIG_HIGHMEM /* * Free a highmem page into the buddy system, adjusting totalhigh_pages @@ -1321,10 +1322,7 @@ extern unsigned long free_reserved_area(void *start, void *end, extern void free_highmem_page(struct page *page); #endif -static inline void adjust_managed_page_count(struct page *page, long count) -{ - totalram_pages += count; -} +extern void adjust_managed_page_count(struct page *page, long count); /* Free the reserved page into the buddy system, so it gets managed. */ static inline void __free_reserved_page(struct page *page) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e511f9429f1..09d381b71fd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -474,10 +474,16 @@ struct zone { * frequently read in proximity to zone->lock. It's good to * give them a chance of being in the same cacheline. * - * Write access to present_pages and managed_pages at runtime should - * be protected by lock_memory_hotplug()/unlock_memory_hotplug(). - * Any reader who can't tolerant drift of present_pages and - * managed_pages should hold memory hotplug lock to get a stable value. + * Write access to present_pages at runtime should be protected by + * lock_memory_hotplug()/unlock_memory_hotplug(). Any reader who can't + * tolerant drift of present_pages should hold memory hotplug lock to + * get a stable value. + * + * Read access to managed_pages should be safe because it's unsigned + * long. Write access to zone->managed_pages and totalram_pages are + * protected by managed_page_count_lock at runtime. Idealy only + * adjust_managed_page_count() should be used instead of directly + * touching zone->managed_pages and totalram_pages. */ unsigned long spanned_pages; unsigned long present_pages; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 22438eba00b..93f292a60cb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -103,6 +103,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { }; EXPORT_SYMBOL(node_states); +/* Protect totalram_pages and zone->managed_pages */ +static DEFINE_SPINLOCK(managed_page_count_lock); + unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; /* @@ -5206,6 +5209,14 @@ early_param("movablecore", cmdline_parse_movablecore); #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +void adjust_managed_page_count(struct page *page, long count) +{ + spin_lock(&managed_page_count_lock); + page_zone(page)->managed_pages += count; + totalram_pages += count; + spin_unlock(&managed_page_count_lock); +} + unsigned long free_reserved_area(void *start, void *end, int poison, char *s) { void *pos; -- cgit v1.2.3-70-g09d2 From 170a5a7eb2bf10161197e5490fbc29ca4561aedb Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jul 2013 15:03:17 -0700 Subject: mm: make __free_pages_bootmem() only available at boot time In order to simpilify management of totalram_pages and zone->managed_pages, make __free_pages_bootmem() only available at boot time. With this change applied, __free_pages_bootmem() will only be used by bootmem.c and nobootmem.c at boot time, so mark it as __init. Other callers of __free_pages_bootmem() have been converted to use free_reserved_page(), which handles totalram_pages and zone->managed_pages in a safer way. This patch also fix a bug in free_pagetable() for x86_64, which should increase zone->managed_pages instead of zone->present_pages when freeing reserved pages. And now we have managed_pages_count_lock to protect totalram_pages and zone->managed_pages, so remove the redundant ppb_lock lock in put_page_bootmem(). This greatly simplifies the locking rules. Signed-off-by: Jiang Liu Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Yinghai Lu Cc: Wen Congyang Cc: Tang Chen Cc: Yasuaki Ishimatsu Cc: Mel Gorman Cc: Minchan Kim Cc: "Michael S. Tsirkin" Cc: Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Chris Metcalf Cc: David Howells Cc: Geert Uytterhoeven Cc: Jeremy Fitzhardinge Cc: Jianguo Wu Cc: Joonsoo Kim Cc: Kamezawa Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Marek Szyprowski Cc: Michel Lespinasse Cc: Rik van Riel Cc: Rusty Russell Cc: Tejun Heo Cc: Will Deacon Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 18 ++---------------- mm/memory_hotplug.c | 16 ++-------------- mm/page_alloc.c | 9 +-------- 3 files changed, 5 insertions(+), 38 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index b7bdf7bebf3..ec312a92b13 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -712,36 +712,22 @@ EXPORT_SYMBOL_GPL(arch_add_memory); static void __meminit free_pagetable(struct page *page, int order) { - struct zone *zone; - bool bootmem = false; unsigned long magic; unsigned int nr_pages = 1 << order; /* bootmem page has reserved flag */ if (PageReserved(page)) { __ClearPageReserved(page); - bootmem = true; magic = (unsigned long)page->lru.next; if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { while (nr_pages--) put_page_bootmem(page++); } else - __free_pages_bootmem(page, order); + while (nr_pages--) + free_reserved_page(page++); } else free_pages((unsigned long)page_address(page), order); - - /* - * SECTION_INFO pages and MIX_SECTION_INFO pages - * are all allocated by bootmem. - */ - if (bootmem) { - zone = page_zone(page); - zone_span_writelock(zone); - zone->present_pages += nr_pages; - zone_span_writeunlock(zone); - totalram_pages += nr_pages; - } } static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6096cb91873..814ecb2d262 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -101,12 +101,9 @@ void get_page_bootmem(unsigned long info, struct page *page, atomic_inc(&page->_count); } -/* reference to __meminit __free_pages_bootmem is valid - * so use __ref to tell modpost not to generate a warning */ -void __ref put_page_bootmem(struct page *page) +void put_page_bootmem(struct page *page) { unsigned long type; - static DEFINE_MUTEX(ppb_lock); type = (unsigned long) page->lru.next; BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || @@ -116,17 +113,8 @@ void __ref put_page_bootmem(struct page *page) ClearPagePrivate(page); set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); - - /* - * Please refer to comment for __free_pages_bootmem() - * for why we serialize here. - */ - mutex_lock(&ppb_lock); - __free_pages_bootmem(page, 0); - mutex_unlock(&ppb_lock); - totalram_pages++; + free_reserved_page(page); } - } #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 93f292a60cb..2437a7e17ab 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -745,14 +745,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -/* - * Read access to zone->managed_pages is safe because it's unsigned long, - * but we still need to serialize writers. Currently all callers of - * __free_pages_bootmem() except put_page_bootmem() should only be used - * at boot time. So for shorter boot time, we shift the burden to - * put_page_bootmem() to serialize writers. - */ -void __meminit __free_pages_bootmem(struct page *page, unsigned int order) +void __init __free_pages_bootmem(struct page *page, unsigned int order) { unsigned int nr_pages = 1 << order; unsigned int loop; -- cgit v1.2.3-70-g09d2 From 3dcc0571cd64816309765b7c7e4691a4cadf2ee7 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jul 2013 15:03:21 -0700 Subject: mm: correctly update zone->managed_pages Enhance adjust_managed_page_count() to adjust totalhigh_pages for highmem pages. And change code which directly adjusts totalram_pages to use adjust_managed_page_count() because it adjusts totalram_pages, totalhigh_pages and zone->managed_pages altogether in a safe way. Remove inc_totalhigh_pages() and dec_totalhigh_pages() from xen/balloon driver bacause adjust_managed_page_count() has already adjusted totalhigh_pages. This patch also fixes two bugs: 1) enhances virtio_balloon driver to adjust totalhigh_pages when reserve/unreserve pages. 2) enhance memory_hotplug.c to adjust totalhigh_pages when hot-removing memory. We still need to deal with modifications of totalram_pages in file arch/powerpc/platforms/pseries/cmm.c, but need help from PPC experts. [akpm@linux-foundation.org: remove ifdef, per Wanpeng Li, virtio_balloon.c cleanup, per Sergei] [akpm@linux-foundation.org: export adjust_managed_page_count() to modules, for drivers/virtio/virtio_balloon.c] Signed-off-by: Jiang Liu Cc: Chris Metcalf Cc: Rusty Russell Cc: "Michael S. Tsirkin" Cc: Konrad Rzeszutek Wilk Cc: Jeremy Fitzhardinge Cc: Wen Congyang Cc: Tang Chen Cc: Yasuaki Ishimatsu Cc: Mel Gorman Cc: Minchan Kim Cc: "H. Peter Anvin" Cc: Cc: Arnd Bergmann Cc: Catalin Marinas Cc: David Howells Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Jianguo Wu Cc: Joonsoo Kim Cc: Kamezawa Hiroyuki Cc: Marek Szyprowski Cc: Michel Lespinasse Cc: Rik van Riel Cc: Tejun Heo Cc: Thomas Gleixner Cc: Will Deacon Cc: Yinghai Lu Cc: Russell King Cc: Sergei Shtylyov Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/virtio/virtio_balloon.c | 7 ++++--- drivers/xen/balloon.c | 23 +++++------------------ mm/hugetlb.c | 2 +- mm/memory_hotplug.c | 16 +++------------- mm/page_alloc.c | 11 ++++++----- 5 files changed, 19 insertions(+), 40 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index bd3ae324a1a..0098810df69 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -148,7 +148,7 @@ static void fill_balloon(struct virtio_balloon *vb, size_t num) } set_page_pfns(vb->pfns + vb->num_pfns, page); vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE; - totalram_pages--; + adjust_managed_page_count(page, -1); } /* Did we get any? */ @@ -163,8 +163,9 @@ static void release_pages_by_pfn(const u32 pfns[], unsigned int num) /* Find pfns pointing at start of each page, get pages and free them. */ for (i = 0; i < num; i += VIRTIO_BALLOON_PAGES_PER_PAGE) { - balloon_page_free(balloon_pfn_to_page(pfns[i])); - totalram_pages++; + struct page *page = balloon_pfn_to_page(pfns[i]); + balloon_page_free(page); + adjust_managed_page_count(page, 1); } } diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 930fb681790..c8aab4e9783 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -89,14 +89,6 @@ EXPORT_SYMBOL_GPL(balloon_stats); /* We increase/decrease in batches which fit in a page */ static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)]; -#ifdef CONFIG_HIGHMEM -#define inc_totalhigh_pages() (totalhigh_pages++) -#define dec_totalhigh_pages() (totalhigh_pages--) -#else -#define inc_totalhigh_pages() do {} while (0) -#define dec_totalhigh_pages() do {} while (0) -#endif - /* List of ballooned pages, threaded through the mem_map array. */ static LIST_HEAD(ballooned_pages); @@ -132,9 +124,7 @@ static void __balloon_append(struct page *page) static void balloon_append(struct page *page) { __balloon_append(page); - if (PageHighMem(page)) - dec_totalhigh_pages(); - totalram_pages--; + adjust_managed_page_count(page, -1); } /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ @@ -151,13 +141,12 @@ static struct page *balloon_retrieve(bool prefer_highmem) page = list_entry(ballooned_pages.next, struct page, lru); list_del(&page->lru); - if (PageHighMem(page)) { + if (PageHighMem(page)) balloon_stats.balloon_high--; - inc_totalhigh_pages(); - } else + else balloon_stats.balloon_low--; - totalram_pages++; + adjust_managed_page_count(page, 1); return page; } @@ -372,9 +361,7 @@ static enum bp_state increase_reservation(unsigned long nr_pages) #endif /* Relinquish the page back to the allocator. */ - ClearPageReserved(page); - init_page_count(page); - __free_page(page); + __free_reserved_page(page); } balloon_stats.current_pages += rc; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fe095158859..83aff0a4d09 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1263,7 +1263,7 @@ static void __init gather_bootmem_prealloc(void) * side-effects, like CommitLimit going negative. */ if (h->order > (MAX_ORDER - 1)) - totalram_pages += 1 << h->order; + adjust_managed_page_count(page, 1 << h->order); } } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 814ecb2d262..5e34922124a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -772,20 +772,13 @@ EXPORT_SYMBOL_GPL(__online_page_set_limits); void __online_page_increment_counters(struct page *page) { - totalram_pages++; - -#ifdef CONFIG_HIGHMEM - if (PageHighMem(page)) - totalhigh_pages++; -#endif + adjust_managed_page_count(page, 1); } EXPORT_SYMBOL_GPL(__online_page_increment_counters); void __online_page_free(struct page *page) { - ClearPageReserved(page); - init_page_count(page); - __free_page(page); + __free_reserved_page(page); } EXPORT_SYMBOL_GPL(__online_page_free); @@ -983,7 +976,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ return ret; } - zone->managed_pages += onlined_pages; zone->present_pages += onlined_pages; pgdat_resize_lock(zone->zone_pgdat, &flags); @@ -1572,15 +1564,13 @@ repeat: /* reset pagetype flags and makes migrate type to be MOVABLE */ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); /* removal success */ - zone->managed_pages -= offlined_pages; + adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); zone->present_pages -= offlined_pages; pgdat_resize_lock(zone->zone_pgdat, &flags); zone->zone_pgdat->node_present_pages -= offlined_pages; pgdat_resize_unlock(zone->zone_pgdat, &flags); - totalram_pages -= offlined_pages; - init_per_zone_wmark_min(); if (!populated_zone(zone)) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2437a7e17ab..1481439ee2e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -780,11 +780,7 @@ void __init init_cma_reserved_pageblock(struct page *page) set_page_refcounted(page); set_pageblock_migratetype(page, MIGRATE_CMA); __free_pages(page, pageblock_order); - totalram_pages += pageblock_nr_pages; -#ifdef CONFIG_HIGHMEM - if (PageHighMem(page)) - totalhigh_pages += pageblock_nr_pages; -#endif + adjust_managed_page_count(page, pageblock_nr_pages); } #endif @@ -5207,8 +5203,13 @@ void adjust_managed_page_count(struct page *page, long count) spin_lock(&managed_page_count_lock); page_zone(page)->managed_pages += count; totalram_pages += count; +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages += count; +#endif spin_unlock(&managed_page_count_lock); } +EXPORT_SYMBOL(adjust_managed_page_count); unsigned long free_reserved_area(void *start, void *end, int poison, char *s) { -- cgit v1.2.3-70-g09d2 From cdd91a77043ba81585236ef61f65c18222b212e6 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jul 2013 15:03:27 -0700 Subject: mm: report available pages as "MemTotal" for each NUMA node As reported by https://bugzilla.kernel.org/show_bug.cgi?id=53501, "MemTotal" from /proc/meminfo means memory pages managed by the buddy system (managed_pages), but "MemTotal" from /sys/.../node/nodex/meminfo means physical pages present (present_pages) within the NUMA node. There's a difference between managed_pages and present_pages due to bootmem allocator and reserved pages. And Documentation/filesystems/proc.txt says MemTotal: Total usable ram (i.e. physical ram minus a few reserved bits and the kernel binary code) So change /sys/.../node/nodex/meminfo to report available pages within the node as "MemTotal". Signed-off-by: Jiang Liu Reported-by: Cc: Mel Gorman Cc: Minchan Kim Cc: "H. Peter Anvin" Cc: "Michael S. Tsirkin" Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Chris Metcalf Cc: David Howells Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Jeremy Fitzhardinge Cc: Jianguo Wu Cc: Joonsoo Kim Cc: Kamezawa Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Marek Szyprowski Cc: Michel Lespinasse Cc: Rik van Riel Cc: Rusty Russell Cc: Tang Chen Cc: Tejun Heo Cc: Thomas Gleixner Cc: Wen Congyang Cc: Will Deacon Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1481439ee2e..d9445c4f5fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2904,9 +2904,13 @@ EXPORT_SYMBOL(si_meminfo); #ifdef CONFIG_NUMA void si_meminfo_node(struct sysinfo *val, int nid) { + int zone_type; /* needs to be signed */ + unsigned long managed_pages = 0; pg_data_t *pgdat = NODE_DATA(nid); - val->totalram = pgdat->node_present_pages; + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) + managed_pages += pgdat->node_zones[zone_type].managed_pages; + val->totalram = managed_pages; val->freeram = node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; -- cgit v1.2.3-70-g09d2 From 7ee3d4e8cd560500192d80ca84d7f15d6dee0807 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jul 2013 15:03:41 -0700 Subject: mm: introduce helper function mem_init_print_info() to simplify mem_init() Introduce helper function mem_init_print_info() to simplify mem_init() across different architectures, which also unifies the format and information printed. Function mem_init_print_info() calculates memory statistics information without walking each page, so it should be a little faster on some architectures. Also introduce another helper get_num_physpages() to kill the global variable num_physpages. Signed-off-by: Jiang Liu Cc: Mel Gorman Cc: Michel Lespinasse Cc: Rik van Riel Cc: Minchan Kim Cc: Marek Szyprowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 12 ++++++++++++ mm/page_alloc.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4310f80ce95..09c235301db 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1323,6 +1323,7 @@ extern void free_highmem_page(struct page *page); #endif extern void adjust_managed_page_count(struct page *page, long count); +extern void mem_init_print_info(const char *str); /* Free the reserved page into the buddy system, so it gets managed. */ static inline void __free_reserved_page(struct page *page) @@ -1358,6 +1359,17 @@ static inline unsigned long free_initmem_default(int poison) poison, "unused kernel"); } +static inline unsigned long get_num_physpages(void) +{ + int nid; + unsigned long phys_pages = 0; + + for_each_online_node(nid) + phys_pages += node_present_pages(nid); + + return phys_pages; +} + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d9445c4f5fd..327516b7aee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -61,6 +61,7 @@ #include #include +#include #include #include #include "internal.h" @@ -5246,6 +5247,57 @@ void free_highmem_page(struct page *page) } #endif + +void __init mem_init_print_info(const char *str) +{ + unsigned long physpages, codesize, datasize, rosize, bss_size; + unsigned long init_code_size, init_data_size; + + physpages = get_num_physpages(); + codesize = _etext - _stext; + datasize = _edata - _sdata; + rosize = __end_rodata - __start_rodata; + bss_size = __bss_stop - __bss_start; + init_data_size = __init_end - __init_begin; + init_code_size = _einittext - _sinittext; + + /* + * Detect special cases and adjust section sizes accordingly: + * 1) .init.* may be embedded into .data sections + * 2) .init.text.* may be out of [__init_begin, __init_end], + * please refer to arch/tile/kernel/vmlinux.lds.S. + * 3) .rodata.* may be embedded into .text or .data sections. + */ +#define adj_init_size(start, end, size, pos, adj) \ + if (start <= pos && pos < end && size > adj) \ + size -= adj; + + adj_init_size(__init_begin, __init_end, init_data_size, + _sinittext, init_code_size); + adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); + adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); + adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); + adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); + +#undef adj_init_size + + printk("Memory: %luK/%luK available " + "(%luK kernel code, %luK rwdata, %luK rodata, " + "%luK init, %luK bss, %luK reserved" +#ifdef CONFIG_HIGHMEM + ", %luK highmem" +#endif + "%s%s)\n", + nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), + codesize >> 10, datasize >> 10, rosize >> 10, + (init_data_size + init_code_size) >> 10, bss_size >> 10, + (physpages - totalram_pages) << (PAGE_SHIFT-10), +#ifdef CONFIG_HIGHMEM + totalhigh_pages << (PAGE_SHIFT-10), +#endif + str ? ", " : "", str ? str : ""); +} + /** * set_dma_reserve - set the specified number of pages reserved in the first zone * @new_dma_reserve: The number of pages to mark reserved -- cgit v1.2.3-70-g09d2 From 7960aedde8cfa72e4caf488806ea7ea7d2fa8dba Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 15:59:52 -0700 Subject: mm: remove duplicated call of get_pfn_range_for_nid When calculating pages in a node, for each zone in that node, we will have zone_spanned_pages_in_node --> get_pfn_range_for_nid zone_absent_pages_in_node --> get_pfn_range_for_nid That is to say, we call the get_pfn_range_for_nid to get start_pfn and end_pfn of the node for MAX_NR_ZONES * 2 times. And this is totally unnecessary if we call the get_pfn_range_for_nid before zone_*_pages_in_node add two extra arguments node_start_pfn and node_end_pfn for zone_*_pages_in_node, then we can remove the get_pfn_range_in_node in zone_*_pages_in_node. [akpm@linux-foundation.org: make definitions more readable] Signed-off-by: Zhang Yanfei Cc: Michal Hocko Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 327516b7aee..7d5e40fe0c2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4421,13 +4421,13 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid, */ static unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *ignored) { - unsigned long node_start_pfn, node_end_pfn; unsigned long zone_start_pfn, zone_end_pfn; - /* Get the start and end of the node and zone */ - get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); + /* Get the start and end of the zone */ zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; adjust_zone_range_for_zone_movable(nid, zone_type, @@ -4482,14 +4482,14 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn, /* Return the number of page frames in holes in a zone on a node */ static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *ignored) { unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; - unsigned long node_start_pfn, node_end_pfn; unsigned long zone_start_pfn, zone_end_pfn; - get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); @@ -4502,6 +4502,8 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *zones_size) { return zones_size[zone_type]; @@ -4509,6 +4511,8 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, static inline unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *zholes_size) { if (!zholes_size) @@ -4520,21 +4524,27 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid, #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, - unsigned long *zones_size, unsigned long *zholes_size) + unsigned long node_start_pfn, + unsigned long node_end_pfn, + unsigned long *zones_size, + unsigned long *zholes_size) { unsigned long realtotalpages, totalpages = 0; enum zone_type i; for (i = 0; i < MAX_NR_ZONES; i++) totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, - zones_size); + node_start_pfn, + node_end_pfn, + zones_size); pgdat->node_spanned_pages = totalpages; realtotalpages = totalpages; for (i = 0; i < MAX_NR_ZONES; i++) realtotalpages -= zone_absent_pages_in_node(pgdat->node_id, i, - zholes_size); + node_start_pfn, node_end_pfn, + zholes_size); pgdat->node_present_pages = realtotalpages; printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); @@ -4643,6 +4653,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, * NOTE: pgdat should get zeroed by caller. */ static void __paginginit free_area_init_core(struct pglist_data *pgdat, + unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zones_size, unsigned long *zholes_size) { enum zone_type j; @@ -4664,8 +4675,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; - size = zone_spanned_pages_in_node(nid, j, zones_size); + size = zone_spanned_pages_in_node(nid, j, node_start_pfn, + node_end_pfn, zones_size); realsize = freesize = size - zone_absent_pages_in_node(nid, j, + node_start_pfn, + node_end_pfn, zholes_size); /* @@ -4779,6 +4793,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { pg_data_t *pgdat = NODE_DATA(nid); + unsigned long start_pfn = 0; + unsigned long end_pfn = 0; /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); @@ -4786,7 +4802,11 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; init_zone_allows_reclaim(nid); - calculate_node_totalpages(pgdat, zones_size, zholes_size); +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); +#endif + calculate_node_totalpages(pgdat, start_pfn, end_pfn, + zones_size, zholes_size); alloc_node_mem_map(pgdat); #ifdef CONFIG_FLAT_NODE_MEM_MAP @@ -4795,7 +4815,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, (unsigned long)pgdat->node_mem_map); #endif - free_area_init_core(pgdat, zones_size, zholes_size); + free_area_init_core(pgdat, start_pfn, end_pfn, + zones_size, zholes_size); } #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -- cgit v1.2.3-70-g09d2 From bc732f1d55cf41627ee4c64078812b2fa592b394 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 16:00:06 -0700 Subject: mm/page_alloc.c: remove zone_type argument of build_zonelists_node The callers of build_zonelists_node always pass MAX_NR_ZONES -1 as the zone_type argument, so we can directly use the value in build_zonelists_node and remove zone_type argument. Signed-off-by: Zhang Yanfei Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7d5e40fe0c2..27f9d4beac9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3153,12 +3153,10 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) * Add all populated zones of a node to the zonelist. */ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, - int nr_zones, enum zone_type zone_type) + int nr_zones) { struct zone *zone; - - BUG_ON(zone_type >= MAX_NR_ZONES); - zone_type++; + enum zone_type zone_type = MAX_NR_ZONES; do { zone_type--; @@ -3168,8 +3166,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, &zonelist->_zonerefs[nr_zones++]); check_highest_zone(zone_type); } - } while (zone_type); + return nr_zones; } @@ -3363,8 +3361,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) zonelist = &pgdat->node_zonelists[0]; for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) ; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, - MAX_NR_ZONES - 1); + j = build_zonelists_node(NODE_DATA(node), zonelist, j); zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; } @@ -3378,7 +3375,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) struct zonelist *zonelist; zonelist = &pgdat->node_zonelists[1]; - j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); + j = build_zonelists_node(pgdat, zonelist, 0); zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; } @@ -3586,7 +3583,7 @@ static void build_zonelists(pg_data_t *pgdat) local_node = pgdat->node_id; zonelist = &pgdat->node_zonelists[0]; - j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); + j = build_zonelists_node(pgdat, zonelist, 0); /* * Now we build the zonelist so that it contains the zones @@ -3599,14 +3596,12 @@ static void build_zonelists(pg_data_t *pgdat) for (node = local_node + 1; node < MAX_NUMNODES; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, - MAX_NR_ZONES - 1); + j = build_zonelists_node(NODE_DATA(node), zonelist, j); } for (node = 0; node < local_node; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, - MAX_NR_ZONES - 1); + j = build_zonelists_node(NODE_DATA(node), zonelist, j); } zonelist->_zonerefs[j].zone = NULL; -- cgit v1.2.3-70-g09d2 From 345606d42971fc4ed164fbabac118708d51b8e0a Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 16:00:08 -0700 Subject: mm/page_alloc.c: remove unlikely() from the current_order test In __rmqueue_fallback(), current_order loops down from MAX_ORDER - 1 to the order passed. MAX_ORDER is typically 11 and pageblock_order is typically 9 on x86. Integer division truncates, so pageblock_order / 2 is 4. For the first eight iterations, it's guaranteed that current_order >= pageblock_order / 2 if it even gets that far! So just remove the unlikely(), it's completely bogus. Signed-off-by: Zhang Yanfei Suggested-by: David Rientjes Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 27f9d4beac9..b5855e545ee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1046,7 +1046,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) * MIGRATE_CMA areas. */ if (!is_migrate_cma(migratetype) && - (unlikely(current_order >= pageblock_order / 2) || + (current_order >= pageblock_order / 2 || start_migratetype == MIGRATE_RECLAIMABLE || page_group_by_mobility_disabled)) { int pages; -- cgit v1.2.3-70-g09d2 From 5f12733e9d976132e6cbbae9d08f71406fdacdfb Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 8 Jul 2013 16:00:40 -0700 Subject: mm: honor min_free_kbytes set by user min_free_kbytes is updated during memory hotplug (by init_per_zone_wmark_min) currently which is right thing to do in most cases but this could be unexpected if admin increased the value to prevent from allocation failures and the new min_free_kbytes would be decreased as a result of memory hotadd. This patch saves the user defined value and allows updating min_free_kbytes only if it is higher than the saved one. A warning is printed when the new value is ignored. Signed-off-by: Michal Hocko Cc: Mel Gorman Acked-by: Zhang Yanfei Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b5855e545ee..b100255dedd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -204,6 +204,7 @@ static char * const zone_names[MAX_NR_ZONES] = { }; int min_free_kbytes = 1024; +int user_min_free_kbytes; static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; @@ -5589,14 +5590,21 @@ static void __meminit setup_per_zone_inactive_ratio(void) int __meminit init_per_zone_wmark_min(void) { unsigned long lowmem_kbytes; + int new_min_free_kbytes; lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); - - min_free_kbytes = int_sqrt(lowmem_kbytes * 16); - if (min_free_kbytes < 128) - min_free_kbytes = 128; - if (min_free_kbytes > 65536) - min_free_kbytes = 65536; + new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); + + if (new_min_free_kbytes > user_min_free_kbytes) { + min_free_kbytes = new_min_free_kbytes; + if (min_free_kbytes < 128) + min_free_kbytes = 128; + if (min_free_kbytes > 65536) + min_free_kbytes = 65536; + } else { + pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", + new_min_free_kbytes, user_min_free_kbytes); + } setup_per_zone_wmarks(); refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); @@ -5614,8 +5622,10 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); - if (write) + if (write) { + user_min_free_kbytes = min_free_kbytes; setup_per_zone_wmarks(); + } return 0; } -- cgit v1.2.3-70-g09d2