From 89689ae7f95995723fbcd5c116c47933a3bb8b13 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 6 Dec 2006 20:31:45 -0800 Subject: [PATCH] Get rid of zone_table[] The zone table is mostly not needed. If we have a node in the page flags then we can get to the zone via NODE_DATA() which is much more likely to be already in the cpu cache. In case of SMP and UP NODE_DATA() is a constant pointer which allows us to access an exact replica of zonetable in the node_zones field. In all of the above cases there will be no need at all for the zone table. The only remaining case is if in a NUMA system the node numbers do not fit into the page flags. In that case we make sparse generate a table that maps sections to nodes and use that table to to figure out the node number. This table is sized to fit in a single cache line for the known 32 bit NUMA platform which makes it very likely that the information can be obtained without a cache miss. For sparsemem the zone table seems to be have been fairly large based on the maximum possible number of sections and the number of zones per node. There is some memory saving by removing zone_table. The main benefit is to reduce the cache foootprint of the VM from the frequent lookups of zones. Plus it simplifies the page allocator. [akpm@osdl.org: build fix] Signed-off-by: Christoph Lameter Cc: Dave Hansen Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 57 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 21 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index d538de90196..ab6e4974f37 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -396,7 +396,9 @@ void split_page(struct page *page, unsigned int order); * We are going to use the flags for the page to node mapping if its in * there. This includes the case where there is no node, so it is implicit. */ -#define FLAGS_HAS_NODE (NODES_WIDTH > 0 || NODES_SHIFT == 0) +#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0) +#define NODE_NOT_IN_PAGE_FLAGS +#endif #ifndef PFN_SECTION_SHIFT #define PFN_SECTION_SHIFT 0 @@ -411,13 +413,18 @@ void split_page(struct page *page, unsigned int order); #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) -/* NODE:ZONE or SECTION:ZONE is used to lookup the zone from a page. */ -#if FLAGS_HAS_NODE -#define ZONETABLE_SHIFT (NODES_SHIFT + ZONES_SHIFT) +/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allcator */ +#ifdef NODE_NOT_IN_PAGEFLAGS +#define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) #else -#define ZONETABLE_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) +#define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT) +#endif + +#if ZONES_WIDTH > 0 +#define ZONEID_PGSHIFT ZONES_PGSHIFT +#else +#define ZONEID_PGSHIFT NODES_PGOFF #endif -#define ZONETABLE_PGSHIFT ZONES_PGSHIFT #if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED #error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED @@ -426,23 +433,25 @@ void split_page(struct page *page, unsigned int order); #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) #define NODES_MASK ((1UL << NODES_WIDTH) - 1) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) -#define ZONETABLE_MASK ((1UL << ZONETABLE_SHIFT) - 1) +#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) static inline enum zone_type page_zonenum(struct page *page) { return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } -struct zone; -extern struct zone *zone_table[]; - +/* + * The identification function is only used by the buddy allocator for + * determining if two pages could be buddies. We are not really + * identifying a zone since we could be using a the section number + * id if we have not node id available in page flags. + * We guarantee only that it will return the same value for two + * combinable pages in a zone. + */ static inline int page_zone_id(struct page *page) { - return (page->flags >> ZONETABLE_PGSHIFT) & ZONETABLE_MASK; -} -static inline struct zone *page_zone(struct page *page) -{ - return zone_table[page_zone_id(page)]; + BUILD_BUG_ON(ZONEID_PGSHIFT == 0 && ZONEID_MASK); + return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; } static inline unsigned long zone_to_nid(struct zone *zone) @@ -454,13 +463,20 @@ static inline unsigned long zone_to_nid(struct zone *zone) #endif } +#ifdef NODE_NOT_IN_PAGE_FLAGS +extern unsigned long page_to_nid(struct page *page); +#else static inline unsigned long page_to_nid(struct page *page) { - if (FLAGS_HAS_NODE) - return (page->flags >> NODES_PGSHIFT) & NODES_MASK; - else - return zone_to_nid(page_zone(page)); + return (page->flags >> NODES_PGSHIFT) & NODES_MASK; } +#endif + +static inline struct zone *page_zone(struct page *page) +{ + return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; +} + static inline unsigned long page_to_section(struct page *page) { return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; @@ -477,6 +493,7 @@ static inline void set_page_node(struct page *page, unsigned long node) page->flags &= ~(NODES_MASK << NODES_PGSHIFT); page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; } + static inline void set_page_section(struct page *page, unsigned long section) { page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); @@ -947,8 +964,6 @@ extern void mem_init(void); extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); -extern void zonetable_add(struct zone *zone, int nid, enum zone_type zid, - unsigned long pfn, unsigned long size); #ifdef CONFIG_NUMA extern void setup_per_cpu_pageset(void); -- cgit v1.2.3-70-g09d2 From c43692e85f306667545b91194c748a6e46c1f8b4 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 6 Dec 2006 20:32:48 -0800 Subject: [PATCH] Move vm_area_cachep to include/mm.h vm_area_cachep is used to store vm_area_structs. So move to mm.h. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ include/linux/slab.h | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index ab6e4974f37..840303769c1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -114,6 +114,8 @@ struct vm_area_struct { #endif }; +extern struct kmem_cache *vm_area_cachep; + /* * This struct defines the per-mm list of VMAs for uClinux. If CONFIG_MMU is * disabled, then there's a single shared list of VMAs maintained by the diff --git a/include/linux/slab.h b/include/linux/slab.h index 467b297d2c6..a30a4028a92 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -309,7 +309,6 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags) #endif /* CONFIG_SLOB */ /* System wide caches */ -extern kmem_cache_t *vm_area_cachep; extern kmem_cache_t *names_cachep; extern kmem_cache_t *files_cachep; extern kmem_cache_t *filp_cachep; -- cgit v1.2.3-70-g09d2 From 25ba77c141dbcd2602dd0171824d0d72aa023a01 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 6 Dec 2006 20:33:03 -0800 Subject: [PATCH] numa node ids are int, page_to_nid and zone_to_nid should return int NUMA node ids are passed as either int or unsigned int almost exclusivly page_to_nid and zone_to_nid both return unsigned long. This is a throw back to when page_to_nid was a #define and was thus exposing the real type of the page flags field. In addition to fixing up the definitions of page_to_nid and zone_to_nid I audited the users of these functions identifying the following incorrect uses: 1) mm/page_alloc.c show_node() -- printk dumping the node id, 2) include/asm-ia64/pgalloc.h pgtable_quicklist_free() -- comparison against numa_node_id() which returns an int from cpu_to_node(), and 3) mm/mpolicy.c check_pte_range -- used as an index in node_isset which uses bit_set which in generic code takes an int. Signed-off-by: Andy Whitcroft Cc: Christoph Lameter Cc: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-ia64/pgalloc.h | 2 +- include/linux/mm.h | 6 +++--- mm/mempolicy.c | 2 +- mm/page_alloc.c | 2 +- mm/sparse.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/asm-ia64/pgalloc.h b/include/asm-ia64/pgalloc.h index 9cb68e9b377..393e04c42a2 100644 --- a/include/asm-ia64/pgalloc.h +++ b/include/asm-ia64/pgalloc.h @@ -60,7 +60,7 @@ static inline void *pgtable_quicklist_alloc(void) static inline void pgtable_quicklist_free(void *pgtable_entry) { #ifdef CONFIG_NUMA - unsigned long nid = page_to_nid(virt_to_page(pgtable_entry)); + int nid = page_to_nid(virt_to_page(pgtable_entry)); if (unlikely(nid != numa_node_id())) { free_page((unsigned long)pgtable_entry); diff --git a/include/linux/mm.h b/include/linux/mm.h index 840303769c1..0e266fe1b4c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -456,7 +456,7 @@ static inline int page_zone_id(struct page *page) return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; } -static inline unsigned long zone_to_nid(struct zone *zone) +static inline int zone_to_nid(struct zone *zone) { #ifdef CONFIG_NUMA return zone->node; @@ -466,9 +466,9 @@ static inline unsigned long zone_to_nid(struct zone *zone) } #ifdef NODE_NOT_IN_PAGE_FLAGS -extern unsigned long page_to_nid(struct page *page); +extern int page_to_nid(struct page *page); #else -static inline unsigned long page_to_nid(struct page *page) +static inline int page_to_nid(struct page *page) { return (page->flags >> NODES_PGSHIFT) & NODES_MASK; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index fb907236bbd..e7b69c90cfd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -221,7 +221,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { struct page *page; - unsigned int nid; + int nid; if (!pte_present(*pte)) continue; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 86f2984f8b7..614d427854a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1407,7 +1407,7 @@ unsigned int nr_free_pagecache_pages(void) static inline void show_node(struct zone *zone) { if (NUMA_BUILD) - printk("Node %ld ", zone_to_nid(zone)); + printk("Node %d ", zone_to_nid(zone)); } void si_meminfo(struct sysinfo *val) diff --git a/mm/sparse.c b/mm/sparse.c index 158d6a2a526..ac26eb0d73c 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -36,7 +36,7 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; #endif -unsigned long page_to_nid(struct page *page) +int page_to_nid(struct page *page) { return section_to_node_table[page_to_section(page)]; } -- cgit v1.2.3-70-g09d2 From 33f2ef89f8e181486b63fdbdc97c6afa6ca9f34b Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 6 Dec 2006 20:33:32 -0800 Subject: [PATCH] mm: make compound page destructor handling explicit Currently we we use the lru head link of the second page of a compound page to hold its destructor. This was ok when it was purely an internal implmentation detail. However, hugetlbfs overrides this destructor violating the layering. Abstract this out as explicit calls, also introduce a type for the callback function allowing them to be type checked. For each callback we pre-declare the function, causing a type error on definition rather than on use elsewhere. [akpm@osdl.org: cleanups] Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 18 ++++++++++++++++++ mm/hugetlb.c | 2 +- mm/page_alloc.c | 2 +- mm/swap.c | 4 ++-- 4 files changed, 22 insertions(+), 4 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0e266fe1b4c..a17b147c61e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -295,6 +295,24 @@ void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); +/* + * Compound pages have a destructor function. Provide a + * prototype for that function and accessor functions. + * These are _only_ valid on the head of a PG_compound page. + */ +typedef void compound_page_dtor(struct page *); + +static inline void set_compound_page_dtor(struct page *page, + compound_page_dtor *dtor) +{ + page[1].lru.next = (void *)dtor; +} + +static inline compound_page_dtor *get_compound_page_dtor(struct page *page) +{ + return (compound_page_dtor *)page[1].lru.next; +} + /* * Multiple processes may "see" the same page. E.g. for untouched * mappings of /dev/null, all processes see the same page full of diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2911a364481..0ccc7f23025 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -109,7 +109,7 @@ static int alloc_fresh_huge_page(void) if (nid == MAX_NUMNODES) nid = first_node(node_online_map); if (page) { - page[1].lru.next = (void *)free_huge_page; /* dtor */ + set_compound_page_dtor(page, free_huge_page); spin_lock(&hugetlb_lock); nr_huge_pages++; nr_huge_pages_node[page_to_nid(page)]++; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dc8753bdd47..d539f83c62b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -230,7 +230,7 @@ static void prep_compound_page(struct page *page, unsigned long order) int i; int nr_pages = 1 << order; - page[1].lru.next = (void *)free_compound_page; /* set dtor */ + set_compound_page_dtor(page, free_compound_page); page[1].lru.prev = (void *)order; for (i = 0; i < nr_pages; i++) { struct page *p = page + i; diff --git a/mm/swap.c b/mm/swap.c index d9a3770d8f3..017e72ca9bb 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -57,9 +57,9 @@ static void put_compound_page(struct page *page) { page = (struct page *)page_private(page); if (put_page_testzero(page)) { - void (*dtor)(struct page *page); + compound_page_dtor *dtor; - dtor = (void (*)(struct page *))page[1].lru.next; + dtor = get_compound_page_dtor(page); (*dtor)(page); } } -- cgit v1.2.3-70-g09d2 From a2f3aa02576632cdb60bd3de1f4bf55e9ac65604 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 10 Jan 2007 23:15:30 -0800 Subject: [PATCH] Fix sparsemem on Cell Fix an oops experienced on the Cell architecture when init-time functions, early_*(), are called at runtime. It alters the call paths to make sure that the callers explicitly say whether the call is being made on behalf of a hotplug even, or happening at boot-time. It has been compile tested on ppc64, ia64, s390, i386 and x86_64. Acked-by: Arnd Bergmann Signed-off-by: Dave Hansen Cc: Yasunori Goto Acked-by: Andy Whitcroft Cc: Christoph Lameter Cc: Martin Schwidefsky Acked-by: Heiko Carstens Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 5 +++-- arch/s390/mm/vmem.c | 3 ++- include/linux/mm.h | 3 ++- include/linux/mmzone.h | 8 ++++++-- mm/memory_hotplug.c | 6 ++++-- mm/page_alloc.c | 25 +++++++++++++++++-------- 6 files changed, 34 insertions(+), 16 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 1a3d8a2feb9..1373fae7657 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -543,7 +543,8 @@ virtual_memmap_init (u64 start, u64 end, void *arg) if (map_start < map_end) memmap_init_zone((unsigned long)(map_end - map_start), - args->nid, args->zone, page_to_pfn(map_start)); + args->nid, args->zone, page_to_pfn(map_start), + MEMMAP_EARLY); return 0; } @@ -552,7 +553,7 @@ memmap_init (unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { if (!vmem_map) - memmap_init_zone(size, nid, zone, start_pfn); + memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY); else { struct page *start; struct memmap_init_callback_data args; diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 7f2944d3ec2..cd3d93e8c21 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -61,7 +61,8 @@ void memmap_init(unsigned long size, int nid, unsigned long zone, if (map_start < map_end) memmap_init_zone((unsigned long)(map_end - map_start), - nid, zone, page_to_pfn(map_start)); + nid, zone, page_to_pfn(map_start), + MEMMAP_EARLY); } } diff --git a/include/linux/mm.h b/include/linux/mm.h index a17b147c61e..76912231af4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -978,7 +978,8 @@ extern int early_pfn_to_nid(unsigned long pfn); #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ extern void set_dma_reserve(unsigned long new_dma_reserve); -extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long); +extern void memmap_init_zone(unsigned long, int, unsigned long, + unsigned long, enum memmap_context); extern void setup_per_zone_pages_min(void); extern void mem_init(void); extern void show_mem(void); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e339a7345f2..b262f47961f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -450,9 +450,13 @@ void build_all_zonelists(void); void wakeup_kswapd(struct zone *zone, int order); int zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags); - +enum memmap_context { + MEMMAP_EARLY, + MEMMAP_HOTPLUG, +}; extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, - unsigned long size); + unsigned long size, + enum memmap_context context); #ifdef CONFIG_HAVE_MEMORY_PRESENT void memory_present(int nid, unsigned long start, unsigned long end); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0c055a090f4..84279127fcd 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -67,11 +67,13 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) zone_type = zone - pgdat->node_zones; if (!populated_zone(zone)) { int ret = 0; - ret = init_currently_empty_zone(zone, phys_start_pfn, nr_pages); + ret = init_currently_empty_zone(zone, phys_start_pfn, + nr_pages, MEMMAP_HOTPLUG); if (ret < 0) return ret; } - memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); + memmap_init_zone(nr_pages, nid, zone_type, + phys_start_pfn, MEMMAP_HOTPLUG); return 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a49f96b7ea4..fc5b5442e94 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1956,17 +1956,24 @@ static inline unsigned long wait_table_bits(unsigned long size) * done. Non-atomic initialization, single-pass. */ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, - unsigned long start_pfn) + unsigned long start_pfn, enum memmap_context context) { struct page *page; unsigned long end_pfn = start_pfn + size; unsigned long pfn; for (pfn = start_pfn; pfn < end_pfn; pfn++) { - if (!early_pfn_valid(pfn)) - continue; - if (!early_pfn_in_nid(pfn, nid)) - continue; + /* + * There can be holes in boot-time mem_map[]s + * handed to this function. They do not + * exist on hotplugged memory. + */ + if (context == MEMMAP_EARLY) { + if (!early_pfn_valid(pfn)) + continue; + if (!early_pfn_in_nid(pfn, nid)) + continue; + } page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); init_page_count(page); @@ -1993,7 +2000,7 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, #ifndef __HAVE_ARCH_MEMMAP_INIT #define memmap_init(size, nid, zone, start_pfn) \ - memmap_init_zone((size), (nid), (zone), (start_pfn)) + memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) #endif static int __cpuinit zone_batchsize(struct zone *zone) @@ -2239,7 +2246,8 @@ static __meminit void zone_pcp_init(struct zone *zone) __meminit int init_currently_empty_zone(struct zone *zone, unsigned long zone_start_pfn, - unsigned long size) + unsigned long size, + enum memmap_context context) { struct pglist_data *pgdat = zone->zone_pgdat; int ret; @@ -2683,7 +2691,8 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, if (!size) continue; - ret = init_currently_empty_zone(zone, zone_start_pfn, size); + ret = init_currently_empty_zone(zone, zone_start_pfn, + size, MEMMAP_EARLY); BUG_ON(ret); zone_start_pfn += size; } -- cgit v1.2.3-70-g09d2