[IA64] Percpu quicklist for combined allocator for pgd/pmd/pte.

This patch introduces using the quicklists for pgd, pmd, and pte levels by combining the alloc and free functions into a common set of routines. This greatly simplifies the reading of this header file. This patch is simple but necessary for large numa configurations. It simply ensures that only pages from the local node are added to a cpus quicklist. This prevents the trapping of pages on a remote nodes quicklist by starting a process, touching a large number of pages to fill pmd and pte entries, migrating to another node, and then unmapping or exiting. With those conditions, the pages get trapped and if the machine has more than 100 nodes of the same size, the calculation of the pgtable high water mark will be larger than any single node so page table cache flushing will never occur. I ran lmbench lat_proc fork and lat_proc exec on a zx1 with and without this patch and did not notice any change. On an sn2 machine, there was a slight improvement which is possibly due to pages from other nodes trapped on the test node before starting the run. I did not investigate further. This patch shrinks the quicklist based upon free memory on the node instead of the high/low water marks. I have written it to enable preemption periodically and recalculate the amount to shrink every time we have freed enough pages that the quicklist size should have grown. I rescan the nodes zones each pass because other processess may be draining node memory at the same time as we are adding. Signed-off-by: Robin Holt <holt@sgi.com> Signed-off-by: Tony Luck <tony.luck@intel.com>
author: Robin Holt <holt@sgi.com> 2005-04-25 13:13:16 -0700
committer: Tony Luck <tony.luck@intel.com> 2005-04-25 13:13:16 -0700
commit: fde740e4dd4a05ca8957490d468fa9b2770f5bd6 (patch)
tree: 04bc0221bc6c59379a17f3631fc4bd3c886e1d61
parent: ff3eb55ed97db3f12964beeffe3d34602d295367 (diff)
5 files changed, 112 insertions, 115 deletions
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
index 6daf15ac894..91a055f5731 100644
--- a/arch/ia64/mm/contig.c
+++ b/arch/ia64/mm/contig.c
@@ -61,7 +61,8 @@ show_mem (void)
 	printk("%d reserved pages\n", reserved);
 	printk("%d pages shared\n", shared);
 	printk("%d pages swap cached\n", cached);
-	printk("%ld pages in page table cache\n", pgtable_cache_size);
+	printk("%ld pages in page table cache\n",
+		pgtable_quicklist_total_size());
 }
 
 /* physical address where the bootmem map is located */
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 3456a9b6971..c0071092939 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -582,7 +582,8 @@ void show_mem(void)
 	printk("%d reserved pages\n", total_reserved);
 	printk("%d pages shared\n", total_shared);
 	printk("%d pages swap cached\n", total_cached);
-	printk("Total of %ld pages in page table cache\n", pgtable_cache_size);
+	printk("Total of %ld pages in page table cache\n",
+		pgtable_quicklist_total_size());
 	printk("%d free buffer pages\n", nr_free_buffer_pages());
 }
 
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 65cf839573e..4892be53e22 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -39,6 +39,9 @@
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 
+DEFINE_PER_CPU(unsigned long *, __pgtable_quicklist);
+DEFINE_PER_CPU(long, __pgtable_quicklist_size);
+
 extern void ia64_tlb_init (void);
 
 unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL;
@@ -50,27 +53,53 @@ struct page *vmem_map;
 EXPORT_SYMBOL(vmem_map);
 #endif
 
-static int pgt_cache_water[2] = { 25, 50 };
-
-struct page *zero_page_memmap_ptr;		/* map entry for zero page */
+struct page *zero_page_memmap_ptr;	/* map entry for zero page */
 EXPORT_SYMBOL(zero_page_memmap_ptr);
 
+#define MIN_PGT_PAGES			25UL
+#define MAX_PGT_FREES_PER_PASS		16
+#define PGT_FRACTION_OF_NODE_MEM	16
+
+static inline long
+max_pgt_pages(void)
+{
+	u64 node_free_pages, max_pgt_pages;
+
+#ifndef	CONFIG_NUMA
+	node_free_pages = nr_free_pages();
+#else
+	node_free_pages = nr_free_pages_pgdat(NODE_DATA(numa_node_id()));
+#endif
+	max_pgt_pages = node_free_pages / PGT_FRACTION_OF_NODE_MEM;
+	max_pgt_pages = max(max_pgt_pages, MIN_PGT_PAGES);
+	return max_pgt_pages;
+}
+
+static inline long
+min_pages_to_free(void)
+{
+	long pages_to_free;
+
+	pages_to_free = pgtable_quicklist_size - max_pgt_pages();
+	pages_to_free = min(pages_to_free, MAX_PGT_FREES_PER_PASS);
+	return pages_to_free;
+}
+
 void
-check_pgt_cache (void)
+check_pgt_cache(void)
 {
-	int low, high;
+	long pages_to_free;
 
-	low = pgt_cache_water[0];
-	high = pgt_cache_water[1];
+	if (unlikely(pgtable_quicklist_size <= MIN_PGT_PAGES))
+		return;
 
 	preempt_disable();
-	if (pgtable_cache_size > (u64) high) {
-		do {
-			if (pgd_quicklist)
-				free_page((unsigned long)pgd_alloc_one_fast(NULL));
-			if (pmd_quicklist)
-				free_page((unsigned long)pmd_alloc_one_fast(NULL, 0));
-		} while (pgtable_cache_size > (u64) low);
+	while (unlikely((pages_to_free = min_pages_to_free()) > 0)) {
+		while (pages_to_free--) {
+			free_page((unsigned long)pgtable_quicklist_alloc());
+		}
+		preempt_enable();
+		preempt_disable();
 	}
 	preempt_enable();
 }
@@ -523,11 +552,14 @@ void
 mem_init (void)
 {
 	long reserved_pages, codesize, datasize, initsize;
-	unsigned long num_pgt_pages;
 	pg_data_t *pgdat;
 	int i;
 	static struct kcore_list kcore_mem, kcore_vmem, kcore_kernel;
 
+	BUG_ON(PTRS_PER_PGD * sizeof(pgd_t) != PAGE_SIZE);
+	BUG_ON(PTRS_PER_PMD * sizeof(pmd_t) != PAGE_SIZE);
+	BUG_ON(PTRS_PER_PTE * sizeof(pte_t) != PAGE_SIZE);
+
 #ifdef CONFIG_PCI
 	/*
 	 * This needs to be called _after_ the command line has been parsed but _before_
@@ -564,18 +596,6 @@ mem_init (void)
 	       num_physpages << (PAGE_SHIFT - 10), codesize >> 10,
 	       reserved_pages << (PAGE_SHIFT - 10), datasize >> 10, initsize >> 10);
 
-	/*
-	 * Allow for enough (cached) page table pages so that we can map the entire memory
-	 * at least once.  Each task also needs a couple of page tables pages, so add in a
-	 * fudge factor for that (don't use "threads-max" here; that would be wrong!).
-	 * Don't allow the cache to be more than 10% of total memory, though.
-	 */
-#	define NUM_TASKS	500	/* typical number of tasks */
-	num_pgt_pages = nr_free_pages() / PTRS_PER_PGD + NUM_TASKS;
-	if (num_pgt_pages > nr_free_pages() / 10)
-		num_pgt_pages = nr_free_pages() / 10;
-	if (num_pgt_pages > (u64) pgt_cache_water[1])
-		pgt_cache_water[1] = num_pgt_pages;
 
 	/*
 	 * For fsyscall entrpoints with no light-weight handler, use the ordinary
diff --git a/include/asm-ia64/pgalloc.h b/include/asm-ia64/pgalloc.h
index 0f05dc8bd46..e86a8c331ee 100644
--- a/include/asm-ia64/pgalloc.h
+++ b/include/asm-ia64/pgalloc.h
@@ -22,146 +22,124 @@
 
 #include <asm/mmu_context.h>
 
-/*
- * Very stupidly, we used to get new pgd's and pmd's, init their contents
- * to point to the NULL versions of the next level page table, later on
- * completely re-init them the same way, then free them up.  This wasted
- * a lot of work and caused unnecessary memory traffic.  How broken...
- * We fix this by caching them.
- */
-#define pgd_quicklist		(local_cpu_data->pgd_quick)
-#define pmd_quicklist		(local_cpu_data->pmd_quick)
-#define pgtable_cache_size	(local_cpu_data->pgtable_cache_sz)
+DECLARE_PER_CPU(unsigned long *, __pgtable_quicklist);
+#define pgtable_quicklist __ia64_per_cpu_var(__pgtable_quicklist)
+DECLARE_PER_CPU(long, __pgtable_quicklist_size);
+#define pgtable_quicklist_size __ia64_per_cpu_var(__pgtable_quicklist_size)
 
-static inline pgd_t*
-pgd_alloc_one_fast (struct mm_struct *mm)
+static inline long pgtable_quicklist_total_size(void)
+{
+	long ql_size;
+	int cpuid;
+
+	for_each_online_cpu(cpuid) {
+		ql_size += per_cpu(__pgtable_quicklist_size, cpuid);
+	}
+	return ql_size;
+}
+
+static inline void *pgtable_quicklist_alloc(void)
 {
 	unsigned long *ret = NULL;
 
 	preempt_disable();
 
-	ret = pgd_quicklist;
+	ret = pgtable_quicklist;
 	if (likely(ret != NULL)) {
-		pgd_quicklist = (unsigned long *)(*ret);
+		pgtable_quicklist = (unsigned long *)(*ret);
 		ret[0] = 0;
-		--pgtable_cache_size;
-	} else
-		ret = NULL;
+		--pgtable_quicklist_size;
+	} else {
+		ret = (unsigned long *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+	}
 
 	preempt_enable();
 
-	return (pgd_t *) ret;
+	return ret;
 }
 
-static inline pgd_t*
-pgd_alloc (struct mm_struct *mm)
+static inline void pgtable_quicklist_free(void *pgtable_entry)
 {
-	/* the VM system never calls pgd_alloc_one_fast(), so we do it here. */
-	pgd_t *pgd = pgd_alloc_one_fast(mm);
+#ifdef CONFIG_NUMA
+	unsigned long nid = page_to_nid(virt_to_page(pgtable_entry));
 
-	if (unlikely(pgd == NULL)) {
-		pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO);
+	if (unlikely(nid != numa_node_id())) {
+		free_page((unsigned long)pgtable_entry);
+		return;
 	}
-	return pgd;
-}
+#endif
 
-static inline void
-pgd_free (pgd_t *pgd)
-{
 	preempt_disable();
-	*(unsigned long *)pgd = (unsigned long) pgd_quicklist;
-	pgd_quicklist = (unsigned long *) pgd;
-	++pgtable_cache_size;
+	*(unsigned long *)pgtable_entry = (unsigned long)pgtable_quicklist;
+	pgtable_quicklist = (unsigned long *)pgtable_entry;
+	++pgtable_quicklist_size;
 	preempt_enable();
 }
 
-static inline void
-pud_populate (struct mm_struct *mm, pud_t *pud_entry, pmd_t *pmd)
+static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	pud_val(*pud_entry) = __pa(pmd);
+	return pgtable_quicklist_alloc();
 }
 
-static inline pmd_t*
-pmd_alloc_one_fast (struct mm_struct *mm, unsigned long addr)
+static inline void pgd_free(pgd_t * pgd)
 {
-	unsigned long *ret = NULL;
-
-	preempt_disable();
-
-	ret = (unsigned long *)pmd_quicklist;
-	if (likely(ret != NULL)) {
-		pmd_quicklist = (unsigned long *)(*ret);
-		ret[0] = 0;
-		--pgtable_cache_size;
-	}
-
-	preempt_enable();
-
-	return (pmd_t *)ret;
+	pgtable_quicklist_free(pgd);
 }
 
-static inline pmd_t*
-pmd_alloc_one (struct mm_struct *mm, unsigned long addr)
+static inline void
+pud_populate(struct mm_struct *mm, pud_t * pud_entry, pmd_t * pmd)
 {
-	pmd_t *pmd = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+	pud_val(*pud_entry) = __pa(pmd);
+}
 
-	return pmd;
+static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+	return pgtable_quicklist_alloc();
 }
 
-static inline void
-pmd_free (pmd_t *pmd)
+static inline void pmd_free(pmd_t * pmd)
 {
-	preempt_disable();
-	*(unsigned long *)pmd = (unsigned long) pmd_quicklist;
-	pmd_quicklist = (unsigned long *) pmd;
-	++pgtable_cache_size;
-	preempt_enable();
+	pgtable_quicklist_free(pmd);
 }
 
 #define __pmd_free_tlb(tlb, pmd)	pmd_free(pmd)
 
 static inline void
-pmd_populate (struct mm_struct *mm, pmd_t *pmd_entry, struct page *pte)
+pmd_populate(struct mm_struct *mm, pmd_t * pmd_entry, struct page *pte)
 {
 	pmd_val(*pmd_entry) = page_to_phys(pte);
 }
 
 static inline void
-pmd_populate_kernel (struct mm_struct *mm, pmd_t *pmd_entry, pte_t *pte)
+pmd_populate_kernel(struct mm_struct *mm, pmd_t * pmd_entry, pte_t * pte)
 {
 	pmd_val(*pmd_entry) = __pa(pte);
 }
 
-static inline struct page *
-pte_alloc_one (struct mm_struct *mm, unsigned long addr)
+static inline struct page *pte_alloc_one(struct mm_struct *mm,
+					 unsigned long addr)
 {
-	struct page *pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
-
-	return pte;
+	return virt_to_page(pgtable_quicklist_alloc());
 }
 
-static inline pte_t *
-pte_alloc_one_kernel (struct mm_struct *mm, unsigned long addr)
+static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
+					  unsigned long addr)
 {
-	pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
-
-	return pte;
+	return pgtable_quicklist_alloc();
 }
 
-static inline void
-pte_free (struct page *pte)
+static inline void pte_free(struct page *pte)
 {
-	__free_page(pte);
+	pgtable_quicklist_free(page_address(pte));
 }
 
-static inline void
-pte_free_kernel (pte_t *pte)
+static inline void pte_free_kernel(pte_t * pte)
 {
-	free_page((unsigned long) pte);
+	pgtable_quicklist_free(pte);
 }
 
-#define __pte_free_tlb(tlb, pte)	tlb_remove_page((tlb), (pte))
+#define __pte_free_tlb(tlb, pte)	pte_free(pte)
 
-extern void check_pgt_cache (void);
+extern void check_pgt_cache(void);
 
-#endif /* _ASM_IA64_PGALLOC_H */
+#endif				/* _ASM_IA64_PGALLOC_H */
diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h
index 2807f8d766d..983798ec179 100644
--- a/include/asm-ia64/processor.h
+++ b/include/asm-ia64/processor.h
@@ -137,9 +137,6 @@ struct cpuinfo_ia64 {
 	__u64 nsec_per_cyc;	/* (1000000000<<IA64_NSEC_PER_CYC_SHIFT)/itc_freq */
 	__u64 unimpl_va_mask;	/* mask of unimplemented virtual address bits (from PAL) */
 	__u64 unimpl_pa_mask;	/* mask of unimplemented physical address bits (from PAL) */
-	__u64 *pgd_quick;
-	__u64 *pmd_quick;
-	__u64 pgtable_cache_sz;
 	__u64 itc_freq;		/* frequency of ITC counter */
 	__u64 proc_freq;	/* frequency of processor */
 	__u64 cyc_per_usec;	/* itc_freq/1000000 */
author	Robin Holt <holt@sgi.com>	2005-04-25 13:13:16 -0700
committer	Tony Luck <tony.luck@intel.com>	2005-04-25 13:13:16 -0700
commit	fde740e4dd4a05ca8957490d468fa9b2770f5bd6 (patch)
tree	04bc0221bc6c59379a17f3631fc4bd3c886e1d61
parent	ff3eb55ed97db3f12964beeffe3d34602d295367 (diff)