From 63551ae0feaaa23807ebea60de1901564bbef32e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 21 Jun 2005 17:14:44 -0700
Subject: [PATCH] Hugepage consolidation

A lot of the code in arch/*/mm/hugetlbpage.c is quite similar.  This patch
attempts to consolidate a lot of the code across the arch's, putting the
combined version in mm/hugetlb.c.  There are a couple of uglyish hacks in
order to covert all the hugepage archs, but the result is a very large
reduction in the total amount of code.  It also means things like hugepage
lazy allocation could be implemented in one place, instead of six.

Tested, at least a little, on ppc64, i386 and x86_64.

Notes:
	- this patch changes the meaning of set_huge_pte() to be more
	  analagous to set_pte()
	- does SH4 need s special huge_ptep_get_and_clear()??

Acked-by: William Lee Irwin <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc64/mm/hugetlbpage.c | 180 +-------------------------------------------
 1 file changed, 2 insertions(+), 178 deletions(-)

(limited to 'arch/ppc64/mm')

diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
index d3bf86a5c1a..b4ab766f598 100644
--- a/arch/ppc64/mm/hugetlbpage.c
+++ b/arch/ppc64/mm/hugetlbpage.c
@@ -121,7 +121,7 @@ static pte_t *hugepte_alloc(struct mm_struct *mm, pud_t *dir, unsigned long addr
 	return hugepte_offset(dir, addr);
 }
 
-static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
 	pud_t *pud;
 
@@ -134,7 +134,7 @@ static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	return hugepte_offset(pud, addr);
 }
 
-static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pud_t *pud;
 
@@ -147,25 +147,6 @@ static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 	return hugepte_alloc(mm, pud, addr);
 }
 
-static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma,
-			 unsigned long addr, struct page *page,
-			 pte_t *ptep, int write_access)
-{
-	pte_t entry;
-
-	add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
-	if (write_access) {
-		entry =
-		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
-	} else {
-		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
-	}
-	entry = pte_mkyoung(entry);
-	entry = pte_mkhuge(entry);
-
-	set_pte_at(mm, addr, ptep, entry);
-}
-
 /*
  * This function checks for proper alignment of input addr and len parameters.
  */
@@ -259,80 +240,6 @@ int prepare_hugepage_range(unsigned long addr, unsigned long len)
 	return -EINVAL;
 }
 
-int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
-			struct vm_area_struct *vma)
-{
-	pte_t *src_pte, *dst_pte, entry;
-	struct page *ptepage;
-	unsigned long addr = vma->vm_start;
-	unsigned long end = vma->vm_end;
-	int err = -ENOMEM;
-
-	while (addr < end) {
-		dst_pte = huge_pte_alloc(dst, addr);
-		if (!dst_pte)
-			goto out;
-
-		src_pte = huge_pte_offset(src, addr);
-		entry = *src_pte;
-		
-		ptepage = pte_page(entry);
-		get_page(ptepage);
-		add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
-		set_pte_at(dst, addr, dst_pte, entry);
-
-		addr += HPAGE_SIZE;
-	}
-
-	err = 0;
- out:
-	return err;
-}
-
-int
-follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		    struct page **pages, struct vm_area_struct **vmas,
-		    unsigned long *position, int *length, int i)
-{
-	unsigned long vpfn, vaddr = *position;
-	int remainder = *length;
-
-	WARN_ON(!is_vm_hugetlb_page(vma));
-
-	vpfn = vaddr/PAGE_SIZE;
-	while (vaddr < vma->vm_end && remainder) {
-		if (pages) {
-			pte_t *pte;
-			struct page *page;
-
-			pte = huge_pte_offset(mm, vaddr);
-
-			/* hugetlb should be locked, and hence, prefaulted */
-			WARN_ON(!pte || pte_none(*pte));
-
-			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
-
-			WARN_ON(!PageCompound(page));
-
-			get_page(page);
-			pages[i] = page;
-		}
-
-		if (vmas)
-			vmas[i] = vma;
-
-		vaddr += PAGE_SIZE;
-		++vpfn;
-		--remainder;
-		++i;
-	}
-
-	*length = remainder;
-	*position = vaddr;
-
-	return i;
-}
-
 struct page *
 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 {
@@ -363,89 +270,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	return NULL;
 }
 
-void unmap_hugepage_range(struct vm_area_struct *vma,
-			  unsigned long start, unsigned long end)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	unsigned long addr;
-	pte_t *ptep;
-	struct page *page;
-
-	WARN_ON(!is_vm_hugetlb_page(vma));
-	BUG_ON((start % HPAGE_SIZE) != 0);
-	BUG_ON((end % HPAGE_SIZE) != 0);
-
-	for (addr = start; addr < end; addr += HPAGE_SIZE) {
-		pte_t pte;
-
-		ptep = huge_pte_offset(mm, addr);
-		if (!ptep || pte_none(*ptep))
-			continue;
-
-		pte = *ptep;
-		page = pte_page(pte);
-		pte_clear(mm, addr, ptep);
-
-		put_page(page);
-	}
-	add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT));
-	flush_tlb_pending();
-}
-
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
-{
-	struct mm_struct *mm = current->mm;
-	unsigned long addr;
-	int ret = 0;
-
-	WARN_ON(!is_vm_hugetlb_page(vma));
-	BUG_ON((vma->vm_start % HPAGE_SIZE) != 0);
-	BUG_ON((vma->vm_end % HPAGE_SIZE) != 0);
-
-	spin_lock(&mm->page_table_lock);
-	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
-		unsigned long idx;
-		pte_t *pte = huge_pte_alloc(mm, addr);
-		struct page *page;
-
-		if (!pte) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		if (! pte_none(*pte))
-			continue;
-
-		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-		page = find_get_page(mapping, idx);
-		if (!page) {
-			/* charge the fs quota first */
-			if (hugetlb_get_quota(mapping)) {
-				ret = -ENOMEM;
-				goto out;
-			}
-			page = alloc_huge_page();
-			if (!page) {
-				hugetlb_put_quota(mapping);
-				ret = -ENOMEM;
-				goto out;
-			}
-			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
-			if (! ret) {
-				unlock_page(page);
-			} else {
-				hugetlb_put_quota(mapping);
-				free_huge_page(page);
-				goto out;
-			}
-		}
-		set_huge_pte(mm, vma, addr, page, pte, vma->vm_flags & VM_WRITE);
-	}
-out:
-	spin_unlock(&mm->page_table_lock);
-	return ret;
-}
-
 /* Because we have an exclusive hugepage region which lies within the
  * normal user address space, we have to take special measures to make
  * non-huge mmap()s evade the hugepage reserved regions. */
-- 
cgit v1.2.3-70-g09d2


From 1363c3cd8603a913a27e2995dccbd70d5312d8e6 Mon Sep 17 00:00:00 2001
From: Wolfgang Wander <wwc@rentec.com>
Date: Tue, 21 Jun 2005 17:14:49 -0700
Subject: [PATCH] Avoiding mmap fragmentation

Ingo recently introduced a great speedup for allocating new mmaps using the
free_area_cache pointer which boosts the specweb SSL benchmark by 4-5% and
causes huge performance increases in thread creation.

The downside of this patch is that it does lead to fragmentation in the
mmap-ed areas (visible via /proc/self/maps), such that some applications
that work fine under 2.4 kernels quickly run out of memory on any 2.6
kernel.

The problem is twofold:

  1) the free_area_cache is used to continue a search for memory where
     the last search ended.  Before the change new areas were always
     searched from the base address on.

     So now new small areas are cluttering holes of all sizes
     throughout the whole mmap-able region whereas before small holes
     tended to close holes near the base leaving holes far from the base
     large and available for larger requests.

  2) the free_area_cache also is set to the location of the last
     munmap-ed area so in scenarios where we allocate e.g.  five regions of
     1K each, then free regions 4 2 3 in this order the next request for 1K
     will be placed in the position of the old region 3, whereas before we
     appended it to the still active region 1, placing it at the location
     of the old region 2.  Before we had 1 free region of 2K, now we only
     get two free regions of 1K -> fragmentation.

The patch addresses thes issues by introducing yet another cache descriptor
cached_hole_size that contains the largest known hole size below the
current free_area_cache.  If a new request comes in the size is compared
against the cached_hole_size and if the request can be filled with a hole
below free_area_cache the search is started from the base instead.

The results look promising: Whereas 2.6.12-rc4 fragments quickly and my
(earlier posted) leakme.c test program terminates after 50000+ iterations
with 96 distinct and fragmented maps in /proc/self/maps it performs nicely
(as expected) with thread creation, Ingo's test_str02 with 20000 threads
requires 0.7s system time.

Taking out Ingo's patch (un-patch available per request) by basically
deleting all mentions of free_area_cache from the kernel and starting the
search for new memory always at the respective bases we observe: leakme
terminates successfully with 11 distinctive hardly fragmented areas in
/proc/self/maps but thread creating is gringdingly slow: 30+s(!) system
time for Ingo's test_str02 with 20000 threads.

Now - drumroll ;-) the appended patch works fine with leakme: it ends with
only 7 distinct areas in /proc/self/maps and also thread creation seems
sufficiently fast with 0.71s for 20000 threads.

Signed-off-by: Wolfgang Wander <wwc@rentec.com>
Credit-to: "Richard Purdie" <rpurdie@rpsys.net>
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu> (partly)
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/mm/mmap.c              | 10 +++++++-
 arch/i386/mm/hugetlbpage.c      | 34 ++++++++++++++++++++++----
 arch/ppc64/mm/hugetlbpage.c     | 34 ++++++++++++++++++++++----
 arch/sh/kernel/sys_sh.c         |  8 +++++++
 arch/sparc64/kernel/sys_sparc.c |  8 +++++++
 arch/x86_64/ia32/ia32_aout.c    |  1 +
 arch/x86_64/kernel/sys_x86_64.c |  9 +++++++
 fs/binfmt_aout.c                |  1 +
 fs/binfmt_elf.c                 |  1 +
 fs/hugetlbfs/inode.c            |  3 +++
 include/linux/sched.h           | 11 +++++----
 kernel/fork.c                   |  2 ++
 mm/mmap.c                       | 53 +++++++++++++++++++++++++++++++----------
 mm/nommu.c                      |  2 +-
 14 files changed, 147 insertions(+), 30 deletions(-)

(limited to 'arch/ppc64/mm')

diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index 32c4b0e35b3..3de7f84b53c 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -73,7 +73,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
-	start_addr = addr = mm->free_area_cache;
+	if (len > mm->cached_hole_size) {
+	        start_addr = addr = mm->free_area_cache;
+	} else {
+	        start_addr = addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
 
 full_search:
 	if (do_align)
@@ -90,6 +95,7 @@ full_search:
 			 */
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				start_addr = addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -101,6 +107,8 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
 		addr = vma->vm_end;
 		if (do_align)
 			addr = COLOUR_ALIGN(addr, pgoff);
diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index 5aa06001a4b..3b099f32b94 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -140,7 +140,12 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
 	struct vm_area_struct *vma;
 	unsigned long start_addr;
 
-	start_addr = mm->free_area_cache;
+	if (len > mm->cached_hole_size) {
+	        start_addr = mm->free_area_cache;
+	} else {
+	        start_addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
 
 full_search:
 	addr = ALIGN(start_addr, HPAGE_SIZE);
@@ -154,6 +159,7 @@ full_search:
 			 */
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				start_addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -162,6 +168,8 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
 		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
 	}
 }
@@ -173,12 +181,17 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev_vma;
 	unsigned long base = mm->mmap_base, addr = addr0;
+	unsigned long largest_hole = mm->cached_hole_size;
 	int first_time = 1;
 
 	/* don't allow allocations above current base */
 	if (mm->free_area_cache > base)
 		mm->free_area_cache = base;
 
+	if (len <= largest_hole) {
+	        largest_hole = 0;
+		mm->free_area_cache  = base;
+	}
 try_again:
 	/* make sure it can fit in the remaining address space */
 	if (mm->free_area_cache < len)
@@ -199,13 +212,21 @@ try_again:
 		 * vma->vm_start, use it:
 		 */
 		if (addr + len <= vma->vm_start &&
-				(!prev_vma || (addr >= prev_vma->vm_end)))
+		            (!prev_vma || (addr >= prev_vma->vm_end))) {
 			/* remember the address as a hint for next time */
-			return (mm->free_area_cache = addr);
-		else
+		        mm->cached_hole_size = largest_hole;
+		        return (mm->free_area_cache = addr);
+		} else {
 			/* pull free_area_cache down to the first hole */
-			if (mm->free_area_cache == vma->vm_end)
+		        if (mm->free_area_cache == vma->vm_end) {
 				mm->free_area_cache = vma->vm_start;
+				mm->cached_hole_size = largest_hole;
+			}
+		}
+
+		/* remember the largest hole we saw so far */
+		if (addr + largest_hole < vma->vm_start)
+		        largest_hole = vma->vm_start - addr;
 
 		/* try just below the current vma->vm_start */
 		addr = (vma->vm_start - len) & HPAGE_MASK;
@@ -218,6 +239,7 @@ fail:
 	 */
 	if (first_time) {
 		mm->free_area_cache = base;
+		largest_hole = 0;
 		first_time = 0;
 		goto try_again;
 	}
@@ -228,6 +250,7 @@ fail:
 	 * allocations.
 	 */
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->cached_hole_size = ~0UL;
 	addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
 			len, pgoff, flags);
 
@@ -235,6 +258,7 @@ fail:
 	 * Restore the topdown base:
 	 */
 	mm->free_area_cache = base;
+	mm->cached_hole_size = ~0UL;
 
 	return addr;
 }
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
index b4ab766f598..fdcfe97c75c 100644
--- a/arch/ppc64/mm/hugetlbpage.c
+++ b/arch/ppc64/mm/hugetlbpage.c
@@ -292,7 +292,12 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		    && !is_hugepage_only_range(mm, addr,len))
 			return addr;
 	}
-	start_addr = addr = mm->free_area_cache;
+	if (len > mm->cached_hole_size) {
+	        start_addr = addr = mm->free_area_cache;
+	} else {
+	        start_addr = addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
 
 full_search:
 	vma = find_vma(mm, addr);
@@ -316,6 +321,8 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
 		addr = vma->vm_end;
 		vma = vma->vm_next;
 	}
@@ -323,6 +330,7 @@ full_search:
 	/* Make sure we didn't miss any holes */
 	if (start_addr != TASK_UNMAPPED_BASE) {
 		start_addr = addr = TASK_UNMAPPED_BASE;
+		mm->cached_hole_size = 0;
 		goto full_search;
 	}
 	return -ENOMEM;
@@ -344,6 +352,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	struct vm_area_struct *vma, *prev_vma;
 	struct mm_struct *mm = current->mm;
 	unsigned long base = mm->mmap_base, addr = addr0;
+	unsigned long largest_hole = mm->cached_hole_size;
 	int first_time = 1;
 
 	/* requested length too big for entire address space */
@@ -364,6 +373,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 			return addr;
 	}
 
+	if (len <= largest_hole) {
+	        largest_hole = 0;
+		mm->free_area_cache = base;
+	}
 try_again:
 	/* make sure it can fit in the remaining address space */
 	if (mm->free_area_cache < len)
@@ -392,13 +405,21 @@ hugepage_recheck:
 		 * vma->vm_start, use it:
 		 */
 		if (addr+len <= vma->vm_start &&
-				(!prev_vma || (addr >= prev_vma->vm_end)))
+		          (!prev_vma || (addr >= prev_vma->vm_end))) {
 			/* remember the address as a hint for next time */
-			return (mm->free_area_cache = addr);
-		else
+		        mm->cached_hole_size = largest_hole;
+		        return (mm->free_area_cache = addr);
+		} else {
 			/* pull free_area_cache down to the first hole */
-			if (mm->free_area_cache == vma->vm_end)
+		        if (mm->free_area_cache == vma->vm_end) {
 				mm->free_area_cache = vma->vm_start;
+				mm->cached_hole_size = largest_hole;
+			}
+		}
+
+		/* remember the largest hole we saw so far */
+		if (addr + largest_hole < vma->vm_start)
+		        largest_hole = vma->vm_start - addr;
 
 		/* try just below the current vma->vm_start */
 		addr = vma->vm_start-len;
@@ -411,6 +432,7 @@ fail:
 	 */
 	if (first_time) {
 		mm->free_area_cache = base;
+		largest_hole = 0;
 		first_time = 0;
 		goto try_again;
 	}
@@ -421,11 +443,13 @@ fail:
 	 * allocations.
 	 */
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->cached_hole_size = ~0UL;
 	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
 	/*
 	 * Restore the topdown base:
 	 */
 	mm->free_area_cache = base;
+	mm->cached_hole_size = ~0UL;
 
 	return addr;
 }
diff --git a/arch/sh/kernel/sys_sh.c b/arch/sh/kernel/sys_sh.c
index df5ac294c37..917b2f32f26 100644
--- a/arch/sh/kernel/sys_sh.c
+++ b/arch/sh/kernel/sys_sh.c
@@ -79,6 +79,10 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
+	if (len <= mm->cached_hole_size) {
+	        mm->cached_hole_size = 0;
+		mm->free_area_cache = TASK_UNMAPPED_BASE;
+	}
 	if (flags & MAP_PRIVATE)
 		addr = PAGE_ALIGN(mm->free_area_cache);
 	else
@@ -95,6 +99,7 @@ full_search:
 			 */
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				start_addr = addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -106,6 +111,9 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
+
 		addr = vma->vm_end;
 		if (!(flags & MAP_PRIVATE))
 			addr = COLOUR_ALIGN(addr);
diff --git a/arch/sparc64/kernel/sys_sparc.c b/arch/sparc64/kernel/sys_sparc.c
index 0077f02f4b3..5f8c822a2b4 100644
--- a/arch/sparc64/kernel/sys_sparc.c
+++ b/arch/sparc64/kernel/sys_sparc.c
@@ -84,6 +84,10 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
 			return addr;
 	}
 
+	if (len <= mm->cached_hole_size) {
+	        mm->cached_hole_size = 0;
+		mm->free_area_cache = TASK_UNMAPPED_BASE;
+	}
 	start_addr = addr = mm->free_area_cache;
 
 	task_size -= len;
@@ -103,6 +107,7 @@ full_search:
 		if (task_size < addr) {
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				start_addr = addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -114,6 +119,9 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
+
 		addr = vma->vm_end;
 		if (do_color_align)
 			addr = COLOUR_ALIGN(addr, pgoff);
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
index 1965efc974d..c12edf5d97f 100644
--- a/arch/x86_64/ia32/ia32_aout.c
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -312,6 +312,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->brk = ex.a_bss +
 		(current->mm->start_brk = N_BSSADDR(ex));
 	current->mm->free_area_cache = TASK_UNMAPPED_BASE;
+	current->mm->cached_hole_size = 0;
 
 	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c
index d9798dd433f..cc7821c6885 100644
--- a/arch/x86_64/kernel/sys_x86_64.c
+++ b/arch/x86_64/kernel/sys_x86_64.c
@@ -105,6 +105,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
+	if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
+	    && len <= mm->cached_hole_size) {
+	        mm->cached_hole_size = 0;
+		mm->free_area_cache = begin;
+	}
 	addr = mm->free_area_cache;
 	if (addr < begin) 
 		addr = begin; 
@@ -120,6 +125,7 @@ full_search:
 			 */
 			if (start_addr != begin) {
 				start_addr = addr = begin;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -131,6 +137,9 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
+
 		addr = vma->vm_end;
 	}
 }
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 009b8920c1f..dd9baabaf01 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -316,6 +316,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->brk = ex.a_bss +
 		(current->mm->start_brk = N_BSSADDR(ex));
 	current->mm->free_area_cache = current->mm->mmap_base;
+	current->mm->cached_hole_size = 0;
 
 	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f8f6b6b7617..7976a238f0a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -775,6 +775,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	   change some of these later */
 	set_mm_counter(current->mm, rss, 0);
 	current->mm->free_area_cache = current->mm->mmap_base;
+	current->mm->cached_hole_size = 0;
 	retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
 				 executable_stack);
 	if (retval < 0) {
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 2af3338f891..3a9b6d179cb 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -122,6 +122,9 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 
 	start_addr = mm->free_area_cache;
 
+	if (len <= mm->cached_hole_size)
+		start_addr = TASK_UNMAPPED_BASE;
+
 full_search:
 	addr = ALIGN(start_addr, HPAGE_SIZE);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4dbb109022f..b58afd97a18 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -201,8 +201,8 @@ extern unsigned long
 arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 			  unsigned long len, unsigned long pgoff,
 			  unsigned long flags);
-extern void arch_unmap_area(struct vm_area_struct *area);
-extern void arch_unmap_area_topdown(struct vm_area_struct *area);
+extern void arch_unmap_area(struct mm_struct *, unsigned long);
+extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 
 #define set_mm_counter(mm, member, value) (mm)->_##member = (value)
 #define get_mm_counter(mm, member) ((mm)->_##member)
@@ -218,9 +218,10 @@ struct mm_struct {
 	unsigned long (*get_unmapped_area) (struct file *filp,
 				unsigned long addr, unsigned long len,
 				unsigned long pgoff, unsigned long flags);
-	void (*unmap_area) (struct vm_area_struct *area);
-	unsigned long mmap_base;		/* base of mmap area */
-	unsigned long free_area_cache;		/* first hole */
+	void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
+        unsigned long mmap_base;		/* base of mmap area */
+        unsigned long cached_hole_size;         /* if non-zero, the largest hole below free_area_cache */
+	unsigned long free_area_cache;		/* first hole of size cached_hole_size or larger */
 	pgd_t * pgd;
 	atomic_t mm_users;			/* How many users with user space? */
 	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
diff --git a/kernel/fork.c b/kernel/fork.c
index f42a17f8869..876b31cd822 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -194,6 +194,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 	mm->mmap = NULL;
 	mm->mmap_cache = NULL;
 	mm->free_area_cache = oldmm->mmap_base;
+	mm->cached_hole_size = ~0UL;
 	mm->map_count = 0;
 	set_mm_counter(mm, rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
@@ -322,6 +323,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
 	mm->ioctx_list = NULL;
 	mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->cached_hole_size = ~0UL;
 
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
diff --git a/mm/mmap.c b/mm/mmap.c
index de54acd9942..9da23c1ef9d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1175,7 +1175,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
-	start_addr = addr = mm->free_area_cache;
+	if (len > mm->cached_hole_size) {
+	        start_addr = addr = mm->free_area_cache;
+	} else {
+	        start_addr = addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
 
 full_search:
 	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
@@ -1186,7 +1191,9 @@ full_search:
 			 * some holes.
 			 */
 			if (start_addr != TASK_UNMAPPED_BASE) {
-				start_addr = addr = TASK_UNMAPPED_BASE;
+				addr = TASK_UNMAPPED_BASE;
+			        start_addr = addr;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -1198,19 +1205,22 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
 		addr = vma->vm_end;
 	}
 }
 #endif	
 
-void arch_unmap_area(struct vm_area_struct *area)
+void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
 {
 	/*
 	 * Is this a new hole at the lowest possible address?
 	 */
-	if (area->vm_start >= TASK_UNMAPPED_BASE &&
-			area->vm_start < area->vm_mm->free_area_cache)
-		area->vm_mm->free_area_cache = area->vm_start;
+	if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) {
+		mm->free_area_cache = addr;
+		mm->cached_hole_size = ~0UL;
+	}
 }
 
 /*
@@ -1240,6 +1250,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 			return addr;
 	}
 
+	/* check if free_area_cache is useful for us */
+	if (len <= mm->cached_hole_size) {
+ 	        mm->cached_hole_size = 0;
+ 		mm->free_area_cache = mm->mmap_base;
+ 	}
+
 	/* either no address requested or can't fit in requested address hole */
 	addr = mm->free_area_cache;
 
@@ -1264,6 +1280,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 			/* remember the address as a hint for next time */
 			return (mm->free_area_cache = addr);
 
+ 		/* remember the largest hole we saw so far */
+ 		if (addr + mm->cached_hole_size < vma->vm_start)
+ 		        mm->cached_hole_size = vma->vm_start - addr;
+
 		/* try just below the current vma->vm_start */
 		addr = vma->vm_start-len;
 	} while (len < vma->vm_start);
@@ -1274,28 +1294,30 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	 * can happen with large stack limits and large mmap()
 	 * allocations.
 	 */
-	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->cached_hole_size = ~0UL;
+  	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
 	/*
 	 * Restore the topdown base:
 	 */
 	mm->free_area_cache = mm->mmap_base;
+	mm->cached_hole_size = ~0UL;
 
 	return addr;
 }
 #endif
 
-void arch_unmap_area_topdown(struct vm_area_struct *area)
+void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
 {
 	/*
 	 * Is this a new hole at the highest possible address?
 	 */
-	if (area->vm_end > area->vm_mm->free_area_cache)
-		area->vm_mm->free_area_cache = area->vm_end;
+	if (addr > mm->free_area_cache)
+		mm->free_area_cache = addr;
 
 	/* dont allow allocations above current base */
-	if (area->vm_mm->free_area_cache > area->vm_mm->mmap_base)
-		area->vm_mm->free_area_cache = area->vm_mm->mmap_base;
+	if (mm->free_area_cache > mm->mmap_base)
+		mm->free_area_cache = mm->mmap_base;
 }
 
 unsigned long
@@ -1595,7 +1617,6 @@ static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
 	if (area->vm_flags & VM_LOCKED)
 		area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
 	vm_stat_unaccount(area);
-	area->vm_mm->unmap_area(area);
 	remove_vm_struct(area);
 }
 
@@ -1649,6 +1670,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	struct vm_area_struct **insertion_point;
 	struct vm_area_struct *tail_vma = NULL;
+	unsigned long addr;
 
 	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
 	do {
@@ -1659,6 +1681,11 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 	} while (vma && vma->vm_start < end);
 	*insertion_point = vma;
 	tail_vma->vm_next = NULL;
+	if (mm->unmap_area == arch_unmap_area)
+		addr = prev ? prev->vm_end : mm->mmap_base;
+	else
+		addr = vma ?  vma->vm_start : mm->mmap_base;
+	mm->unmap_area(mm, addr);
 	mm->mmap_cache = NULL;		/* Kill the cache. */
 }
 
diff --git a/mm/nommu.c b/mm/nommu.c
index c53e9c8f6b4..ce74452c02d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1067,7 +1067,7 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
 	return -ENOMEM;
 }
 
-void arch_unmap_area(struct vm_area_struct *area)
+void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
 {
 }
 
-- 
cgit v1.2.3-70-g09d2


From 20cee16ced631f70a62c97bdebae08a1c9470448 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 21 Jun 2005 17:15:31 -0700
Subject: [PATCH] ppc64: Abolish ioremap_mm

Currently ppc64 has two mm_structs for the kernel, init_mm and also
ioremap_mm.  The latter really isn't necessary: this patch abolishes it,
instead restricting vmallocs to the lower 1TB of the init_mm's range and
placing io mappings in the upper 1TB.  This simplifies the code in a number
of places and eliminates an unecessary set of pagetables.  It also tweaks
the unmap/free path a little, allowing us to remove the unmap_im_area() set
of page table walkers, replacing them with unmap_vm_area().

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc64/kernel/eeh.c       |  2 +-
 arch/ppc64/kernel/head.S      |  4 --
 arch/ppc64/kernel/process.c   |  8 ----
 arch/ppc64/mm/hash_utils.c    |  4 --
 arch/ppc64/mm/imalloc.c       | 20 ++++++----
 arch/ppc64/mm/init.c          | 93 +++++--------------------------------------
 include/asm-ppc64/imalloc.h   | 12 +++---
 include/asm-ppc64/page.h      |  2 -
 include/asm-ppc64/pgtable.h   |  9 +----
 include/asm-ppc64/processor.h | 10 -----
 10 files changed, 31 insertions(+), 133 deletions(-)

(limited to 'arch/ppc64/mm')

diff --git a/arch/ppc64/kernel/eeh.c b/arch/ppc64/kernel/eeh.c
index d63d41f3eec..af5272fedad 100644
--- a/arch/ppc64/kernel/eeh.c
+++ b/arch/ppc64/kernel/eeh.c
@@ -505,7 +505,7 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
 	pte_t *ptep;
 	unsigned long pa;
 
-	ptep = find_linux_pte(ioremap_mm.pgd, token);
+	ptep = find_linux_pte(init_mm.pgd, token);
 	if (!ptep)
 		return token;
 	pa = pte_pfn(*ptep) << PAGE_SHIFT;
diff --git a/arch/ppc64/kernel/head.S b/arch/ppc64/kernel/head.S
index 346dbf606b5..02c8f4e3e4b 100644
--- a/arch/ppc64/kernel/head.S
+++ b/arch/ppc64/kernel/head.S
@@ -2121,10 +2121,6 @@ empty_zero_page:
 swapper_pg_dir:
 	.space	4096
 
-	.globl	ioremap_dir
-ioremap_dir:
-	.space	4096
-
 #ifdef CONFIG_SMP
 /* 1 page segment table per cpu (max 48, cpu0 allocated at STAB0_PHYS_ADDR) */
 	.globl	stab_array
diff --git a/arch/ppc64/kernel/process.c b/arch/ppc64/kernel/process.c
index cdfecbeb331..aba89554d89 100644
--- a/arch/ppc64/kernel/process.c
+++ b/arch/ppc64/kernel/process.c
@@ -58,14 +58,6 @@ struct task_struct *last_task_used_math = NULL;
 struct task_struct *last_task_used_altivec = NULL;
 #endif
 
-struct mm_struct ioremap_mm = {
-	.pgd		= ioremap_dir,
-	.mm_users	= ATOMIC_INIT(2),
-	.mm_count	= ATOMIC_INIT(1),
-	.cpu_vm_mask	= CPU_MASK_ALL,
-	.page_table_lock = SPIN_LOCK_UNLOCKED,
-};
-
 /*
  * Make sure the floating-point register state in the
  * the thread_struct is up to date for task tsk.
diff --git a/arch/ppc64/mm/hash_utils.c b/arch/ppc64/mm/hash_utils.c
index 0a0f97008d0..87d0525f56f 100644
--- a/arch/ppc64/mm/hash_utils.c
+++ b/arch/ppc64/mm/hash_utils.c
@@ -310,10 +310,6 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 
 		vsid = get_vsid(mm->context.id, ea);
 		break;
-	case IO_REGION_ID:
-		mm = &ioremap_mm;
-		vsid = get_kernel_vsid(ea);
-		break;
 	case VMALLOC_REGION_ID:
 		mm = &init_mm;
 		vsid = get_kernel_vsid(ea);
diff --git a/arch/ppc64/mm/imalloc.c b/arch/ppc64/mm/imalloc.c
index cb8727f3267..b6e75b891ac 100644
--- a/arch/ppc64/mm/imalloc.c
+++ b/arch/ppc64/mm/imalloc.c
@@ -15,6 +15,7 @@
 #include <asm/pgtable.h>
 #include <asm/semaphore.h>
 #include <asm/imalloc.h>
+#include <asm/cacheflush.h>
 
 static DECLARE_MUTEX(imlist_sem);
 struct vm_struct * imlist = NULL;
@@ -285,29 +286,32 @@ struct vm_struct * im_get_area(unsigned long v_addr, unsigned long size,
 	return area;
 }
 
-unsigned long im_free(void * addr)
+void im_free(void * addr)
 {
 	struct vm_struct **p, *tmp;
-	unsigned long ret_size = 0;
   
 	if (!addr)
-		return ret_size;
-	if ((PAGE_SIZE-1) & (unsigned long) addr) {
+		return;
+	if ((unsigned long) addr & ~PAGE_MASK) {
 		printk(KERN_ERR "Trying to %s bad address (%p)\n", __FUNCTION__,			addr);
-		return ret_size;
+		return;
 	}
 	down(&imlist_sem);
 	for (p = &imlist ; (tmp = *p) ; p = &tmp->next) {
 		if (tmp->addr == addr) {
-			ret_size = tmp->size;
 			*p = tmp->next;
+
+			/* XXX: do we need the lock? */
+			spin_lock(&init_mm.page_table_lock);
+			unmap_vm_area(tmp);
+			spin_unlock(&init_mm.page_table_lock);
+
 			kfree(tmp);
 			up(&imlist_sem);
-			return ret_size;
+			return;
 		}
 	}
 	up(&imlist_sem);
 	printk(KERN_ERR "Trying to %s nonexistent area (%p)\n", __FUNCTION__,
 			addr);
-	return ret_size;
 }
diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c
index 4b42aff74d7..6fa1e6490b5 100644
--- a/arch/ppc64/mm/init.c
+++ b/arch/ppc64/mm/init.c
@@ -73,9 +73,6 @@ static unsigned long phbs_io_bot = PHBS_IO_BASE;
 extern pgd_t swapper_pg_dir[];
 extern struct task_struct *current_set[NR_CPUS];
 
-extern pgd_t ioremap_dir[];
-pgd_t * ioremap_pgd = (pgd_t *)&ioremap_dir;
-
 unsigned long klimit = (unsigned long)_end;
 
 unsigned long _SDR1=0;
@@ -137,69 +134,6 @@ void iounmap(volatile void __iomem *addr)
 
 #else
 
-static void unmap_im_area_pte(pmd_t *pmd, unsigned long addr,
-				  unsigned long end)
-{
-	pte_t *pte;
-
-	pte = pte_offset_kernel(pmd, addr);
-	do {
-		pte_t ptent = ptep_get_and_clear(&ioremap_mm, addr, pte);
-		WARN_ON(!pte_none(ptent) && !pte_present(ptent));
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-}
-
-static inline void unmap_im_area_pmd(pud_t *pud, unsigned long addr,
-				     unsigned long end)
-{
-	pmd_t *pmd;
-	unsigned long next;
-
-	pmd = pmd_offset(pud, addr);
-	do {
-		next = pmd_addr_end(addr, end);
-		if (pmd_none_or_clear_bad(pmd))
-			continue;
-		unmap_im_area_pte(pmd, addr, next);
-	} while (pmd++, addr = next, addr != end);
-}
-
-static inline void unmap_im_area_pud(pgd_t *pgd, unsigned long addr,
-				     unsigned long end)
-{
-	pud_t *pud;
-	unsigned long next;
-
-	pud = pud_offset(pgd, addr);
-	do {
-		next = pud_addr_end(addr, end);
-		if (pud_none_or_clear_bad(pud))
-			continue;
-		unmap_im_area_pmd(pud, addr, next);
-	} while (pud++, addr = next, addr != end);
-}
-
-static void unmap_im_area(unsigned long addr, unsigned long end)
-{
-	struct mm_struct *mm = &ioremap_mm;
-	unsigned long next;
-	pgd_t *pgd;
-
-	spin_lock(&mm->page_table_lock);
-
-	pgd = pgd_offset_i(addr);
-	flush_cache_vunmap(addr, end);
-	do {
-		next = pgd_addr_end(addr, end);
-		if (pgd_none_or_clear_bad(pgd))
-			continue;
-		unmap_im_area_pud(pgd, addr, next);
-	} while (pgd++, addr = next, addr != end);
-	flush_tlb_kernel_range(start, end);
-
-	spin_unlock(&mm->page_table_lock);
-}
-
 /*
  * map_io_page currently only called by __ioremap
  * map_io_page adds an entry to the ioremap page table
@@ -214,21 +148,21 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
 	unsigned long vsid;
 
 	if (mem_init_done) {
-		spin_lock(&ioremap_mm.page_table_lock);
-		pgdp = pgd_offset_i(ea);
-		pudp = pud_alloc(&ioremap_mm, pgdp, ea);
+		spin_lock(&init_mm.page_table_lock);
+		pgdp = pgd_offset_k(ea);
+		pudp = pud_alloc(&init_mm, pgdp, ea);
 		if (!pudp)
 			return -ENOMEM;
-		pmdp = pmd_alloc(&ioremap_mm, pudp, ea);
+		pmdp = pmd_alloc(&init_mm, pudp, ea);
 		if (!pmdp)
 			return -ENOMEM;
-		ptep = pte_alloc_kernel(&ioremap_mm, pmdp, ea);
+		ptep = pte_alloc_kernel(&init_mm, pmdp, ea);
 		if (!ptep)
 			return -ENOMEM;
 		pa = abs_to_phys(pa);
-		set_pte_at(&ioremap_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
+		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
 							  __pgprot(flags)));
-		spin_unlock(&ioremap_mm.page_table_lock);
+		spin_unlock(&init_mm.page_table_lock);
 	} else {
 		unsigned long va, vpn, hash, hpteg;
 
@@ -267,13 +201,9 @@ static void __iomem * __ioremap_com(unsigned long addr, unsigned long pa,
 
 	for (i = 0; i < size; i += PAGE_SIZE)
 		if (map_io_page(ea+i, pa+i, flags))
-			goto failure;
+			return NULL;
 
 	return (void __iomem *) (ea + (addr & ~PAGE_MASK));
- failure:
-	if (mem_init_done)
-		unmap_im_area(ea, ea + size);
-	return NULL;
 }
 
 
@@ -381,19 +311,14 @@ int __ioremap_explicit(unsigned long pa, unsigned long ea,
  */
 void iounmap(volatile void __iomem *token)
 {
-	unsigned long address, size;
 	void *addr;
 
 	if (!mem_init_done)
 		return;
 	
 	addr = (void *) ((unsigned long __force) token & PAGE_MASK);
-	
-	if ((size = im_free(addr)) == 0)
-		return;
 
-	address = (unsigned long)addr; 
-	unmap_im_area(address, address + size);
+	im_free(addr);
 }
 
 static int iounmap_subset_regions(unsigned long addr, unsigned long size)
diff --git a/include/asm-ppc64/imalloc.h b/include/asm-ppc64/imalloc.h
index 3a45e918bf1..e46ff68a6e4 100644
--- a/include/asm-ppc64/imalloc.h
+++ b/include/asm-ppc64/imalloc.h
@@ -4,9 +4,9 @@
 /*
  * Define the address range of the imalloc VM area.
  */
-#define PHBS_IO_BASE  	  IOREGIONBASE
-#define IMALLOC_BASE      (IOREGIONBASE + 0x80000000ul)	/* Reserve 2 gigs for PHBs */
-#define IMALLOC_END       (IOREGIONBASE + EADDR_MASK)
+#define PHBS_IO_BASE  	  VMALLOC_END
+#define IMALLOC_BASE      (PHBS_IO_BASE + 0x80000000ul)	/* Reserve 2 gigs for PHBs */
+#define IMALLOC_END       (VMALLOC_START + EADDR_MASK)
 
 
 /* imalloc region types */
@@ -18,7 +18,9 @@
 
 extern struct vm_struct * im_get_free_area(unsigned long size);
 extern struct vm_struct * im_get_area(unsigned long v_addr, unsigned long size,
-			int region_type);
-unsigned long im_free(void *addr);
+				      int region_type);
+extern void im_free(void *addr);
+
+extern unsigned long ioremap_bot;
 
 #endif /* _PPC64_IMALLOC_H */
diff --git a/include/asm-ppc64/page.h b/include/asm-ppc64/page.h
index bcd21789d3b..257d87eb7c3 100644
--- a/include/asm-ppc64/page.h
+++ b/include/asm-ppc64/page.h
@@ -202,9 +202,7 @@ extern u64 ppc64_pft_size;		/* Log 2 of page table size */
 #define PAGE_OFFSET     ASM_CONST(0xC000000000000000)
 #define KERNELBASE      PAGE_OFFSET
 #define VMALLOCBASE     ASM_CONST(0xD000000000000000)
-#define IOREGIONBASE    ASM_CONST(0xE000000000000000)
 
-#define IO_REGION_ID       (IOREGIONBASE >> REGION_SHIFT)
 #define VMALLOC_REGION_ID  (VMALLOCBASE >> REGION_SHIFT)
 #define KERNEL_REGION_ID   (KERNELBASE >> REGION_SHIFT)
 #define USER_REGION_ID     (0UL)
diff --git a/include/asm-ppc64/pgtable.h b/include/asm-ppc64/pgtable.h
index 264c4f7993b..46cf61c2ff6 100644
--- a/include/asm-ppc64/pgtable.h
+++ b/include/asm-ppc64/pgtable.h
@@ -53,7 +53,8 @@
  * Define the address range of the vmalloc VM area.
  */
 #define VMALLOC_START (0xD000000000000000ul)
-#define VMALLOC_END   (VMALLOC_START + EADDR_MASK)
+#define VMALLOC_SIZE  (0x10000000000UL)
+#define VMALLOC_END   (VMALLOC_START + VMALLOC_SIZE)
 
 /*
  * Bits in a linux-style PTE.  These match the bits in the
@@ -239,9 +240,6 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
 /* This now only contains the vmalloc pages */
 #define pgd_offset_k(address) pgd_offset(&init_mm, address)
 
-/* to find an entry in the ioremap page-table-directory */
-#define pgd_offset_i(address) (ioremap_pgd + pgd_index(address))
-
 /*
  * The following only work if pte_present() is true.
  * Undefined behaviour if not..
@@ -459,15 +457,12 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
 #define __HAVE_ARCH_PTE_SAME
 #define pte_same(A,B)	(((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0)
 
-extern unsigned long ioremap_bot, ioremap_base;
-
 #define pmd_ERROR(e) \
 	printk("%s:%d: bad pmd %08x.\n", __FILE__, __LINE__, pmd_val(e))
 #define pgd_ERROR(e) \
 	printk("%s:%d: bad pgd %08x.\n", __FILE__, __LINE__, pgd_val(e))
 
 extern pgd_t swapper_pg_dir[];
-extern pgd_t ioremap_dir[];
 
 extern void paging_init(void);
 
diff --git a/include/asm-ppc64/processor.h b/include/asm-ppc64/processor.h
index 809c634ba1d..3084099086a 100644
--- a/include/asm-ppc64/processor.h
+++ b/include/asm-ppc64/processor.h
@@ -429,16 +429,6 @@ struct thread_struct {
 	.fpexc_mode = MSR_FE0|MSR_FE1, \
 }
 
-/*
- * Note: the vm_start and vm_end fields here should *not*
- * be in kernel space.  (Could vm_end == vm_start perhaps?)
- */
-#define IOREMAP_MMAP { &ioremap_mm, 0, 0x1000, NULL, \
-		    PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, \
-		    1, NULL, NULL }
-
-extern struct mm_struct ioremap_mm;
-
 /*
  * Return saved PC of a blocked thread. For now, this is the "user" PC
  */
-- 
cgit v1.2.3-70-g09d2


From 515bae9cdc6a78eda0879e1f158056d73ec808b7 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Tue, 21 Jun 2005 17:15:55 -0700
Subject: [PATCH] ppc64: Mark kernel hptes dirty

We dont use the hardware referenced and changed bits and setting them early
avoids a store to memory.  We already do this for userspace hptes but not
kernel ones.  Do it.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc64/mm/hash_utils.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/ppc64/mm')

diff --git a/arch/ppc64/mm/hash_utils.c b/arch/ppc64/mm/hash_utils.c
index 87d0525f56f..1647b1c6f28 100644
--- a/arch/ppc64/mm/hash_utils.c
+++ b/arch/ppc64/mm/hash_utils.c
@@ -195,7 +195,7 @@ void __init htab_initialize(void)
 		memset((void *)table, 0, htab_size_bytes);
 	}
 
-	mode_rw = _PAGE_ACCESSED | _PAGE_COHERENT | PP_RWXX;
+	mode_rw = _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX;
 
 	/* On U3 based machines, we need to reserve the DART area and
 	 * _NOT_ map it to avoid cache paradoxes as it's remapped non
-- 
cgit v1.2.3-70-g09d2