From 6daa0e28627abf362138244a620a821a9027d816 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:18:50 -0400 Subject: [PATCH] gfp_t: mm/* (easy parts) Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- mm/filemap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index b5346576e58..1c31b2fd2ca 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -377,7 +377,7 @@ int filemap_write_and_wait_range(struct address_space *mapping, * This function does not add the page to the LRU. The caller must do that. */ int add_to_page_cache(struct page *page, struct address_space *mapping, - pgoff_t offset, int gfp_mask) + pgoff_t offset, gfp_t gfp_mask) { int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); @@ -401,7 +401,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping, EXPORT_SYMBOL(add_to_page_cache); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, - pgoff_t offset, int gfp_mask) + pgoff_t offset, gfp_t gfp_mask) { int ret = add_to_page_cache(page, mapping, offset, gfp_mask); if (ret == 0) @@ -591,7 +591,7 @@ EXPORT_SYMBOL(find_lock_page); * memory exhaustion. */ struct page *find_or_create_page(struct address_space *mapping, - unsigned long index, unsigned int gfp_mask) + unsigned long index, gfp_t gfp_mask) { struct page *page, *cached_page = NULL; int err; @@ -683,7 +683,7 @@ struct page * grab_cache_page_nowait(struct address_space *mapping, unsigned long index) { struct page *page = find_get_page(mapping, index); - unsigned int gfp_mask; + gfp_t gfp_mask; if (page) { if (!TestSetPageLocked(page)) -- cgit v1.2.3-70-g09d2 From 65500d234e74fc4e8f18e1a429bc24e51e75de4a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:15:59 -0700 Subject: [PATCH] mm: page fault handlers tidyup Impose a little more consistency on the page fault handlers do_wp_page, do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their arguments in the same order, called the same names? break_cow is all very well, but what it did was inlined elsewhere: easier to compare if it's brought back into do_wp_page. do_file_page's fallback to do_no_page dates from a time when we were testing pte_file by using it wherever possible: currently it's peculiar to nonlinear vmas, so just check that. BUG_ON if not? Better not, it's probably page table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's use that for do_wp_page's invalid pfn too. Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it: restored (and say "pud" not "pmd" in its pud_ERROR). Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-ppc64/pgtable.h | 4 +- mm/filemap.c | 2 +- mm/memory.c | 220 +++++++++++++++++++------------------------- mm/shmem.c | 2 +- 4 files changed, 102 insertions(+), 126 deletions(-) (limited to 'mm/filemap.c') diff --git a/include/asm-ppc64/pgtable.h b/include/asm-ppc64/pgtable.h index c83679c9d2b..2eb1778a3a1 100644 --- a/include/asm-ppc64/pgtable.h +++ b/include/asm-ppc64/pgtable.h @@ -478,10 +478,12 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr, #define __HAVE_ARCH_PTE_SAME #define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0) +#define pte_ERROR(e) \ + printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e)) #define pmd_ERROR(e) \ printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e)) #define pud_ERROR(e) \ - printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pud_val(e)) + printk("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__, pud_val(e)) #define pgd_ERROR(e) \ printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) diff --git a/mm/filemap.c b/mm/filemap.c index 1c31b2fd2ca..8aa344e8848 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1520,7 +1520,7 @@ repeat: page_cache_release(page); return err; } - } else { + } else if (vma->vm_flags & VM_NONLINEAR) { /* No page was found just because we can't read it in now (being * here implies nonblock != 0), but the page may exist, so set * the PTE to fault it in later. */ diff --git a/mm/memory.c b/mm/memory.c index 13667681cd1..eaf79031f57 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1212,29 +1212,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } -/* - * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock - */ -static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, - pte_t *page_table) -{ - pte_t entry; - - entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)), - vma); - ptep_establish(vma, address, page_table, entry); - update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); -} - /* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address * and decrementing the shared-page counter for the old page. * - * Goto-purists beware: the only reason for goto's here is that it results - * in better assembly code.. The "default" path will see no jumps at all. - * * Note that this routine assumes that the protection checks have been * done by the caller (the low-level page fault routine in most cases). * Thus we can safely just mark it writable once we've done any necessary @@ -1247,25 +1229,22 @@ static inline void break_cow(struct vm_area_struct * vma, struct page * new_page * We hold the mm semaphore and the page_table_lock on entry and exit * with the page_table_lock released. */ -static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte) +static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + pte_t orig_pte) { struct page *old_page, *new_page; - unsigned long pfn = pte_pfn(pte); + unsigned long pfn = pte_pfn(orig_pte); pte_t entry; - int ret; + int ret = VM_FAULT_MINOR; if (unlikely(!pfn_valid(pfn))) { /* - * This should really halt the system so it can be debugged or - * at least the kernel stops what it's doing before it corrupts - * data, but for the moment just pretend this is OOM. + * Page table corrupted: show pte and kill process. */ - pte_unmap(page_table); - printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", - address); - spin_unlock(&mm->page_table_lock); - return VM_FAULT_OOM; + pte_ERROR(orig_pte); + ret = VM_FAULT_OOM; + goto unlock; } old_page = pfn_to_page(pfn); @@ -1274,52 +1253,57 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, unlock_page(old_page); if (reuse) { flush_cache_page(vma, address, pfn); - entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)), - vma); + entry = pte_mkyoung(orig_pte); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); ptep_set_access_flags(vma, address, page_table, entry, 1); update_mmu_cache(vma, address, entry); lazy_mmu_prot_update(entry); - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - return VM_FAULT_MINOR|VM_FAULT_WRITE; + ret |= VM_FAULT_WRITE; + goto unlock; } } - pte_unmap(page_table); /* * Ok, we need to copy. Oh, well.. */ if (!PageReserved(old_page)) page_cache_get(old_page); + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); if (unlikely(anon_vma_prepare(vma))) - goto no_new_page; + goto oom; if (old_page == ZERO_PAGE(address)) { new_page = alloc_zeroed_user_highpage(vma, address); if (!new_page) - goto no_new_page; + goto oom; } else { new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); if (!new_page) - goto no_new_page; + goto oom; copy_user_highpage(new_page, old_page, address); } + /* * Re-check the pte - we dropped the lock */ - ret = VM_FAULT_MINOR; spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); - if (likely(pte_same(*page_table, pte))) { + if (likely(pte_same(*page_table, orig_pte))) { if (PageAnon(old_page)) dec_mm_counter(mm, anon_rss); if (PageReserved(old_page)) inc_mm_counter(mm, rss); else page_remove_rmap(old_page); + flush_cache_page(vma, address, pfn); - break_cow(vma, new_page, address, page_table); + entry = mk_pte(new_page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + ptep_establish(vma, address, page_table, entry); + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); + lru_cache_add_active(new_page); page_add_anon_rmap(new_page, vma, address); @@ -1327,13 +1311,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, new_page = old_page; ret |= VM_FAULT_WRITE; } - pte_unmap(page_table); page_cache_release(new_page); page_cache_release(old_page); +unlock: + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); return ret; - -no_new_page: +oom: page_cache_release(old_page); return VM_FAULT_OOM; } @@ -1661,17 +1645,19 @@ void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struc * We hold the mm semaphore and the page_table_lock on entry and * should release the pagetable lock on exit.. */ -static int do_swap_page(struct mm_struct * mm, - struct vm_area_struct * vma, unsigned long address, - pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access) +static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access, pte_t orig_pte) { struct page *page; - swp_entry_t entry = pte_to_swp_entry(orig_pte); + swp_entry_t entry; pte_t pte; int ret = VM_FAULT_MINOR; pte_unmap(page_table); spin_unlock(&mm->page_table_lock); + + entry = pte_to_swp_entry(orig_pte); page = lookup_swap_cache(entry); if (!page) { swapin_readahead(entry, address, vma); @@ -1685,11 +1671,7 @@ static int do_swap_page(struct mm_struct * mm, page_table = pte_offset_map(pmd, address); if (likely(pte_same(*page_table, orig_pte))) ret = VM_FAULT_OOM; - else - ret = VM_FAULT_MINOR; - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - goto out; + goto unlock; } /* Had to read the page from swap area: Major fault */ @@ -1745,6 +1727,7 @@ static int do_swap_page(struct mm_struct * mm, /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); lazy_mmu_prot_update(pte); +unlock: pte_unmap(page_table); spin_unlock(&mm->page_table_lock); out: @@ -1754,7 +1737,7 @@ out_nomap: spin_unlock(&mm->page_table_lock); unlock_page(page); page_cache_release(page); - goto out; + return ret; } /* @@ -1762,17 +1745,15 @@ out_nomap: * spinlock held to protect against concurrent faults in * multithreaded programs. */ -static int -do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, - pte_t *page_table, pmd_t *pmd, int write_access, - unsigned long addr) +static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access) { pte_t entry; /* Mapping of ZERO_PAGE - vm_page_prot is readonly */ entry = mk_pte(ZERO_PAGE(addr), vma->vm_page_prot); - /* ..except if it's a write access */ if (write_access) { struct page *page; @@ -1781,39 +1762,36 @@ do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(&mm->page_table_lock); if (unlikely(anon_vma_prepare(vma))) - goto no_mem; - page = alloc_zeroed_user_highpage(vma, addr); + goto oom; + page = alloc_zeroed_user_highpage(vma, address); if (!page) - goto no_mem; + goto oom; spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, addr); + page_table = pte_offset_map(pmd, address); if (!pte_none(*page_table)) { - pte_unmap(page_table); page_cache_release(page); - spin_unlock(&mm->page_table_lock); - goto out; + goto unlock; } inc_mm_counter(mm, rss); - entry = maybe_mkwrite(pte_mkdirty(mk_pte(page, - vma->vm_page_prot)), - vma); + entry = mk_pte(page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); lru_cache_add_active(page); SetPageReferenced(page); - page_add_anon_rmap(page, vma, addr); + page_add_anon_rmap(page, vma, address); } - set_pte_at(mm, addr, page_table, entry); - pte_unmap(page_table); + set_pte_at(mm, address, page_table, entry); /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, addr, entry); + update_mmu_cache(vma, address, entry); lazy_mmu_prot_update(entry); +unlock: + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); -out: return VM_FAULT_MINOR; -no_mem: +oom: return VM_FAULT_OOM; } @@ -1829,20 +1807,17 @@ no_mem: * This is called with the MM semaphore held and the page table * spinlock held. Exit with the spinlock released. */ -static int -do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd) +static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access) { - struct page * new_page; + struct page *new_page; struct address_space *mapping = NULL; pte_t entry; unsigned int sequence = 0; int ret = VM_FAULT_MINOR; int anon = 0; - if (!vma->vm_ops || !vma->vm_ops->nopage) - return do_anonymous_page(mm, vma, page_table, - pmd, write_access, address); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); @@ -1852,7 +1827,6 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, smp_rmb(); /* serializes i_size against truncate_count */ } retry: - cond_resched(); new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); /* * No smp_rmb is needed here as long as there's a full @@ -1892,9 +1866,11 @@ retry: * retry getting the page. */ if (mapping && unlikely(sequence != mapping->truncate_count)) { - sequence = mapping->truncate_count; spin_unlock(&mm->page_table_lock); page_cache_release(new_page); + cond_resched(); + sequence = mapping->truncate_count; + smp_rmb(); goto retry; } page_table = pte_offset_map(pmd, address); @@ -1924,25 +1900,22 @@ retry: page_add_anon_rmap(new_page, vma, address); } else page_add_file_rmap(new_page); - pte_unmap(page_table); } else { /* One of our sibling threads was faster, back out. */ - pte_unmap(page_table); page_cache_release(new_page); - spin_unlock(&mm->page_table_lock); - goto out; + goto unlock; } /* no need to invalidate: a not-present page shouldn't be cached */ update_mmu_cache(vma, address, entry); lazy_mmu_prot_update(entry); +unlock: + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); -out: return ret; oom: page_cache_release(new_page); - ret = VM_FAULT_OOM; - goto out; + return VM_FAULT_OOM; } /* @@ -1950,29 +1923,28 @@ oom: * from the encoded file_pte if possible. This enables swappable * nonlinear vmas. */ -static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma, - unsigned long address, int write_access, pte_t *pte, pmd_t *pmd) +static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access, pte_t orig_pte) { - unsigned long pgoff; + pgoff_t pgoff; int err; - BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage); - /* - * Fall back to the linear mapping if the fs does not support - * ->populate: - */ - if (!vma->vm_ops->populate || - (write_access && !(vma->vm_flags & VM_SHARED))) { - pte_clear(mm, address, pte); - return do_no_page(mm, vma, address, write_access, pte, pmd); - } - - pgoff = pte_to_pgoff(*pte); - - pte_unmap(pte); + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); - err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); + if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { + /* + * Page table corrupted: show pte and kill process. + */ + pte_ERROR(orig_pte); + return VM_FAULT_OOM; + } + /* We can then assume vm->vm_ops && vma->vm_ops->populate */ + + pgoff = pte_to_pgoff(orig_pte); + err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, + vma->vm_page_prot, pgoff, 0); if (err == -ENOMEM) return VM_FAULT_OOM; if (err) @@ -2002,23 +1974,25 @@ static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma, * release it when done. */ static inline int handle_pte_fault(struct mm_struct *mm, - struct vm_area_struct * vma, unsigned long address, - int write_access, pte_t *pte, pmd_t *pmd) + struct vm_area_struct *vma, unsigned long address, + pte_t *pte, pmd_t *pmd, int write_access) { pte_t entry; entry = *pte; if (!pte_present(entry)) { - /* - * If it truly wasn't present, we know that kswapd - * and the PTE updates will not touch it later. So - * drop the lock. - */ - if (pte_none(entry)) - return do_no_page(mm, vma, address, write_access, pte, pmd); + if (pte_none(entry)) { + if (!vma->vm_ops || !vma->vm_ops->nopage) + return do_anonymous_page(mm, vma, address, + pte, pmd, write_access); + return do_no_page(mm, vma, address, + pte, pmd, write_access); + } if (pte_file(entry)) - return do_file_page(mm, vma, address, write_access, pte, pmd); - return do_swap_page(mm, vma, address, pte, pmd, entry, write_access); + return do_file_page(mm, vma, address, + pte, pmd, write_access, entry); + return do_swap_page(mm, vma, address, + pte, pmd, write_access, entry); } if (write_access) { @@ -2038,7 +2012,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, /* * By the time we get here, we already hold the mm semaphore */ -int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, +int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access) { pgd_t *pgd; @@ -2072,7 +2046,7 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, if (!pte) goto oom; - return handle_pte_fault(mm, vma, address, write_access, pte, pmd); + return handle_pte_fault(mm, vma, address, pte, pmd, write_access); oom: spin_unlock(&mm->page_table_lock); diff --git a/mm/shmem.c b/mm/shmem.c index 55e04a0734c..6796311a23e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1201,7 +1201,7 @@ static int shmem_populate(struct vm_area_struct *vma, page_cache_release(page); return err; } - } else { + } else if (vma->vm_flags & VM_NONLINEAR) { /* No page was found just because we can't read it in * now (being here implies nonblock != 0), but the page * may exist, so set the PTE to fault it in later. */ -- cgit v1.2.3-70-g09d2 From 4c21e2f2441dc5fbb957b030333f5a3f2d02dea7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:40 -0700 Subject: [PATCH] mm: split page table lock Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with a many-threaded application which concurrently initializes different parts of a large anonymous area. This patch corrects that, by using a separate spinlock per page table page, to guard the page table entries in that page, instead of using the mm's single page_table_lock. (But even then, page_table_lock is still used to guard page table allocation, and anon_vma allocation.) In this implementation, the spinlock is tucked inside the struct page of the page table page: with a BUILD_BUG_ON in case it overflows - which it would in the case of 32-bit PA-RISC with spinlock debugging enabled. Splitting the lock is not quite for free: another cacheline access. Ideally, I suppose we would use split ptlock only for multi-threaded processes on multi-cpu machines; but deciding that dynamically would have its own costs. So for now enable it by config, at some number of cpus - since the Kconfig language doesn't support inequalities, let preprocessor compare that with NR_CPUS. But I don't think it's worth being user-configurable: for good testing of both split and unsplit configs, split now at 4 cpus, and perhaps change that to 8 later. There is a benefit even for singly threaded processes: kswapd can be attacking one part of the mm while another part is busy faulting. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/mm-armv.c | 1 + arch/frv/mm/pgalloc.c | 4 ++-- arch/i386/mm/pgtable.c | 8 ++++---- arch/um/kernel/skas/mmu.c | 1 + fs/afs/file.c | 4 ++-- fs/buffer.c | 2 +- fs/jfs/jfs_metapage.c | 12 ++++++------ fs/xfs/linux-2.6/xfs_buf.c | 7 ++++--- include/linux/buffer_head.h | 6 +++--- include/linux/mm.h | 46 +++++++++++++++++++++++++++++++++++++-------- kernel/kexec.c | 4 ++-- mm/Kconfig | 13 +++++++++++++ mm/filemap.c | 2 +- mm/memory.c | 24 +++++++++++++---------- mm/mremap.c | 11 ++++++++++- mm/page_alloc.c | 16 ++++++++-------- mm/page_io.c | 6 ++++-- mm/rmap.c | 4 ++-- mm/shmem.c | 22 ++++++++++------------ mm/swap.c | 2 +- mm/swap_state.c | 8 ++++---- mm/swapfile.c | 12 ++++++------ mm/vmscan.c | 2 +- 23 files changed, 138 insertions(+), 79 deletions(-) (limited to 'mm/filemap.c') diff --git a/arch/arm/mm/mm-armv.c b/arch/arm/mm/mm-armv.c index 60f3e039bac..1221fdde176 100644 --- a/arch/arm/mm/mm-armv.c +++ b/arch/arm/mm/mm-armv.c @@ -229,6 +229,7 @@ void free_pgd_slow(pgd_t *pgd) pte = pmd_page(*pmd); pmd_clear(pmd); dec_page_state(nr_page_table_pages); + pte_lock_deinit(pte); pte_free(pte); pmd_free(pmd); free: diff --git a/arch/frv/mm/pgalloc.c b/arch/frv/mm/pgalloc.c index 4eaec0f3525..2c67dfe5a6b 100644 --- a/arch/frv/mm/pgalloc.c +++ b/arch/frv/mm/pgalloc.c @@ -87,14 +87,14 @@ static inline void pgd_list_add(pgd_t *pgd) if (pgd_list) pgd_list->private = (unsigned long) &page->index; pgd_list = page; - page->private = (unsigned long) &pgd_list; + set_page_private(page, (unsigned long)&pgd_list); } static inline void pgd_list_del(pgd_t *pgd) { struct page *next, **pprev, *page = virt_to_page(pgd); next = (struct page *) page->index; - pprev = (struct page **) page->private; + pprev = (struct page **)page_private(page); *pprev = next; if (next) next->private = (unsigned long) pprev; diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index dcdce2c6c53..39c099f15b5 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -188,19 +188,19 @@ static inline void pgd_list_add(pgd_t *pgd) struct page *page = virt_to_page(pgd); page->index = (unsigned long)pgd_list; if (pgd_list) - pgd_list->private = (unsigned long)&page->index; + set_page_private(pgd_list, (unsigned long)&page->index); pgd_list = page; - page->private = (unsigned long)&pgd_list; + set_page_private(page, (unsigned long)&pgd_list); } static inline void pgd_list_del(pgd_t *pgd) { struct page *next, **pprev, *page = virt_to_page(pgd); next = (struct page *)page->index; - pprev = (struct page **)page->private; + pprev = (struct page **)page_private(page); *pprev = next; if (next) - next->private = (unsigned long)pprev; + set_page_private(next, (unsigned long)pprev); } void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 02cf36e0331..9e5e39cea82 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -144,6 +144,7 @@ void destroy_context_skas(struct mm_struct *mm) if(!proc_mm || !ptrace_faultinfo){ free_page(mmu->id.stack); + pte_lock_deinit(virt_to_page(mmu->last_page_table)); pte_free_kernel((pte_t *) mmu->last_page_table); dec_page_state(nr_page_table_pages); #ifdef CONFIG_3_LEVEL_PGTABLES diff --git a/fs/afs/file.c b/fs/afs/file.c index 0d576987ec6..4975c9c193d 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -291,8 +291,8 @@ static int afs_file_releasepage(struct page *page, gfp_t gfp_flags) cachefs_uncache_page(vnode->cache, page); #endif - pageio = (struct cachefs_page *) page->private; - page->private = 0; + pageio = (struct cachefs_page *) page_private(page); + set_page_private(page, 0); ClearPagePrivate(page); if (pageio) diff --git a/fs/buffer.c b/fs/buffer.c index b1667986442..2066e4cb700 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -96,7 +96,7 @@ static void __clear_page_buffers(struct page *page) { ClearPagePrivate(page); - page->private = 0; + set_page_private(page, 0); page_cache_release(page); } diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 26091a5f88d..8a53981f9f2 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -86,7 +86,7 @@ struct meta_anchor { atomic_t io_count; struct metapage *mp[MPS_PER_PAGE]; }; -#define mp_anchor(page) ((struct meta_anchor *)page->private) +#define mp_anchor(page) ((struct meta_anchor *)page_private(page)) static inline struct metapage *page_to_mp(struct page *page, uint offset) { @@ -108,7 +108,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp) if (!a) return -ENOMEM; memset(a, 0, sizeof(struct meta_anchor)); - page->private = (unsigned long)a; + set_page_private(page, (unsigned long)a); SetPagePrivate(page); kmap(page); } @@ -136,7 +136,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp) a->mp[index] = NULL; if (--a->mp_count == 0) { kfree(a); - page->private = 0; + set_page_private(page, 0); ClearPagePrivate(page); kunmap(page); } @@ -156,13 +156,13 @@ static inline void dec_io(struct page *page, void (*handler) (struct page *)) #else static inline struct metapage *page_to_mp(struct page *page, uint offset) { - return PagePrivate(page) ? (struct metapage *)page->private : NULL; + return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL; } static inline int insert_metapage(struct page *page, struct metapage *mp) { if (mp) { - page->private = (unsigned long)mp; + set_page_private(page, (unsigned long)mp); SetPagePrivate(page); kmap(page); } @@ -171,7 +171,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp) static inline void remove_metapage(struct page *page, struct metapage *mp) { - page->private = 0; + set_page_private(page, 0); ClearPagePrivate(page); kunmap(page); } diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index ba4767c04ad..4cd46abe843 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -181,8 +181,9 @@ set_page_region( size_t offset, size_t length) { - page->private |= page_region_mask(offset, length); - if (page->private == ~0UL) + set_page_private(page, + page_private(page) | page_region_mask(offset, length)); + if (page_private(page) == ~0UL) SetPageUptodate(page); } @@ -194,7 +195,7 @@ test_page_region( { unsigned long mask = page_region_mask(offset, length); - return (mask && (page->private & mask) == mask); + return (mask && (page_private(page) & mask) == mask); } /* diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 88af42f5e04..c937d6e6550 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -126,8 +126,8 @@ BUFFER_FNS(Eopnotsupp, eopnotsupp) /* If we *know* page->private refers to buffer_heads */ #define page_buffers(page) \ ({ \ - BUG_ON(!PagePrivate(page)); \ - ((struct buffer_head *)(page)->private); \ + BUG_ON(!PagePrivate(page)); \ + ((struct buffer_head *)page_private(page)); \ }) #define page_has_buffers(page) PagePrivate(page) @@ -219,7 +219,7 @@ static inline void attach_page_buffers(struct page *page, { page_cache_get(page); SetPagePrivate(page); - page->private = (unsigned long)head; + set_page_private(page, (unsigned long)head); } static inline void get_bh(struct buffer_head *bh) diff --git a/include/linux/mm.h b/include/linux/mm.h index e8d1424153b..8a514eca40d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -226,13 +226,18 @@ struct page { * to show when page is mapped * & limit reverse map searches. */ - unsigned long private; /* Mapping-private opaque data: + union { + unsigned long private; /* Mapping-private opaque data: * usually used for buffer_heads * if PagePrivate set; used for * swp_entry_t if PageSwapCache * When page is free, this indicates * order in the buddy system. */ +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS + spinlock_t ptl; +#endif + } u; struct address_space *mapping; /* If low bit clear, points to * inode address_space, or NULL. * If page mapped as anonymous @@ -260,6 +265,9 @@ struct page { #endif /* WANT_PAGE_VIRTUAL */ }; +#define page_private(page) ((page)->u.private) +#define set_page_private(page, v) ((page)->u.private = (v)) + /* * FIXME: take this include out, include page-flags.h in * files which need it (119 of them) @@ -311,17 +319,17 @@ extern void FASTCALL(__page_cache_release(struct page *)); #ifdef CONFIG_HUGETLB_PAGE -static inline int page_count(struct page *p) +static inline int page_count(struct page *page) { - if (PageCompound(p)) - p = (struct page *)p->private; - return atomic_read(&(p)->_count) + 1; + if (PageCompound(page)) + page = (struct page *)page_private(page); + return atomic_read(&page->_count) + 1; } static inline void get_page(struct page *page) { if (unlikely(PageCompound(page))) - page = (struct page *)page->private; + page = (struct page *)page_private(page); atomic_inc(&page->_count); } @@ -587,7 +595,7 @@ static inline int PageAnon(struct page *page) static inline pgoff_t page_index(struct page *page) { if (unlikely(PageSwapCache(page))) - return page->private; + return page_private(page); return page->index; } @@ -779,9 +787,31 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a } #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS +/* + * We tuck a spinlock to guard each pagetable page into its struct page, + * at page->private, with BUILD_BUG_ON to make sure that this will not + * overflow into the next struct page (as it might with DEBUG_SPINLOCK). + * When freeing, reset page->mapping so free_pages_check won't complain. + */ +#define __pte_lockptr(page) &((page)->u.ptl) +#define pte_lock_init(_page) do { \ + spin_lock_init(__pte_lockptr(_page)); \ +} while (0) +#define pte_lock_deinit(page) ((page)->mapping = NULL) +#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));}) +#else +/* + * We use mm->page_table_lock to guard all pagetable pages of the mm. + */ +#define pte_lock_init(page) do {} while (0) +#define pte_lock_deinit(page) do {} while (0) +#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) +#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ + #define pte_offset_map_lock(mm, pmd, address, ptlp) \ ({ \ - spinlock_t *__ptl = &(mm)->page_table_lock; \ + spinlock_t *__ptl = pte_lockptr(mm, pmd); \ pte_t *__pte = pte_offset_map(pmd, address); \ *(ptlp) = __ptl; \ spin_lock(__ptl); \ diff --git a/kernel/kexec.c b/kernel/kexec.c index 36c5d9cd4cc..2c95848fbce 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -334,7 +334,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) if (pages) { unsigned int count, i; pages->mapping = NULL; - pages->private = order; + set_page_private(pages, order); count = 1 << order; for (i = 0; i < count; i++) SetPageReserved(pages + i); @@ -347,7 +347,7 @@ static void kimage_free_pages(struct page *page) { unsigned int order, count, i; - order = page->private; + order = page_private(page); count = 1 << order; for (i = 0; i < count; i++) ClearPageReserved(page + i); diff --git a/mm/Kconfig b/mm/Kconfig index 391ffc54d13..f35a550ba4b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -111,3 +111,16 @@ config SPARSEMEM_STATIC config SPARSEMEM_EXTREME def_bool y depends on SPARSEMEM && !SPARSEMEM_STATIC + +# Heavily threaded applications may benefit from splitting the mm-wide +# page_table_lock, so that faults on different parts of the user address +# space can be handled with less contention: split it at this NR_CPUS. +# Default to 4 for wider testing, though 8 might be more appropriate. +# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. +# PA-RISC's debug spinlock_t is too large for the 32-bit struct page. +# +config SPLIT_PTLOCK_CPUS + int + default "4096" if ARM && !CPU_CACHE_VIPT + default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT + default "4" diff --git a/mm/filemap.c b/mm/filemap.c index 8aa344e8848..f560b41c8f6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -152,7 +152,7 @@ static int sync_page(void *word) * in the ->sync_page() methods make essential use of the * page_mapping(), merely passing the page down to the backing * device's unplug functions when it's non-NULL, which in turn - * ignore it for all cases but swap, where only page->private is + * ignore it for all cases but swap, where only page_private(page) is * of interest. When page_mapping() does go NULL, the entire * call stack gracefully ignores the page and returns. * -- wli diff --git a/mm/memory.c b/mm/memory.c index 8461e2dd91d..e9ef599498b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -114,6 +114,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) { struct page *page = pmd_page(*pmd); pmd_clear(pmd); + pte_lock_deinit(page); pte_free_tlb(tlb, page); dec_page_state(nr_page_table_pages); tlb->mm->nr_ptes--; @@ -294,10 +295,12 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) if (!new) return -ENOMEM; + pte_lock_init(new); spin_lock(&mm->page_table_lock); - if (pmd_present(*pmd)) /* Another has populated it */ + if (pmd_present(*pmd)) { /* Another has populated it */ + pte_lock_deinit(new); pte_free(new); - else { + } else { mm->nr_ptes++; inc_page_state(nr_page_table_pages); pmd_populate(mm, pmd, new); @@ -432,7 +435,7 @@ again: if (!dst_pte) return -ENOMEM; src_pte = pte_offset_map_nested(src_pmd, addr); - src_ptl = &src_mm->page_table_lock; + src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock(src_ptl); do { @@ -1194,15 +1197,16 @@ EXPORT_SYMBOL(remap_pfn_range); * (but do_wp_page is only called after already making such a check; * and do_anonymous_page and do_no_page can safely check later on). */ -static inline int pte_unmap_same(struct mm_struct *mm, +static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, pte_t *page_table, pte_t orig_pte) { int same = 1; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) if (sizeof(pte_t) > sizeof(unsigned long)) { - spin_lock(&mm->page_table_lock); + spinlock_t *ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); same = pte_same(*page_table, orig_pte); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); } #endif pte_unmap(page_table); @@ -1655,7 +1659,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, pte_t pte; int ret = VM_FAULT_MINOR; - if (!pte_unmap_same(mm, page_table, orig_pte)) + if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) goto out; entry = pte_to_swp_entry(orig_pte); @@ -1773,7 +1777,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_get(page); entry = mk_pte(page, vma->vm_page_prot); - ptl = &mm->page_table_lock; + ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (!pte_none(*page_table)) goto release; @@ -1934,7 +1938,7 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, pgoff_t pgoff; int err; - if (!pte_unmap_same(mm, page_table, orig_pte)) + if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) return VM_FAULT_MINOR; if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { @@ -1992,7 +1996,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, pte, pmd, write_access, entry); } - ptl = &mm->page_table_lock; + ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (unlikely(!pte_same(*pte, entry))) goto unlock; diff --git a/mm/mremap.c b/mm/mremap.c index 8de77b632a2..b535438c363 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -72,7 +72,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, struct address_space *mapping = NULL; struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; - spinlock_t *old_ptl; + spinlock_t *old_ptl, *new_ptl; if (vma->vm_file) { /* @@ -88,8 +88,15 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, new_vma->vm_truncate_count = 0; } + /* + * We don't have to worry about the ordering of src and dst + * pte locks because exclusive mmap_sem prevents deadlock. + */ old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); new_pte = pte_offset_map_nested(new_pmd, new_addr); + new_ptl = pte_lockptr(mm, new_pmd); + if (new_ptl != old_ptl) + spin_lock(new_ptl); for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, new_pte++, new_addr += PAGE_SIZE) { @@ -101,6 +108,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, set_pte_at(mm, new_addr, new_pte, pte); } + if (new_ptl != old_ptl) + spin_unlock(new_ptl); pte_unmap_nested(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0541288ebf4..a2995a5d012 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -154,7 +154,7 @@ static void prep_compound_page(struct page *page, unsigned long order) struct page *p = page + i; SetPageCompound(p); - p->private = (unsigned long)page; + set_page_private(p, (unsigned long)page); } } @@ -174,7 +174,7 @@ static void destroy_compound_page(struct page *page, unsigned long order) if (!PageCompound(p)) bad_page(__FUNCTION__, page); - if (p->private != (unsigned long)page) + if (page_private(p) != (unsigned long)page) bad_page(__FUNCTION__, page); ClearPageCompound(p); } @@ -187,18 +187,18 @@ static void destroy_compound_page(struct page *page, unsigned long order) * So, we don't need atomic page->flags operations here. */ static inline unsigned long page_order(struct page *page) { - return page->private; + return page_private(page); } static inline void set_page_order(struct page *page, int order) { - page->private = order; + set_page_private(page, order); __SetPagePrivate(page); } static inline void rmv_page_order(struct page *page) { __ClearPagePrivate(page); - page->private = 0; + set_page_private(page, 0); } /* @@ -238,7 +238,7 @@ __find_combined_index(unsigned long page_idx, unsigned int order) * (a) the buddy is free && * (b) the buddy is on the buddy system && * (c) a page and its buddy have the same order. - * for recording page's order, we use page->private and PG_private. + * for recording page's order, we use page_private(page) and PG_private. * */ static inline int page_is_buddy(struct page *page, int order) @@ -264,7 +264,7 @@ static inline int page_is_buddy(struct page *page, int order) * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous * free pages of length of (1 << order) and marked with PG_Private.Page's - * order is recorded in page->private field. + * order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were * free, the remainder of the region must be split into blocks. @@ -463,7 +463,7 @@ static void prep_new_page(struct page *page, int order) page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked | 1 << PG_mappedtodisk); - page->private = 0; + set_page_private(page, 0); set_page_refs(page, order); kernel_map_pages(page, 1 << order, 1); } diff --git a/mm/page_io.c b/mm/page_io.c index 330e00d6db0..bb2b0d53889 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -91,7 +91,8 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) unlock_page(page); goto out; } - bio = get_swap_bio(GFP_NOIO, page->private, page, end_swap_bio_write); + bio = get_swap_bio(GFP_NOIO, page_private(page), page, + end_swap_bio_write); if (bio == NULL) { set_page_dirty(page); unlock_page(page); @@ -115,7 +116,8 @@ int swap_readpage(struct file *file, struct page *page) BUG_ON(!PageLocked(page)); ClearPageUptodate(page); - bio = get_swap_bio(GFP_KERNEL, page->private, page, end_swap_bio_read); + bio = get_swap_bio(GFP_KERNEL, page_private(page), page, + end_swap_bio_read); if (bio == NULL) { unlock_page(page); ret = -ENOMEM; diff --git a/mm/rmap.c b/mm/rmap.c index a84bdfe582c..a33e779d1bd 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -274,7 +274,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, return NULL; } - ptl = &mm->page_table_lock; + ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { *ptlp = ptl; @@ -550,7 +550,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) update_hiwater_rss(mm); if (PageAnon(page)) { - swp_entry_t entry = { .val = page->private }; + swp_entry_t entry = { .val = page_private(page) }; /* * Store the swap location in the pte. * See handle_pte_fault() ... diff --git a/mm/shmem.c b/mm/shmem.c index 37777f4c11f..dc25565a61e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -71,9 +71,6 @@ /* Pretend that each entry is of this size in directory's i_size */ #define BOGO_DIRENT_SIZE 20 -/* Keep swapped page count in private field of indirect struct page */ -#define nr_swapped private - /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ enum sgp_type { SGP_QUICK, /* don't try more than file page cache lookup */ @@ -324,8 +321,10 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns entry->val = value; info->swapped += incdec; - if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) - kmap_atomic_to_page(entry)->nr_swapped += incdec; + if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) { + struct page *page = kmap_atomic_to_page(entry); + set_page_private(page, page_private(page) + incdec); + } } /* @@ -368,9 +367,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long spin_unlock(&info->lock); page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO); - if (page) { - page->nr_swapped = 0; - } + if (page) + set_page_private(page, 0); spin_lock(&info->lock); if (!page) { @@ -561,7 +559,7 @@ static void shmem_truncate(struct inode *inode) diroff = 0; } subdir = dir[diroff]; - if (subdir && subdir->nr_swapped) { + if (subdir && page_private(subdir)) { size = limit - idx; if (size > ENTRIES_PER_PAGE) size = ENTRIES_PER_PAGE; @@ -572,10 +570,10 @@ static void shmem_truncate(struct inode *inode) nr_swaps_freed += freed; if (offset) spin_lock(&info->lock); - subdir->nr_swapped -= freed; + set_page_private(subdir, page_private(subdir) - freed); if (offset) spin_unlock(&info->lock); - BUG_ON(subdir->nr_swapped > offset); + BUG_ON(page_private(subdir) > offset); } if (offset) offset = 0; @@ -743,7 +741,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s dir = shmem_dir_map(subdir); } subdir = *dir; - if (subdir && subdir->nr_swapped) { + if (subdir && page_private(subdir)) { ptr = shmem_swp_map(subdir); size = limit - idx; if (size > ENTRIES_PER_PAGE) diff --git a/mm/swap.c b/mm/swap.c index 21d15f99805..b89512877ec 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -39,7 +39,7 @@ int page_cluster; void put_page(struct page *page) { if (unlikely(PageCompound(page))) { - page = (struct page *)page->private; + page = (struct page *)page_private(page); if (put_page_testzero(page)) { void (*dtor)(struct page *page); diff --git a/mm/swap_state.c b/mm/swap_state.c index 132164f7d0a..cafc1edcbeb 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -83,7 +83,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, page_cache_get(page); SetPageLocked(page); SetPageSwapCache(page); - page->private = entry.val; + set_page_private(page, entry.val); total_swapcache_pages++; pagecache_acct(1); } @@ -126,8 +126,8 @@ void __delete_from_swap_cache(struct page *page) BUG_ON(PageWriteback(page)); BUG_ON(PagePrivate(page)); - radix_tree_delete(&swapper_space.page_tree, page->private); - page->private = 0; + radix_tree_delete(&swapper_space.page_tree, page_private(page)); + set_page_private(page, 0); ClearPageSwapCache(page); total_swapcache_pages--; pagecache_acct(-1); @@ -197,7 +197,7 @@ void delete_from_swap_cache(struct page *page) { swp_entry_t entry; - entry.val = page->private; + entry.val = page_private(page); write_lock_irq(&swapper_space.tree_lock); __delete_from_swap_cache(page); diff --git a/mm/swapfile.c b/mm/swapfile.c index 510f0039b00..8970c0b7419 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -61,7 +61,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) swp_entry_t entry; down_read(&swap_unplug_sem); - entry.val = page->private; + entry.val = page_private(page); if (PageSwapCache(page)) { struct block_device *bdev = swap_info[swp_type(entry)].bdev; struct backing_dev_info *bdi; @@ -69,8 +69,8 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) /* * If the page is removed from swapcache from under us (with a * racy try_to_unuse/swapoff) we need an additional reference - * count to avoid reading garbage from page->private above. If - * the WARN_ON triggers during a swapoff it maybe the race + * count to avoid reading garbage from page_private(page) above. + * If the WARN_ON triggers during a swapoff it maybe the race * condition and it's harmless. However if it triggers without * swapoff it signals a problem. */ @@ -294,7 +294,7 @@ static inline int page_swapcount(struct page *page) struct swap_info_struct *p; swp_entry_t entry; - entry.val = page->private; + entry.val = page_private(page); p = swap_info_get(entry); if (p) { /* Subtract the 1 for the swap cache itself */ @@ -339,7 +339,7 @@ int remove_exclusive_swap_page(struct page *page) if (page_count(page) != 2) /* 2: us + cache */ return 0; - entry.val = page->private; + entry.val = page_private(page); p = swap_info_get(entry); if (!p) return 0; @@ -1042,7 +1042,7 @@ int page_queue_congested(struct page *page) BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ if (PageSwapCache(page)) { - swp_entry_t entry = { .val = page->private }; + swp_entry_t entry = { .val = page_private(page) }; struct swap_info_struct *sis; sis = get_swap_info_struct(swp_type(entry)); diff --git a/mm/vmscan.c b/mm/vmscan.c index 41d1064aabf..135bf8ca96e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -521,7 +521,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) #ifdef CONFIG_SWAP if (PageSwapCache(page)) { - swp_entry_t swap = { .val = page->private }; + swp_entry_t swap = { .val = page_private(page) }; __delete_from_swap_cache(page); write_unlock_irq(&mapping->tree_lock); swap_free(swap); -- cgit v1.2.3-70-g09d2 From b8072f099b7829a6ff3eba618e1d079a81f753f8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:41 -0700 Subject: [PATCH] mm: update comments to pte lock Updated several references to page_table_lock in common code comments. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 2 +- include/linux/mempolicy.h | 3 +-- mm/filemap.c | 6 +++--- mm/rmap.c | 10 +++++----- mm/swap_state.c | 3 +-- 5 files changed, 11 insertions(+), 13 deletions(-) (limited to 'mm/filemap.c') diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index ff28c8b31f5..7dca30a26c5 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -8,7 +8,7 @@ * - update the page tables * - inform the TLB about the new one * - * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock. + * We hold the mm semaphore for reading, and the pte lock. * * Note: the old pte is known to not be writable, so we don't need to * worry about dirty bits etc getting lost. diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 38e60a09939..7af8cb836e7 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -47,8 +47,7 @@ struct vm_area_struct; * Locking policy for interlave: * In process context there is no locking because only the process accesses * its own state. All vma manipulation is somewhat protected by a down_read on - * mmap_sem. For allocating in the interleave policy the page_table_lock - * must be also aquired to protect il_next. + * mmap_sem. * * Freeing policy: * When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd. diff --git a/mm/filemap.c b/mm/filemap.c index f560b41c8f6..036599d1177 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -66,7 +66,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * * ->mmap_sem * ->i_mmap_lock - * ->page_table_lock (various places, mainly in mmap.c) + * ->page_table_lock or pte_lock (various, mainly in memory.c) * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) * * ->mmap_sem @@ -86,9 +86,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * ->anon_vma.lock (vma_adjust) * * ->anon_vma.lock - * ->page_table_lock (anon_vma_prepare and various) + * ->page_table_lock or pte_lock (anon_vma_prepare and various) * - * ->page_table_lock + * ->page_table_lock or pte_lock * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) * ->tree_lock (try_to_unmap_one) diff --git a/mm/rmap.c b/mm/rmap.c index a7427bbf57e..914d04b98be 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -32,7 +32,7 @@ * page->flags PG_locked (lock_page) * mapping->i_mmap_lock * anon_vma->lock - * mm->page_table_lock + * mm->page_table_lock or pte_lock * zone->lru_lock (in mark_page_accessed) * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) @@ -244,7 +244,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) /* * Check that @page is mapped at @address into @mm. * - * On success returns with mapped pte and locked mm->page_table_lock. + * On success returns with pte mapped and locked. */ pte_t *page_check_address(struct page *page, struct mm_struct *mm, unsigned long address, spinlock_t **ptlp) @@ -445,7 +445,7 @@ int page_referenced(struct page *page, int is_locked, int ignore_token) * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped * - * The caller needs to hold the mm->page_table_lock. + * The caller needs to hold the pte lock. */ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) @@ -468,7 +468,7 @@ void page_add_anon_rmap(struct page *page, * page_add_file_rmap - add pte mapping to a file page * @page: the page to add the mapping to * - * The caller needs to hold the mm->page_table_lock. + * The caller needs to hold the pte lock. */ void page_add_file_rmap(struct page *page) { @@ -483,7 +483,7 @@ void page_add_file_rmap(struct page *page) * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from * - * Caller needs to hold the mm->page_table_lock. + * The caller needs to hold the pte lock. */ void page_remove_rmap(struct page *page) { diff --git a/mm/swap_state.c b/mm/swap_state.c index cafc1edcbeb..dfd9a46755b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -259,8 +259,7 @@ static inline void free_swap_cache(struct page *page) /* * Perform a free_page(), also freeing any swap cache associated with - * this page if it is the last user of the page. Can not do a lock_page, - * as we are holding the page_table_lock spinlock. + * this page if it is the last user of the page. */ void free_page_and_swap_cache(struct page *page) { -- cgit v1.2.3-70-g09d2 From b1459461f1e0abd5c28317d6bff6f2ca612a719d Mon Sep 17 00:00:00 2001 From: Nikita Danilov Date: Sat, 29 Oct 2005 18:17:02 -0700 Subject: [PATCH] mm/filemap.c:filemap_populate(): move export. move EXPORT_SYMBOL(filemap_populate) to the proper place: just after function itself: it's easy to miss that function is exported otherwise. Signed-off-by: Nikita Danilov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 036599d1177..768687f1d46 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1537,6 +1537,7 @@ repeat: return 0; } +EXPORT_SYMBOL(filemap_populate); struct vm_operations_struct generic_file_vm_ops = { .nopage = filemap_nopage, @@ -1555,7 +1556,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma) vma->vm_ops = &generic_file_vm_ops; return 0; } -EXPORT_SYMBOL(filemap_populate); /* * This is for filesystems which do not implement ->writepage. -- cgit v1.2.3-70-g09d2 From 39e88ca2c9d0f6d1e9f34ea2a6e86a652bb69a7a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 30 Oct 2005 15:02:40 -0800 Subject: [PATCH] fs: error case fix in __generic_file_aio_read When __generic_file_aio_read() hits an error during reading, it reports the error iff nothing has successfully been read yet. This is condition - when an error occurs, if nothing has been read/written, report the error code; otherwise, report the amount of bytes successfully transferred upto that point. This corner case can be exposed by performing readv(2) with the following iov. iov[0] = len0 @ ptr0 iov[1] = len1 @ NULL (or any other invalid pointer) iov[2] = len2 @ ptr2 When file size is enough, performing above readv(2) results in len0 bytes from file_pos @ ptr0 len2 bytes from file_pos + len0 @ ptr2 And the return value is len0 + len2. Test program is attached to this mail. This patch makes __generic_file_aio_read()'s error handling identical to other functions. #include #include #include #include #include #include #include #include #include int main(int argc, char **argv) { const char *path; struct stat stbuf; size_t len0, len1; void *buf0, *buf1; struct iovec iov[3]; int fd, i; ssize_t ret; if (argc < 2) { fprintf(stderr, "Usage: testreadv path (better be a " "small text file)\n"); return 1; } path = argv[1]; if (stat(path, &stbuf) < 0) { perror("stat"); return 1; } len0 = stbuf.st_size / 2; len1 = stbuf.st_size - len0; if (!len0 || !len1) { fprintf(stderr, "Dude, file is too small\n"); return 1; } if ((fd = open(path, O_RDONLY)) < 0) { perror("open"); return 1; } if (!(buf0 = malloc(len0)) || !(buf1 = malloc(len1))) { perror("malloc"); return 1; } memset(buf0, 0, len0); memset(buf1, 0, len1); iov[0].iov_base = buf0; iov[0].iov_len = len0; iov[1].iov_base = NULL; iov[1].iov_len = len1; iov[2].iov_base = buf1; iov[2].iov_len = len1; printf("vector "); for (i = 0; i < 3; i++) printf("%p:%zu ", iov[i].iov_base, iov[i].iov_len); printf("\n"); ret = readv(fd, iov, 3); if (ret < 0) perror("readv"); printf("readv returned %zd\nbuf0 = [%s]\nbuf1 = [%s]\n", ret, (char *)buf0, (char *)buf1); return 0; } Signed-off-by: Tejun Heo Cc: Benjamin LaHaise Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 768687f1d46..5d6e4c2000d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1030,8 +1030,8 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, desc.error = 0; do_generic_file_read(filp,ppos,&desc,file_read_actor); retval += desc.written; - if (!retval) { - retval = desc.error; + if (desc.error) { + retval = retval ?: desc.error; break; } } -- cgit v1.2.3-70-g09d2