diff options
Diffstat (limited to 'mm/rmap.c')
-rw-r--r-- | mm/rmap.c | 301 |
1 files changed, 226 insertions, 75 deletions
diff --git a/mm/rmap.c b/mm/rmap.c index 8da044a1db0..27dfd3b82b0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -24,8 +24,8 @@ * inode->i_alloc_sem (vmtruncate_range) * mm->mmap_sem * page->flags PG_locked (lock_page) - * mapping->i_mmap_lock - * anon_vma->lock + * mapping->i_mmap_mutex + * anon_vma->mutex * mm->page_table_lock or pte_lock * zone->lru_lock (in mark_page_accessed, isolate_lru_page) * swap_lock (in swap_duplicate, swap_info_get) @@ -40,7 +40,7 @@ * * (code doesn't rely on that order so it could be switched around) * ->tasklist_lock - * anon_vma->lock (memory_failure, collect_procs_anon) + * anon_vma->mutex (memory_failure, collect_procs_anon) * pte map lock */ @@ -86,12 +86,35 @@ static inline struct anon_vma *anon_vma_alloc(void) static inline void anon_vma_free(struct anon_vma *anon_vma) { VM_BUG_ON(atomic_read(&anon_vma->refcount)); + + /* + * Synchronize against page_lock_anon_vma() such that + * we can safely hold the lock without the anon_vma getting + * freed. + * + * Relies on the full mb implied by the atomic_dec_and_test() from + * put_anon_vma() against the acquire barrier implied by + * mutex_trylock() from page_lock_anon_vma(). This orders: + * + * page_lock_anon_vma() VS put_anon_vma() + * mutex_trylock() atomic_dec_and_test() + * LOCK MB + * atomic_read() mutex_is_locked() + * + * LOCK should suffice since the actual taking of the lock must + * happen _before_ what follows. + */ + if (mutex_is_locked(&anon_vma->root->mutex)) { + anon_vma_lock(anon_vma); + anon_vma_unlock(anon_vma); + } + kmem_cache_free(anon_vma_cachep, anon_vma); } -static inline struct anon_vma_chain *anon_vma_chain_alloc(void) +static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) { - return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); + return kmem_cache_alloc(anon_vma_chain_cachep, gfp); } static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) @@ -136,7 +159,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) struct mm_struct *mm = vma->vm_mm; struct anon_vma *allocated; - avc = anon_vma_chain_alloc(); + avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto out_enomem; @@ -177,6 +200,32 @@ int anon_vma_prepare(struct vm_area_struct *vma) return -ENOMEM; } +/* + * This is a useful helper function for locking the anon_vma root as + * we traverse the vma->anon_vma_chain, looping over anon_vma's that + * have the same vma. + * + * Such anon_vma's should have the same root, so you'd expect to see + * just a single mutex_lock for the whole traversal. + */ +static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) +{ + struct anon_vma *new_root = anon_vma->root; + if (new_root != root) { + if (WARN_ON_ONCE(root)) + mutex_unlock(&root->mutex); + root = new_root; + mutex_lock(&root->mutex); + } + return root; +} + +static inline void unlock_anon_vma_root(struct anon_vma *root) +{ + if (root) + mutex_unlock(&root->mutex); +} + static void anon_vma_chain_link(struct vm_area_struct *vma, struct anon_vma_chain *avc, struct anon_vma *anon_vma) @@ -185,13 +234,11 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, avc->anon_vma = anon_vma; list_add(&avc->same_vma, &vma->anon_vma_chain); - anon_vma_lock(anon_vma); /* * It's critical to add new vmas to the tail of the anon_vma, * see comment in huge_memory.c:__split_huge_page(). */ list_add_tail(&avc->same_anon_vma, &anon_vma->head); - anon_vma_unlock(anon_vma); } /* @@ -201,13 +248,24 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) { struct anon_vma_chain *avc, *pavc; + struct anon_vma *root = NULL; list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { - avc = anon_vma_chain_alloc(); - if (!avc) - goto enomem_failure; - anon_vma_chain_link(dst, avc, pavc->anon_vma); + struct anon_vma *anon_vma; + + avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); + if (unlikely(!avc)) { + unlock_anon_vma_root(root); + root = NULL; + avc = anon_vma_chain_alloc(GFP_KERNEL); + if (!avc) + goto enomem_failure; + } + anon_vma = pavc->anon_vma; + root = lock_anon_vma_root(root, anon_vma); + anon_vma_chain_link(dst, avc, anon_vma); } + unlock_anon_vma_root(root); return 0; enomem_failure: @@ -240,7 +298,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) anon_vma = anon_vma_alloc(); if (!anon_vma) goto out_error; - avc = anon_vma_chain_alloc(); + avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto out_error_free_anon_vma; @@ -257,7 +315,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) get_anon_vma(anon_vma->root); /* Mark this anon_vma as the one where our new (COWed) pages go. */ vma->anon_vma = anon_vma; + anon_vma_lock(anon_vma); anon_vma_chain_link(vma, avc, anon_vma); + anon_vma_unlock(anon_vma); return 0; @@ -268,36 +328,43 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) return -ENOMEM; } -static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) -{ - struct anon_vma *anon_vma = anon_vma_chain->anon_vma; - int empty; - - /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */ - if (!anon_vma) - return; - - anon_vma_lock(anon_vma); - list_del(&anon_vma_chain->same_anon_vma); - - /* We must garbage collect the anon_vma if it's empty */ - empty = list_empty(&anon_vma->head); - anon_vma_unlock(anon_vma); - - if (empty) - put_anon_vma(anon_vma); -} - void unlink_anon_vmas(struct vm_area_struct *vma) { struct anon_vma_chain *avc, *next; + struct anon_vma *root = NULL; /* * Unlink each anon_vma chained to the VMA. This list is ordered * from newest to oldest, ensuring the root anon_vma gets freed last. */ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { - anon_vma_unlink(avc); + struct anon_vma *anon_vma = avc->anon_vma; + + root = lock_anon_vma_root(root, anon_vma); + list_del(&avc->same_anon_vma); + + /* + * Leave empty anon_vmas on the list - we'll need + * to free them outside the lock. + */ + if (list_empty(&anon_vma->head)) + continue; + + list_del(&avc->same_vma); + anon_vma_chain_free(avc); + } + unlock_anon_vma_root(root); + + /* + * Iterate the list once more, it now only contains empty and unlinked + * anon_vmas, destroy them. Could not do before due to __put_anon_vma() + * needing to acquire the anon_vma->root->mutex. + */ + list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { + struct anon_vma *anon_vma = avc->anon_vma; + + put_anon_vma(anon_vma); + list_del(&avc->same_vma); anon_vma_chain_free(avc); } @@ -307,7 +374,7 @@ static void anon_vma_ctor(void *data) { struct anon_vma *anon_vma = data; - spin_lock_init(&anon_vma->lock); + mutex_init(&anon_vma->mutex); atomic_set(&anon_vma->refcount, 0); INIT_LIST_HEAD(&anon_vma->head); } @@ -320,12 +387,31 @@ void __init anon_vma_init(void) } /* - * Getting a lock on a stable anon_vma from a page off the LRU is - * tricky: page_lock_anon_vma rely on RCU to guard against the races. + * Getting a lock on a stable anon_vma from a page off the LRU is tricky! + * + * Since there is no serialization what so ever against page_remove_rmap() + * the best this function can do is return a locked anon_vma that might + * have been relevant to this page. + * + * The page might have been remapped to a different anon_vma or the anon_vma + * returned may already be freed (and even reused). + * + * In case it was remapped to a different anon_vma, the new anon_vma will be a + * child of the old anon_vma, and the anon_vma lifetime rules will therefore + * ensure that any anon_vma obtained from the page will still be valid for as + * long as we observe page_mapped() [ hence all those page_mapped() tests ]. + * + * All users of this function must be very careful when walking the anon_vma + * chain and verify that the page in question is indeed mapped in it + * [ something equivalent to page_mapped_in_vma() ]. + * + * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap() + * that the anon_vma pointer from page->mapping is valid if there is a + * mapcount, we can dereference the anon_vma after observing those. */ -struct anon_vma *__page_lock_anon_vma(struct page *page) +struct anon_vma *page_get_anon_vma(struct page *page) { - struct anon_vma *anon_vma, *root_anon_vma; + struct anon_vma *anon_vma = NULL; unsigned long anon_mapping; rcu_read_lock(); @@ -336,32 +422,100 @@ struct anon_vma *__page_lock_anon_vma(struct page *page) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); - root_anon_vma = ACCESS_ONCE(anon_vma->root); - spin_lock(&root_anon_vma->lock); + if (!atomic_inc_not_zero(&anon_vma->refcount)) { + anon_vma = NULL; + goto out; + } /* * If this page is still mapped, then its anon_vma cannot have been - * freed. But if it has been unmapped, we have no security against - * the anon_vma structure being freed and reused (for another anon_vma: - * SLAB_DESTROY_BY_RCU guarantees that - so the spin_lock above cannot - * corrupt): with anon_vma_prepare() or anon_vma_fork() redirecting - * anon_vma->root before page_unlock_anon_vma() is called to unlock. + * freed. But if it has been unmapped, we have no security against the + * anon_vma structure being freed and reused (for another anon_vma: + * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero() + * above cannot corrupt). */ - if (page_mapped(page)) - return anon_vma; + if (!page_mapped(page)) { + put_anon_vma(anon_vma); + anon_vma = NULL; + } +out: + rcu_read_unlock(); + + return anon_vma; +} + +/* + * Similar to page_get_anon_vma() except it locks the anon_vma. + * + * Its a little more complex as it tries to keep the fast path to a single + * atomic op -- the trylock. If we fail the trylock, we fall back to getting a + * reference like with page_get_anon_vma() and then block on the mutex. + */ +struct anon_vma *page_lock_anon_vma(struct page *page) +{ + struct anon_vma *anon_vma = NULL; + struct anon_vma *root_anon_vma; + unsigned long anon_mapping; + + rcu_read_lock(); + anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); + if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) + goto out; + if (!page_mapped(page)) + goto out; + + anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); + root_anon_vma = ACCESS_ONCE(anon_vma->root); + if (mutex_trylock(&root_anon_vma->mutex)) { + /* + * If the page is still mapped, then this anon_vma is still + * its anon_vma, and holding the mutex ensures that it will + * not go away, see anon_vma_free(). + */ + if (!page_mapped(page)) { + mutex_unlock(&root_anon_vma->mutex); + anon_vma = NULL; + } + goto out; + } + + /* trylock failed, we got to sleep */ + if (!atomic_inc_not_zero(&anon_vma->refcount)) { + anon_vma = NULL; + goto out; + } + + if (!page_mapped(page)) { + put_anon_vma(anon_vma); + anon_vma = NULL; + goto out; + } + + /* we pinned the anon_vma, its safe to sleep */ + rcu_read_unlock(); + anon_vma_lock(anon_vma); + + if (atomic_dec_and_test(&anon_vma->refcount)) { + /* + * Oops, we held the last refcount, release the lock + * and bail -- can't simply use put_anon_vma() because + * we'll deadlock on the anon_vma_lock() recursion. + */ + anon_vma_unlock(anon_vma); + __put_anon_vma(anon_vma); + anon_vma = NULL; + } + + return anon_vma; - spin_unlock(&root_anon_vma->lock); out: rcu_read_unlock(); - return NULL; + return anon_vma; } void page_unlock_anon_vma(struct anon_vma *anon_vma) - __releases(&anon_vma->root->lock) - __releases(RCU) { anon_vma_unlock(anon_vma); - rcu_read_unlock(); } /* @@ -646,14 +800,14 @@ static int page_referenced_file(struct page *page, * The page lock not only makes sure that page->mapping cannot * suddenly be NULLified by truncation, it makes sure that the * structure at mapping cannot be freed and reused yet, - * so we can safely take mapping->i_mmap_lock. + * so we can safely take mapping->i_mmap_mutex. */ BUG_ON(!PageLocked(page)); - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); /* - * i_mmap_lock does not stabilize mapcount at all, but mapcount + * i_mmap_mutex does not stabilize mapcount at all, but mapcount * is more likely to be accurate if we note it after spinning. */ mapcount = page_mapcount(page); @@ -675,7 +829,7 @@ static int page_referenced_file(struct page *page, break; } - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); return referenced; } @@ -719,7 +873,7 @@ int page_referenced(struct page *page, unlock_page(page); } out: - if (page_test_and_clear_young(page)) + if (page_test_and_clear_young(page_to_pfn(page))) referenced++; return referenced; @@ -762,7 +916,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) BUG_ON(PageAnon(page)); - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { if (vma->vm_flags & VM_SHARED) { unsigned long address = vma_address(page, vma); @@ -771,7 +925,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) ret += page_mkclean_one(page, vma, address); } } - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); return ret; } @@ -785,10 +939,8 @@ int page_mkclean(struct page *page) struct address_space *mapping = page_mapping(page); if (mapping) { ret = page_mkclean_file(mapping, page); - if (page_test_dirty(page)) { - page_clear_dirty(page, 1); + if (page_test_and_clear_dirty(page_to_pfn(page), 1)) ret = 1; - } } } @@ -914,7 +1066,7 @@ void do_page_add_anon_rmap(struct page *page, return; VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); + /* address might be in next vma when migration races vma_adjust */ if (first) __page_set_anon_rmap(page, vma, address, exclusive); else @@ -981,10 +1133,9 @@ void page_remove_rmap(struct page *page) * not if it's in swapcache - there might be another pte slot * containing the swap entry, but page not yet written to swap. */ - if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) { - page_clear_dirty(page, 1); + if ((!PageAnon(page) || PageSwapCache(page)) && + page_test_and_clear_dirty(page_to_pfn(page), 1)) set_page_dirty(page); - } /* * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED * and not charged by memcg for now. @@ -1122,7 +1273,7 @@ out_mlock: /* * We need mmap_sem locking, Otherwise VM_LOCKED check makes * unstable result and race. Plus, We can't wait here because - * we now hold anon_vma->lock or mapping->i_mmap_lock. + * we now hold anon_vma->mutex or mapping->i_mmap_mutex. * if trylock failed, the page remain in evictable lru and later * vmscan could retry to move the page to unevictable lru if the * page is actually mlocked. @@ -1348,7 +1499,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) unsigned long max_nl_size = 0; unsigned int mapcount; - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) @@ -1394,7 +1545,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) mapcount = page_mapcount(page); if (!mapcount) goto out; - cond_resched_lock(&mapping->i_mmap_lock); + cond_resched(); max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; if (max_nl_cursor == 0) @@ -1416,7 +1567,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) } vma->vm_private_data = (void *) max_nl_cursor; } - cond_resched_lock(&mapping->i_mmap_lock); + cond_resched(); max_nl_cursor += CLUSTER_SIZE; } while (max_nl_cursor <= max_nl_size); @@ -1428,7 +1579,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) vma->vm_private_data = NULL; out: - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); return ret; } @@ -1547,7 +1698,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, if (!mapping) return ret; - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) @@ -1561,7 +1712,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, * never contain migration ptes. Decide what to do about this * limitation to linear when we need rmap_walk() on nonlinear. */ - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); return ret; } @@ -1610,7 +1761,7 @@ void hugepage_add_anon_rmap(struct page *page, BUG_ON(!PageLocked(page)); BUG_ON(!anon_vma); - BUG_ON(address < vma->vm_start || address >= vma->vm_end); + /* address might be in next vma when migration races vma_adjust */ first = atomic_inc_and_test(&page->_mapcount); if (first) __hugepage_set_anon_rmap(page, vma, address, 0); |