From 6328650bb4d854a7dc1498d1c0048b838b0d340c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 3 Aug 2011 16:21:18 -0700 Subject: radix_tree: exceptional entries and indices A patchset to extend tmpfs to MAX_LFS_FILESIZE by abandoning its peculiar swap vector, instead keeping a file's swap entries in the same radix tree as its struct page pointers: thus saving memory, and simplifying its code and locking. This patch: The radix_tree is used by several subsystems for different purposes. A major use is to store the struct page pointers of a file's pagecache for memory management. But what if mm wanted to store something other than page pointers there too? The low bit of a radix_tree entry is already used to denote an indirect pointer, for internal use, and the unlikely radix_tree_deref_retry() case. Define the next bit as denoting an exceptional entry, and supply inline functions radix_tree_exception() to return non-0 in either unlikely case, and radix_tree_exceptional_entry() to return non-0 in the second case. If a subsystem already uses radix_tree with that bit set, no problem: it does not affect internal workings at all, but is defined for the convenience of those storing well-aligned pointers in the radix_tree. The radix_tree_gang_lookups have an implicit assumption that the caller can deduce the offset of each entry returned e.g. by the page->index of a struct page. But that may not be feasible for some kinds of item to be stored there. radix_tree_gang_lookup_slot() allow for an optional indices argument, output array in which to return those offsets. The same could be added to other radix_tree_gang_lookups, but for now keep it to the only one for which we need it. Signed-off-by: Hugh Dickins Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 867d40222ec..b83aebfd0a0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -840,7 +840,7 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, rcu_read_lock(); restart: nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - (void ***)pages, start, nr_pages); + (void ***)pages, NULL, start, nr_pages); ret = 0; for (i = 0; i < nr_found; i++) { struct page *page; @@ -903,7 +903,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, rcu_read_lock(); restart: nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - (void ***)pages, index, nr_pages); + (void ***)pages, NULL, index, nr_pages); ret = 0; for (i = 0; i < nr_found; i++) { struct page *page; -- cgit v1.2.3-70-g09d2 From a2c16d6cb0e478812829ca84aeabd02e36af35eb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 3 Aug 2011 16:21:19 -0700 Subject: mm: let swap use exceptional entries If swap entries are to be stored along with struct page pointers in a radix tree, they need to be distinguished as exceptional entries. Most of the handling of swap entries in radix tree will be contained in shmem.c, but a few functions in filemap.c's common code need to check for their appearance: find_get_page(), find_lock_page(), find_get_pages() and find_get_pages_contig(). So as not to slow their fast paths, tuck those checks inside the existing checks for unlikely radix_tree_deref_slot(); except for find_lock_page(), where it is an added test. And make it a BUG in find_get_pages_tag(), which is not applied to tmpfs files. A part of the reason for eliminating shmem_readpage() earlier, was to minimize the places where common code would need to allow for swap entries. The swp_entry_t known to swapfile.c must be massaged into a slightly different form when stored in the radix tree, just as it gets massaged into a pte_t when stored in page tables. In an i386 kernel this limits its information (type and page offset) to 30 bits: given 32 "types" of swapfile and 4kB pagesize, that's a maximum swapfile size of 128GB. Which is less than the 512GB we previously allowed with X86_PAE (where the swap entry can occupy the entire upper 32 bits of a pte_t), but not a new limitation on 32-bit without PAE; and there's not a new limitation on 64-bit (where swap filesize is already limited to 16TB by a 32-bit page offset). Thirty areas of 128GB is probably still enough swap for a 64GB 32-bit machine. Provide swp_to_radix_entry() and radix_to_swp_entry() conversions, and enforce filesize limit in read_swap_header(), just as for ptes. Signed-off-by: Hugh Dickins Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swapops.h | 23 +++++++++++++++++++++++ mm/filemap.c | 49 +++++++++++++++++++++++++++++++------------------ mm/swapfile.c | 20 ++++++++++++-------- 3 files changed, 66 insertions(+), 26 deletions(-) (limited to 'mm/filemap.c') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index cd42e30b7c6..2189d3ffc85 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -1,3 +1,8 @@ +#ifndef _LINUX_SWAPOPS_H +#define _LINUX_SWAPOPS_H + +#include + /* * swapcache pages are stored in the swapper_space radix tree. We want to * get good packing density in that tree, so the index should be dense in @@ -76,6 +81,22 @@ static inline pte_t swp_entry_to_pte(swp_entry_t entry) return __swp_entry_to_pte(arch_entry); } +static inline swp_entry_t radix_to_swp_entry(void *arg) +{ + swp_entry_t entry; + + entry.val = (unsigned long)arg >> RADIX_TREE_EXCEPTIONAL_SHIFT; + return entry; +} + +static inline void *swp_to_radix_entry(swp_entry_t entry) +{ + unsigned long value; + + value = entry.val << RADIX_TREE_EXCEPTIONAL_SHIFT; + return (void *)(value | RADIX_TREE_EXCEPTIONAL_ENTRY); +} + #ifdef CONFIG_MIGRATION static inline swp_entry_t make_migration_entry(struct page *page, int write) { @@ -169,3 +190,5 @@ static inline int non_swap_entry(swp_entry_t entry) return 0; } #endif + +#endif /* _LINUX_SWAPOPS_H */ diff --git a/mm/filemap.c b/mm/filemap.c index b83aebfd0a0..76bfb6460f5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -714,9 +714,12 @@ repeat: page = radix_tree_deref_slot(pagep); if (unlikely(!page)) goto out; - if (radix_tree_deref_retry(page)) + if (radix_tree_exception(page)) { + if (radix_tree_exceptional_entry(page)) + goto out; + /* radix_tree_deref_retry(page) */ goto repeat; - + } if (!page_cache_get_speculative(page)) goto repeat; @@ -753,7 +756,7 @@ struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) repeat: page = find_get_page(mapping, offset); - if (page) { + if (page && !radix_tree_exception(page)) { lock_page(page); /* Has the page been truncated? */ if (unlikely(page->mapping != mapping)) { @@ -849,11 +852,14 @@ repeat: if (unlikely(!page)) continue; - /* - * This can only trigger when the entry at index 0 moves out - * of or back to the root: none yet gotten, safe to restart. - */ - if (radix_tree_deref_retry(page)) { + if (radix_tree_exception(page)) { + if (radix_tree_exceptional_entry(page)) + continue; + /* + * radix_tree_deref_retry(page): + * can only trigger when entry at index 0 moves out of + * or back to root: none yet gotten, safe to restart. + */ WARN_ON(start | i); goto restart; } @@ -912,12 +918,16 @@ repeat: if (unlikely(!page)) continue; - /* - * This can only trigger when the entry at index 0 moves out - * of or back to the root: none yet gotten, safe to restart. - */ - if (radix_tree_deref_retry(page)) + if (radix_tree_exception(page)) { + if (radix_tree_exceptional_entry(page)) + break; + /* + * radix_tree_deref_retry(page): + * can only trigger when entry at index 0 moves out of + * or back to root: none yet gotten, safe to restart. + */ goto restart; + } if (!page_cache_get_speculative(page)) goto repeat; @@ -977,12 +987,15 @@ repeat: if (unlikely(!page)) continue; - /* - * This can only trigger when the entry at index 0 moves out - * of or back to the root: none yet gotten, safe to restart. - */ - if (radix_tree_deref_retry(page)) + if (radix_tree_exception(page)) { + BUG_ON(radix_tree_exceptional_entry(page)); + /* + * radix_tree_deref_retry(page): + * can only trigger when entry at index 0 moves out of + * or back to root: none yet gotten, safe to restart. + */ goto restart; + } if (!page_cache_get_speculative(page)) goto repeat; diff --git a/mm/swapfile.c b/mm/swapfile.c index 1b8c3390724..17bc224bce6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1924,20 +1924,24 @@ static unsigned long read_swap_header(struct swap_info_struct *p, /* * Find out how many pages are allowed for a single swap - * device. There are two limiting factors: 1) the number of - * bits for the swap offset in the swp_entry_t type and - * 2) the number of bits in the a swap pte as defined by - * the different architectures. In order to find the - * largest possible bit mask a swap entry with swap type 0 + * device. There are three limiting factors: 1) the number + * of bits for the swap offset in the swp_entry_t type, and + * 2) the number of bits in the swap pte as defined by the + * the different architectures, and 3) the number of free bits + * in an exceptional radix_tree entry. In order to find the + * largest possible bit mask, a swap entry with swap type 0 * and swap offset ~0UL is created, encoded to a swap pte, - * decoded to a swp_entry_t again and finally the swap + * decoded to a swp_entry_t again, and finally the swap * offset is extracted. This will mask all the bits from * the initial ~0UL mask that can't be encoded in either * the swp_entry_t or the architecture definition of a - * swap pte. + * swap pte. Then the same is done for a radix_tree entry. */ maxpages = swp_offset(pte_to_swp_entry( - swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; + swp_entry_to_pte(swp_entry(0, ~0UL)))); + maxpages = swp_offset(radix_to_swp_entry( + swp_to_radix_entry(swp_entry(0, maxpages)))) + 1; + if (maxpages > swap_header->info.last_page) { maxpages = swap_header->info.last_page + 1; /* p->max is an unsigned int: don't overflow it */ -- cgit v1.2.3-70-g09d2 From 31475dd611209413bace21651a400afb91d0bd9d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 3 Aug 2011 16:21:27 -0700 Subject: mm: a few small updates for radix-swap Remove PageSwapBacked (!page_is_file_cache) cases from add_to_page_cache_locked() and add_to_page_cache_lru(): those pages now go through shmem_add_to_page_cache(). Remove a comment on maximum tmpfs size from fsstack_copy_inode_size(), and add a comment on swap entries to invalidate_mapping_pages(). And mincore_page() uses find_get_page() on what might be shmem or a tmpfs file: allow for a radix_tree_exceptional_entry(), and proceed to find_get_page() on swapper_space if so (oh, swapper_space needs #ifdef). Signed-off-by: Hugh Dickins Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/stack.c | 5 +---- mm/filemap.c | 21 +++------------------ mm/mincore.c | 10 ++++++---- mm/truncate.c | 8 ++++++++ 4 files changed, 18 insertions(+), 26 deletions(-) (limited to 'mm/filemap.c') diff --git a/fs/stack.c b/fs/stack.c index 4a6f7f44065..b4f2ab48a61 100644 --- a/fs/stack.c +++ b/fs/stack.c @@ -29,10 +29,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src) * * We don't actually know what locking is used at the lower level; * but if it's a filesystem that supports quotas, it will be using - * i_lock as in inode_add_bytes(). tmpfs uses other locking, and - * its 32-bit is (just) able to exceed 2TB i_size with the aid of - * holes; but its i_blocks cannot carry into the upper long without - * almost 2TB swap - let's ignore that case. + * i_lock as in inode_add_bytes(). */ if (sizeof(i_blocks) > sizeof(long)) spin_lock(&src->i_lock); diff --git a/mm/filemap.c b/mm/filemap.c index 76bfb6460f5..96778faf82d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -33,7 +33,6 @@ #include #include /* for BUG_ON(!in_atomic()) only */ #include -#include /* for page_is_file_cache() */ #include #include "internal.h" @@ -462,6 +461,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, int error; VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(PageSwapBacked(page)); error = mem_cgroup_cache_charge(page, current->mm, gfp_mask & GFP_RECLAIM_MASK); @@ -479,8 +479,6 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); - if (PageSwapBacked(page)) - __inc_zone_page_state(page, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); } else { page->mapping = NULL; @@ -502,22 +500,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, { int ret; - /* - * Splice_read and readahead add shmem/tmpfs pages into the page cache - * before shmem_readpage has a chance to mark them as SwapBacked: they - * need to go on the anon lru below, and mem_cgroup_cache_charge - * (called in add_to_page_cache) needs to know where they're going too. - */ - if (mapping_cap_swap_backed(mapping)) - SetPageSwapBacked(page); - ret = add_to_page_cache(page, mapping, offset, gfp_mask); - if (ret == 0) { - if (page_is_file_cache(page)) - lru_cache_add_file(page); - else - lru_cache_add_anon(page); - } + if (ret == 0) + lru_cache_add_file(page); return ret; } EXPORT_SYMBOL_GPL(add_to_page_cache_lru); diff --git a/mm/mincore.c b/mm/mincore.c index a4e6b9d75c7..733f1829b0d 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -69,12 +69,14 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) * file will not get a swp_entry_t in its pte, but rather it is like * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. - * - * However when tmpfs moves the page from pagecache and into swapcache, - * it is still in core, but the find_get_page below won't find it. - * No big deal, but make a note of it. */ page = find_get_page(mapping, pgoff); +#ifdef CONFIG_SWAP + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swap = radix_to_swp_entry(page); + page = find_get_page(&swapper_space, swap.val); + } +#endif if (page) { present = PageUptodate(page); page_cache_release(page); diff --git a/mm/truncate.c b/mm/truncate.c index 232eb2736a7..b40ac6d4e86 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -336,6 +336,14 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, unsigned long count = 0; int i; + /* + * Note: this function may get called on a shmem/tmpfs mapping: + * pagevec_lookup() might then return 0 prematurely (because it + * got a gangful of swap entries); but it's hardly worth worrying + * about - it can rarely have anything to free from such a mapping + * (most pages are dirty), and already skips over any difficulties. + */ + pagevec_init(&pvec, 0); while (index <= end && pagevec_lookup(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { -- cgit v1.2.3-70-g09d2 From 8079b1c859c44f27d63da4951f5038a16589a563 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 3 Aug 2011 16:21:28 -0700 Subject: mm: clarify the radix_tree exceptional cases Make the radix_tree exceptional cases, mostly in filemap.c, clearer. It's hard to devise a suitable snappy name that illuminates the use by shmem/tmpfs for swap, while keeping filemap/pagecache/radix_tree generality. And akpm points out that /* radix_tree_deref_retry(page) */ comments look like calls that have been commented out for unknown reason. Skirt the naming difficulty by rearranging these blocks to handle the transient radix_tree_deref_retry(page) case first; then just explain the remaining shmem/tmpfs swap case in a comment. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 66 ++++++++++++++++++++++++++++++++++++++++-------------------- mm/mincore.c | 1 + mm/shmem.c | 12 +++++++---- 3 files changed, 53 insertions(+), 26 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 96778faf82d..645a080ba4d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -700,10 +700,14 @@ repeat: if (unlikely(!page)) goto out; if (radix_tree_exception(page)) { - if (radix_tree_exceptional_entry(page)) - goto out; - /* radix_tree_deref_retry(page) */ - goto repeat; + if (radix_tree_deref_retry(page)) + goto repeat; + /* + * Otherwise, shmem/tmpfs must be storing a swap entry + * here as an exceptional entry: so return it without + * attempting to raise page count. + */ + goto out; } if (!page_cache_get_speculative(page)) goto repeat; @@ -838,15 +842,21 @@ repeat: continue; if (radix_tree_exception(page)) { - if (radix_tree_exceptional_entry(page)) - continue; + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + WARN_ON(start | i); + goto restart; + } /* - * radix_tree_deref_retry(page): - * can only trigger when entry at index 0 moves out of - * or back to root: none yet gotten, safe to restart. + * Otherwise, shmem/tmpfs must be storing a swap entry + * here as an exceptional entry: so skip over it - + * we only reach this from invalidate_mapping_pages(). */ - WARN_ON(start | i); - goto restart; + continue; } if (!page_cache_get_speculative(page)) @@ -904,14 +914,20 @@ repeat: continue; if (radix_tree_exception(page)) { - if (radix_tree_exceptional_entry(page)) - break; + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + goto restart; + } /* - * radix_tree_deref_retry(page): - * can only trigger when entry at index 0 moves out of - * or back to root: none yet gotten, safe to restart. + * Otherwise, shmem/tmpfs must be storing a swap entry + * here as an exceptional entry: so stop looking for + * contiguous pages. */ - goto restart; + break; } if (!page_cache_get_speculative(page)) @@ -973,13 +989,19 @@ repeat: continue; if (radix_tree_exception(page)) { - BUG_ON(radix_tree_exceptional_entry(page)); + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + goto restart; + } /* - * radix_tree_deref_retry(page): - * can only trigger when entry at index 0 moves out of - * or back to root: none yet gotten, safe to restart. + * This function is never used on a shmem/tmpfs + * mapping, so a swap entry won't be found here. */ - goto restart; + BUG(); } if (!page_cache_get_speculative(page)) diff --git a/mm/mincore.c b/mm/mincore.c index 733f1829b0d..636a86876ff 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -72,6 +72,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) */ page = find_get_page(mapping, pgoff); #ifdef CONFIG_SWAP + /* shmem/tmpfs may return swap: account for swapcache page too. */ if (radix_tree_exceptional_entry(page)) { swp_entry_t swap = radix_to_swp_entry(page); page = find_get_page(&swapper_space, swap.val); diff --git a/mm/shmem.c b/mm/shmem.c index 1c702f6f124..32f6763f16f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -332,10 +332,14 @@ repeat: if (unlikely(!page)) continue; if (radix_tree_exception(page)) { - if (radix_tree_exceptional_entry(page)) - goto export; - /* radix_tree_deref_retry(page) */ - goto restart; + if (radix_tree_deref_retry(page)) + goto restart; + /* + * Otherwise, we must be storing a swap entry + * here as an exceptional entry: so return it + * without attempting to raise page count. + */ + goto export; } if (!page_cache_get_speculative(page)) goto repeat; -- cgit v1.2.3-70-g09d2