From 6328650bb4d854a7dc1498d1c0048b838b0d340c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 3 Aug 2011 16:21:18 -0700 Subject: radix_tree: exceptional entries and indices A patchset to extend tmpfs to MAX_LFS_FILESIZE by abandoning its peculiar swap vector, instead keeping a file's swap entries in the same radix tree as its struct page pointers: thus saving memory, and simplifying its code and locking. This patch: The radix_tree is used by several subsystems for different purposes. A major use is to store the struct page pointers of a file's pagecache for memory management. But what if mm wanted to store something other than page pointers there too? The low bit of a radix_tree entry is already used to denote an indirect pointer, for internal use, and the unlikely radix_tree_deref_retry() case. Define the next bit as denoting an exceptional entry, and supply inline functions radix_tree_exception() to return non-0 in either unlikely case, and radix_tree_exceptional_entry() to return non-0 in the second case. If a subsystem already uses radix_tree with that bit set, no problem: it does not affect internal workings at all, but is defined for the convenience of those storing well-aligned pointers in the radix_tree. The radix_tree_gang_lookups have an implicit assumption that the caller can deduce the offset of each entry returned e.g. by the page->index of a struct page. But that may not be feasible for some kinds of item to be stored there. radix_tree_gang_lookup_slot() allow for an optional indices argument, output array in which to return those offsets. The same could be added to other radix_tree_gang_lookups, but for now keep it to the only one for which we need it. Signed-off-by: Hugh Dickins Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/radix-tree.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'lib/radix-tree.c') diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 7ea2e033d71..348eaefbed7 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -823,8 +823,8 @@ unsigned long radix_tree_prev_hole(struct radix_tree_root *root, EXPORT_SYMBOL(radix_tree_prev_hole); static unsigned int -__lookup(struct radix_tree_node *slot, void ***results, unsigned long index, - unsigned int max_items, unsigned long *next_index) +__lookup(struct radix_tree_node *slot, void ***results, unsigned long *indices, + unsigned long index, unsigned int max_items, unsigned long *next_index) { unsigned int nr_found = 0; unsigned int shift, height; @@ -857,12 +857,16 @@ __lookup(struct radix_tree_node *slot, void ***results, unsigned long index, /* Bottom level: grab some items */ for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) { - index++; if (slot->slots[i]) { - results[nr_found++] = &(slot->slots[i]); - if (nr_found == max_items) + results[nr_found] = &(slot->slots[i]); + if (indices) + indices[nr_found] = index; + if (++nr_found == max_items) { + index++; goto out; + } } + index++; } out: *next_index = index; @@ -918,8 +922,8 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, if (cur_index > max_index) break; - slots_found = __lookup(node, (void ***)results + ret, cur_index, - max_items - ret, &next_index); + slots_found = __lookup(node, (void ***)results + ret, NULL, + cur_index, max_items - ret, &next_index); nr_found = 0; for (i = 0; i < slots_found; i++) { struct radix_tree_node *slot; @@ -944,6 +948,7 @@ EXPORT_SYMBOL(radix_tree_gang_lookup); * radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree * @root: radix tree root * @results: where the results of the lookup are placed + * @indices: where their indices should be placed (but usually NULL) * @first_index: start the lookup from this key * @max_items: place up to this many items at *results * @@ -958,7 +963,8 @@ EXPORT_SYMBOL(radix_tree_gang_lookup); * protection, radix_tree_deref_slot may fail requiring a retry. */ unsigned int -radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, +radix_tree_gang_lookup_slot(struct radix_tree_root *root, + void ***results, unsigned long *indices, unsigned long first_index, unsigned int max_items) { unsigned long max_index; @@ -974,6 +980,8 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, if (first_index > 0) return 0; results[0] = (void **)&root->rnode; + if (indices) + indices[0] = 0; return 1; } node = indirect_to_ptr(node); @@ -987,8 +995,9 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, if (cur_index > max_index) break; - slots_found = __lookup(node, results + ret, cur_index, - max_items - ret, &next_index); + slots_found = __lookup(node, results + ret, + indices ? indices + ret : NULL, + cur_index, max_items - ret, &next_index); ret += slots_found; if (next_index == 0) break; -- cgit v1.2.3-70-g09d2 From e504f3fdd63d486d45b18009e5a65f2e329acb0a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 3 Aug 2011 16:21:27 -0700 Subject: tmpfs radix_tree: locate_item to speed up swapoff We have already acknowledged that swapoff of a tmpfs file is slower than it was before conversion to the generic radix_tree: a little slower there will be acceptable, if the hotter paths are faster. But it was a shock to find swapoff of a 500MB file 20 times slower on my laptop, taking 10 minutes; and at that rate it significantly slows down my testing. Now, most of that turned out to be overhead from PROVE_LOCKING and PROVE_RCU: without those it was only 4 times slower than before; and more realistic tests on other machines don't fare as badly. I've tried a number of things to improve it, including tagging the swap entries, then doing lookup by tag: I'd expected that to halve the time, but in practice it's erratic, and often counter-productive. The only change I've so far found to make a consistent improvement, is to short-circuit the way we go back and forth, gang lookup packing entries into the array supplied, then shmem scanning that array for the target entry. Scanning in place doubles the speed, so it's now only twice as slow as before (or three times slower when the PROVEs are on). So, add radix_tree_locate_item() as an expedient, once-off, single-caller hack to do the lookup directly in place. #ifdef it on CONFIG_SHMEM and CONFIG_SWAP, as much to document its limited applicability as save space in other configurations. And, sadly, #include sched.h for cond_resched(). Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 1 + lib/radix-tree.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++ mm/shmem.c | 38 +------------------ 3 files changed, 94 insertions(+), 37 deletions(-) (limited to 'lib/radix-tree.c') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index b7edf825145..9d4539c52e5 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -252,6 +252,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, unsigned long nr_to_tag, unsigned int fromtag, unsigned int totag); int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag); +unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item); static inline void radix_tree_preload_end(void) { diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 348eaefbed7..a2f9da59c19 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -1203,6 +1203,98 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, } EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); +#if defined(CONFIG_SHMEM) && defined(CONFIG_SWAP) +#include /* for cond_resched() */ + +/* + * This linear search is at present only useful to shmem_unuse_inode(). + */ +static unsigned long __locate(struct radix_tree_node *slot, void *item, + unsigned long index, unsigned long *found_index) +{ + unsigned int shift, height; + unsigned long i; + + height = slot->height; + shift = (height-1) * RADIX_TREE_MAP_SHIFT; + + for ( ; height > 1; height--) { + i = (index >> shift) & RADIX_TREE_MAP_MASK; + for (;;) { + if (slot->slots[i] != NULL) + break; + index &= ~((1UL << shift) - 1); + index += 1UL << shift; + if (index == 0) + goto out; /* 32-bit wraparound */ + i++; + if (i == RADIX_TREE_MAP_SIZE) + goto out; + } + + shift -= RADIX_TREE_MAP_SHIFT; + slot = rcu_dereference_raw(slot->slots[i]); + if (slot == NULL) + goto out; + } + + /* Bottom level: check items */ + for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { + if (slot->slots[i] == item) { + *found_index = index + i; + index = 0; + goto out; + } + } + index += RADIX_TREE_MAP_SIZE; +out: + return index; +} + +/** + * radix_tree_locate_item - search through radix tree for item + * @root: radix tree root + * @item: item to be found + * + * Returns index where item was found, or -1 if not found. + * Caller must hold no lock (since this time-consuming function needs + * to be preemptible), and must check afterwards if item is still there. + */ +unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) +{ + struct radix_tree_node *node; + unsigned long max_index; + unsigned long cur_index = 0; + unsigned long found_index = -1; + + do { + rcu_read_lock(); + node = rcu_dereference_raw(root->rnode); + if (!radix_tree_is_indirect_ptr(node)) { + rcu_read_unlock(); + if (node == item) + found_index = 0; + break; + } + + node = indirect_to_ptr(node); + max_index = radix_tree_maxindex(node->height); + if (cur_index > max_index) + break; + + cur_index = __locate(node, item, cur_index, &found_index); + rcu_read_unlock(); + cond_resched(); + } while (cur_index != 0 && cur_index <= max_index); + + return found_index; +} +#else +unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) +{ + return -1; +} +#endif /* CONFIG_SHMEM && CONFIG_SWAP */ /** * radix_tree_shrink - shrink height of a radix tree to minimal diff --git a/mm/shmem.c b/mm/shmem.c index 3a5be0feb6a..1c702f6f124 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -356,42 +356,6 @@ export: return ret; } -/* - * Lockless lookup of swap entry in radix tree, avoiding refcount on pages. - */ -static pgoff_t shmem_find_swap(struct address_space *mapping, void *radswap) -{ - void **slots[PAGEVEC_SIZE]; - pgoff_t indices[PAGEVEC_SIZE]; - unsigned int nr_found; - -restart: - nr_found = 1; - indices[0] = -1; - while (nr_found) { - pgoff_t index = indices[nr_found - 1] + 1; - unsigned int i; - - rcu_read_lock(); - nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - slots, indices, index, PAGEVEC_SIZE); - for (i = 0; i < nr_found; i++) { - void *item = radix_tree_deref_slot(slots[i]); - if (radix_tree_deref_retry(item)) { - rcu_read_unlock(); - goto restart; - } - if (item == radswap) { - rcu_read_unlock(); - return indices[i]; - } - } - rcu_read_unlock(); - cond_resched(); - } - return -1; -} - /* * Remove swap entry from radix tree, free the swap and its page cache. */ @@ -612,7 +576,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, int error; radswap = swp_to_radix_entry(swap); - index = shmem_find_swap(mapping, radswap); + index = radix_tree_locate_item(&mapping->page_tree, radswap); if (index == -1) return 0; -- cgit v1.2.3-70-g09d2