From 643b52b9c0b4e959436b4b551ebf4060d06d5ae8 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 12 Jun 2008 15:21:52 -0700 Subject: radix-tree: fix small lockless radix-tree bug We shrink a radix tree when its root node has only one child, in the left most slot. The child becomes the new root node. To perform this operation in a manner compatible with concurrent lockless lookups, we atomically switch the root pointer from the parent to its child. However a concurrent lockless lookup may now have loaded a pointer to the parent (and is presently deciding what to do next). For this reason, we also have to keep the parent node in a valid state after shrinking the tree, until the next RCU grace period -- otherwise this lookup with the parent pointer may not do the right thing. Notably, we need to keep the child in the left most slot there in case that is requested by the lookup. This is all pretty standard RCU stuff. It is worth repeating because in my eagerness to obey the radix tree node constructor scheme, I had broken it by zeroing the radix tree node before the grace period. What could happen is that a lookup can load the parent pointer, then decide it wants to follow the left most child slot, only to find the slot contained NULL due to the concurrent shrinker having zeroed the parent node before waiting for a grace period. The lookup would return a false negative as a result. Fix it by doing that clearing in the RCU callback. I would normally want to rip out the constructor entirely, but radix tree nodes are one of those places where they make sense (only few cachelines will be touched soon after allocation). This was never actually found in any lockless pagecache testing or by the test harness, but by seeing the odd problem with my scalable vmap rewrite. I have not tickled the test harness into reproducing it yet, but I'll keep working at it. Fortunately, it is not a problem anywhere lockless pagecache is used in mainline kernels (pagecache probe is not a guarantee, and brd does not have concurrent lookups and deletes). Signed-off-by: Nick Piggin Acked-by: Peter Zijlstra Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/radix-tree.c | 120 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 62 insertions(+), 58 deletions(-) (limited to 'lib/radix-tree.c') diff --git a/lib/radix-tree.c b/lib/radix-tree.c index bd521716ab1..169a2f8dabc 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -88,6 +88,57 @@ static inline gfp_t root_gfp_mask(struct radix_tree_root *root) return root->gfp_mask & __GFP_BITS_MASK; } +static inline void tag_set(struct radix_tree_node *node, unsigned int tag, + int offset) +{ + __set_bit(offset, node->tags[tag]); +} + +static inline void tag_clear(struct radix_tree_node *node, unsigned int tag, + int offset) +{ + __clear_bit(offset, node->tags[tag]); +} + +static inline int tag_get(struct radix_tree_node *node, unsigned int tag, + int offset) +{ + return test_bit(offset, node->tags[tag]); +} + +static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag) +{ + root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT)); +} + +static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag) +{ + root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT)); +} + +static inline void root_tag_clear_all(struct radix_tree_root *root) +{ + root->gfp_mask &= __GFP_BITS_MASK; +} + +static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag) +{ + return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT)); +} + +/* + * Returns 1 if any slot in the node has this tag set. + * Otherwise returns 0. + */ +static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag) +{ + int idx; + for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { + if (node->tags[tag][idx]) + return 1; + } + return 0; +} /* * This assumes that the caller has performed appropriate preallocation, and * that the caller has pinned this thread of control to the current CPU. @@ -124,6 +175,17 @@ static void radix_tree_node_rcu_free(struct rcu_head *head) { struct radix_tree_node *node = container_of(head, struct radix_tree_node, rcu_head); + + /* + * must only free zeroed nodes into the slab. radix_tree_shrink + * can leave us with a non-NULL entry in the first slot, so clear + * that here to make sure. + */ + tag_clear(node, 0, 0); + tag_clear(node, 1, 0); + node->slots[0] = NULL; + node->count = 0; + kmem_cache_free(radix_tree_node_cachep, node); } @@ -165,59 +227,6 @@ out: } EXPORT_SYMBOL(radix_tree_preload); -static inline void tag_set(struct radix_tree_node *node, unsigned int tag, - int offset) -{ - __set_bit(offset, node->tags[tag]); -} - -static inline void tag_clear(struct radix_tree_node *node, unsigned int tag, - int offset) -{ - __clear_bit(offset, node->tags[tag]); -} - -static inline int tag_get(struct radix_tree_node *node, unsigned int tag, - int offset) -{ - return test_bit(offset, node->tags[tag]); -} - -static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag) -{ - root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT)); -} - - -static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag) -{ - root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT)); -} - -static inline void root_tag_clear_all(struct radix_tree_root *root) -{ - root->gfp_mask &= __GFP_BITS_MASK; -} - -static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag) -{ - return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT)); -} - -/* - * Returns 1 if any slot in the node has this tag set. - * Otherwise returns 0. - */ -static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag) -{ - int idx; - for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { - if (node->tags[tag][idx]) - return 1; - } - return 0; -} - /* * Return the maximum key which can be store into a * radix tree with height HEIGHT. @@ -930,11 +939,6 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) newptr = radix_tree_ptr_to_indirect(newptr); root->rnode = newptr; root->height--; - /* must only free zeroed nodes into the slab */ - tag_clear(to_free, 0, 0); - tag_clear(to_free, 1, 0); - to_free->slots[0] = NULL; - to_free->count = 0; radix_tree_node_free(to_free); } } -- cgit v1.2.3-70-g09d2 From cde53535991fbb5c34a1566f25955297c1487b8d Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 4 Jul 2008 09:59:22 -0700 Subject: Christoph has moved Remove all clameter@sgi.com addresses from the kernel tree since they will become invalid on June 27th. Change my maintainer email address for the slab allocators to cl@linux-foundation.org (which will be the new email address for the future). Signed-off-by: Christoph Lameter Signed-off-by: Christoph Lameter Cc: Pekka Enberg Cc: Stephen Rothwell Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/slabinfo.c | 4 ++-- Documentation/vm/slub.txt | 2 +- MAINTAINERS | 2 +- include/asm-generic/atomic.h | 2 +- include/linux/slab.h | 2 +- include/linux/slub_def.h | 2 +- kernel/workqueue.c | 2 +- lib/radix-tree.c | 2 +- mm/allocpercpu.c | 2 +- mm/migrate.c | 2 +- mm/slub.c | 2 +- mm/sparse-vmemmap.c | 2 +- 12 files changed, 13 insertions(+), 13 deletions(-) (limited to 'lib/radix-tree.c') diff --git a/Documentation/vm/slabinfo.c b/Documentation/vm/slabinfo.c index e4230ed16ee..df3227605d5 100644 --- a/Documentation/vm/slabinfo.c +++ b/Documentation/vm/slabinfo.c @@ -1,7 +1,7 @@ /* * Slabinfo: Tool to get reports about slabs * - * (C) 2007 sgi, Christoph Lameter + * (C) 2007 sgi, Christoph Lameter * * Compile by: * @@ -99,7 +99,7 @@ void fatal(const char *x, ...) void usage(void) { - printf("slabinfo 5/7/2007. (c) 2007 sgi. clameter@sgi.com\n\n" + printf("slabinfo 5/7/2007. (c) 2007 sgi.\n\n" "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n" "-a|--aliases Show aliases\n" "-A|--activity Most active slabs first\n" diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt index 7c13f22a0c9..bb1f5c6e28b 100644 --- a/Documentation/vm/slub.txt +++ b/Documentation/vm/slub.txt @@ -266,4 +266,4 @@ of other objects. slub_debug=FZ,dentry -Christoph Lameter, , May 30, 2007 +Christoph Lameter, May 30, 2007 diff --git a/MAINTAINERS b/MAINTAINERS index 460e699fd28..13b7b19692e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3672,7 +3672,7 @@ S: Maintained SLAB ALLOCATOR P: Christoph Lameter -M: clameter@sgi.com +M: cl@linux-foundation.org P: Pekka Enberg M: penberg@cs.helsinki.fi P: Matt Mackall diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h index 85fd0aa27a8..4ec0a296bde 100644 --- a/include/asm-generic/atomic.h +++ b/include/asm-generic/atomic.h @@ -2,7 +2,7 @@ #define _ASM_GENERIC_ATOMIC_H /* * Copyright (C) 2005 Silicon Graphics, Inc. - * Christoph Lameter + * Christoph Lameter * * Allows to provide arch independent atomic definitions without the need to * edit all arch specific atomic.h files. diff --git a/include/linux/slab.h b/include/linux/slab.h index c2ad3501659..9aa90a6f20e 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -1,7 +1,7 @@ /* * Written by Mark Hemment, 1996 (markhe@nextd.demon.co.uk). * - * (C) SGI 2006, Christoph Lameter + * (C) SGI 2006, Christoph Lameter * Cleaned up and restructured to ease the addition of alternative * implementations of SLAB allocators. */ diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index cef6f8fddd7..d117ea2825a 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -4,7 +4,7 @@ /* * SLUB : A Slab allocator without object queues. * - * (C) 2007 SGI, Christoph Lameter + * (C) 2007 SGI, Christoph Lameter */ #include #include diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 29fc39f1029..ce7799540c9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -13,7 +13,7 @@ * Kai Petzke * Theodore Ts'o * - * Made to use alloc_percpu by Christoph Lameter . + * Made to use alloc_percpu by Christoph Lameter. */ #include diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 169a2f8dabc..56ec21a7f73 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -1,7 +1,7 @@ /* * Copyright (C) 2001 Momchil Velikov * Portions Copyright (C) 2001 Christoph Hellwig - * Copyright (C) 2005 SGI, Christoph Lameter + * Copyright (C) 2005 SGI, Christoph Lameter * Copyright (C) 2006 Nick Piggin * * This program is free software; you can redistribute it and/or diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index f4026bae6ee..05f2b4009cc 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -1,7 +1,7 @@ /* * linux/mm/allocpercpu.c * - * Separated from slab.c August 11, 2006 Christoph Lameter + * Separated from slab.c August 11, 2006 Christoph Lameter */ #include #include diff --git a/mm/migrate.c b/mm/migrate.c index 112bcaeaa10..55bd355d170 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -9,7 +9,7 @@ * IWAMOTO Toshihiro * Hirokazu Takahashi * Dave Hansen - * Christoph Lameter + * Christoph Lameter */ #include diff --git a/mm/slub.c b/mm/slub.c index 2c9a62d1f42..1a427c0ae83 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5,7 +5,7 @@ * The allocator synchronizes using per slab locks and only * uses a centralized lock to manage a pool of partial slabs. * - * (C) 2007 SGI, Christoph Lameter + * (C) 2007 SGI, Christoph Lameter */ #include diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 99c4f36eb8a..a91b5f8fcaf 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -1,7 +1,7 @@ /* * Virtual Memory Map support * - * (C) 2007 sgi. Christoph Lameter . + * (C) 2007 sgi. Christoph Lameter. * * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn, * virt_to_page, page_address() to be implemented as a base offset -- cgit v1.2.3-70-g09d2