diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/Kconfig | 21 | ||||
-rw-r--r-- | lib/Makefile | 9 | ||||
-rw-r--r-- | lib/assoc_array.c | 1746 | ||||
-rw-r--r-- | lib/kfifo.c | 4 | ||||
-rw-r--r-- | lib/llist.c | 22 | ||||
-rw-r--r-- | lib/lockref.c | 2 | ||||
-rw-r--r-- | lib/mpi/mpiutil.c | 3 | ||||
-rw-r--r-- | lib/percpu-rwsem.c | 165 | ||||
-rw-r--r-- | lib/percpu_counter.c | 15 | ||||
-rw-r--r-- | lib/percpu_ida.c | 94 | ||||
-rw-r--r-- | lib/random32.c | 12 | ||||
-rw-r--r-- | lib/rwsem-spinlock.c | 296 | ||||
-rw-r--r-- | lib/rwsem.c | 293 | ||||
-rw-r--r-- | lib/spinlock_debug.c | 302 | ||||
-rw-r--r-- | lib/swiotlb.c | 6 | ||||
-rw-r--r-- | lib/vsprintf.c | 20 |
16 files changed, 1897 insertions, 1113 deletions
diff --git a/lib/Kconfig b/lib/Kconfig index 75485e163ca..991c98bc4a3 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -51,13 +51,6 @@ config PERCPU_RWSEM config ARCH_USE_CMPXCHG_LOCKREF bool -config CMPXCHG_LOCKREF - def_bool y if ARCH_USE_CMPXCHG_LOCKREF - depends on SMP - depends on !GENERIC_LOCKBREAK - depends on !DEBUG_SPINLOCK - depends on !DEBUG_LOCK_ALLOC - config CRC_CCITT tristate "CRC-CCITT functions" help @@ -329,6 +322,20 @@ config TEXTSEARCH_FSM config BTREE boolean +config ASSOCIATIVE_ARRAY + bool + help + Generic associative array. Can be searched and iterated over whilst + it is being modified. It is also reasonably quick to search and + modify. The algorithms are non-recursive, and the trees are highly + capacious. + + See: + + Documentation/assoc_array.txt + + for more information. + config HAS_IOMEM boolean depends on !NO_IOMEM diff --git a/lib/Makefile b/lib/Makefile index bb016e116ba..a459c31e8c6 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -13,7 +13,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ - earlycpio.o percpu-refcount.o percpu_ida.o + earlycpio.o obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o lib-$(CONFIG_MMU) += ioremap.o @@ -26,7 +26,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \ bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \ - percpu_ida.o + percpu-refcount.o percpu_ida.o obj-y += string_helpers.o obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o obj-y += kstrtox.o @@ -42,15 +42,12 @@ obj-$(CONFIG_GENERIC_PCI_IOMAP) += pci_iomap.o obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o -obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o -lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o -lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o -lib-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS)) obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o obj-$(CONFIG_BTREE) += btree.o +obj-$(CONFIG_ASSOCIATIVE_ARRAY) += assoc_array.o obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o obj-$(CONFIG_DEBUG_LIST) += list_debug.o obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o diff --git a/lib/assoc_array.c b/lib/assoc_array.c new file mode 100644 index 00000000000..17edeaf1918 --- /dev/null +++ b/lib/assoc_array.c @@ -0,0 +1,1746 @@ +/* Generic associative array implementation. + * + * See Documentation/assoc_array.txt for information. + * + * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ +//#define DEBUG +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/assoc_array_priv.h> + +/* + * Iterate over an associative array. The caller must hold the RCU read lock + * or better. + */ +static int assoc_array_subtree_iterate(const struct assoc_array_ptr *root, + const struct assoc_array_ptr *stop, + int (*iterator)(const void *leaf, + void *iterator_data), + void *iterator_data) +{ + const struct assoc_array_shortcut *shortcut; + const struct assoc_array_node *node; + const struct assoc_array_ptr *cursor, *ptr, *parent; + unsigned long has_meta; + int slot, ret; + + cursor = root; + +begin_node: + if (assoc_array_ptr_is_shortcut(cursor)) { + /* Descend through a shortcut */ + shortcut = assoc_array_ptr_to_shortcut(cursor); + smp_read_barrier_depends(); + cursor = ACCESS_ONCE(shortcut->next_node); + } + + node = assoc_array_ptr_to_node(cursor); + smp_read_barrier_depends(); + slot = 0; + + /* We perform two passes of each node. + * + * The first pass does all the leaves in this node. This means we + * don't miss any leaves if the node is split up by insertion whilst + * we're iterating over the branches rooted here (we may, however, see + * some leaves twice). + */ + has_meta = 0; + for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + ptr = ACCESS_ONCE(node->slots[slot]); + has_meta |= (unsigned long)ptr; + if (ptr && assoc_array_ptr_is_leaf(ptr)) { + /* We need a barrier between the read of the pointer + * and dereferencing the pointer - but only if we are + * actually going to dereference it. + */ + smp_read_barrier_depends(); + + /* Invoke the callback */ + ret = iterator(assoc_array_ptr_to_leaf(ptr), + iterator_data); + if (ret) + return ret; + } + } + + /* The second pass attends to all the metadata pointers. If we follow + * one of these we may find that we don't come back here, but rather go + * back to a replacement node with the leaves in a different layout. + * + * We are guaranteed to make progress, however, as the slot number for + * a particular portion of the key space cannot change - and we + * continue at the back pointer + 1. + */ + if (!(has_meta & ASSOC_ARRAY_PTR_META_TYPE)) + goto finished_node; + slot = 0; + +continue_node: + node = assoc_array_ptr_to_node(cursor); + smp_read_barrier_depends(); + + for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + ptr = ACCESS_ONCE(node->slots[slot]); + if (assoc_array_ptr_is_meta(ptr)) { + cursor = ptr; + goto begin_node; + } + } + +finished_node: + /* Move up to the parent (may need to skip back over a shortcut) */ + parent = ACCESS_ONCE(node->back_pointer); + slot = node->parent_slot; + if (parent == stop) + return 0; + + if (assoc_array_ptr_is_shortcut(parent)) { + shortcut = assoc_array_ptr_to_shortcut(parent); + smp_read_barrier_depends(); + cursor = parent; + parent = ACCESS_ONCE(shortcut->back_pointer); + slot = shortcut->parent_slot; + if (parent == stop) + return 0; + } + + /* Ascend to next slot in parent node */ + cursor = parent; + slot++; + goto continue_node; +} + +/** + * assoc_array_iterate - Pass all objects in the array to a callback + * @array: The array to iterate over. + * @iterator: The callback function. + * @iterator_data: Private data for the callback function. + * + * Iterate over all the objects in an associative array. Each one will be + * presented to the iterator function. + * + * If the array is being modified concurrently with the iteration then it is + * possible that some objects in the array will be passed to the iterator + * callback more than once - though every object should be passed at least + * once. If this is undesirable then the caller must lock against modification + * for the duration of this function. + * + * The function will return 0 if no objects were in the array or else it will + * return the result of the last iterator function called. Iteration stops + * immediately if any call to the iteration function results in a non-zero + * return. + * + * The caller should hold the RCU read lock or better if concurrent + * modification is possible. + */ +int assoc_array_iterate(const struct assoc_array *array, + int (*iterator)(const void *object, + void *iterator_data), + void *iterator_data) +{ + struct assoc_array_ptr *root = ACCESS_ONCE(array->root); + + if (!root) + return 0; + return assoc_array_subtree_iterate(root, NULL, iterator, iterator_data); +} + +enum assoc_array_walk_status { + assoc_array_walk_tree_empty, + assoc_array_walk_found_terminal_node, + assoc_array_walk_found_wrong_shortcut, +} status; + +struct assoc_array_walk_result { + struct { + struct assoc_array_node *node; /* Node in which leaf might be found */ + int level; + int slot; + } terminal_node; + struct { + struct assoc_array_shortcut *shortcut; + int level; + int sc_level; + unsigned long sc_segments; + unsigned long dissimilarity; + } wrong_shortcut; +}; + +/* + * Navigate through the internal tree looking for the closest node to the key. + */ +static enum assoc_array_walk_status +assoc_array_walk(const struct assoc_array *array, + const struct assoc_array_ops *ops, + const void *index_key, + struct assoc_array_walk_result *result) +{ + struct assoc_array_shortcut *shortcut; + struct assoc_array_node *node; + struct assoc_array_ptr *cursor, *ptr; + unsigned long sc_segments, dissimilarity; + unsigned long segments; + int level, sc_level, next_sc_level; + int slot; + + pr_devel("-->%s()\n", __func__); + + cursor = ACCESS_ONCE(array->root); + if (!cursor) + return assoc_array_walk_tree_empty; + + level = 0; + + /* Use segments from the key for the new leaf to navigate through the + * internal tree, skipping through nodes and shortcuts that are on + * route to the destination. Eventually we'll come to a slot that is + * either empty or contains a leaf at which point we've found a node in + * which the leaf we're looking for might be found or into which it + * should be inserted. + */ +jumped: + segments = ops->get_key_chunk(index_key, level); + pr_devel("segments[%d]: %lx\n", level, segments); + + if (assoc_array_ptr_is_shortcut(cursor)) + goto follow_shortcut; + +consider_node: + node = assoc_array_ptr_to_node(cursor); + smp_read_barrier_depends(); + + slot = segments >> (level & ASSOC_ARRAY_KEY_CHUNK_MASK); + slot &= ASSOC_ARRAY_FAN_MASK; + ptr = ACCESS_ONCE(node->slots[slot]); + + pr_devel("consider slot %x [ix=%d type=%lu]\n", + slot, level, (unsigned long)ptr & 3); + + if (!assoc_array_ptr_is_meta(ptr)) { + /* The node doesn't have a node/shortcut pointer in the slot + * corresponding to the index key that we have to follow. + */ + result->terminal_node.node = node; + result->terminal_node.level = level; + result->terminal_node.slot = slot; + pr_devel("<--%s() = terminal_node\n", __func__); + return assoc_array_walk_found_terminal_node; + } + + if (assoc_array_ptr_is_node(ptr)) { + /* There is a pointer to a node in the slot corresponding to + * this index key segment, so we need to follow it. + */ + cursor = ptr; + level += ASSOC_ARRAY_LEVEL_STEP; + if ((level & ASSOC_ARRAY_KEY_CHUNK_MASK) != 0) + goto consider_node; + goto jumped; + } + + /* There is a shortcut in the slot corresponding to the index key + * segment. We follow the shortcut if its partial index key matches + * this leaf's. Otherwise we need to split the shortcut. + */ + cursor = ptr; +follow_shortcut: + shortcut = assoc_array_ptr_to_shortcut(cursor); + smp_read_barrier_depends(); + pr_devel("shortcut to %d\n", shortcut->skip_to_level); + sc_level = level + ASSOC_ARRAY_LEVEL_STEP; + BUG_ON(sc_level > shortcut->skip_to_level); + + do { + /* Check the leaf against the shortcut's index key a word at a + * time, trimming the final word (the shortcut stores the index + * key completely from the root to the shortcut's target). + */ + if ((sc_level & ASSOC_ARRAY_KEY_CHUNK_MASK) == 0) + segments = ops->get_key_chunk(index_key, sc_level); + + sc_segments = shortcut->index_key[sc_level >> ASSOC_ARRAY_KEY_CHUNK_SHIFT]; + dissimilarity = segments ^ sc_segments; + + if (round_up(sc_level, ASSOC_ARRAY_KEY_CHUNK_SIZE) > shortcut->skip_to_level) { + /* Trim segments that are beyond the shortcut */ + int shift = shortcut->skip_to_level & ASSOC_ARRAY_KEY_CHUNK_MASK; + dissimilarity &= ~(ULONG_MAX << shift); + next_sc_level = shortcut->skip_to_level; + } else { + next_sc_level = sc_level + ASSOC_ARRAY_KEY_CHUNK_SIZE; + next_sc_level = round_down(next_sc_level, ASSOC_ARRAY_KEY_CHUNK_SIZE); + } + + if (dissimilarity != 0) { + /* This shortcut points elsewhere */ + result->wrong_shortcut.shortcut = shortcut; + result->wrong_shortcut.level = level; + result->wrong_shortcut.sc_level = sc_level; + result->wrong_shortcut.sc_segments = sc_segments; + result->wrong_shortcut.dissimilarity = dissimilarity; + return assoc_array_walk_found_wrong_shortcut; + } + + sc_level = next_sc_level; + } while (sc_level < shortcut->skip_to_level); + + /* The shortcut matches the leaf's index to this point. */ + cursor = ACCESS_ONCE(shortcut->next_node); + if (((level ^ sc_level) & ~ASSOC_ARRAY_KEY_CHUNK_MASK) != 0) { + level = sc_level; + goto jumped; + } else { + level = sc_level; + goto consider_node; + } +} + +/** + * assoc_array_find - Find an object by index key + * @array: The associative array to search. + * @ops: The operations to use. + * @index_key: The key to the object. + * + * Find an object in an associative array by walking through the internal tree + * to the node that should contain the object and then searching the leaves + * there. NULL is returned if the requested object was not found in the array. + * + * The caller must hold the RCU read lock or better. + */ +void *assoc_array_find(const struct assoc_array *array, + const struct assoc_array_ops *ops, + const void *index_key) +{ + struct assoc_array_walk_result result; + const struct assoc_array_node *node; + const struct assoc_array_ptr *ptr; + const void *leaf; + int slot; + + if (assoc_array_walk(array, ops, index_key, &result) != + assoc_array_walk_found_terminal_node) + return NULL; + + node = result.terminal_node.node; + smp_read_barrier_depends(); + + /* If the target key is available to us, it's has to be pointed to by + * the terminal node. + */ + for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + ptr = ACCESS_ONCE(node->slots[slot]); + if (ptr && assoc_array_ptr_is_leaf(ptr)) { + /* We need a barrier between the read of the pointer + * and dereferencing the pointer - but only if we are + * actually going to dereference it. + */ + leaf = assoc_array_ptr_to_leaf(ptr); + smp_read_barrier_depends(); + if (ops->compare_object(leaf, index_key)) + return (void *)leaf; + } + } + + return NULL; +} + +/* + * Destructively iterate over an associative array. The caller must prevent + * other simultaneous accesses. + */ +static void assoc_array_destroy_subtree(struct assoc_array_ptr *root, + const struct assoc_array_ops *ops) +{ + struct assoc_array_shortcut *shortcut; + struct assoc_array_node *node; + struct assoc_array_ptr *cursor, *parent = NULL; + int slot = -1; + + pr_devel("-->%s()\n", __func__); + + cursor = root; + if (!cursor) { + pr_devel("empty\n"); + return; + } + +move_to_meta: + if (assoc_array_ptr_is_shortcut(cursor)) { + /* Descend through a shortcut */ + pr_devel("[%d] shortcut\n", slot); + BUG_ON(!assoc_array_ptr_is_shortcut(cursor)); + shortcut = assoc_array_ptr_to_shortcut(cursor); + BUG_ON(shortcut->back_pointer != parent); + BUG_ON(slot != -1 && shortcut->parent_slot != slot); + parent = cursor; + cursor = shortcut->next_node; + slot = -1; + BUG_ON(!assoc_array_ptr_is_node(cursor)); + } + + pr_devel("[%d] node\n", slot); + node = assoc_array_ptr_to_node(cursor); + BUG_ON(node->back_pointer != parent); + BUG_ON(slot != -1 && node->parent_slot != slot); + slot = 0; + +continue_node: + pr_devel("Node %p [back=%p]\n", node, node->back_pointer); + for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + struct assoc_array_ptr *ptr = node->slots[slot]; + if (!ptr) + continue; + if (assoc_array_ptr_is_meta(ptr)) { + parent = cursor; + cursor = ptr; + goto move_to_meta; + } + + if (ops) { + pr_devel("[%d] free leaf\n", slot); + ops->free_object(assoc_array_ptr_to_leaf(ptr)); + } + } + + parent = node->back_pointer; + slot = node->parent_slot; + pr_devel("free node\n"); + kfree(node); + if (!parent) + return; /* Done */ + + /* Move back up to the parent (may need to free a shortcut on + * the way up) */ + if (assoc_array_ptr_is_shortcut(parent)) { + shortcut = assoc_array_ptr_to_shortcut(parent); + BUG_ON(shortcut->next_node != cursor); + cursor = parent; + parent = shortcut->back_pointer; + slot = shortcut->parent_slot; + pr_devel("free shortcut\n"); + kfree(shortcut); + if (!parent) + return; + + BUG_ON(!assoc_array_ptr_is_node(parent)); + } + + /* Ascend to next slot in parent node */ + pr_devel("ascend to %p[%d]\n", parent, slot); + cursor = parent; + node = assoc_array_ptr_to_node(cursor); + slot++; + goto continue_node; +} + +/** + * assoc_array_destroy - Destroy an associative array + * @array: The array to destroy. + * @ops: The operations to use. + * + * Discard all metadata and free all objects in an associative array. The + * array will be empty and ready to use again upon completion. This function + * cannot fail. + * + * The caller must prevent all other accesses whilst this takes place as no + * attempt is made to adjust pointers gracefully to permit RCU readlock-holding + * accesses to continue. On the other hand, no memory allocation is required. + */ +void assoc_array_destroy(struct assoc_array *array, + const struct assoc_array_ops *ops) +{ + assoc_array_destroy_subtree(array->root, ops); + array->root = NULL; +} + +/* + * Handle insertion into an empty tree. + */ +static bool assoc_array_insert_in_empty_tree(struct assoc_array_edit *edit) +{ + struct assoc_array_node *new_n0; + + pr_devel("-->%s()\n", __func__); + + new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); + if (!new_n0) + return false; + + edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); + edit->leaf_p = &new_n0->slots[0]; + edit->adjust_count_on = new_n0; + edit->set[0].ptr = &edit->array->root; + edit->set[0].to = assoc_array_node_to_ptr(new_n0); + + pr_devel("<--%s() = ok [no root]\n", __func__); + return true; +} + +/* + * Handle insertion into a terminal node. + */ +static bool assoc_array_insert_into_terminal_node(struct assoc_array_edit *edit, + const struct assoc_array_ops *ops, + const void *index_key, + struct assoc_array_walk_result *result) +{ + struct assoc_array_shortcut *shortcut, *new_s0; + struct assoc_array_node *node, *new_n0, *new_n1, *side; + struct assoc_array_ptr *ptr; + unsigned long dissimilarity, base_seg, blank; + size_t keylen; + bool have_meta; + int level, diff; + int slot, next_slot, free_slot, i, j; + + node = result->terminal_node.node; + level = result->terminal_node.level; + edit->segment_cache[ASSOC_ARRAY_FAN_OUT] = result->terminal_node.slot; + + pr_devel("-->%s()\n", __func__); + + /* We arrived at a node which doesn't have an onward node or shortcut + * pointer that we have to follow. This means that (a) the leaf we + * want must go here (either by insertion or replacement) or (b) we + * need to split this node and insert in one of the fragments. + */ + free_slot = -1; + + /* Firstly, we have to check the leaves in this node to see if there's + * a matching one we should replace in place. + */ + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + ptr = node->slots[i]; + if (!ptr) { + free_slot = i; + continue; + } + if (ops->compare_object(assoc_array_ptr_to_leaf(ptr), index_key)) { + pr_devel("replace in slot %d\n", i); + edit->leaf_p = &node->slots[i]; + edit->dead_leaf = node->slots[i]; + pr_devel("<--%s() = ok [replace]\n", __func__); + return true; + } + } + + /* If there is a free slot in this node then we can just insert the + * leaf here. + */ + if (free_slot >= 0) { + pr_devel("insert in free slot %d\n", free_slot); + edit->leaf_p = &node->slots[free_slot]; + edit->adjust_count_on = node; + pr_devel("<--%s() = ok [insert]\n", __func__); + return true; + } + + /* The node has no spare slots - so we're either going to have to split + * it or insert another node before it. + * + * Whatever, we're going to need at least two new nodes - so allocate + * those now. We may also need a new shortcut, but we deal with that + * when we need it. + */ + new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); + if (!new_n0) + return false; + edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); + new_n1 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); + if (!new_n1) + return false; + edit->new_meta[1] = assoc_array_node_to_ptr(new_n1); + + /* We need to find out how similar the leaves are. */ + pr_devel("no spare slots\n"); + have_meta = false; + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + ptr = node->slots[i]; + if (assoc_array_ptr_is_meta(ptr)) { + edit->segment_cache[i] = 0xff; + have_meta = true; + continue; + } + base_seg = ops->get_object_key_chunk( + assoc_array_ptr_to_leaf(ptr), level); + base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK; + edit->segment_cache[i] = base_seg & ASSOC_ARRAY_FAN_MASK; + } + + if (have_meta) { + pr_devel("have meta\n"); + goto split_node; + } + + /* The node contains only leaves */ + dissimilarity = 0; + base_seg = edit->segment_cache[0]; + for (i = 1; i < ASSOC_ARRAY_FAN_OUT; i++) + dissimilarity |= edit->segment_cache[i] ^ base_seg; + + pr_devel("only leaves; dissimilarity=%lx\n", dissimilarity); + + if ((dissimilarity & ASSOC_ARRAY_FAN_MASK) == 0) { + /* The old leaves all cluster in the same slot. We will need + * to insert a shortcut if the new node wants to cluster with them. + */ + if ((edit->segment_cache[ASSOC_ARRAY_FAN_OUT] ^ base_seg) == 0) + goto all_leaves_cluster_together; + + /* Otherwise we can just insert a new node ahead of the old + * one. + */ + goto present_leaves_cluster_but_not_new_leaf; + } + +split_node: + pr_devel("split node\n"); + + /* We need to split the current node; we know that the node doesn't + * simply contain a full set of leaves that cluster together (it + * contains meta pointers and/or non-clustering leaves). + * + * We need to expel at least two leaves out of a set consisting of the + * leaves in the node and the new leaf. + * + * We need a new node (n0) to replace the current one and a new node to + * take the expelled nodes (n1). + */ + edit->set[0].to = assoc_array_node_to_ptr(new_n0); + new_n0->back_pointer = node->back_pointer; + new_n0->parent_slot = node->parent_slot; + new_n1->back_pointer = assoc_array_node_to_ptr(new_n0); + new_n1->parent_slot = -1; /* Need to calculate this */ + +do_split_node: + pr_devel("do_split_node\n"); + + new_n0->nr_leaves_on_branch = node->nr_leaves_on_branch; + new_n1->nr_leaves_on_branch = 0; + + /* Begin by finding two matching leaves. There have to be at least two + * that match - even if there are meta pointers - because any leaf that + * would match a slot with a meta pointer in it must be somewhere + * behind that meta pointer and cannot be here. Further, given N + * remaining leaf slots, we now have N+1 leaves to go in them. + */ + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + slot = edit->segment_cache[i]; + if (slot != 0xff) + for (j = i + 1; j < ASSOC_ARRAY_FAN_OUT + 1; j++) + if (edit->segment_cache[j] == slot) + goto found_slot_for_multiple_occupancy; + } +found_slot_for_multiple_occupancy: + pr_devel("same slot: %x %x [%02x]\n", i, j, slot); + BUG_ON(i >= ASSOC_ARRAY_FAN_OUT); + BUG_ON(j >= ASSOC_ARRAY_FAN_OUT + 1); + BUG_ON(slot >= ASSOC_ARRAY_FAN_OUT); + + new_n1->parent_slot = slot; + + /* Metadata pointers cannot change slot */ + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) + if (assoc_array_ptr_is_meta(node->slots[i])) + new_n0->slots[i] = node->slots[i]; + else + new_n0->slots[i] = NULL; + BUG_ON(new_n0->slots[slot] != NULL); + new_n0->slots[slot] = assoc_array_node_to_ptr(new_n1); + + /* Filter the leaf pointers between the new nodes */ + free_slot = -1; + next_slot = 0; + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + if (assoc_array_ptr_is_meta(node->slots[i])) + continue; + if (edit->segment_cache[i] == slot) { + new_n1->slots[next_slot++] = node->slots[i]; + new_n1->nr_leaves_on_branch++; + } else { + do { + free_slot++; + } while (new_n0->slots[free_slot] != NULL); + new_n0->slots[free_slot] = node->slots[i]; + } + } + + pr_devel("filtered: f=%x n=%x\n", free_slot, next_slot); + + if (edit->segment_cache[ASSOC_ARRAY_FAN_OUT] != slot) { + do { + free_slot++; + } while (new_n0->slots[free_slot] != NULL); + edit->leaf_p = &new_n0->slots[free_slot]; + edit->adjust_count_on = new_n0; + } else { + edit->leaf_p = &new_n1->slots[next_slot++]; + edit->adjust_count_on = new_n1; + } + + BUG_ON(next_slot <= 1); + + edit->set_backpointers_to = assoc_array_node_to_ptr(new_n0); + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + if (edit->segment_cache[i] == 0xff) { + ptr = node->slots[i]; + BUG_ON(assoc_array_ptr_is_leaf(ptr)); + if (assoc_array_ptr_is_node(ptr)) { + side = assoc_array_ptr_to_node(ptr); + edit->set_backpointers[i] = &side->back_pointer; + } else { + shortcut = assoc_array_ptr_to_shortcut(ptr); + edit->set_backpointers[i] = &shortcut->back_pointer; + } + } + } + + ptr = node->back_pointer; + if (!ptr) + edit->set[0].ptr = &edit->array->root; + else if (assoc_array_ptr_is_node(ptr)) + edit->set[0].ptr = &assoc_array_ptr_to_node(ptr)->slots[node->parent_slot]; + else + edit->set[0].ptr = &assoc_array_ptr_to_shortcut(ptr)->next_node; + edit->excised_meta[0] = assoc_array_node_to_ptr(node); + pr_devel("<--%s() = ok [split node]\n", __func__); + return true; + +present_leaves_cluster_but_not_new_leaf: + /* All the old leaves cluster in the same slot, but the new leaf wants + * to go into a different slot, so we create a new node to hold the new + * leaf and a pointer to a new node holding all the old leaves. + */ + pr_devel("present leaves cluster but not new leaf\n"); + + new_n0->back_pointer = node->back_pointer; + new_n0->parent_slot = node->parent_slot; + new_n0->nr_leaves_on_branch = node->nr_leaves_on_branch; + new_n1->back_pointer = assoc_array_node_to_ptr(new_n0); + new_n1->parent_slot = edit->segment_cache[0]; + new_n1->nr_leaves_on_branch = node->nr_leaves_on_branch; + edit->adjust_count_on = new_n0; + + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) + new_n1->slots[i] = node->slots[i]; + + new_n0->slots[edit->segment_cache[0]] = assoc_array_node_to_ptr(new_n0); + edit->leaf_p = &new_n0->slots[edit->segment_cache[ASSOC_ARRAY_FAN_OUT]]; + + edit->set[0].ptr = &assoc_array_ptr_to_node(node->back_pointer)->slots[node->parent_slot]; + edit->set[0].to = assoc_array_node_to_ptr(new_n0); + edit->excised_meta[0] = assoc_array_node_to_ptr(node); + pr_devel("<--%s() = ok [insert node before]\n", __func__); + return true; + +all_leaves_cluster_together: + /* All the leaves, new and old, want to cluster together in this node + * in the same slot, so we have to replace this node with a shortcut to + * skip over the identical parts of the key and then place a pair of + * nodes, one inside the other, at the end of the shortcut and + * distribute the keys between them. + * + * Firstly we need to work out where the leaves start diverging as a + * bit position into their keys so that we know how big the shortcut + * needs to be. + * + * We only need to make a single pass of N of the N+1 leaves because if + * any keys differ between themselves at bit X then at least one of + * them must also differ with the base key at bit X or before. + */ + pr_devel("all leaves cluster together\n"); + diff = INT_MAX; + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + int x = ops->diff_objects(assoc_array_ptr_to_leaf(edit->leaf), + assoc_array_ptr_to_leaf(node->slots[i])); + if (x < diff) { + BUG_ON(x < 0); + diff = x; + } + } + BUG_ON(diff == INT_MAX); + BUG_ON(diff < level + ASSOC_ARRAY_LEVEL_STEP); + + keylen = round_up(diff, ASSOC_ARRAY_KEY_CHUNK_SIZE); + keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; + + new_s0 = kzalloc(sizeof(struct assoc_array_shortcut) + + keylen * sizeof(unsigned long), GFP_KERNEL); + if (!new_s0) + return false; + edit->new_meta[2] = assoc_array_shortcut_to_ptr(new_s0); + + edit->set[0].to = assoc_array_shortcut_to_ptr(new_s0); + new_s0->back_pointer = node->back_pointer; + new_s0->parent_slot = node->parent_slot; + new_s0->next_node = assoc_array_node_to_ptr(new_n0); + new_n0->back_pointer = assoc_array_shortcut_to_ptr(new_s0); + new_n0->parent_slot = 0; + new_n1->back_pointer = assoc_array_node_to_ptr(new_n0); + new_n1->parent_slot = -1; /* Need to calculate this */ + + new_s0->skip_to_level = level = diff & ~ASSOC_ARRAY_LEVEL_STEP_MASK; + pr_devel("skip_to_level = %d [diff %d]\n", level, diff); + BUG_ON(level <= 0); + + for (i = 0; i < keylen; i++) + new_s0->index_key[i] = + ops->get_key_chunk(index_key, i * ASSOC_ARRAY_KEY_CHUNK_SIZE); + + blank = ULONG_MAX << (level & ASSOC_ARRAY_KEY_CHUNK_MASK); + pr_devel("blank off [%zu] %d: %lx\n", keylen - 1, level, blank); + new_s0->index_key[keylen - 1] &= ~blank; + + /* This now reduces to a node splitting exercise for which we'll need + * to regenerate the disparity table. + */ + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + ptr = node->slots[i]; + base_seg = ops->get_object_key_chunk(assoc_array_ptr_to_leaf(ptr), + level); + base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK; + edit->segment_cache[i] = base_seg & ASSOC_ARRAY_FAN_MASK; + } + + base_seg = ops->get_key_chunk(index_key, level); + base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK; + edit->segment_cache[ASSOC_ARRAY_FAN_OUT] = base_seg & ASSOC_ARRAY_FAN_MASK; + goto do_split_node; +} + +/* + * Handle insertion into the middle of a shortcut. + */ +static bool assoc_array_insert_mid_shortcut(struct assoc_array_edit *edit, + const struct assoc_array_ops *ops, + struct assoc_array_walk_result *result) +{ + struct assoc_array_shortcut *shortcut, *new_s0, *new_s1; + struct assoc_array_node *node, *new_n0, *side; + unsigned long sc_segments, dissimilarity, blank; + size_t keylen; + int level, sc_level, diff; + int sc_slot; + + shortcut = result->wrong_shortcut.shortcut; + level = result->wrong_shortcut.level; + sc_level = result->wrong_shortcut.sc_level; + sc_segments = result->wrong_shortcut.sc_segments; + dissimilarity = result->wrong_shortcut.dissimilarity; + + pr_devel("-->%s(ix=%d dis=%lx scix=%d)\n", + __func__, level, dissimilarity, sc_level); + + /* We need to split a shortcut and insert a node between the two + * pieces. Zero-length pieces will be dispensed with entirely. + * + * First of all, we need to find out in which level the first + * difference was. + */ + diff = __ffs(dissimilarity); + diff &= ~ASSOC_ARRAY_LEVEL_STEP_MASK; + diff += sc_level & ~ASSOC_ARRAY_KEY_CHUNK_MASK; + pr_devel("diff=%d\n", diff); + + if (!shortcut->back_pointer) { + edit->set[0].ptr = &edit->array->root; + } else if (assoc_array_ptr_is_node(shortcut->back_pointer)) { + node = assoc_array_ptr_to_node(shortcut->back_pointer); + edit->set[0].ptr = &node->slots[shortcut->parent_slot]; + } else { + BUG(); + } + + edit->excised_meta[0] = assoc_array_shortcut_to_ptr(shortcut); + + /* Create a new node now since we're going to need it anyway */ + new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); + if (!new_n0) + return false; + edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); + edit->adjust_count_on = new_n0; + + /* Insert a new shortcut before the new node if this segment isn't of + * zero length - otherwise we just connect the new node directly to the + * parent. + */ + level += ASSOC_ARRAY_LEVEL_STEP; + if (diff > level) { + pr_devel("pre-shortcut %d...%d\n", level, diff); + keylen = round_up(diff, ASSOC_ARRAY_KEY_CHUNK_SIZE); + keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; + + new_s0 = kzalloc(sizeof(struct assoc_array_shortcut) + + keylen * sizeof(unsigned long), GFP_KERNEL); + if (!new_s0) + return false; + edit->new_meta[1] = assoc_array_shortcut_to_ptr(new_s0); + edit->set[0].to = assoc_array_shortcut_to_ptr(new_s0); + new_s0->back_pointer = shortcut->back_pointer; + new_s0->parent_slot = shortcut->parent_slot; + new_s0->next_node = assoc_array_node_to_ptr(new_n0); + new_s0->skip_to_level = diff; + + new_n0->back_pointer = assoc_array_shortcut_to_ptr(new_s0); + new_n0->parent_slot = 0; + + memcpy(new_s0->index_key, shortcut->index_key, + keylen * sizeof(unsigned long)); + + blank = ULONG_MAX << (diff & ASSOC_ARRAY_KEY_CHUNK_MASK); + pr_devel("blank off [%zu] %d: %lx\n", keylen - 1, diff, blank); + new_s0->index_key[keylen - 1] &= ~blank; + } else { + pr_devel("no pre-shortcut\n"); + edit->set[0].to = assoc_array_node_to_ptr(new_n0); + new_n0->back_pointer = shortcut->back_pointer; + new_n0->parent_slot = shortcut->parent_slot; + } + + side = assoc_array_ptr_to_node(shortcut->next_node); + new_n0->nr_leaves_on_branch = side->nr_leaves_on_branch; + + /* We need to know which slot in the new node is going to take a + * metadata pointer. + */ + sc_slot = sc_segments >> (diff & ASSOC_ARRAY_KEY_CHUNK_MASK); + sc_slot &= ASSOC_ARRAY_FAN_MASK; + + pr_devel("new slot %lx >> %d -> %d\n", + sc_segments, diff & ASSOC_ARRAY_KEY_CHUNK_MASK, sc_slot); + + /* Determine whether we need to follow the new node with a replacement + * for the current shortcut. We could in theory reuse the current + * shortcut if its parent slot number doesn't change - but that's a + * 1-in-16 chance so not worth expending the code upon. + */ + level = diff + ASSOC_ARRAY_LEVEL_STEP; + if (level < shortcut->skip_to_level) { + pr_devel("post-shortcut %d...%d\n", level, shortcut->skip_to_level); + keylen = round_up(shortcut->skip_to_level, ASSOC_ARRAY_KEY_CHUNK_SIZE); + keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; + + new_s1 = kzalloc(sizeof(struct assoc_array_shortcut) + + keylen * sizeof(unsigned long), GFP_KERNEL); + if (!new_s1) + return false; + edit->new_meta[2] = assoc_array_shortcut_to_ptr(new_s1); + + new_s1->back_pointer = assoc_array_node_to_ptr(new_n0); + new_s1->parent_slot = sc_slot; + new_s1->next_node = shortcut->next_node; + new_s1->skip_to_level = shortcut->skip_to_level; + + new_n0->slots[sc_slot] = assoc_array_shortcut_to_ptr(new_s1); + + memcpy(new_s1->index_key, shortcut->index_key, + keylen * sizeof(unsigned long)); + + edit->set[1].ptr = &side->back_pointer; + edit->set[1].to = assoc_array_shortcut_to_ptr(new_s1); + } else { + pr_devel("no post-shortcut\n"); + + /* We don't have to replace the pointed-to node as long as we + * use memory barriers to make sure the parent slot number is + * changed before the back pointer (the parent slot number is + * irrelevant to the old parent shortcut). + */ + new_n0->slots[sc_slot] = shortcut->next_node; + edit->set_parent_slot[0].p = &side->parent_slot; + edit->set_parent_slot[0].to = sc_slot; + edit->set[1].ptr = &side->back_pointer; + edit->set[1].to = assoc_array_node_to_ptr(new_n0); + } + + /* Install the new leaf in a spare slot in the new node. */ + if (sc_slot == 0) + edit->leaf_p = &new_n0->slots[1]; + else + edit->leaf_p = &new_n0->slots[0]; + + pr_devel("<--%s() = ok [split shortcut]\n", __func__); + return edit; +} + +/** + * assoc_array_insert - Script insertion of an object into an associative array + * @array: The array to insert into. + * @ops: The operations to use. + * @index_key: The key to insert at. + * @object: The object to insert. + * + * Precalculate and preallocate a script for the insertion or replacement of an + * object in an associative array. This results in an edit script that can + * either be applied or cancelled. + * + * The function returns a pointer to an edit script or -ENOMEM. + * + * The caller should lock against other modifications and must continue to hold + * the lock until assoc_array_apply_edit() has been called. + * + * Accesses to the tree may take place concurrently with this function, + * provided they hold the RCU read lock. + */ +struct assoc_array_edit *assoc_array_insert(struct assoc_array *array, + const struct assoc_array_ops *ops, + const void *index_key, + void *object) +{ + struct assoc_array_walk_result result; + struct assoc_array_edit *edit; + + pr_devel("-->%s()\n", __func__); + + /* The leaf pointer we're given must not have the bottom bit set as we + * use those for type-marking the pointer. NULL pointers are also not + * allowed as they indicate an empty slot but we have to allow them + * here as they can be updated later. + */ + BUG_ON(assoc_array_ptr_is_meta(object)); + + edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL); + if (!edit) + return ERR_PTR(-ENOMEM); + edit->array = array; + edit->ops = ops; + edit->leaf = assoc_array_leaf_to_ptr(object); + edit->adjust_count_by = 1; + + switch (assoc_array_walk(array, ops, index_key, &result)) { + case assoc_array_walk_tree_empty: + /* Allocate a root node if there isn't one yet */ + if (!assoc_array_insert_in_empty_tree(edit)) + goto enomem; + return edit; + + case assoc_array_walk_found_terminal_node: + /* We found a node that doesn't have a node/shortcut pointer in + * the slot corresponding to the index key that we have to + * follow. + */ + if (!assoc_array_insert_into_terminal_node(edit, ops, index_key, + &result)) + goto enomem; + return edit; + + case assoc_array_walk_found_wrong_shortcut: + /* We found a shortcut that didn't match our key in a slot we + * needed to follow. + */ + if (!assoc_array_insert_mid_shortcut(edit, ops, &result)) + goto enomem; + return edit; + } + +enomem: + /* Clean up after an out of memory error */ + pr_devel("enomem\n"); + assoc_array_cancel_edit(edit); + return ERR_PTR(-ENOMEM); +} + +/** + * assoc_array_insert_set_object - Set the new object pointer in an edit script + * @edit: The edit script to modify. + * @object: The object pointer to set. + * + * Change the object to be inserted in an edit script. The object pointed to + * by the old object is not freed. This must be done prior to applying the + * script. + */ +void assoc_array_insert_set_object(struct assoc_array_edit *edit, void *object) +{ + BUG_ON(!object); + edit->leaf = assoc_array_leaf_to_ptr(object); +} + +struct assoc_array_delete_collapse_context { + struct assoc_array_node *node; + const void *skip_leaf; + int slot; +}; + +/* + * Subtree collapse to node iterator. + */ +static int assoc_array_delete_collapse_iterator(const void *leaf, + void *iterator_data) +{ + struct assoc_array_delete_collapse_context *collapse = iterator_data; + + if (leaf == collapse->skip_leaf) + return 0; + + BUG_ON(collapse->slot >= ASSOC_ARRAY_FAN_OUT); + + collapse->node->slots[collapse->slot++] = assoc_array_leaf_to_ptr(leaf); + return 0; +} + +/** + * assoc_array_delete - Script deletion of an object from an associative array + * @array: The array to search. + * @ops: The operations to use. + * @index_key: The key to the object. + * + * Precalculate and preallocate a script for the deletion of an object from an + * associative array. This results in an edit script that can either be + * applied or cancelled. + * + * The function returns a pointer to an edit script if the object was found, + * NULL if the object was not found or -ENOMEM. + * + * The caller should lock against other modifications and must continue to hold + * the lock until assoc_array_apply_edit() has been called. + * + * Accesses to the tree may take place concurrently with this function, + * provided they hold the RCU read lock. + */ +struct assoc_array_edit *assoc_array_delete(struct assoc_array *array, + const struct assoc_array_ops *ops, + const void *index_key) +{ + struct assoc_array_delete_collapse_context collapse; + struct assoc_array_walk_result result; + struct assoc_array_node *node, *new_n0; + struct assoc_array_edit *edit; + struct assoc_array_ptr *ptr; + bool has_meta; + int slot, i; + + pr_devel("-->%s()\n", __func__); + + edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL); + if (!edit) + return ERR_PTR(-ENOMEM); + edit->array = array; + edit->ops = ops; + edit->adjust_count_by = -1; + + switch (assoc_array_walk(array, ops, index_key, &result)) { + case assoc_array_walk_found_terminal_node: + /* We found a node that should contain the leaf we've been + * asked to remove - *if* it's in the tree. + */ + pr_devel("terminal_node\n"); + node = result.terminal_node.node; + + for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + ptr = node->slots[slot]; + if (ptr && + assoc_array_ptr_is_leaf(ptr) && + ops->compare_object(assoc_array_ptr_to_leaf(ptr), + index_key)) + goto found_leaf; + } + case assoc_array_walk_tree_empty: + case assoc_array_walk_found_wrong_shortcut: + default: + assoc_array_cancel_edit(edit); + pr_devel("not found\n"); + return NULL; + } + +found_leaf: + BUG_ON(array->nr_leaves_on_tree <= 0); + + /* In the simplest form of deletion we just clear the slot and release + * the leaf after a suitable interval. + */ + edit->dead_leaf = node->slots[slot]; + edit->set[0].ptr = &node->slots[slot]; + edit->set[0].to = NULL; + edit->adjust_count_on = node; + + /* If that concludes erasure of the last leaf, then delete the entire + * internal array. + */ + if (array->nr_leaves_on_tree == 1) { + edit->set[1].ptr = &array->root; + edit->set[1].to = NULL; + edit->adjust_count_on = NULL; + edit->excised_subtree = array->root; + pr_devel("all gone\n"); + return edit; + } + + /* However, we'd also like to clear up some metadata blocks if we + * possibly can. + * + * We go for a simple algorithm of: if this node has FAN_OUT or fewer + * leaves in it, then attempt to collapse it - and attempt to + * recursively collapse up the tree. + * + * We could also try and collapse in partially filled subtrees to take + * up space in this node. + */ + if (node->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT + 1) { + struct assoc_array_node *parent, *grandparent; + struct assoc_array_ptr *ptr; + + /* First of all, we need to know if this node has metadata so + * that we don't try collapsing if all the leaves are already + * here. + */ + has_meta = false; + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + ptr = node->slots[i]; + if (assoc_array_ptr_is_meta(ptr)) { + has_meta = true; + break; + } + } + + pr_devel("leaves: %ld [m=%d]\n", + node->nr_leaves_on_branch - 1, has_meta); + + /* Look further up the tree to see if we can collapse this node + * into a more proximal node too. + */ + parent = node; + collapse_up: + pr_devel("collapse subtree: %ld\n", parent->nr_leaves_on_branch); + + ptr = parent->back_pointer; + if (!ptr) + goto do_collapse; + if (assoc_array_ptr_is_shortcut(ptr)) { + struct assoc_array_shortcut *s = assoc_array_ptr_to_shortcut(ptr); + ptr = s->back_pointer; + if (!ptr) + goto do_collapse; + } + + grandparent = assoc_array_ptr_to_node(ptr); + if (grandparent->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT + 1) { + parent = grandparent; + goto collapse_up; + } + + do_collapse: + /* There's no point collapsing if the original node has no meta + * pointers to discard and if we didn't merge into one of that + * node's ancestry. + */ + if (has_meta || parent != node) { + node = parent; + + /* Create a new node to collapse into */ + new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); + if (!new_n0) + goto enomem; + edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); + + new_n0->back_pointer = node->back_pointer; + new_n0->parent_slot = node->parent_slot; + new_n0->nr_leaves_on_branch = node->nr_leaves_on_branch; + edit->adjust_count_on = new_n0; + + collapse.node = new_n0; + collapse.skip_leaf = assoc_array_ptr_to_leaf(edit->dead_leaf); + collapse.slot = 0; + assoc_array_subtree_iterate(assoc_array_node_to_ptr(node), + node->back_pointer, + assoc_array_delete_collapse_iterator, + &collapse); + pr_devel("collapsed %d,%lu\n", collapse.slot, new_n0->nr_leaves_on_branch); + BUG_ON(collapse.slot != new_n0->nr_leaves_on_branch - 1); + + if (!node->back_pointer) { + edit->set[1].ptr = &array->root; + } else if (assoc_array_ptr_is_leaf(node->back_pointer)) { + BUG(); + } else if (assoc_array_ptr_is_node(node->back_pointer)) { + struct assoc_array_node *p = + assoc_array_ptr_to_node(node->back_pointer); + edit->set[1].ptr = &p->slots[node->parent_slot]; + } else if (assoc_array_ptr_is_shortcut(node->back_pointer)) { + struct assoc_array_shortcut *s = + assoc_array_ptr_to_shortcut(node->back_pointer); + edit->set[1].ptr = &s->next_node; + } + edit->set[1].to = assoc_array_node_to_ptr(new_n0); + edit->excised_subtree = assoc_array_node_to_ptr(node); + } + } + + return edit; + +enomem: + /* Clean up after an out of memory error */ + pr_devel("enomem\n"); + assoc_array_cancel_edit(edit); + return ERR_PTR(-ENOMEM); +} + +/** + * assoc_array_clear - Script deletion of all objects from an associative array + * @array: The array to clear. + * @ops: The operations to use. + * + * Precalculate and preallocate a script for the deletion of all the objects + * from an associative array. This results in an edit script that can either + * be applied or cancelled. + * + * The function returns a pointer to an edit script if there are objects to be + * deleted, NULL if there are no objects in the array or -ENOMEM. + * + * The caller should lock against other modifications and must continue to hold + * the lock until assoc_array_apply_edit() has been called. + * + * Accesses to the tree may take place concurrently with this function, + * provided they hold the RCU read lock. + */ +struct assoc_array_edit *assoc_array_clear(struct assoc_array *array, + const struct assoc_array_ops *ops) +{ + struct assoc_array_edit *edit; + + pr_devel("-->%s()\n", __func__); + + if (!array->root) + return NULL; + + edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL); + if (!edit) + return ERR_PTR(-ENOMEM); + edit->array = array; + edit->ops = ops; + edit->set[1].ptr = &array->root; + edit->set[1].to = NULL; + edit->excised_subtree = array->root; + edit->ops_for_excised_subtree = ops; + pr_devel("all gone\n"); + return edit; +} + +/* + * Handle the deferred destruction after an applied edit. + */ +static void assoc_array_rcu_cleanup(struct rcu_head *head) +{ + struct assoc_array_edit *edit = + container_of(head, struct assoc_array_edit, rcu); + int i; + + pr_devel("-->%s()\n", __func__); + + if (edit->dead_leaf) + edit->ops->free_object(assoc_array_ptr_to_leaf(edit->dead_leaf)); + for (i = 0; i < ARRAY_SIZE(edit->excised_meta); i++) + if (edit->excised_meta[i]) + kfree(assoc_array_ptr_to_node(edit->excised_meta[i])); + + if (edit->excised_subtree) { + BUG_ON(assoc_array_ptr_is_leaf(edit->excised_subtree)); + if (assoc_array_ptr_is_node(edit->excised_subtree)) { + struct assoc_array_node *n = + assoc_array_ptr_to_node(edit->excised_subtree); + n->back_pointer = NULL; + } else { + struct assoc_array_shortcut *s = + assoc_array_ptr_to_shortcut(edit->excised_subtree); + s->back_pointer = NULL; + } + assoc_array_destroy_subtree(edit->excised_subtree, + edit->ops_for_excised_subtree); + } + + kfree(edit); +} + +/** + * assoc_array_apply_edit - Apply an edit script to an associative array + * @edit: The script to apply. + * + * Apply an edit script to an associative array to effect an insertion, + * deletion or clearance. As the edit script includes preallocated memory, + * this is guaranteed not to fail. + * + * The edit script, dead objects and dead metadata will be scheduled for + * destruction after an RCU grace period to permit those doing read-only + * accesses on the array to continue to do so under the RCU read lock whilst + * the edit is taking place. + */ +void assoc_array_apply_edit(struct assoc_array_edit *edit) +{ + struct assoc_array_shortcut *shortcut; + struct assoc_array_node *node; + struct assoc_array_ptr *ptr; + int i; + + pr_devel("-->%s()\n", __func__); + + smp_wmb(); + if (edit->leaf_p) + *edit->leaf_p = edit->leaf; + + smp_wmb(); + for (i = 0; i < ARRAY_SIZE(edit->set_parent_slot); i++) + if (edit->set_parent_slot[i].p) + *edit->set_parent_slot[i].p = edit->set_parent_slot[i].to; + + smp_wmb(); + for (i = 0; i < ARRAY_SIZE(edit->set_backpointers); i++) + if (edit->set_backpointers[i]) + *edit->set_backpointers[i] = edit->set_backpointers_to; + + smp_wmb(); + for (i = 0; i < ARRAY_SIZE(edit->set); i++) + if (edit->set[i].ptr) + *edit->set[i].ptr = edit->set[i].to; + + if (edit->array->root == NULL) { + edit->array->nr_leaves_on_tree = 0; + } else if (edit->adjust_count_on) { + node = edit->adjust_count_on; + for (;;) { + node->nr_leaves_on_branch += edit->adjust_count_by; + + ptr = node->back_pointer; + if (!ptr) + break; + if (assoc_array_ptr_is_shortcut(ptr)) { + shortcut = assoc_array_ptr_to_shortcut(ptr); + ptr = shortcut->back_pointer; + if (!ptr) + break; + } + BUG_ON(!assoc_array_ptr_is_node(ptr)); + node = assoc_array_ptr_to_node(ptr); + } + + edit->array->nr_leaves_on_tree += edit->adjust_count_by; + } + + call_rcu(&edit->rcu, assoc_array_rcu_cleanup); +} + +/** + * assoc_array_cancel_edit - Discard an edit script. + * @edit: The script to discard. + * + * Free an edit script and all the preallocated data it holds without making + * any changes to the associative array it was intended for. + * + * NOTE! In the case of an insertion script, this does _not_ release the leaf + * that was to be inserted. That is left to the caller. + */ +void assoc_array_cancel_edit(struct assoc_array_edit *edit) +{ + struct assoc_array_ptr *ptr; + int i; + + pr_devel("-->%s()\n", __func__); + + /* Clean up after an out of memory error */ + for (i = 0; i < ARRAY_SIZE(edit->new_meta); i++) { + ptr = edit->new_meta[i]; + if (ptr) { + if (assoc_array_ptr_is_node(ptr)) + kfree(assoc_array_ptr_to_node(ptr)); + else + kfree(assoc_array_ptr_to_shortcut(ptr)); + } + } + kfree(edit); +} + +/** + * assoc_array_gc - Garbage collect an associative array. + * @array: The array to clean. + * @ops: The operations to use. + * @iterator: A callback function to pass judgement on each object. + * @iterator_data: Private data for the callback function. + * + * Collect garbage from an associative array and pack down the internal tree to + * save memory. + * + * The iterator function is asked to pass judgement upon each object in the + * array. If it returns false, the object is discard and if it returns true, + * the object is kept. If it returns true, it must increment the object's + * usage count (or whatever it needs to do to retain it) before returning. + * + * This function returns 0 if successful or -ENOMEM if out of memory. In the + * latter case, the array is not changed. + * + * The caller should lock against other modifications and must continue to hold + * the lock until assoc_array_apply_edit() has been called. + * + * Accesses to the tree may take place concurrently with this function, + * provided they hold the RCU read lock. + */ +int assoc_array_gc(struct assoc_array *array, + const struct assoc_array_ops *ops, + bool (*iterator)(void *object, void *iterator_data), + void *iterator_data) +{ + struct assoc_array_shortcut *shortcut, *new_s; + struct assoc_array_node *node, *new_n; + struct assoc_array_edit *edit; + struct assoc_array_ptr *cursor, *ptr; + struct assoc_array_ptr *new_root, *new_parent, **new_ptr_pp; + unsigned long nr_leaves_on_tree; + int keylen, slot, nr_free, next_slot, i; + + pr_devel("-->%s()\n", __func__); + + if (!array->root) + return 0; + + edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL); + if (!edit) + return -ENOMEM; + edit->array = array; + edit->ops = ops; + edit->ops_for_excised_subtree = ops; + edit->set[0].ptr = &array->root; + edit->excised_subtree = array->root; + + new_root = new_parent = NULL; + new_ptr_pp = &new_root; + cursor = array->root; + +descend: + /* If this point is a shortcut, then we need to duplicate it and + * advance the target cursor. + */ + if (assoc_array_ptr_is_shortcut(cursor)) { + shortcut = assoc_array_ptr_to_shortcut(cursor); + keylen = round_up(shortcut->skip_to_level, ASSOC_ARRAY_KEY_CHUNK_SIZE); + keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; + new_s = kmalloc(sizeof(struct assoc_array_shortcut) + + keylen * sizeof(unsigned long), GFP_KERNEL); + if (!new_s) + goto enomem; + pr_devel("dup shortcut %p -> %p\n", shortcut, new_s); + memcpy(new_s, shortcut, (sizeof(struct assoc_array_shortcut) + + keylen * sizeof(unsigned long))); + new_s->back_pointer = new_parent; + new_s->parent_slot = shortcut->parent_slot; + *new_ptr_pp = new_parent = assoc_array_shortcut_to_ptr(new_s); + new_ptr_pp = &new_s->next_node; + cursor = shortcut->next_node; + } + + /* Duplicate the node at this position */ + node = assoc_array_ptr_to_node(cursor); + new_n = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); + if (!new_n) + goto enomem; + pr_devel("dup node %p -> %p\n", node, new_n); + new_n->back_pointer = new_parent; + new_n->parent_slot = node->parent_slot; + *new_ptr_pp = new_parent = assoc_array_node_to_ptr(new_n); + new_ptr_pp = NULL; + slot = 0; + +continue_node: + /* Filter across any leaves and gc any subtrees */ + for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + ptr = node->slots[slot]; + if (!ptr) + continue; + + if (assoc_array_ptr_is_leaf(ptr)) { + if (iterator(assoc_array_ptr_to_leaf(ptr), + iterator_data)) + /* The iterator will have done any reference + * counting on the object for us. + */ + new_n->slots[slot] = ptr; + continue; + } + + new_ptr_pp = &new_n->slots[slot]; + cursor = ptr; + goto descend; + } + + pr_devel("-- compress node %p --\n", new_n); + + /* Count up the number of empty slots in this node and work out the + * subtree leaf count. + */ + new_n->nr_leaves_on_branch = 0; + nr_free = 0; + for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + ptr = new_n->slots[slot]; + if (!ptr) + nr_free++; + else if (assoc_array_ptr_is_leaf(ptr)) + new_n->nr_leaves_on_branch++; + } + pr_devel("free=%d, leaves=%lu\n", nr_free, new_n->nr_leaves_on_branch); + + /* See what we can fold in */ + next_slot = 0; + for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + struct assoc_array_shortcut *s; + struct assoc_array_node *child; + + ptr = new_n->slots[slot]; + if (!ptr || assoc_array_ptr_is_leaf(ptr)) + continue; + + s = NULL; + if (assoc_array_ptr_is_shortcut(ptr)) { + s = assoc_array_ptr_to_shortcut(ptr); + ptr = s->next_node; + } + + child = assoc_array_ptr_to_node(ptr); + new_n->nr_leaves_on_branch += child->nr_leaves_on_branch; + + if (child->nr_leaves_on_branch <= nr_free + 1) { + /* Fold the child node into this one */ + pr_devel("[%d] fold node %lu/%d [nx %d]\n", + slot, child->nr_leaves_on_branch, nr_free + 1, + next_slot); + + /* We would already have reaped an intervening shortcut + * on the way back up the tree. + */ + BUG_ON(s); + + new_n->slots[slot] = NULL; + nr_free++; + if (slot < next_slot) + next_slot = slot; + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + struct assoc_array_ptr *p = child->slots[i]; + if (!p) + continue; + BUG_ON(assoc_array_ptr_is_meta(p)); + while (new_n->slots[next_slot]) + next_slot++; + BUG_ON(next_slot >= ASSOC_ARRAY_FAN_OUT); + new_n->slots[next_slot++] = p; + nr_free--; + } + kfree(child); + } else { + pr_devel("[%d] retain node %lu/%d [nx %d]\n", + slot, child->nr_leaves_on_branch, nr_free + 1, + next_slot); + } + } + + pr_devel("after: %lu\n", new_n->nr_leaves_on_branch); + + nr_leaves_on_tree = new_n->nr_leaves_on_branch; + + /* Excise this node if it is singly occupied by a shortcut */ + if (nr_free == ASSOC_ARRAY_FAN_OUT - 1) { + for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) + if ((ptr = new_n->slots[slot])) + break; + + if (assoc_array_ptr_is_meta(ptr) && + assoc_array_ptr_is_shortcut(ptr)) { + pr_devel("excise node %p with 1 shortcut\n", new_n); + new_s = assoc_array_ptr_to_shortcut(ptr); + new_parent = new_n->back_pointer; + slot = new_n->parent_slot; + kfree(new_n); + if (!new_parent) { + new_s->back_pointer = NULL; + new_s->parent_slot = 0; + new_root = ptr; + goto gc_complete; + } + + if (assoc_array_ptr_is_shortcut(new_parent)) { + /* We can discard any preceding shortcut also */ + struct assoc_array_shortcut *s = + assoc_array_ptr_to_shortcut(new_parent); + + pr_devel("excise preceding shortcut\n"); + + new_parent = new_s->back_pointer = s->back_pointer; + slot = new_s->parent_slot = s->parent_slot; + kfree(s); + if (!new_parent) { + new_s->back_pointer = NULL; + new_s->parent_slot = 0; + new_root = ptr; + goto gc_complete; + } + } + + new_s->back_pointer = new_parent; + new_s->parent_slot = slot; + new_n = assoc_array_ptr_to_node(new_parent); + new_n->slots[slot] = ptr; + goto ascend_old_tree; + } + } + + /* Excise any shortcuts we might encounter that point to nodes that + * only contain leaves. + */ + ptr = new_n->back_pointer; + if (!ptr) + goto gc_complete; + + if (assoc_array_ptr_is_shortcut(ptr)) { + new_s = assoc_array_ptr_to_shortcut(ptr); + new_parent = new_s->back_pointer; + slot = new_s->parent_slot; + + if (new_n->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT) { + struct assoc_array_node *n; + + pr_devel("excise shortcut\n"); + new_n->back_pointer = new_parent; + new_n->parent_slot = slot; + kfree(new_s); + if (!new_parent) { + new_root = assoc_array_node_to_ptr(new_n); + goto gc_complete; + } + + n = assoc_array_ptr_to_node(new_parent); + n->slots[slot] = assoc_array_node_to_ptr(new_n); + } + } else { + new_parent = ptr; + } + new_n = assoc_array_ptr_to_node(new_parent); + +ascend_old_tree: + ptr = node->back_pointer; + if (assoc_array_ptr_is_shortcut(ptr)) { + shortcut = assoc_array_ptr_to_shortcut(ptr); + slot = shortcut->parent_slot; + cursor = shortcut->back_pointer; + } else { + slot = node->parent_slot; + cursor = ptr; + } + BUG_ON(!ptr); + node = assoc_array_ptr_to_node(cursor); + slot++; + goto continue_node; + +gc_complete: + edit->set[0].to = new_root; + assoc_array_apply_edit(edit); + edit->array->nr_leaves_on_tree = nr_leaves_on_tree; + return 0; + +enomem: + pr_devel("enomem\n"); + assoc_array_destroy_subtree(new_root, edit->ops); + kfree(edit); + return -ENOMEM; +} diff --git a/lib/kfifo.c b/lib/kfifo.c index 7b7f83027b7..d79b9d22206 100644 --- a/lib/kfifo.c +++ b/lib/kfifo.c @@ -215,7 +215,7 @@ static unsigned long kfifo_copy_from_user(struct __kfifo *fifo, * incrementing the fifo->in index counter */ smp_wmb(); - *copied = len - ret; + *copied = len - ret * esize; /* return the number of elements which are not copied */ return ret; } @@ -275,7 +275,7 @@ static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to, * incrementing the fifo->out index counter */ smp_wmb(); - *copied = len - ret; + *copied = len - ret * esize; /* return the number of elements which are not copied */ return ret; } diff --git a/lib/llist.c b/lib/llist.c index 4a70d120138..f76196d0740 100644 --- a/lib/llist.c +++ b/lib/llist.c @@ -81,3 +81,25 @@ struct llist_node *llist_del_first(struct llist_head *head) return entry; } EXPORT_SYMBOL_GPL(llist_del_first); + +/** + * llist_reverse_order - reverse order of a llist chain + * @head: first item of the list to be reversed + * + * Reverse the order of a chain of llist entries and return the + * new first entry. + */ +struct llist_node *llist_reverse_order(struct llist_node *head) +{ + struct llist_node *new_head = NULL; + + while (head) { + struct llist_node *tmp = head; + head = head->next; + tmp->next = new_head; + new_head = tmp; + } + + return new_head; +} +EXPORT_SYMBOL_GPL(llist_reverse_order); diff --git a/lib/lockref.c b/lib/lockref.c index af6e95d0bed..d2b123f8456 100644 --- a/lib/lockref.c +++ b/lib/lockref.c @@ -1,7 +1,7 @@ #include <linux/export.h> #include <linux/lockref.h> -#ifdef CONFIG_CMPXCHG_LOCKREF +#if USE_CMPXCHG_LOCKREF /* * Allow weakly-ordered memory architectures to provide barrier-less diff --git a/lib/mpi/mpiutil.c b/lib/mpi/mpiutil.c index 657979f71be..bf076d281d4 100644 --- a/lib/mpi/mpiutil.c +++ b/lib/mpi/mpiutil.c @@ -121,3 +121,6 @@ void mpi_free(MPI a) kfree(a); } EXPORT_SYMBOL_GPL(mpi_free); + +MODULE_DESCRIPTION("Multiprecision maths library"); +MODULE_LICENSE("GPL"); diff --git a/lib/percpu-rwsem.c b/lib/percpu-rwsem.c deleted file mode 100644 index 652a8ee8efe..00000000000 --- a/lib/percpu-rwsem.c +++ /dev/null @@ -1,165 +0,0 @@ -#include <linux/atomic.h> -#include <linux/rwsem.h> -#include <linux/percpu.h> -#include <linux/wait.h> -#include <linux/lockdep.h> -#include <linux/percpu-rwsem.h> -#include <linux/rcupdate.h> -#include <linux/sched.h> -#include <linux/errno.h> - -int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, - const char *name, struct lock_class_key *rwsem_key) -{ - brw->fast_read_ctr = alloc_percpu(int); - if (unlikely(!brw->fast_read_ctr)) - return -ENOMEM; - - /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ - __init_rwsem(&brw->rw_sem, name, rwsem_key); - atomic_set(&brw->write_ctr, 0); - atomic_set(&brw->slow_read_ctr, 0); - init_waitqueue_head(&brw->write_waitq); - return 0; -} - -void percpu_free_rwsem(struct percpu_rw_semaphore *brw) -{ - free_percpu(brw->fast_read_ctr); - brw->fast_read_ctr = NULL; /* catch use after free bugs */ -} - -/* - * This is the fast-path for down_read/up_read, it only needs to ensure - * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the - * fast per-cpu counter. The writer uses synchronize_sched_expedited() to - * serialize with the preempt-disabled section below. - * - * The nontrivial part is that we should guarantee acquire/release semantics - * in case when - * - * R_W: down_write() comes after up_read(), the writer should see all - * changes done by the reader - * or - * W_R: down_read() comes after up_write(), the reader should see all - * changes done by the writer - * - * If this helper fails the callers rely on the normal rw_semaphore and - * atomic_dec_and_test(), so in this case we have the necessary barriers. - * - * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or - * __this_cpu_add() below can be reordered with any LOAD/STORE done by the - * reader inside the critical section. See the comments in down_write and - * up_write below. - */ -static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) -{ - bool success = false; - - preempt_disable(); - if (likely(!atomic_read(&brw->write_ctr))) { - __this_cpu_add(*brw->fast_read_ctr, val); - success = true; - } - preempt_enable(); - - return success; -} - -/* - * Like the normal down_read() this is not recursive, the writer can - * come after the first percpu_down_read() and create the deadlock. - * - * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep, - * percpu_up_read() does rwsem_release(). This pairs with the usage - * of ->rw_sem in percpu_down/up_write(). - */ -void percpu_down_read(struct percpu_rw_semaphore *brw) -{ - might_sleep(); - if (likely(update_fast_ctr(brw, +1))) { - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); - return; - } - - down_read(&brw->rw_sem); - atomic_inc(&brw->slow_read_ctr); - /* avoid up_read()->rwsem_release() */ - __up_read(&brw->rw_sem); -} - -void percpu_up_read(struct percpu_rw_semaphore *brw) -{ - rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); - - if (likely(update_fast_ctr(brw, -1))) - return; - - /* false-positive is possible but harmless */ - if (atomic_dec_and_test(&brw->slow_read_ctr)) - wake_up_all(&brw->write_waitq); -} - -static int clear_fast_ctr(struct percpu_rw_semaphore *brw) -{ - unsigned int sum = 0; - int cpu; - - for_each_possible_cpu(cpu) { - sum += per_cpu(*brw->fast_read_ctr, cpu); - per_cpu(*brw->fast_read_ctr, cpu) = 0; - } - - return sum; -} - -/* - * A writer increments ->write_ctr to force the readers to switch to the - * slow mode, note the atomic_read() check in update_fast_ctr(). - * - * After that the readers can only inc/dec the slow ->slow_read_ctr counter, - * ->fast_read_ctr is stable. Once the writer moves its sum into the slow - * counter it represents the number of active readers. - * - * Finally the writer takes ->rw_sem for writing and blocks the new readers, - * then waits until the slow counter becomes zero. - */ -void percpu_down_write(struct percpu_rw_semaphore *brw) -{ - /* tell update_fast_ctr() there is a pending writer */ - atomic_inc(&brw->write_ctr); - /* - * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read - * so that update_fast_ctr() can't succeed. - * - * 2. Ensures we see the result of every previous this_cpu_add() in - * update_fast_ctr(). - * - * 3. Ensures that if any reader has exited its critical section via - * fast-path, it executes a full memory barrier before we return. - * See R_W case in the comment above update_fast_ctr(). - */ - synchronize_sched_expedited(); - - /* exclude other writers, and block the new readers completely */ - down_write(&brw->rw_sem); - - /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */ - atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr); - - /* wait for all readers to complete their percpu_up_read() */ - wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); -} - -void percpu_up_write(struct percpu_rw_semaphore *brw) -{ - /* release the lock, but the readers can't use the fast-path */ - up_write(&brw->rw_sem); - /* - * Insert the barrier before the next fast-path in down_read, - * see W_R case in the comment above update_fast_ctr(). - */ - synchronize_sched_expedited(); - /* the last writer unblocks update_fast_ctr() */ - atomic_dec(&brw->write_ctr); -} diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 93c5d5ecff4..7473ee3b4ee 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -60,14 +60,15 @@ static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) void percpu_counter_set(struct percpu_counter *fbc, s64 amount) { int cpu; + unsigned long flags; - raw_spin_lock(&fbc->lock); + raw_spin_lock_irqsave(&fbc->lock, flags); for_each_possible_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); *pcount = 0; } fbc->count = amount; - raw_spin_unlock(&fbc->lock); + raw_spin_unlock_irqrestore(&fbc->lock, flags); } EXPORT_SYMBOL(percpu_counter_set); @@ -78,9 +79,10 @@ void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch) preempt_disable(); count = __this_cpu_read(*fbc->counters) + amount; if (count >= batch || count <= -batch) { - raw_spin_lock(&fbc->lock); + unsigned long flags; + raw_spin_lock_irqsave(&fbc->lock, flags); fbc->count += count; - raw_spin_unlock(&fbc->lock); + raw_spin_unlock_irqrestore(&fbc->lock, flags); __this_cpu_write(*fbc->counters, 0); } else { __this_cpu_write(*fbc->counters, count); @@ -97,14 +99,15 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc) { s64 ret; int cpu; + unsigned long flags; - raw_spin_lock(&fbc->lock); + raw_spin_lock_irqsave(&fbc->lock, flags); ret = fbc->count; for_each_online_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; } - raw_spin_unlock(&fbc->lock); + raw_spin_unlock_irqrestore(&fbc->lock, flags); return ret; } EXPORT_SYMBOL(__percpu_counter_sum); diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c index bab1ba2a4c7..9d054bf91d0 100644 --- a/lib/percpu_ida.c +++ b/lib/percpu_ida.c @@ -30,15 +30,6 @@ #include <linux/spinlock.h> #include <linux/percpu_ida.h> -/* - * Number of tags we move between the percpu freelist and the global freelist at - * a time - */ -#define IDA_PCPU_BATCH_MOVE 32U - -/* Max size of percpu freelist, */ -#define IDA_PCPU_SIZE ((IDA_PCPU_BATCH_MOVE * 3) / 2) - struct percpu_ida_cpu { /* * Even though this is percpu, we need a lock for tag stealing by remote @@ -78,7 +69,7 @@ static inline void steal_tags(struct percpu_ida *pool, struct percpu_ida_cpu *remote; for (cpus_have_tags = cpumask_weight(&pool->cpus_have_tags); - cpus_have_tags * IDA_PCPU_SIZE > pool->nr_tags / 2; + cpus_have_tags * pool->percpu_max_size > pool->nr_tags / 2; cpus_have_tags--) { cpu = cpumask_next(cpu, &pool->cpus_have_tags); @@ -123,11 +114,10 @@ static inline void alloc_global_tags(struct percpu_ida *pool, { move_tags(tags->freelist, &tags->nr_free, pool->freelist, &pool->nr_free, - min(pool->nr_free, IDA_PCPU_BATCH_MOVE)); + min(pool->nr_free, pool->percpu_batch_size)); } -static inline unsigned alloc_local_tag(struct percpu_ida *pool, - struct percpu_ida_cpu *tags) +static inline unsigned alloc_local_tag(struct percpu_ida_cpu *tags) { int tag = -ENOSPC; @@ -168,7 +158,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp) tags = this_cpu_ptr(pool->tag_cpu); /* Fastpath */ - tag = alloc_local_tag(pool, tags); + tag = alloc_local_tag(tags); if (likely(tag >= 0)) { local_irq_restore(flags); return tag; @@ -245,17 +235,17 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag) wake_up(&pool->wait); } - if (nr_free == IDA_PCPU_SIZE) { + if (nr_free == pool->percpu_max_size) { spin_lock(&pool->lock); /* * Global lock held and irqs disabled, don't need percpu * lock */ - if (tags->nr_free == IDA_PCPU_SIZE) { + if (tags->nr_free == pool->percpu_max_size) { move_tags(pool->freelist, &pool->nr_free, tags->freelist, &tags->nr_free, - IDA_PCPU_BATCH_MOVE); + pool->percpu_batch_size); wake_up(&pool->wait); } @@ -292,7 +282,8 @@ EXPORT_SYMBOL_GPL(percpu_ida_destroy); * Allocation is percpu, but sharding is limited by nr_tags - for best * performance, the workload should not span more cpus than nr_tags / 128. */ -int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags) +int __percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags, + unsigned long max_size, unsigned long batch_size) { unsigned i, cpu, order; @@ -301,6 +292,8 @@ int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags) init_waitqueue_head(&pool->wait); spin_lock_init(&pool->lock); pool->nr_tags = nr_tags; + pool->percpu_max_size = max_size; + pool->percpu_batch_size = batch_size; /* Guard against overflow */ if (nr_tags > (unsigned) INT_MAX + 1) { @@ -319,7 +312,7 @@ int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags) pool->nr_free = nr_tags; pool->tag_cpu = __alloc_percpu(sizeof(struct percpu_ida_cpu) + - IDA_PCPU_SIZE * sizeof(unsigned), + pool->percpu_max_size * sizeof(unsigned), sizeof(unsigned)); if (!pool->tag_cpu) goto err; @@ -332,4 +325,65 @@ err: percpu_ida_destroy(pool); return -ENOMEM; } -EXPORT_SYMBOL_GPL(percpu_ida_init); +EXPORT_SYMBOL_GPL(__percpu_ida_init); + +/** + * percpu_ida_for_each_free - iterate free ids of a pool + * @pool: pool to iterate + * @fn: interate callback function + * @data: parameter for @fn + * + * Note, this doesn't guarantee to iterate all free ids restrictly. Some free + * ids might be missed, some might be iterated duplicated, and some might + * be iterated and not free soon. + */ +int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn, + void *data) +{ + unsigned long flags; + struct percpu_ida_cpu *remote; + unsigned cpu, i, err = 0; + + local_irq_save(flags); + for_each_possible_cpu(cpu) { + remote = per_cpu_ptr(pool->tag_cpu, cpu); + spin_lock(&remote->lock); + for (i = 0; i < remote->nr_free; i++) { + err = fn(remote->freelist[i], data); + if (err) + break; + } + spin_unlock(&remote->lock); + if (err) + goto out; + } + + spin_lock(&pool->lock); + for (i = 0; i < pool->nr_free; i++) { + err = fn(pool->freelist[i], data); + if (err) + break; + } + spin_unlock(&pool->lock); +out: + local_irq_restore(flags); + return err; +} +EXPORT_SYMBOL_GPL(percpu_ida_for_each_free); + +/** + * percpu_ida_free_tags - return free tags number of a specific cpu or global pool + * @pool: pool related + * @cpu: specific cpu or global pool if @cpu == nr_cpu_ids + * + * Note: this just returns a snapshot of free tags number. + */ +unsigned percpu_ida_free_tags(struct percpu_ida *pool, int cpu) +{ + struct percpu_ida_cpu *remote; + if (cpu == nr_cpu_ids) + return pool->nr_free; + remote = per_cpu_ptr(pool->tag_cpu, cpu); + return remote->nr_free; +} +EXPORT_SYMBOL_GPL(percpu_ida_free_tags); diff --git a/lib/random32.c b/lib/random32.c index 82da4f4c348..1e5b2df4429 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -214,18 +214,22 @@ static DEFINE_TIMER(seed_timer, __prandom_timer, 0, 0); static void __prandom_timer(unsigned long dontcare) { u32 entropy; + unsigned long expires; get_random_bytes(&entropy, sizeof(entropy)); prandom_seed(entropy); + /* reseed every ~60 seconds, in [40 .. 80) interval with slack */ - seed_timer.expires = jiffies + (40 * HZ + (prandom_u32() % (40 * HZ))); + expires = 40 + (prandom_u32() % 40); + seed_timer.expires = jiffies + msecs_to_jiffies(expires * MSEC_PER_SEC); + add_timer(&seed_timer); } -static void prandom_start_seed_timer(void) +static void __init __prandom_start_seed_timer(void) { set_timer_slack(&seed_timer, HZ); - seed_timer.expires = jiffies + 40 * HZ; + seed_timer.expires = jiffies + msecs_to_jiffies(40 * MSEC_PER_SEC); add_timer(&seed_timer); } @@ -270,7 +274,7 @@ void prandom_reseed_late(void) static int __init prandom_reseed(void) { __prandom_reseed(false); - prandom_start_seed_timer(); + __prandom_start_seed_timer(); return 0; } late_initcall(prandom_reseed); diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c deleted file mode 100644 index 9be8a914497..00000000000 --- a/lib/rwsem-spinlock.c +++ /dev/null @@ -1,296 +0,0 @@ -/* rwsem-spinlock.c: R/W semaphores: contention handling functions for - * generic spinlock implementation - * - * Copyright (c) 2001 David Howells (dhowells@redhat.com). - * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de> - * - Derived also from comments by Linus - */ -#include <linux/rwsem.h> -#include <linux/sched.h> -#include <linux/export.h> - -enum rwsem_waiter_type { - RWSEM_WAITING_FOR_WRITE, - RWSEM_WAITING_FOR_READ -}; - -struct rwsem_waiter { - struct list_head list; - struct task_struct *task; - enum rwsem_waiter_type type; -}; - -int rwsem_is_locked(struct rw_semaphore *sem) -{ - int ret = 1; - unsigned long flags; - - if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { - ret = (sem->activity != 0); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - } - return ret; -} -EXPORT_SYMBOL(rwsem_is_locked); - -/* - * initialise the semaphore - */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held semaphore: - */ - debug_check_no_locks_freed((void *)sem, sizeof(*sem)); - lockdep_init_map(&sem->dep_map, name, key, 0); -#endif - sem->activity = 0; - raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} -EXPORT_SYMBOL(__init_rwsem); - -/* - * handle the lock release when processes blocked on it that can now run - * - if we come here, then: - * - the 'active count' _reached_ zero - * - the 'waiting count' is non-zero - * - the spinlock must be held by the caller - * - woken process blocks are discarded from the list after having task zeroed - * - writers are only woken if wakewrite is non-zero - */ -static inline struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) -{ - struct rwsem_waiter *waiter; - struct task_struct *tsk; - int woken; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - - if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wakewrite) - /* Wake up a writer. Note that we do not grant it the - * lock - it will have to acquire it when it runs. */ - wake_up_process(waiter->task); - goto out; - } - - /* grant an infinite number of read locks to the front of the queue */ - woken = 0; - do { - struct list_head *next = waiter->list.next; - - list_del(&waiter->list); - tsk = waiter->task; - smp_mb(); - waiter->task = NULL; - wake_up_process(tsk); - put_task_struct(tsk); - woken++; - if (next == &sem->wait_list) - break; - waiter = list_entry(next, struct rwsem_waiter, list); - } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - - sem->activity += woken; - - out: - return sem; -} - -/* - * wake a single writer - */ -static inline struct rw_semaphore * -__rwsem_wake_one_writer(struct rw_semaphore *sem) -{ - struct rwsem_waiter *waiter; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - wake_up_process(waiter->task); - - return sem; -} - -/* - * get a read lock on the semaphore - */ -void __sched __down_read(struct rw_semaphore *sem) -{ - struct rwsem_waiter waiter; - struct task_struct *tsk; - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->activity >= 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->activity++; - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - goto out; - } - - tsk = current; - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - - /* set up my own style of waitqueue */ - waiter.task = tsk; - waiter.type = RWSEM_WAITING_FOR_READ; - get_task_struct(tsk); - - list_add_tail(&waiter.list, &sem->wait_list); - - /* we don't need to touch the semaphore struct anymore */ - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - /* wait to be given the lock */ - for (;;) { - if (!waiter.task) - break; - schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - } - - tsk->state = TASK_RUNNING; - out: - ; -} - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -int __down_read_trylock(struct rw_semaphore *sem) -{ - unsigned long flags; - int ret = 0; - - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->activity >= 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->activity++; - ret = 1; - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; -} - -/* - * get a write lock on the semaphore - */ -void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) -{ - struct rwsem_waiter waiter; - struct task_struct *tsk; - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - /* set up my own style of waitqueue */ - tsk = current; - waiter.task = tsk; - waiter.type = RWSEM_WAITING_FOR_WRITE; - list_add_tail(&waiter.list, &sem->wait_list); - - /* wait for someone to release the lock */ - for (;;) { - /* - * That is the key to support write lock stealing: allows the - * task already on CPU to get the lock soon rather than put - * itself into sleep and waiting for system woke it or someone - * else in the head of the wait list up. - */ - if (sem->activity == 0) - break; - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - schedule(); - raw_spin_lock_irqsave(&sem->wait_lock, flags); - } - /* got the lock */ - sem->activity = -1; - list_del(&waiter.list); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -void __sched __down_write(struct rw_semaphore *sem) -{ - __down_write_nested(sem, 0); -} - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -int __down_write_trylock(struct rw_semaphore *sem) -{ - unsigned long flags; - int ret = 0; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->activity == 0) { - /* got the lock */ - sem->activity = -1; - ret = 1; - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; -} - -/* - * release a read lock on the semaphore - */ -void __up_read(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (--sem->activity == 0 && !list_empty(&sem->wait_list)) - sem = __rwsem_wake_one_writer(sem); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * release a write lock on the semaphore - */ -void __up_write(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - sem->activity = 0; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 1); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * downgrade a write lock into a read lock - * - just wake up any readers at the front of the queue - */ -void __downgrade_write(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - sem->activity = 1; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 0); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - diff --git a/lib/rwsem.c b/lib/rwsem.c deleted file mode 100644 index 19c5fa95e0b..00000000000 --- a/lib/rwsem.c +++ /dev/null @@ -1,293 +0,0 @@ -/* rwsem.c: R/W semaphores: contention handling functions - * - * Written by David Howells (dhowells@redhat.com). - * Derived from arch/i386/kernel/semaphore.c - * - * Writer lock-stealing by Alex Shi <alex.shi@intel.com> - * and Michel Lespinasse <walken@google.com> - */ -#include <linux/rwsem.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/export.h> - -/* - * Initialize an rwsem: - */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held semaphore: - */ - debug_check_no_locks_freed((void *)sem, sizeof(*sem)); - lockdep_init_map(&sem->dep_map, name, key, 0); -#endif - sem->count = RWSEM_UNLOCKED_VALUE; - raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} - -EXPORT_SYMBOL(__init_rwsem); - -enum rwsem_waiter_type { - RWSEM_WAITING_FOR_WRITE, - RWSEM_WAITING_FOR_READ -}; - -struct rwsem_waiter { - struct list_head list; - struct task_struct *task; - enum rwsem_waiter_type type; -}; - -enum rwsem_wake_type { - RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ - RWSEM_WAKE_READERS, /* Wake readers only */ - RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ -}; - -/* - * handle the lock release when processes blocked on it that can now run - * - if we come here from up_xxxx(), then: - * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) - * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) - * - there must be someone on the queue - * - the spinlock must be held by the caller - * - woken process blocks are discarded from the list after having task zeroed - * - writers are only woken if downgrading is false - */ -static struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) -{ - struct rwsem_waiter *waiter; - struct task_struct *tsk; - struct list_head *next; - long oldcount, woken, loop, adjustment; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wake_type == RWSEM_WAKE_ANY) - /* Wake writer at the front of the queue, but do not - * grant it the lock yet as we want other writers - * to be able to steal it. Readers, on the other hand, - * will block as they will notice the queued writer. - */ - wake_up_process(waiter->task); - goto out; - } - - /* Writers might steal the lock before we grant it to the next reader. - * We prefer to do the first reader grant before counting readers - * so we can bail out early if a writer stole the lock. - */ - adjustment = 0; - if (wake_type != RWSEM_WAKE_READ_OWNED) { - adjustment = RWSEM_ACTIVE_READ_BIAS; - try_reader_grant: - oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; - if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { - /* A writer stole the lock. Undo our reader grant. */ - if (rwsem_atomic_update(-adjustment, sem) & - RWSEM_ACTIVE_MASK) - goto out; - /* Last active locker left. Retry waking readers. */ - goto try_reader_grant; - } - } - - /* Grant an infinite number of read locks to the readers at the front - * of the queue. Note we increment the 'active part' of the count by - * the number of readers before waking any processes up. - */ - woken = 0; - do { - woken++; - - if (waiter->list.next == &sem->wait_list) - break; - - waiter = list_entry(waiter->list.next, - struct rwsem_waiter, list); - - } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - - adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; - if (waiter->type != RWSEM_WAITING_FOR_WRITE) - /* hit end of list above */ - adjustment -= RWSEM_WAITING_BIAS; - - if (adjustment) - rwsem_atomic_add(adjustment, sem); - - next = sem->wait_list.next; - loop = woken; - do { - waiter = list_entry(next, struct rwsem_waiter, list); - next = waiter->list.next; - tsk = waiter->task; - smp_mb(); - waiter->task = NULL; - wake_up_process(tsk); - put_task_struct(tsk); - } while (--loop); - - sem->wait_list.next = next; - next->prev = &sem->wait_list; - - out: - return sem; -} - -/* - * wait for the read lock to be granted - */ -struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) -{ - long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; - struct rwsem_waiter waiter; - struct task_struct *tsk = current; - - /* set up my own style of waitqueue */ - waiter.task = tsk; - waiter.type = RWSEM_WAITING_FOR_READ; - get_task_struct(tsk); - - raw_spin_lock_irq(&sem->wait_lock); - if (list_empty(&sem->wait_list)) - adjustment += RWSEM_WAITING_BIAS; - list_add_tail(&waiter.list, &sem->wait_list); - - /* we're now waiting on the lock, but no longer actively locking */ - count = rwsem_atomic_update(adjustment, sem); - - /* If there are no active locks, wake the front queued process(es). - * - * If there are no writers and we are first in the queue, - * wake our own waiter to join the existing active readers ! - */ - if (count == RWSEM_WAITING_BIAS || - (count > RWSEM_WAITING_BIAS && - adjustment != -RWSEM_ACTIVE_READ_BIAS)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); - - raw_spin_unlock_irq(&sem->wait_lock); - - /* wait to be given the lock */ - while (true) { - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - if (!waiter.task) - break; - schedule(); - } - - tsk->state = TASK_RUNNING; - - return sem; -} - -/* - * wait until we successfully acquire the write lock - */ -struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) -{ - long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; - struct rwsem_waiter waiter; - struct task_struct *tsk = current; - - /* set up my own style of waitqueue */ - waiter.task = tsk; - waiter.type = RWSEM_WAITING_FOR_WRITE; - - raw_spin_lock_irq(&sem->wait_lock); - if (list_empty(&sem->wait_list)) - adjustment += RWSEM_WAITING_BIAS; - list_add_tail(&waiter.list, &sem->wait_list); - - /* we're now waiting on the lock, but no longer actively locking */ - count = rwsem_atomic_update(adjustment, sem); - - /* If there were already threads queued before us and there are no - * active writers, the lock must be read owned; so we try to wake - * any read locks that were queued ahead of us. */ - if (count > RWSEM_WAITING_BIAS && - adjustment == -RWSEM_ACTIVE_WRITE_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); - - /* wait until we successfully acquire the lock */ - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - while (true) { - if (!(count & RWSEM_ACTIVE_MASK)) { - /* Try acquiring the write lock. */ - count = RWSEM_ACTIVE_WRITE_BIAS; - if (!list_is_singular(&sem->wait_list)) - count += RWSEM_WAITING_BIAS; - - if (sem->count == RWSEM_WAITING_BIAS && - cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == - RWSEM_WAITING_BIAS) - break; - } - - raw_spin_unlock_irq(&sem->wait_lock); - - /* Block until there are no active lockers. */ - do { - schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - } while ((count = sem->count) & RWSEM_ACTIVE_MASK); - - raw_spin_lock_irq(&sem->wait_lock); - } - - list_del(&waiter.list); - raw_spin_unlock_irq(&sem->wait_lock); - tsk->state = TASK_RUNNING; - - return sem; -} - -/* - * handle waking up a waiter on the semaphore - * - up_read/up_write has decremented the active part of count if we come here - */ -struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - /* do nothing if list empty */ - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return sem; -} - -/* - * downgrade a write lock into a read lock - * - caller incremented waiting part of count and discovered it still negative - * - just wake up any readers at the front of the queue - */ -struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - /* do nothing if list empty */ - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return sem; -} - -EXPORT_SYMBOL(rwsem_down_read_failed); -EXPORT_SYMBOL(rwsem_down_write_failed); -EXPORT_SYMBOL(rwsem_wake); -EXPORT_SYMBOL(rwsem_downgrade_wake); diff --git a/lib/spinlock_debug.c b/lib/spinlock_debug.c deleted file mode 100644 index 0374a596cff..00000000000 --- a/lib/spinlock_debug.c +++ /dev/null @@ -1,302 +0,0 @@ -/* - * Copyright 2005, Red Hat, Inc., Ingo Molnar - * Released under the General Public License (GPL). - * - * This file contains the spinlock/rwlock implementations for - * DEBUG_SPINLOCK. - */ - -#include <linux/spinlock.h> -#include <linux/nmi.h> -#include <linux/interrupt.h> -#include <linux/debug_locks.h> -#include <linux/delay.h> -#include <linux/export.h> - -void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); -#endif - lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - lock->magic = SPINLOCK_MAGIC; - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; -} - -EXPORT_SYMBOL(__raw_spin_lock_init); - -void __rwlock_init(rwlock_t *lock, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); -#endif - lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED; - lock->magic = RWLOCK_MAGIC; - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; -} - -EXPORT_SYMBOL(__rwlock_init); - -static void spin_dump(raw_spinlock_t *lock, const char *msg) -{ - struct task_struct *owner = NULL; - - if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT) - owner = lock->owner; - printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n", - msg, raw_smp_processor_id(), - current->comm, task_pid_nr(current)); - printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, " - ".owner_cpu: %d\n", - lock, lock->magic, - owner ? owner->comm : "<none>", - owner ? task_pid_nr(owner) : -1, - lock->owner_cpu); - dump_stack(); -} - -static void spin_bug(raw_spinlock_t *lock, const char *msg) -{ - if (!debug_locks_off()) - return; - - spin_dump(lock, msg); -} - -#define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg) - -static inline void -debug_spin_lock_before(raw_spinlock_t *lock) -{ - SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); - SPIN_BUG_ON(lock->owner == current, lock, "recursion"); - SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), - lock, "cpu recursion"); -} - -static inline void debug_spin_lock_after(raw_spinlock_t *lock) -{ - lock->owner_cpu = raw_smp_processor_id(); - lock->owner = current; -} - -static inline void debug_spin_unlock(raw_spinlock_t *lock) -{ - SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); - SPIN_BUG_ON(!raw_spin_is_locked(lock), lock, "already unlocked"); - SPIN_BUG_ON(lock->owner != current, lock, "wrong owner"); - SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), - lock, "wrong CPU"); - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; -} - -static void __spin_lock_debug(raw_spinlock_t *lock) -{ - u64 i; - u64 loops = loops_per_jiffy * HZ; - - for (i = 0; i < loops; i++) { - if (arch_spin_trylock(&lock->raw_lock)) - return; - __delay(1); - } - /* lockup suspected: */ - spin_dump(lock, "lockup suspected"); -#ifdef CONFIG_SMP - trigger_all_cpu_backtrace(); -#endif - - /* - * The trylock above was causing a livelock. Give the lower level arch - * specific lock code a chance to acquire the lock. We have already - * printed a warning/backtrace at this point. The non-debug arch - * specific code might actually succeed in acquiring the lock. If it is - * not successful, the end-result is the same - there is no forward - * progress. - */ - arch_spin_lock(&lock->raw_lock); -} - -void do_raw_spin_lock(raw_spinlock_t *lock) -{ - debug_spin_lock_before(lock); - if (unlikely(!arch_spin_trylock(&lock->raw_lock))) - __spin_lock_debug(lock); - debug_spin_lock_after(lock); -} - -int do_raw_spin_trylock(raw_spinlock_t *lock) -{ - int ret = arch_spin_trylock(&lock->raw_lock); - - if (ret) - debug_spin_lock_after(lock); -#ifndef CONFIG_SMP - /* - * Must not happen on UP: - */ - SPIN_BUG_ON(!ret, lock, "trylock failure on UP"); -#endif - return ret; -} - -void do_raw_spin_unlock(raw_spinlock_t *lock) -{ - debug_spin_unlock(lock); - arch_spin_unlock(&lock->raw_lock); -} - -static void rwlock_bug(rwlock_t *lock, const char *msg) -{ - if (!debug_locks_off()) - return; - - printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n", - msg, raw_smp_processor_id(), current->comm, - task_pid_nr(current), lock); - dump_stack(); -} - -#define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg) - -#if 0 /* __write_lock_debug() can lock up - maybe this can too? */ -static void __read_lock_debug(rwlock_t *lock) -{ - u64 i; - u64 loops = loops_per_jiffy * HZ; - int print_once = 1; - - for (;;) { - for (i = 0; i < loops; i++) { - if (arch_read_trylock(&lock->raw_lock)) - return; - __delay(1); - } - /* lockup suspected: */ - if (print_once) { - print_once = 0; - printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, " - "%s/%d, %p\n", - raw_smp_processor_id(), current->comm, - current->pid, lock); - dump_stack(); - } - } -} -#endif - -void do_raw_read_lock(rwlock_t *lock) -{ - RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - arch_read_lock(&lock->raw_lock); -} - -int do_raw_read_trylock(rwlock_t *lock) -{ - int ret = arch_read_trylock(&lock->raw_lock); - -#ifndef CONFIG_SMP - /* - * Must not happen on UP: - */ - RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP"); -#endif - return ret; -} - -void do_raw_read_unlock(rwlock_t *lock) -{ - RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - arch_read_unlock(&lock->raw_lock); -} - -static inline void debug_write_lock_before(rwlock_t *lock) -{ - RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - RWLOCK_BUG_ON(lock->owner == current, lock, "recursion"); - RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), - lock, "cpu recursion"); -} - -static inline void debug_write_lock_after(rwlock_t *lock) -{ - lock->owner_cpu = raw_smp_processor_id(); - lock->owner = current; -} - -static inline void debug_write_unlock(rwlock_t *lock) -{ - RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner"); - RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), - lock, "wrong CPU"); - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; -} - -#if 0 /* This can cause lockups */ -static void __write_lock_debug(rwlock_t *lock) -{ - u64 i; - u64 loops = loops_per_jiffy * HZ; - int print_once = 1; - - for (;;) { - for (i = 0; i < loops; i++) { - if (arch_write_trylock(&lock->raw_lock)) - return; - __delay(1); - } - /* lockup suspected: */ - if (print_once) { - print_once = 0; - printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, " - "%s/%d, %p\n", - raw_smp_processor_id(), current->comm, - current->pid, lock); - dump_stack(); - } - } -} -#endif - -void do_raw_write_lock(rwlock_t *lock) -{ - debug_write_lock_before(lock); - arch_write_lock(&lock->raw_lock); - debug_write_lock_after(lock); -} - -int do_raw_write_trylock(rwlock_t *lock) -{ - int ret = arch_write_trylock(&lock->raw_lock); - - if (ret) - debug_write_lock_after(lock); -#ifndef CONFIG_SMP - /* - * Must not happen on UP: - */ - RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP"); -#endif - return ret; -} - -void do_raw_write_unlock(rwlock_t *lock) -{ - debug_write_unlock(lock); - arch_write_unlock(&lock->raw_lock); -} diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 4e8686c7e5a..e4399fa65ad 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -38,6 +38,9 @@ #include <linux/bootmem.h> #include <linux/iommu-helper.h> +#define CREATE_TRACE_POINTS +#include <trace/events/swiotlb.h> + #define OFFSET(val,align) ((unsigned long) \ ( (val) & ( (align) - 1))) @@ -502,6 +505,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, not_found: spin_unlock_irqrestore(&io_tlb_lock, flags); + dev_warn(hwdev, "swiotlb buffer is full\n"); return SWIOTLB_MAP_ERROR; found: spin_unlock_irqrestore(&io_tlb_lock, flags); @@ -726,6 +730,8 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, if (dma_capable(dev, dev_addr, size) && !swiotlb_force) return dev_addr; + trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); + /* Oh well, have to allocate and map a bounce buffer. */ map = map_single(dev, phys, size, dir); if (map == SWIOTLB_MAP_ERROR) { diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 48586ac3a62..10909c57149 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1712,18 +1712,16 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) break; case FORMAT_TYPE_NRCHARS: { - u8 qualifier = spec.qualifier; + /* + * Since %n poses a greater security risk than + * utility, ignore %n and skip its argument. + */ + void *skip_arg; - if (qualifier == 'l') { - long *ip = va_arg(args, long *); - *ip = (str - buf); - } else if (_tolower(qualifier) == 'z') { - size_t *ip = va_arg(args, size_t *); - *ip = (str - buf); - } else { - int *ip = va_arg(args, int *); - *ip = (str - buf); - } + WARN_ONCE(1, "Please remove ignored %%n in '%s'\n", + old_fmt); + + skip_arg = va_arg(args, void *); break; } |