9 files changed, 268 insertions, 87 deletions
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 79e0dff1cdc..9e06b7f5ecf 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -410,6 +410,13 @@ config DEBUG_KMEMLEAK_TEST
 
 	  If unsure, say N.
 
+config DEBUG_KMEMLEAK_DEFAULT_OFF
+	bool "Default kmemleak to off"
+	depends on DEBUG_KMEMLEAK
+	help
+	  Say Y here to disable kmemleak by default. It can then be enabled
+	  on the command line via kmemleak=on.
+
 config DEBUG_PREEMPT
 	bool "Debug preemptible kernel"
 	depends on DEBUG_KERNEL && PREEMPT && TRACE_IRQFLAGS_SUPPORT
diff --git a/lib/flex_array.c b/lib/flex_array.c
index 41b1804fa72..77a6fea7481 100644
--- a/lib/flex_array.c
+++ b/lib/flex_array.c
@@ -171,6 +171,8 @@ __fa_get_part(struct flex_array *fa, int part_nr, gfp_t flags)
  * Note that this *copies* the contents of @src into
  * the array.  If you are trying to store an array of
  * pointers, make sure to pass in &ptr instead of ptr.
+ * You may instead wish to use the flex_array_put_ptr()
+ * helper function.
  *
  * Locking must be provided by the caller.
  */
@@ -265,7 +267,8 @@ int flex_array_prealloc(struct flex_array *fa, unsigned int start,
  *
  * Returns a pointer to the data at index @element_nr.  Note
  * that this is a copy of the data that was passed in.  If you
- * are using this to store pointers, you'll get back &ptr.
+ * are using this to store pointers, you'll get back &ptr.  You
+ * may instead wish to use the flex_array_get_ptr helper.
  *
  * Locking must be provided by the caller.
  */
@@ -286,6 +289,26 @@ void *flex_array_get(struct flex_array *fa, unsigned int element_nr)
 	return &part->elements[index_inside_part(fa, element_nr)];
 }
 
+/**
+ * flex_array_get_ptr - pull a ptr back out of the array
+ * @fa:		the flex array from which to extract data
+ * @element_nr:	index of the element to fetch from the array
+ *
+ * Returns the pointer placed in the flex array at element_nr using
+ * flex_array_put_ptr().  This function should not be called if the
+ * element in question was not set using the _put_ptr() helper.
+ */
+void *flex_array_get_ptr(struct flex_array *fa, unsigned int element_nr)
+{
+	void **tmp;
+
+	tmp = flex_array_get(fa, element_nr);
+	if (!tmp)
+		return NULL;
+
+	return *tmp;
+}
+
 static int part_is_free(struct flex_array_part *part)
 {
 	int i;
diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c
index c0251f4ad08..da053313ee5 100644
--- a/lib/iommu-helper.c
+++ b/lib/iommu-helper.c
@@ -38,12 +38,3 @@ again:
 	return -1;
 }
 EXPORT_SYMBOL(iommu_area_alloc);
-
-unsigned long iommu_num_pages(unsigned long addr, unsigned long len,
-			      unsigned long io_page_size)
-{
-	unsigned long size = (addr & (io_page_size - 1)) + len;
-
-	return DIV_ROUND_UP(size, io_page_size);
-}
-EXPORT_SYMBOL(iommu_num_pages);
diff --git a/lib/list_debug.c b/lib/list_debug.c
index 1a39f4e3ae1..344c710d16c 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -43,6 +43,12 @@ EXPORT_SYMBOL(__list_add);
  */
 void list_del(struct list_head *entry)
 {
+	WARN(entry->next == LIST_POISON1,
+		"list_del corruption, next is LIST_POISON1 (%p)\n",
+		LIST_POISON1);
+	WARN(entry->next != LIST_POISON1 && entry->prev == LIST_POISON2,
+		"list_del corruption, prev is LIST_POISON2 (%p)\n",
+		LIST_POISON2);
 	WARN(entry->prev->next != entry,
 		"list_del corruption. prev->next should be %p, "
 		"but was %p\n", entry, entry->prev->next);
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index aeaa6d73444..ec9048e74f4 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -137,6 +137,33 @@ static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
+/*
+ * Compare counter against given value.
+ * Return 1 if greater, 0 if equal and -1 if less
+ */
+int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
+{
+	s64	count;
+
+	count = percpu_counter_read(fbc);
+	/* Check to see if rough count will be sufficient for comparison */
+	if (abs(count - rhs) > (percpu_counter_batch*num_online_cpus())) {
+		if (count > rhs)
+			return 1;
+		else
+			return -1;
+	}
+	/* Need to use precise count */
+	count = percpu_counter_sum(fbc);
+	if (count > rhs)
+		return 1;
+	else if (count < rhs)
+		return -1;
+	else
+		return 0;
+}
+EXPORT_SYMBOL(percpu_counter_compare);
+
 static int __init percpu_counter_startup(void)
 {
 	compute_batch_value();
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 05da38bcc29..e907858498a 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -609,6 +609,100 @@ int radix_tree_tag_get(struct radix_tree_root *root,
 EXPORT_SYMBOL(radix_tree_tag_get);
 
 /**
+ * radix_tree_range_tag_if_tagged - for each item in given range set given
+ *				   tag if item has another tag set
+ * @root:		radix tree root
+ * @first_indexp:	pointer to a starting index of a range to scan
+ * @last_index:		last index of a range to scan
+ * @nr_to_tag:		maximum number items to tag
+ * @iftag:		tag index to test
+ * @settag:		tag index to set if tested tag is set
+ *
+ * This function scans range of radix tree from first_index to last_index
+ * (inclusive).  For each item in the range if iftag is set, the function sets
+ * also settag. The function stops either after tagging nr_to_tag items or
+ * after reaching last_index.
+ *
+ * The function returns number of leaves where the tag was set and sets
+ * *first_indexp to the first unscanned index.
+ */
+unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
+		unsigned long *first_indexp, unsigned long last_index,
+		unsigned long nr_to_tag,
+		unsigned int iftag, unsigned int settag)
+{
+	unsigned int height = root->height, shift;
+	unsigned long tagged = 0, index = *first_indexp;
+	struct radix_tree_node *open_slots[height], *slot;
+
+	last_index = min(last_index, radix_tree_maxindex(height));
+	if (index > last_index)
+		return 0;
+	if (!nr_to_tag)
+		return 0;
+	if (!root_tag_get(root, iftag)) {
+		*first_indexp = last_index + 1;
+		return 0;
+	}
+	if (height == 0) {
+		*first_indexp = last_index + 1;
+		root_tag_set(root, settag);
+		return 1;
+	}
+
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	slot = radix_tree_indirect_to_ptr(root->rnode);
+
+	for (;;) {
+		int offset;
+
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+		if (!slot->slots[offset])
+			goto next;
+		if (!tag_get(slot, iftag, offset))
+			goto next;
+		tag_set(slot, settag, offset);
+		if (height == 1) {
+			tagged++;
+			goto next;
+		}
+		/* Go down one level */
+		height--;
+		shift -= RADIX_TREE_MAP_SHIFT;
+		open_slots[height] = slot;
+		slot = slot->slots[offset];
+		continue;
+next:
+		/* Go to next item at level determined by 'shift' */
+		index = ((index >> shift) + 1) << shift;
+		if (index > last_index)
+			break;
+		if (tagged >= nr_to_tag)
+			break;
+		while (((index >> shift) & RADIX_TREE_MAP_MASK) == 0) {
+			/*
+			 * We've fully scanned this node. Go up. Because
+			 * last_index is guaranteed to be in the tree, what
+			 * we do below cannot wander astray.
+			 */
+			slot = open_slots[height];
+			height++;
+			shift += RADIX_TREE_MAP_SHIFT;
+		}
+	}
+	/*
+	 * The iftag must have been set somewhere because otherwise
+	 * we would return immediated at the beginning of the function
+	 */
+	root_tag_set(root, settag);
+	*first_indexp = index;
+
+	return tagged;
+}
+EXPORT_SYMBOL(radix_tree_range_tag_if_tagged);
+
+
+/**
  *	radix_tree_next_hole    -    find the next hole (not-present entry)
  *	@root:		tree root
  *	@index:		index key
diff --git a/lib/rwsem.c b/lib/rwsem.c
index ceba8e28807..f236d7cd5cf 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -36,45 +36,56 @@ struct rwsem_waiter {
 #define RWSEM_WAITING_FOR_WRITE	0x00000002
 };
 
+/* Wake types for __rwsem_do_wake().  Note that RWSEM_WAKE_NO_ACTIVE and
+ * RWSEM_WAKE_READ_OWNED imply that the spinlock must have been kept held
+ * since the rwsem value was observed.
+ */
+#define RWSEM_WAKE_ANY        0 /* Wake whatever's at head of wait list */
+#define RWSEM_WAKE_NO_ACTIVE  1 /* rwsem was observed with no active thread */
+#define RWSEM_WAKE_READ_OWNED 2 /* rwsem was observed to be read owned */
+
 /*
  * handle the lock release when processes blocked on it that can now run
  * - if we come here from up_xxxx(), then:
  *   - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
  *   - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
- *   - there must be someone on the queue
+ * - there must be someone on the queue
  * - the spinlock must be held by the caller
  * - woken process blocks are discarded from the list after having task zeroed
  * - writers are only woken if downgrading is false
  */
-static inline struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
+static struct rw_semaphore *
+__rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 {
 	struct rwsem_waiter *waiter;
 	struct task_struct *tsk;
 	struct list_head *next;
-	signed long oldcount, woken, loop;
-
-	if (downgrading)
-		goto dont_wake_writers;
-
-	/* if we came through an up_xxxx() call, we only only wake someone up
-	 * if we can transition the active part of the count from 0 -> 1
-	 */
- try_again:
-	oldcount = rwsem_atomic_update(RWSEM_ACTIVE_BIAS, sem)
-						- RWSEM_ACTIVE_BIAS;
-	if (oldcount & RWSEM_ACTIVE_MASK)
-		goto undo;
+	signed long oldcount, woken, loop, adjustment;
 
 	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-
-	/* try to grant a single write lock if there's a writer at the front
-	 * of the queue - note we leave the 'active part' of the count
-	 * incremented by 1 and the waiting part incremented by 0x00010000
-	 */
 	if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE))
 		goto readers_only;
 
+	if (wake_type == RWSEM_WAKE_READ_OWNED)
+		/* Another active reader was observed, so wakeup is not
+		 * likely to succeed. Save the atomic op.
+		 */
+		goto out;
+
+	/* There's a writer at the front of the queue - try to grant it the
+	 * write lock.  However, we only wake this writer if we can transition
+	 * the active part of the count from 0 -> 1
+	 */
+	adjustment = RWSEM_ACTIVE_WRITE_BIAS;
+	if (waiter->list.next == &sem->wait_list)
+		adjustment -= RWSEM_WAITING_BIAS;
+
+ try_again_write:
+	oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
+	if (oldcount & RWSEM_ACTIVE_MASK)
+		/* Someone grabbed the sem already */
+		goto undo_write;
+
 	/* We must be careful not to touch 'waiter' after we set ->task = NULL.
 	 * It is an allocated on the waiter's stack and may become invalid at
 	 * any time after that point (due to a wakeup from another source).
@@ -87,18 +98,30 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
 	put_task_struct(tsk);
 	goto out;
 
-	/* don't want to wake any writers */
- dont_wake_writers:
-	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-	if (waiter->flags & RWSEM_WAITING_FOR_WRITE)
+ readers_only:
+	/* If we come here from up_xxxx(), another thread might have reached
+	 * rwsem_down_failed_common() before we acquired the spinlock and
+	 * woken up a waiter, making it now active.  We prefer to check for
+	 * this first in order to not spend too much time with the spinlock
+	 * held if we're not going to be able to wake up readers in the end.
+	 *
+	 * Note that we do not need to update the rwsem count: any writer
+	 * trying to acquire rwsem will run rwsem_down_write_failed() due
+	 * to the waiting threads and block trying to acquire the spinlock.
+	 *
+	 * We use a dummy atomic update in order to acquire the cache line
+	 * exclusively since we expect to succeed and run the final rwsem
+	 * count adjustment pretty soon.
+	 */
+	if (wake_type == RWSEM_WAKE_ANY &&
+	    rwsem_atomic_update(0, sem) < RWSEM_WAITING_BIAS)
+		/* Someone grabbed the sem for write already */
 		goto out;
 
-	/* grant an infinite number of read locks to the readers at the front
-	 * of the queue
-	 * - note we increment the 'active part' of the count by the number of
-	 *   readers before waking any processes up
+	/* Grant an infinite number of read locks to the readers at the front
+	 * of the queue.  Note we increment the 'active part' of the count by
+	 * the number of readers before waking any processes up.
 	 */
- readers_only:
 	woken = 0;
 	do {
 		woken++;
@@ -111,16 +134,15 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
 
 	} while (waiter->flags & RWSEM_WAITING_FOR_READ);
 
-	loop = woken;
-	woken *= RWSEM_ACTIVE_BIAS - RWSEM_WAITING_BIAS;
-	if (!downgrading)
-		/* we'd already done one increment earlier */
-		woken -= RWSEM_ACTIVE_BIAS;
+	adjustment = woken * RWSEM_ACTIVE_READ_BIAS;
+	if (waiter->flags & RWSEM_WAITING_FOR_READ)
+		/* hit end of list above */
+		adjustment -= RWSEM_WAITING_BIAS;
 
-	rwsem_atomic_add(woken, sem);
+	rwsem_atomic_add(adjustment, sem);
 
 	next = sem->wait_list.next;
-	for (; loop > 0; loop--) {
+	for (loop = woken; loop > 0; loop--) {
 		waiter = list_entry(next, struct rwsem_waiter, list);
 		next = waiter->list.next;
 		tsk = waiter->task;
@@ -138,10 +160,10 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
 
 	/* undo the change to the active count, but check for a transition
 	 * 1->0 */
- undo:
-	if (rwsem_atomic_update(-RWSEM_ACTIVE_BIAS, sem) & RWSEM_ACTIVE_MASK)
+ undo_write:
+	if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK)
 		goto out;
-	goto try_again;
+	goto try_again_write;
 }
 
 /*
@@ -149,8 +171,9 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
  */
 static struct rw_semaphore __sched *
 rwsem_down_failed_common(struct rw_semaphore *sem,
-			struct rwsem_waiter *waiter, signed long adjustment)
+			 unsigned int flags, signed long adjustment)
 {
+	struct rwsem_waiter waiter;
 	struct task_struct *tsk = current;
 	signed long count;
 
@@ -158,23 +181,34 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 
 	/* set up my own style of waitqueue */
 	spin_lock_irq(&sem->wait_lock);
-	waiter->task = tsk;
+	waiter.task = tsk;
+	waiter.flags = flags;
 	get_task_struct(tsk);
 
-	list_add_tail(&waiter->list, &sem->wait_list);
+	if (list_empty(&sem->wait_list))
+		adjustment += RWSEM_WAITING_BIAS;
+	list_add_tail(&waiter.list, &sem->wait_list);
 
-	/* we're now waiting on the lock, but no longer actively read-locking */
+	/* we're now waiting on the lock, but no longer actively locking */
 	count = rwsem_atomic_update(adjustment, sem);
 
-	/* if there are no active locks, wake the front queued process(es) up */
-	if (!(count & RWSEM_ACTIVE_MASK))
-		sem = __rwsem_do_wake(sem, 0);
+	/* If there are no active locks, wake the front queued process(es) up.
+	 *
+	 * Alternatively, if we're called from a failed down_write(), there
+	 * were already threads queued before us and there are no active
+	 * writers, the lock must be read owned; so we try to wake any read
+	 * locks that were queued ahead of us. */
+	if (count == RWSEM_WAITING_BIAS)
+		sem = __rwsem_do_wake(sem, RWSEM_WAKE_NO_ACTIVE);
+	else if (count > RWSEM_WAITING_BIAS &&
+		 adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
+		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
 
 	spin_unlock_irq(&sem->wait_lock);
 
 	/* wait to be given the lock */
 	for (;;) {
-		if (!waiter->task)
+		if (!waiter.task)
 			break;
 		schedule();
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
@@ -191,12 +225,8 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 asmregparm struct rw_semaphore __sched *
 rwsem_down_read_failed(struct rw_semaphore *sem)
 {
-	struct rwsem_waiter waiter;
-
-	waiter.flags = RWSEM_WAITING_FOR_READ;
-	rwsem_down_failed_common(sem, &waiter,
-				RWSEM_WAITING_BIAS - RWSEM_ACTIVE_BIAS);
-	return sem;
+	return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_READ,
+					-RWSEM_ACTIVE_READ_BIAS);
 }
 
 /*
@@ -205,12 +235,8 @@ rwsem_down_read_failed(struct rw_semaphore *sem)
 asmregparm struct rw_semaphore __sched *
 rwsem_down_write_failed(struct rw_semaphore *sem)
 {
-	struct rwsem_waiter waiter;
-
-	waiter.flags = RWSEM_WAITING_FOR_WRITE;
-	rwsem_down_failed_common(sem, &waiter, -RWSEM_ACTIVE_BIAS);
-
-	return sem;
+	return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_WRITE,
+					-RWSEM_ACTIVE_WRITE_BIAS);
 }
 
 /*
@@ -225,7 +251,7 @@ asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
 
 	/* do nothing if list empty */
 	if (!list_empty(&sem->wait_list))
-		sem = __rwsem_do_wake(sem, 0);
+		sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
 
 	spin_unlock_irqrestore(&sem->wait_lock, flags);
 
@@ -245,7 +271,7 @@ asmregparm struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
 
 	/* do nothing if list empty */
 	if (!list_empty(&sem->wait_list))
-		sem = __rwsem_do_wake(sem, 1);
+		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
 
 	spin_unlock_irqrestore(&sem->wait_lock, flags);
 
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 9afa25b52a8..a5ec42868f9 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -10,6 +10,7 @@
 #include <linux/slab.h>
 #include <linux/scatterlist.h>
 #include <linux/highmem.h>
+#include <linux/kmemleak.h>
 
 /**
  * sg_next - return the next scatterlist entry in a list
@@ -115,17 +116,29 @@ EXPORT_SYMBOL(sg_init_one);
  */
 static struct scatterlist *sg_kmalloc(unsigned int nents, gfp_t gfp_mask)
 {
-	if (nents == SG_MAX_SINGLE_ALLOC)
-		return (struct scatterlist *) __get_free_page(gfp_mask);
-	else
+	if (nents == SG_MAX_SINGLE_ALLOC) {
+		/*
+		 * Kmemleak doesn't track page allocations as they are not
+		 * commonly used (in a raw form) for kernel data structures.
+		 * As we chain together a list of pages and then a normal
+		 * kmalloc (tracked by kmemleak), in order to for that last
+		 * allocation not to become decoupled (and thus a
+		 * false-positive) we need to inform kmemleak of all the
+		 * intermediate allocations.
+		 */
+		void *ptr = (void *) __get_free_page(gfp_mask);
+		kmemleak_alloc(ptr, PAGE_SIZE, 1, gfp_mask);
+		return ptr;
+	} else
 		return kmalloc(nents * sizeof(struct scatterlist), gfp_mask);
 }
 
 static void sg_kfree(struct scatterlist *sg, unsigned int nents)
 {
-	if (nents == SG_MAX_SINGLE_ALLOC)
+	if (nents == SG_MAX_SINGLE_ALLOC) {
+		kmemleak_free(sg);
 		free_page((unsigned long) sg);
-	else
+	} else
 		kfree(sg);
 }
 
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 4ee19d0d391..7af9d841c43 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -146,19 +146,16 @@ int strict_strtoul(const char *cp, unsigned int base, unsigned long *res)
 {
 	char *tail;
 	unsigned long val;
-	size_t len;
 
 	*res = 0;
-	len = strlen(cp);
-	if (len == 0)
+	if (!*cp)
 		return -EINVAL;
 
 	val = simple_strtoul(cp, &tail, base);
 	if (tail == cp)
 		return -EINVAL;
 
-	if ((*tail == '\0') ||
-		((len == (size_t)(tail - cp) + 1) && (*tail == '\n'))) {
+	if ((tail[0] == '\0') || (tail[0] == '\n' && tail[1] == '\0')) {
 		*res = val;
 		return 0;
 	}
@@ -220,18 +217,15 @@ int strict_strtoull(const char *cp, unsigned int base, unsigned long long *res)
 {
 	char *tail;
 	unsigned long long val;
-	size_t len;
 
 	*res = 0;
-	len = strlen(cp);
-	if (len == 0)
+	if (!*cp)
 		return -EINVAL;
 
 	val = simple_strtoull(cp, &tail, base);
 	if (tail == cp)
 		return -EINVAL;
-	if ((*tail == '\0') ||
-		((len == (size_t)(tail - cp) + 1) && (*tail == '\n'))) {
+	if ((tail[0] == '\0') || (tail[0] == '\n' && tail[1] == '\0')) {
 		*res = val;
 		return 0;
 	}