From 285b2c4fdd69ea73b4762785d8c6be83b6c074a6 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 3 Aug 2011 16:21:20 -0700
Subject: tmpfs: demolish old swap vector support

The maximum size of a shmem/tmpfs file has been limited by the maximum
size of its triple-indirect swap vector.  With 4kB page size, maximum
filesize was just over 2TB on a 32-bit kernel, but sadly one eighth of
that on a 64-bit kernel.  (With 8kB page size, maximum filesize was just
over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel,
MAX_LFS_FILESIZE being then more restrictive than swap vector layout.)

It's a shame that tmpfs should be more restrictive than ramfs, and this
limitation has now been noticed.  Add another level to the swap vector?
No, it became obscure and hard to maintain, once I complicated it to
make use of highmem pages nine years ago: better choose another way.

Surely, if 2.4 had had the radix tree pagecache introduced in 2.5, then
tmpfs would never have invented its own peculiar radix tree: we would
have fitted swap entries into the common radix tree instead, in much the
same way as we fit swap entries into page tables.

And why should each file have a separate radix tree for its pages and
for its swap entries? The swap entries are required precisely where and
when the pages are not.  We want to put them together in a single radix
tree: which can then avoid much of the locking which was needed to
prevent them from being exchanged underneath us.

This also avoids the waste of memory devoted to swap vectors, first in
the shmem_inode itself, then at least two more pages once a file grew
beyond 16 data pages (pages accounted by df and du, but not by memcg).
Allocated upfront, to avoid allocation when under swapping pressure, but
pure waste when CONFIG_SWAP is not set - I have never spattered around
the ifdefs to prevent that, preferring this move to sharing the common
radix tree instead.

There are three downsides to sharing the radix tree.  One, that it binds
tmpfs more tightly to the rest of mm, either requiring knowledge of swap
entries in radix tree there, or duplication of its code here in shmem.c.
I believe that the simplications and memory savings (and probable higher
performance, not yet measured) justify that.

Two, that on HIGHMEM systems with SWAP enabled, it's the lowmem radix
nodes that cannot be freed under memory pressure - whereas before it was
the less precious highmem swap vector pages that could not be freed.
I'm hoping that 64-bit has now been accessible for long enough, that the
highmem argument has grown much less persuasive.

Three, that swapoff is slower than it used to be on tmpfs files, since
it's using a simple generic mechanism not tailored to it: I find this
noticeable, and shall want to improve, but maybe nobody else will
notice.

So...  now remove most of the old swap vector code from shmem.c.  But,
for the moment, keep the simple i_direct vector of 16 pages, with simple
accessors shmem_put_swap() and shmem_get_swap(), as a toy implementation
to help mark where swap needs to be handled in subsequent patches.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 782 +++++++------------------------------------------------------
 1 file changed, 84 insertions(+), 698 deletions(-)

(limited to 'mm/shmem.c')

diff --git a/mm/shmem.c b/mm/shmem.c
index 5cc21f8b4cd..5574b00ca77 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -66,37 +66,9 @@ static struct vfsmount *shm_mnt;
 #include <asm/div64.h>
 #include <asm/pgtable.h>
 
-/*
- * The maximum size of a shmem/tmpfs file is limited by the maximum size of
- * its triple-indirect swap vector - see illustration at shmem_swp_entry().
- *
- * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel,
- * but one eighth of that on a 64-bit kernel.  With 8kB page size, maximum
- * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel,
- * MAX_LFS_FILESIZE being then more restrictive than swap vector layout.
- *
- * We use / and * instead of shifts in the definitions below, so that the swap
- * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE.
- */
-#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
-#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
-
-#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
-#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT)
-
-#define SHMEM_MAX_BYTES  min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE)
-#define SHMEM_MAX_INDEX  ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT))
-
 #define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
 #define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
 
-/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
-#define SHMEM_PAGEIN	 VM_READ
-#define SHMEM_TRUNCATE	 VM_WRITE
-
-/* Definition to limit shmem_truncate's steps between cond_rescheds */
-#define LATENCY_LIMIT	 64
-
 /* Pretend that each entry is of this size in directory's i_size */
 #define BOGO_DIRENT_SIZE 20
 
@@ -107,7 +79,7 @@ struct shmem_xattr {
 	char value[0];
 };
 
-/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
+/* Flag allocation requirements to shmem_getpage */
 enum sgp_type {
 	SGP_READ,	/* don't exceed i_size, don't allocate page */
 	SGP_CACHE,	/* don't exceed i_size, may allocate page */
@@ -137,56 +109,6 @@ static inline int shmem_getpage(struct inode *inode, pgoff_t index,
 			mapping_gfp_mask(inode->i_mapping), fault_type);
 }
 
-static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
-{
-	/*
-	 * The above definition of ENTRIES_PER_PAGE, and the use of
-	 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
-	 * might be reconsidered if it ever diverges from PAGE_SIZE.
-	 *
-	 * Mobility flags are masked out as swap vectors cannot move
-	 */
-	return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO,
-				PAGE_CACHE_SHIFT-PAGE_SHIFT);
-}
-
-static inline void shmem_dir_free(struct page *page)
-{
-	__free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
-}
-
-static struct page **shmem_dir_map(struct page *page)
-{
-	return (struct page **)kmap_atomic(page, KM_USER0);
-}
-
-static inline void shmem_dir_unmap(struct page **dir)
-{
-	kunmap_atomic(dir, KM_USER0);
-}
-
-static swp_entry_t *shmem_swp_map(struct page *page)
-{
-	return (swp_entry_t *)kmap_atomic(page, KM_USER1);
-}
-
-static inline void shmem_swp_balance_unmap(void)
-{
-	/*
-	 * When passing a pointer to an i_direct entry, to code which
-	 * also handles indirect entries and so will shmem_swp_unmap,
-	 * we must arrange for the preempt count to remain in balance.
-	 * What kmap_atomic of a lowmem page does depends on config
-	 * and architecture, so pretend to kmap_atomic some lowmem page.
-	 */
-	(void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
-}
-
-static inline void shmem_swp_unmap(swp_entry_t *entry)
-{
-	kunmap_atomic(entry, KM_USER1);
-}
-
 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
@@ -303,468 +225,56 @@ static void shmem_recalc_inode(struct inode *inode)
 	}
 }
 
-/**
- * shmem_swp_entry - find the swap vector position in the info structure
- * @info:  info structure for the inode
- * @index: index of the page to find
- * @page:  optional page to add to the structure. Has to be preset to
- *         all zeros
- *
- * If there is no space allocated yet it will return NULL when
- * page is NULL, else it will use the page for the needed block,
- * setting it to NULL on return to indicate that it has been used.
- *
- * The swap vector is organized the following way:
- *
- * There are SHMEM_NR_DIRECT entries directly stored in the
- * shmem_inode_info structure. So small files do not need an addional
- * allocation.
- *
- * For pages with index > SHMEM_NR_DIRECT there is the pointer
- * i_indirect which points to a page which holds in the first half
- * doubly indirect blocks, in the second half triple indirect blocks:
- *
- * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
- * following layout (for SHMEM_NR_DIRECT == 16):
- *
- * i_indirect -> dir --> 16-19
- * 	      |	     +-> 20-23
- * 	      |
- * 	      +-->dir2 --> 24-27
- * 	      |	       +-> 28-31
- * 	      |	       +-> 32-35
- * 	      |	       +-> 36-39
- * 	      |
- * 	      +-->dir3 --> 40-43
- * 	       	       +-> 44-47
- * 	      	       +-> 48-51
- * 	      	       +-> 52-55
- */
-static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
-{
-	unsigned long offset;
-	struct page **dir;
-	struct page *subdir;
-
-	if (index < SHMEM_NR_DIRECT) {
-		shmem_swp_balance_unmap();
-		return info->i_direct+index;
-	}
-	if (!info->i_indirect) {
-		if (page) {
-			info->i_indirect = *page;
-			*page = NULL;
-		}
-		return NULL;			/* need another page */
-	}
-
-	index -= SHMEM_NR_DIRECT;
-	offset = index % ENTRIES_PER_PAGE;
-	index /= ENTRIES_PER_PAGE;
-	dir = shmem_dir_map(info->i_indirect);
-
-	if (index >= ENTRIES_PER_PAGE/2) {
-		index -= ENTRIES_PER_PAGE/2;
-		dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
-		index %= ENTRIES_PER_PAGE;
-		subdir = *dir;
-		if (!subdir) {
-			if (page) {
-				*dir = *page;
-				*page = NULL;
-			}
-			shmem_dir_unmap(dir);
-			return NULL;		/* need another page */
-		}
-		shmem_dir_unmap(dir);
-		dir = shmem_dir_map(subdir);
-	}
-
-	dir += index;
-	subdir = *dir;
-	if (!subdir) {
-		if (!page || !(subdir = *page)) {
-			shmem_dir_unmap(dir);
-			return NULL;		/* need a page */
-		}
-		*dir = subdir;
-		*page = NULL;
-	}
-	shmem_dir_unmap(dir);
-	return shmem_swp_map(subdir) + offset;
-}
-
-static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value)
+static void shmem_put_swap(struct shmem_inode_info *info, pgoff_t index,
+			   swp_entry_t swap)
 {
-	long incdec = value? 1: -1;
-
-	entry->val = value;
-	info->swapped += incdec;
-	if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
-		struct page *page = kmap_atomic_to_page(entry);
-		set_page_private(page, page_private(page) + incdec);
-	}
-}
-
-/**
- * shmem_swp_alloc - get the position of the swap entry for the page.
- * @info:	info structure for the inode
- * @index:	index of the page to find
- * @sgp:	check and recheck i_size? skip allocation?
- * @gfp:	gfp mask to use for any page allocation
- *
- * If the entry does not exist, allocate it.
- */
-static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info,
-			unsigned long index, enum sgp_type sgp, gfp_t gfp)
-{
-	struct inode *inode = &info->vfs_inode;
-	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
-	struct page *page = NULL;
-	swp_entry_t *entry;
-
-	if (sgp != SGP_WRITE &&
-	    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
-		return ERR_PTR(-EINVAL);
-
-	while (!(entry = shmem_swp_entry(info, index, &page))) {
-		if (sgp == SGP_READ)
-			return shmem_swp_map(ZERO_PAGE(0));
-		/*
-		 * Test used_blocks against 1 less max_blocks, since we have 1 data
-		 * page (and perhaps indirect index pages) yet to allocate:
-		 * a waste to allocate index if we cannot allocate data.
-		 */
-		if (sbinfo->max_blocks) {
-			if (percpu_counter_compare(&sbinfo->used_blocks,
-						sbinfo->max_blocks - 1) >= 0)
-				return ERR_PTR(-ENOSPC);
-			percpu_counter_inc(&sbinfo->used_blocks);
-			inode->i_blocks += BLOCKS_PER_PAGE;
-		}
-
-		spin_unlock(&info->lock);
-		page = shmem_dir_alloc(gfp);
-		spin_lock(&info->lock);
-
-		if (!page) {
-			shmem_free_blocks(inode, 1);
-			return ERR_PTR(-ENOMEM);
-		}
-		if (sgp != SGP_WRITE &&
-		    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
-			entry = ERR_PTR(-EINVAL);
-			break;
-		}
-		if (info->next_index <= index)
-			info->next_index = index + 1;
-	}
-	if (page) {
-		/* another task gave its page, or truncated the file */
-		shmem_free_blocks(inode, 1);
-		shmem_dir_free(page);
-	}
-	if (info->next_index <= index && !IS_ERR(entry))
-		info->next_index = index + 1;
-	return entry;
-}
-
-/**
- * shmem_free_swp - free some swap entries in a directory
- * @dir:        pointer to the directory
- * @edir:       pointer after last entry of the directory
- * @punch_lock: pointer to spinlock when needed for the holepunch case
- */
-static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir,
-						spinlock_t *punch_lock)
-{
-	spinlock_t *punch_unlock = NULL;
-	swp_entry_t *ptr;
-	int freed = 0;
-
-	for (ptr = dir; ptr < edir; ptr++) {
-		if (ptr->val) {
-			if (unlikely(punch_lock)) {
-				punch_unlock = punch_lock;
-				punch_lock = NULL;
-				spin_lock(punch_unlock);
-				if (!ptr->val)
-					continue;
-			}
-			free_swap_and_cache(*ptr);
-			*ptr = (swp_entry_t){0};
-			freed++;
-		}
-	}
-	if (punch_unlock)
-		spin_unlock(punch_unlock);
-	return freed;
-}
-
-static int shmem_map_and_free_swp(struct page *subdir, int offset,
-		int limit, struct page ***dir, spinlock_t *punch_lock)
-{
-	swp_entry_t *ptr;
-	int freed = 0;
-
-	ptr = shmem_swp_map(subdir);
-	for (; offset < limit; offset += LATENCY_LIMIT) {
-		int size = limit - offset;
-		if (size > LATENCY_LIMIT)
-			size = LATENCY_LIMIT;
-		freed += shmem_free_swp(ptr+offset, ptr+offset+size,
-							punch_lock);
-		if (need_resched()) {
-			shmem_swp_unmap(ptr);
-			if (*dir) {
-				shmem_dir_unmap(*dir);
-				*dir = NULL;
-			}
-			cond_resched();
-			ptr = shmem_swp_map(subdir);
-		}
-	}
-	shmem_swp_unmap(ptr);
-	return freed;
+	if (index < SHMEM_NR_DIRECT)
+		info->i_direct[index] = swap;
 }
 
-static void shmem_free_pages(struct list_head *next)
+static swp_entry_t shmem_get_swap(struct shmem_inode_info *info, pgoff_t index)
 {
-	struct page *page;
-	int freed = 0;
-
-	do {
-		page = container_of(next, struct page, lru);
-		next = next->next;
-		shmem_dir_free(page);
-		freed++;
-		if (freed >= LATENCY_LIMIT) {
-			cond_resched();
-			freed = 0;
-		}
-	} while (next);
+	return (index < SHMEM_NR_DIRECT) ?
+		info->i_direct[index] : (swp_entry_t){0};
 }
 
-void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 {
+	struct address_space *mapping = inode->i_mapping;
 	struct shmem_inode_info *info = SHMEM_I(inode);
-	unsigned long idx;
-	unsigned long size;
-	unsigned long limit;
-	unsigned long stage;
-	unsigned long diroff;
-	struct page **dir;
-	struct page *topdir;
-	struct page *middir;
-	struct page *subdir;
-	swp_entry_t *ptr;
-	LIST_HEAD(pages_to_free);
-	long nr_pages_to_free = 0;
-	long nr_swaps_freed = 0;
-	int offset;
-	int freed;
-	int punch_hole;
-	spinlock_t *needs_lock;
-	spinlock_t *punch_lock;
-	unsigned long upper_limit;
+	pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
+	pgoff_t index;
+	swp_entry_t swap;
 
-	truncate_inode_pages_range(inode->i_mapping, start, end);
+	truncate_inode_pages_range(mapping, lstart, lend);
 
-	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-	idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (idx >= info->next_index)
-		return;
+	if (end > SHMEM_NR_DIRECT)
+		end = SHMEM_NR_DIRECT;
 
 	spin_lock(&info->lock);
-	info->flags |= SHMEM_TRUNCATE;
-	if (likely(end == (loff_t) -1)) {
-		limit = info->next_index;
-		upper_limit = SHMEM_MAX_INDEX;
-		info->next_index = idx;
-		needs_lock = NULL;
-		punch_hole = 0;
-	} else {
-		if (end + 1 >= inode->i_size) {	/* we may free a little more */
-			limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >>
-							PAGE_CACHE_SHIFT;
-			upper_limit = SHMEM_MAX_INDEX;
-		} else {
-			limit = (end + 1) >> PAGE_CACHE_SHIFT;
-			upper_limit = limit;
-		}
-		needs_lock = &info->lock;
-		punch_hole = 1;
-	}
-
-	topdir = info->i_indirect;
-	if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
-		info->i_indirect = NULL;
-		nr_pages_to_free++;
-		list_add(&topdir->lru, &pages_to_free);
-	}
-	spin_unlock(&info->lock);
-
-	if (info->swapped && idx < SHMEM_NR_DIRECT) {
-		ptr = info->i_direct;
-		size = limit;
-		if (size > SHMEM_NR_DIRECT)
-			size = SHMEM_NR_DIRECT;
-		nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock);
-	}
-
-	/*
-	 * If there are no indirect blocks or we are punching a hole
-	 * below indirect blocks, nothing to be done.
-	 */
-	if (!topdir || limit <= SHMEM_NR_DIRECT)
-		goto done2;
-
-	/*
-	 * The truncation case has already dropped info->lock, and we're safe
-	 * because i_size and next_index have already been lowered, preventing
-	 * access beyond.  But in the punch_hole case, we still need to take
-	 * the lock when updating the swap directory, because there might be
-	 * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or
-	 * shmem_writepage.  However, whenever we find we can remove a whole
-	 * directory page (not at the misaligned start or end of the range),
-	 * we first NULLify its pointer in the level above, and then have no
-	 * need to take the lock when updating its contents: needs_lock and
-	 * punch_lock (either pointing to info->lock or NULL) manage this.
-	 */
-
-	upper_limit -= SHMEM_NR_DIRECT;
-	limit -= SHMEM_NR_DIRECT;
-	idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
-	offset = idx % ENTRIES_PER_PAGE;
-	idx -= offset;
-
-	dir = shmem_dir_map(topdir);
-	stage = ENTRIES_PER_PAGEPAGE/2;
-	if (idx < ENTRIES_PER_PAGEPAGE/2) {
-		middir = topdir;
-		diroff = idx/ENTRIES_PER_PAGE;
-	} else {
-		dir += ENTRIES_PER_PAGE/2;
-		dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
-		while (stage <= idx)
-			stage += ENTRIES_PER_PAGEPAGE;
-		middir = *dir;
-		if (*dir) {
-			diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
-				ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
-			if (!diroff && !offset && upper_limit >= stage) {
-				if (needs_lock) {
-					spin_lock(needs_lock);
-					*dir = NULL;
-					spin_unlock(needs_lock);
-					needs_lock = NULL;
-				} else
-					*dir = NULL;
-				nr_pages_to_free++;
-				list_add(&middir->lru, &pages_to_free);
-			}
-			shmem_dir_unmap(dir);
-			dir = shmem_dir_map(middir);
-		} else {
-			diroff = 0;
-			offset = 0;
-			idx = stage;
+	for (index = start; index < end; index++) {
+		swap = shmem_get_swap(info, index);
+		if (swap.val) {
+			free_swap_and_cache(swap);
+			shmem_put_swap(info, index, (swp_entry_t){0});
+			info->swapped--;
 		}
 	}
 
-	for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) {
-		if (unlikely(idx == stage)) {
-			shmem_dir_unmap(dir);
-			dir = shmem_dir_map(topdir) +
-			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
-			while (!*dir) {
-				dir++;
-				idx += ENTRIES_PER_PAGEPAGE;
-				if (idx >= limit)
-					goto done1;
-			}
-			stage = idx + ENTRIES_PER_PAGEPAGE;
-			middir = *dir;
-			if (punch_hole)
-				needs_lock = &info->lock;
-			if (upper_limit >= stage) {
-				if (needs_lock) {
-					spin_lock(needs_lock);
-					*dir = NULL;
-					spin_unlock(needs_lock);
-					needs_lock = NULL;
-				} else
-					*dir = NULL;
-				nr_pages_to_free++;
-				list_add(&middir->lru, &pages_to_free);
-			}
-			shmem_dir_unmap(dir);
-			cond_resched();
-			dir = shmem_dir_map(middir);
-			diroff = 0;
-		}
-		punch_lock = needs_lock;
-		subdir = dir[diroff];
-		if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) {
-			if (needs_lock) {
-				spin_lock(needs_lock);
-				dir[diroff] = NULL;
-				spin_unlock(needs_lock);
-				punch_lock = NULL;
-			} else
-				dir[diroff] = NULL;
-			nr_pages_to_free++;
-			list_add(&subdir->lru, &pages_to_free);
-		}
-		if (subdir && page_private(subdir) /* has swap entries */) {
-			size = limit - idx;
-			if (size > ENTRIES_PER_PAGE)
-				size = ENTRIES_PER_PAGE;
-			freed = shmem_map_and_free_swp(subdir,
-					offset, size, &dir, punch_lock);
-			if (!dir)
-				dir = shmem_dir_map(middir);
-			nr_swaps_freed += freed;
-			if (offset || punch_lock) {
-				spin_lock(&info->lock);
-				set_page_private(subdir,
-					page_private(subdir) - freed);
-				spin_unlock(&info->lock);
-			} else
-				BUG_ON(page_private(subdir) != freed);
-		}
-		offset = 0;
-	}
-done1:
-	shmem_dir_unmap(dir);
-done2:
-	if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
+	if (mapping->nrpages) {
+		spin_unlock(&info->lock);
 		/*
-		 * Call truncate_inode_pages again: racing shmem_unuse_inode
-		 * may have swizzled a page in from swap since
-		 * truncate_pagecache or generic_delete_inode did it, before we
-		 * lowered next_index.  Also, though shmem_getpage checks
-		 * i_size before adding to cache, no recheck after: so fix the
-		 * narrow window there too.
+		 * A page may have meanwhile sneaked in from swap.
 		 */
-		truncate_inode_pages_range(inode->i_mapping, start, end);
+		truncate_inode_pages_range(mapping, lstart, lend);
+		spin_lock(&info->lock);
 	}
 
-	spin_lock(&info->lock);
-	info->flags &= ~SHMEM_TRUNCATE;
-	info->swapped -= nr_swaps_freed;
-	if (nr_pages_to_free)
-		shmem_free_blocks(inode, nr_pages_to_free);
 	shmem_recalc_inode(inode);
 	spin_unlock(&info->lock);
 
-	/*
-	 * Empty swap vector directory pages to be freed?
-	 */
-	if (!list_empty(&pages_to_free)) {
-		pages_to_free.prev->next = NULL;
-		shmem_free_pages(pages_to_free.next);
-	}
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
 
@@ -797,19 +307,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 				if (page)
 					unlock_page(page);
 			}
-			/*
-			 * Reset SHMEM_PAGEIN flag so that shmem_truncate can
-			 * detect if any pages might have been added to cache
-			 * after truncate_inode_pages.  But we needn't bother
-			 * if it's being fully truncated to zero-length: the
-			 * nrpages check is efficient enough in that case.
-			 */
-			if (newsize) {
-				struct shmem_inode_info *info = SHMEM_I(inode);
-				spin_lock(&info->lock);
-				info->flags &= ~SHMEM_PAGEIN;
-				spin_unlock(&info->lock);
-			}
 		}
 		if (newsize != oldsize) {
 			i_size_write(inode, newsize);
@@ -859,106 +356,28 @@ static void shmem_evict_inode(struct inode *inode)
 	end_writeback(inode);
 }
 
-static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
-{
-	swp_entry_t *ptr;
-
-	for (ptr = dir; ptr < edir; ptr++) {
-		if (ptr->val == entry.val)
-			return ptr - dir;
-	}
-	return -1;
-}
-
 static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
 {
-	struct address_space *mapping;
+	struct address_space *mapping = info->vfs_inode.i_mapping;
 	unsigned long idx;
-	unsigned long size;
-	unsigned long limit;
-	unsigned long stage;
-	struct page **dir;
-	struct page *subdir;
-	swp_entry_t *ptr;
-	int offset;
 	int error;
 
-	idx = 0;
-	ptr = info->i_direct;
-	spin_lock(&info->lock);
-	if (!info->swapped) {
-		list_del_init(&info->swaplist);
-		goto lost2;
-	}
-	limit = info->next_index;
-	size = limit;
-	if (size > SHMEM_NR_DIRECT)
-		size = SHMEM_NR_DIRECT;
-	offset = shmem_find_swp(entry, ptr, ptr+size);
-	if (offset >= 0) {
-		shmem_swp_balance_unmap();
-		goto found;
-	}
-	if (!info->i_indirect)
-		goto lost2;
-
-	dir = shmem_dir_map(info->i_indirect);
-	stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
-
-	for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
-		if (unlikely(idx == stage)) {
-			shmem_dir_unmap(dir-1);
-			if (cond_resched_lock(&info->lock)) {
-				/* check it has not been truncated */
-				if (limit > info->next_index) {
-					limit = info->next_index;
-					if (idx >= limit)
-						goto lost2;
-				}
-			}
-			dir = shmem_dir_map(info->i_indirect) +
-			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
-			while (!*dir) {
-				dir++;
-				idx += ENTRIES_PER_PAGEPAGE;
-				if (idx >= limit)
-					goto lost1;
-			}
-			stage = idx + ENTRIES_PER_PAGEPAGE;
-			subdir = *dir;
-			shmem_dir_unmap(dir);
-			dir = shmem_dir_map(subdir);
-		}
-		subdir = *dir;
-		if (subdir && page_private(subdir)) {
-			ptr = shmem_swp_map(subdir);
-			size = limit - idx;
-			if (size > ENTRIES_PER_PAGE)
-				size = ENTRIES_PER_PAGE;
-			offset = shmem_find_swp(entry, ptr, ptr+size);
-			shmem_swp_unmap(ptr);
-			if (offset >= 0) {
-				shmem_dir_unmap(dir);
-				ptr = shmem_swp_map(subdir);
-				goto found;
-			}
-		}
-	}
-lost1:
-	shmem_dir_unmap(dir-1);
-lost2:
-	spin_unlock(&info->lock);
+	for (idx = 0; idx < SHMEM_NR_DIRECT; idx++)
+		if (shmem_get_swap(info, idx).val == entry.val)
+			goto found;
 	return 0;
 found:
-	idx += offset;
-	ptr += offset;
+	spin_lock(&info->lock);
+	if (shmem_get_swap(info, idx).val != entry.val) {
+		spin_unlock(&info->lock);
+		return 0;
+	}
 
 	/*
 	 * Move _head_ to start search for next from here.
 	 * But be careful: shmem_evict_inode checks list_empty without taking
 	 * mutex, and there's an instant in list_move_tail when info->swaplist
-	 * would appear empty, if it were the only one on shmem_swaplist.  We
-	 * could avoid doing it if inode NULL; or use this minor optimization.
+	 * would appear empty, if it were the only one on shmem_swaplist.
 	 */
 	if (shmem_swaplist.next != &info->swaplist)
 		list_move_tail(&shmem_swaplist, &info->swaplist);
@@ -968,19 +387,17 @@ found:
 	 * but also to hold up shmem_evict_inode(): so inode cannot be freed
 	 * beneath us (pagelock doesn't help until the page is in pagecache).
 	 */
-	mapping = info->vfs_inode.i_mapping;
 	error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT);
 	/* which does mem_cgroup_uncharge_cache_page on error */
 
 	if (error != -ENOMEM) {
 		delete_from_swap_cache(page);
 		set_page_dirty(page);
-		info->flags |= SHMEM_PAGEIN;
-		shmem_swp_set(info, ptr, 0);
+		shmem_put_swap(info, idx, (swp_entry_t){0});
+		info->swapped--;
 		swap_free(entry);
 		error = 1;	/* not an error, but entry was found */
 	}
-	shmem_swp_unmap(ptr);
 	spin_unlock(&info->lock);
 	return error;
 }
@@ -1017,7 +434,14 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
 	mutex_lock(&shmem_swaplist_mutex);
 	list_for_each_safe(p, next, &shmem_swaplist) {
 		info = list_entry(p, struct shmem_inode_info, swaplist);
-		found = shmem_unuse_inode(info, entry, page);
+		if (!info->swapped) {
+			spin_lock(&info->lock);
+			if (!info->swapped)
+				list_del_init(&info->swaplist);
+			spin_unlock(&info->lock);
+		}
+		if (info->swapped)
+			found = shmem_unuse_inode(info, entry, page);
 		cond_resched();
 		if (found)
 			break;
@@ -1041,7 +465,7 @@ out:
 static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct shmem_inode_info *info;
-	swp_entry_t *entry, swap;
+	swp_entry_t swap, oswap;
 	struct address_space *mapping;
 	unsigned long index;
 	struct inode *inode;
@@ -1067,6 +491,15 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 		WARN_ON_ONCE(1);	/* Still happens? Tell us about it! */
 		goto redirty;
 	}
+
+	/*
+	 * Just for this patch, we have a toy implementation,
+	 * which can swap out only the first SHMEM_NR_DIRECT pages:
+	 * for simple demonstration of where we need to think about swap.
+	 */
+	if (index >= SHMEM_NR_DIRECT)
+		goto redirty;
+
 	swap = get_swap_page();
 	if (!swap.val)
 		goto redirty;
@@ -1087,22 +520,19 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 	spin_lock(&info->lock);
 	mutex_unlock(&shmem_swaplist_mutex);
 
-	if (index >= info->next_index) {
-		BUG_ON(!(info->flags & SHMEM_TRUNCATE));
-		goto unlock;
-	}
-	entry = shmem_swp_entry(info, index, NULL);
-	if (entry->val) {
+	oswap = shmem_get_swap(info, index);
+	if (oswap.val) {
 		WARN_ON_ONCE(1);	/* Still happens? Tell us about it! */
-		free_swap_and_cache(*entry);
-		shmem_swp_set(info, entry, 0);
+		free_swap_and_cache(oswap);
+		shmem_put_swap(info, index, (swp_entry_t){0});
+		info->swapped--;
 	}
 	shmem_recalc_inode(inode);
 
 	if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
 		delete_from_page_cache(page);
-		shmem_swp_set(info, entry, swap.val);
-		shmem_swp_unmap(entry);
+		shmem_put_swap(info, index, swap);
+		info->swapped++;
 		swap_shmem_alloc(swap);
 		spin_unlock(&info->lock);
 		BUG_ON(page_mapped(page));
@@ -1110,13 +540,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 		return 0;
 	}
 
-	shmem_swp_unmap(entry);
-unlock:
 	spin_unlock(&info->lock);
-	/*
-	 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
-	 * clear SWAP_HAS_CACHE flag.
-	 */
 	swapcache_free(swap, NULL);
 redirty:
 	set_page_dirty(page);
@@ -1230,12 +654,10 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx,
 	struct shmem_sb_info *sbinfo;
 	struct page *page;
 	struct page *prealloc_page = NULL;
-	swp_entry_t *entry;
 	swp_entry_t swap;
 	int error;
-	int ret;
 
-	if (idx >= SHMEM_MAX_INDEX)
+	if (idx > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
 		return -EFBIG;
 repeat:
 	page = find_lock_page(mapping, idx);
@@ -1272,37 +694,22 @@ repeat:
 
 	spin_lock(&info->lock);
 	shmem_recalc_inode(inode);
-	entry = shmem_swp_alloc(info, idx, sgp, gfp);
-	if (IS_ERR(entry)) {
-		spin_unlock(&info->lock);
-		error = PTR_ERR(entry);
-		goto out;
-	}
-	swap = *entry;
-
+	swap = shmem_get_swap(info, idx);
 	if (swap.val) {
 		/* Look it up and read it in.. */
 		page = lookup_swap_cache(swap);
 		if (!page) {
-			shmem_swp_unmap(entry);
 			spin_unlock(&info->lock);
 			/* here we actually do the io */
 			if (fault_type)
 				*fault_type |= VM_FAULT_MAJOR;
 			page = shmem_swapin(swap, gfp, info, idx);
 			if (!page) {
-				spin_lock(&info->lock);
-				entry = shmem_swp_alloc(info, idx, sgp, gfp);
-				if (IS_ERR(entry))
-					error = PTR_ERR(entry);
-				else {
-					if (entry->val == swap.val)
-						error = -ENOMEM;
-					shmem_swp_unmap(entry);
-				}
-				spin_unlock(&info->lock);
-				if (error)
+				swp_entry_t nswap = shmem_get_swap(info, idx);
+				if (nswap.val == swap.val) {
+					error = -ENOMEM;
 					goto out;
+				}
 				goto repeat;
 			}
 			wait_on_page_locked(page);
@@ -1312,14 +719,12 @@ repeat:
 
 		/* We have to do this with page locked to prevent races */
 		if (!trylock_page(page)) {
-			shmem_swp_unmap(entry);
 			spin_unlock(&info->lock);
 			wait_on_page_locked(page);
 			page_cache_release(page);
 			goto repeat;
 		}
 		if (PageWriteback(page)) {
-			shmem_swp_unmap(entry);
 			spin_unlock(&info->lock);
 			wait_on_page_writeback(page);
 			unlock_page(page);
@@ -1327,7 +732,6 @@ repeat:
 			goto repeat;
 		}
 		if (!PageUptodate(page)) {
-			shmem_swp_unmap(entry);
 			spin_unlock(&info->lock);
 			unlock_page(page);
 			page_cache_release(page);
@@ -1338,7 +742,6 @@ repeat:
 		error = add_to_page_cache_locked(page, mapping,
 						 idx, GFP_NOWAIT);
 		if (error) {
-			shmem_swp_unmap(entry);
 			spin_unlock(&info->lock);
 			if (error == -ENOMEM) {
 				/*
@@ -1358,16 +761,14 @@ repeat:
 			goto repeat;
 		}
 
-		info->flags |= SHMEM_PAGEIN;
-		shmem_swp_set(info, entry, 0);
-		shmem_swp_unmap(entry);
 		delete_from_swap_cache(page);
+		shmem_put_swap(info, idx, (swp_entry_t){0});
+		info->swapped--;
 		spin_unlock(&info->lock);
 		set_page_dirty(page);
 		swap_free(swap);
 
 	} else if (sgp == SGP_READ) {
-		shmem_swp_unmap(entry);
 		page = find_get_page(mapping, idx);
 		if (page && !trylock_page(page)) {
 			spin_unlock(&info->lock);
@@ -1378,7 +779,6 @@ repeat:
 		spin_unlock(&info->lock);
 
 	} else if (prealloc_page) {
-		shmem_swp_unmap(entry);
 		sbinfo = SHMEM_SB(inode->i_sb);
 		if (sbinfo->max_blocks) {
 			if (percpu_counter_compare(&sbinfo->used_blocks,
@@ -1393,34 +793,24 @@ repeat:
 		page = prealloc_page;
 		prealloc_page = NULL;
 
-		entry = shmem_swp_alloc(info, idx, sgp, gfp);
-		if (IS_ERR(entry))
-			error = PTR_ERR(entry);
-		else {
-			swap = *entry;
-			shmem_swp_unmap(entry);
-		}
-		ret = error || swap.val;
-		if (ret)
+		swap = shmem_get_swap(info, idx);
+		if (swap.val)
 			mem_cgroup_uncharge_cache_page(page);
 		else
-			ret = add_to_page_cache_lru(page, mapping,
+			error = add_to_page_cache_lru(page, mapping,
 						idx, GFP_NOWAIT);
 		/*
 		 * At add_to_page_cache_lru() failure,
 		 * uncharge will be done automatically.
 		 */
-		if (ret) {
+		if (swap.val || error) {
 			shmem_unacct_blocks(info->flags, 1);
 			shmem_free_blocks(inode, 1);
 			spin_unlock(&info->lock);
 			page_cache_release(page);
-			if (error)
-				goto out;
 			goto repeat;
 		}
 
-		info->flags |= SHMEM_PAGEIN;
 		info->alloced++;
 		spin_unlock(&info->lock);
 		clear_highpage(page);
@@ -2627,7 +2017,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed;
 	sbinfo->free_inodes = sbinfo->max_inodes;
 
-	sb->s_maxbytes = SHMEM_MAX_BYTES;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_blocksize = PAGE_CACHE_SIZE;
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 	sb->s_magic = TMPFS_MAGIC;
@@ -2863,7 +2253,7 @@ out4:
 void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
 					struct page **pagep, swp_entry_t *ent)
 {
-	swp_entry_t entry = { .val = 0 }, *ptr;
+	swp_entry_t entry = { .val = 0 };
 	struct page *page = NULL;
 	struct shmem_inode_info *info = SHMEM_I(inode);
 
@@ -2871,16 +2261,13 @@ void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
 		goto out;
 
 	spin_lock(&info->lock);
-	ptr = shmem_swp_entry(info, pgoff, NULL);
 #ifdef CONFIG_SWAP
-	if (ptr && ptr->val) {
-		entry.val = ptr->val;
+	entry = shmem_get_swap(info, pgoff);
+	if (entry.val)
 		page = find_get_page(&swapper_space, entry.val);
-	} else
+	else
 #endif
 		page = find_get_page(inode->i_mapping, pgoff);
-	if (ptr)
-		shmem_swp_unmap(ptr);
 	spin_unlock(&info->lock);
 out:
 	*pagep = page;
@@ -2963,7 +2350,6 @@ out:
 #define shmem_get_inode(sb, dir, mode, dev, flags)	ramfs_get_inode(sb, dir, mode, dev)
 #define shmem_acct_size(flags, size)		0
 #define shmem_unacct_size(flags, size)		do {} while (0)
-#define SHMEM_MAX_BYTES				MAX_LFS_FILESIZE
 
 #endif /* CONFIG_SHMEM */
 
@@ -2987,7 +2373,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
 	if (IS_ERR(shm_mnt))
 		return (void *)shm_mnt;
 
-	if (size < 0 || size > SHMEM_MAX_BYTES)
+	if (size < 0 || size > MAX_LFS_FILESIZE)
 		return ERR_PTR(-EINVAL);
 
 	if (shmem_acct_size(flags, size))
-- 
cgit v1.2.3-70-g09d2


From 41ffe5d5ceef7f7ff2ff18e320d88ca6d629efaf Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 3 Aug 2011 16:21:21 -0700
Subject: tmpfs: miscellaneous trivial cleanups

While it's at its least, make a number of boring nitpicky cleanups to
shmem.c, mostly for consistency of variable naming.  Things like "swap"
instead of "entry", "pgoff_t index" instead of "unsigned long idx".

And since everything else here is prefixed "shmem_", better change
init_tmpfs() to shmem_init().

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shmem_fs.h |   2 +-
 init/main.c              |   2 +-
 mm/shmem.c               | 216 +++++++++++++++++++++++------------------------
 3 files changed, 109 insertions(+), 111 deletions(-)

(limited to 'mm/shmem.c')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 80b695213fd..3f05795dcf7 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -47,7 +47,7 @@ static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
 /*
  * Functions in mm/shmem.c called directly from elsewhere:
  */
-extern int init_tmpfs(void);
+extern int shmem_init(void);
 extern int shmem_fill_super(struct super_block *sb, void *data, int silent);
 extern struct file *shmem_file_setup(const char *name,
 					loff_t size, unsigned long flags);
diff --git a/init/main.c b/init/main.c
index d7211faed2a..1952d37e4ec 100644
--- a/init/main.c
+++ b/init/main.c
@@ -715,7 +715,7 @@ static void __init do_basic_setup(void)
 {
 	cpuset_init_smp();
 	usermodehelper_init();
-	init_tmpfs();
+	shmem_init();
 	driver_init();
 	init_irq_proc();
 	do_ctors();
diff --git a/mm/shmem.c b/mm/shmem.c
index 5574b00ca77..24e95ac1605 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,7 +28,6 @@
 #include <linux/file.h>
 #include <linux/mm.h>
 #include <linux/module.h>
-#include <linux/percpu_counter.h>
 #include <linux/swap.h>
 
 static struct vfsmount *shm_mnt;
@@ -51,6 +50,7 @@ static struct vfsmount *shm_mnt;
 #include <linux/shmem_fs.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
+#include <linux/percpu_counter.h>
 #include <linux/splice.h>
 #include <linux/security.h>
 #include <linux/swapops.h>
@@ -63,7 +63,6 @@ static struct vfsmount *shm_mnt;
 #include <linux/magic.h>
 
 #include <asm/uaccess.h>
-#include <asm/div64.h>
 #include <asm/pgtable.h>
 
 #define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
@@ -201,7 +200,7 @@ static void shmem_free_inode(struct super_block *sb)
 }
 
 /**
- * shmem_recalc_inode - recalculate the size of an inode
+ * shmem_recalc_inode - recalculate the block usage of an inode
  * @inode: inode to recalc
  *
  * We have to calculate the free blocks since the mm can drop
@@ -356,19 +355,20 @@ static void shmem_evict_inode(struct inode *inode)
 	end_writeback(inode);
 }
 
-static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
+static int shmem_unuse_inode(struct shmem_inode_info *info,
+			     swp_entry_t swap, struct page *page)
 {
 	struct address_space *mapping = info->vfs_inode.i_mapping;
-	unsigned long idx;
+	pgoff_t index;
 	int error;
 
-	for (idx = 0; idx < SHMEM_NR_DIRECT; idx++)
-		if (shmem_get_swap(info, idx).val == entry.val)
+	for (index = 0; index < SHMEM_NR_DIRECT; index++)
+		if (shmem_get_swap(info, index).val == swap.val)
 			goto found;
 	return 0;
 found:
 	spin_lock(&info->lock);
-	if (shmem_get_swap(info, idx).val != entry.val) {
+	if (shmem_get_swap(info, index).val != swap.val) {
 		spin_unlock(&info->lock);
 		return 0;
 	}
@@ -387,15 +387,15 @@ found:
 	 * but also to hold up shmem_evict_inode(): so inode cannot be freed
 	 * beneath us (pagelock doesn't help until the page is in pagecache).
 	 */
-	error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT);
+	error = add_to_page_cache_locked(page, mapping, index, GFP_NOWAIT);
 	/* which does mem_cgroup_uncharge_cache_page on error */
 
 	if (error != -ENOMEM) {
 		delete_from_swap_cache(page);
 		set_page_dirty(page);
-		shmem_put_swap(info, idx, (swp_entry_t){0});
+		shmem_put_swap(info, index, (swp_entry_t){0});
 		info->swapped--;
-		swap_free(entry);
+		swap_free(swap);
 		error = 1;	/* not an error, but entry was found */
 	}
 	spin_unlock(&info->lock);
@@ -405,9 +405,9 @@ found:
 /*
  * shmem_unuse() search for an eventually swapped out shmem page.
  */
-int shmem_unuse(swp_entry_t entry, struct page *page)
+int shmem_unuse(swp_entry_t swap, struct page *page)
 {
-	struct list_head *p, *next;
+	struct list_head *this, *next;
 	struct shmem_inode_info *info;
 	int found = 0;
 	int error;
@@ -432,8 +432,8 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
 	radix_tree_preload_end();
 
 	mutex_lock(&shmem_swaplist_mutex);
-	list_for_each_safe(p, next, &shmem_swaplist) {
-		info = list_entry(p, struct shmem_inode_info, swaplist);
+	list_for_each_safe(this, next, &shmem_swaplist) {
+		info = list_entry(this, struct shmem_inode_info, swaplist);
 		if (!info->swapped) {
 			spin_lock(&info->lock);
 			if (!info->swapped)
@@ -441,7 +441,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
 			spin_unlock(&info->lock);
 		}
 		if (info->swapped)
-			found = shmem_unuse_inode(info, entry, page);
+			found = shmem_unuse_inode(info, swap, page);
 		cond_resched();
 		if (found)
 			break;
@@ -467,7 +467,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 	struct shmem_inode_info *info;
 	swp_entry_t swap, oswap;
 	struct address_space *mapping;
-	unsigned long index;
+	pgoff_t index;
 	struct inode *inode;
 
 	BUG_ON(!PageLocked(page));
@@ -577,35 +577,33 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 }
 #endif /* CONFIG_TMPFS */
 
-static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
-			struct shmem_inode_info *info, unsigned long idx)
+static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
+			struct shmem_inode_info *info, pgoff_t index)
 {
 	struct mempolicy mpol, *spol;
 	struct vm_area_struct pvma;
-	struct page *page;
 
 	spol = mpol_cond_copy(&mpol,
-				mpol_shared_policy_lookup(&info->policy, idx));
+			mpol_shared_policy_lookup(&info->policy, index));
 
 	/* Create a pseudo vma that just contains the policy */
 	pvma.vm_start = 0;
-	pvma.vm_pgoff = idx;
+	pvma.vm_pgoff = index;
 	pvma.vm_ops = NULL;
 	pvma.vm_policy = spol;
-	page = swapin_readahead(entry, gfp, &pvma, 0);
-	return page;
+	return swapin_readahead(swap, gfp, &pvma, 0);
 }
 
 static struct page *shmem_alloc_page(gfp_t gfp,
-			struct shmem_inode_info *info, unsigned long idx)
+			struct shmem_inode_info *info, pgoff_t index)
 {
 	struct vm_area_struct pvma;
 
 	/* Create a pseudo vma that just contains the policy */
 	pvma.vm_start = 0;
-	pvma.vm_pgoff = idx;
+	pvma.vm_pgoff = index;
 	pvma.vm_ops = NULL;
-	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
+	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
 
 	/*
 	 * alloc_page_vma() will drop the shared policy reference
@@ -614,19 +612,19 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 }
 #else /* !CONFIG_NUMA */
 #ifdef CONFIG_TMPFS
-static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p)
+static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
 {
 }
 #endif /* CONFIG_TMPFS */
 
-static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
-			struct shmem_inode_info *info, unsigned long idx)
+static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
+			struct shmem_inode_info *info, pgoff_t index)
 {
-	return swapin_readahead(entry, gfp, NULL, 0);
+	return swapin_readahead(swap, gfp, NULL, 0);
 }
 
 static inline struct page *shmem_alloc_page(gfp_t gfp,
-			struct shmem_inode_info *info, unsigned long idx)
+			struct shmem_inode_info *info, pgoff_t index)
 {
 	return alloc_page(gfp);
 }
@@ -646,7 +644,7 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
  * vm. If we swap it in we mark it dirty since we also free the swap
  * entry since a page cannot live in both the swap and page cache
  */
-static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx,
+static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 	struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
 {
 	struct address_space *mapping = inode->i_mapping;
@@ -657,10 +655,10 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx,
 	swp_entry_t swap;
 	int error;
 
-	if (idx > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
+	if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
 		return -EFBIG;
 repeat:
-	page = find_lock_page(mapping, idx);
+	page = find_lock_page(mapping, index);
 	if (page) {
 		/*
 		 * Once we can get the page lock, it must be uptodate:
@@ -681,7 +679,7 @@ repeat:
 	radix_tree_preload_end();
 
 	if (sgp != SGP_READ && !prealloc_page) {
-		prealloc_page = shmem_alloc_page(gfp, info, idx);
+		prealloc_page = shmem_alloc_page(gfp, info, index);
 		if (prealloc_page) {
 			SetPageSwapBacked(prealloc_page);
 			if (mem_cgroup_cache_charge(prealloc_page,
@@ -694,7 +692,7 @@ repeat:
 
 	spin_lock(&info->lock);
 	shmem_recalc_inode(inode);
-	swap = shmem_get_swap(info, idx);
+	swap = shmem_get_swap(info, index);
 	if (swap.val) {
 		/* Look it up and read it in.. */
 		page = lookup_swap_cache(swap);
@@ -703,9 +701,9 @@ repeat:
 			/* here we actually do the io */
 			if (fault_type)
 				*fault_type |= VM_FAULT_MAJOR;
-			page = shmem_swapin(swap, gfp, info, idx);
+			page = shmem_swapin(swap, gfp, info, index);
 			if (!page) {
-				swp_entry_t nswap = shmem_get_swap(info, idx);
+				swp_entry_t nswap = shmem_get_swap(info, index);
 				if (nswap.val == swap.val) {
 					error = -ENOMEM;
 					goto out;
@@ -740,7 +738,7 @@ repeat:
 		}
 
 		error = add_to_page_cache_locked(page, mapping,
-						 idx, GFP_NOWAIT);
+						 index, GFP_NOWAIT);
 		if (error) {
 			spin_unlock(&info->lock);
 			if (error == -ENOMEM) {
@@ -762,14 +760,14 @@ repeat:
 		}
 
 		delete_from_swap_cache(page);
-		shmem_put_swap(info, idx, (swp_entry_t){0});
+		shmem_put_swap(info, index, (swp_entry_t){0});
 		info->swapped--;
 		spin_unlock(&info->lock);
 		set_page_dirty(page);
 		swap_free(swap);
 
 	} else if (sgp == SGP_READ) {
-		page = find_get_page(mapping, idx);
+		page = find_get_page(mapping, index);
 		if (page && !trylock_page(page)) {
 			spin_unlock(&info->lock);
 			wait_on_page_locked(page);
@@ -793,12 +791,12 @@ repeat:
 		page = prealloc_page;
 		prealloc_page = NULL;
 
-		swap = shmem_get_swap(info, idx);
+		swap = shmem_get_swap(info, index);
 		if (swap.val)
 			mem_cgroup_uncharge_cache_page(page);
 		else
 			error = add_to_page_cache_lru(page, mapping,
-						idx, GFP_NOWAIT);
+						index, GFP_NOWAIT);
 		/*
 		 * At add_to_page_cache_lru() failure,
 		 * uncharge will be done automatically.
@@ -841,7 +839,7 @@ nospace:
 	 * but must also avoid reporting a spurious ENOSPC while working on a
 	 * full tmpfs.
 	 */
-	page = find_get_page(mapping, idx);
+	page = find_get_page(mapping, index);
 	spin_unlock(&info->lock);
 	if (page) {
 		page_cache_release(page);
@@ -872,20 +870,20 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 
 #ifdef CONFIG_NUMA
-static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
+static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
 {
-	struct inode *i = vma->vm_file->f_path.dentry->d_inode;
-	return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
+	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
 }
 
 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
 					  unsigned long addr)
 {
-	struct inode *i = vma->vm_file->f_path.dentry->d_inode;
-	unsigned long idx;
+	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	pgoff_t index;
 
-	idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-	return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
+	index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+	return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
 }
 #endif
 
@@ -1016,7 +1014,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct address_space *mapping = inode->i_mapping;
-	unsigned long index, offset;
+	pgoff_t index;
+	unsigned long offset;
 	enum sgp_type sgp = SGP_READ;
 
 	/*
@@ -1032,7 +1031,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
 
 	for (;;) {
 		struct page *page = NULL;
-		unsigned long end_index, nr, ret;
+		pgoff_t end_index;
+		unsigned long nr, ret;
 		loff_t i_size = i_size_read(inode);
 
 		end_index = i_size >> PAGE_CACHE_SHIFT;
@@ -1270,8 +1270,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_namelen = NAME_MAX;
 	if (sbinfo->max_blocks) {
 		buf->f_blocks = sbinfo->max_blocks;
-		buf->f_bavail = buf->f_bfree =
-				sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks);
+		buf->f_bavail =
+		buf->f_bfree  = sbinfo->max_blocks -
+				percpu_counter_sum(&sbinfo->used_blocks);
 	}
 	if (sbinfo->max_inodes) {
 		buf->f_files = sbinfo->max_inodes;
@@ -1480,8 +1481,8 @@ static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *n
 static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct page *page = NULL;
-	int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
-	nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
+	int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
+	nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
 	if (page)
 		unlock_page(page);
 	return page;
@@ -1592,7 +1593,6 @@ out:
 	return err;
 }
 
-
 static const struct xattr_handler *shmem_xattr_handlers[] = {
 #ifdef CONFIG_TMPFS_POSIX_ACL
 	&generic_acl_access_handler,
@@ -2052,14 +2052,14 @@ static struct kmem_cache *shmem_inode_cachep;
 
 static struct inode *shmem_alloc_inode(struct super_block *sb)
 {
-	struct shmem_inode_info *p;
-	p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
-	if (!p)
+	struct shmem_inode_info *info;
+	info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
+	if (!info)
 		return NULL;
-	return &p->vfs_inode;
+	return &info->vfs_inode;
 }
 
-static void shmem_i_callback(struct rcu_head *head)
+static void shmem_destroy_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
 	INIT_LIST_HEAD(&inode->i_dentry);
@@ -2072,25 +2072,24 @@ static void shmem_destroy_inode(struct inode *inode)
 		/* only struct inode is valid if it's an inline symlink */
 		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
 	}
-	call_rcu(&inode->i_rcu, shmem_i_callback);
+	call_rcu(&inode->i_rcu, shmem_destroy_callback);
 }
 
-static void init_once(void *foo)
+static void shmem_init_inode(void *foo)
 {
-	struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
-
-	inode_init_once(&p->vfs_inode);
+	struct shmem_inode_info *info = foo;
+	inode_init_once(&info->vfs_inode);
 }
 
-static int init_inodecache(void)
+static int shmem_init_inodecache(void)
 {
 	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
 				sizeof(struct shmem_inode_info),
-				0, SLAB_PANIC, init_once);
+				0, SLAB_PANIC, shmem_init_inode);
 	return 0;
 }
 
-static void destroy_inodecache(void)
+static void shmem_destroy_inodecache(void)
 {
 	kmem_cache_destroy(shmem_inode_cachep);
 }
@@ -2187,21 +2186,20 @@ static const struct vm_operations_struct shmem_vm_ops = {
 #endif
 };
 
-
 static struct dentry *shmem_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
 	return mount_nodev(fs_type, flags, data, shmem_fill_super);
 }
 
-static struct file_system_type tmpfs_fs_type = {
+static struct file_system_type shmem_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "tmpfs",
 	.mount		= shmem_mount,
 	.kill_sb	= kill_litter_super,
 };
 
-int __init init_tmpfs(void)
+int __init shmem_init(void)
 {
 	int error;
 
@@ -2209,18 +2207,18 @@ int __init init_tmpfs(void)
 	if (error)
 		goto out4;
 
-	error = init_inodecache();
+	error = shmem_init_inodecache();
 	if (error)
 		goto out3;
 
-	error = register_filesystem(&tmpfs_fs_type);
+	error = register_filesystem(&shmem_fs_type);
 	if (error) {
 		printk(KERN_ERR "Could not register tmpfs\n");
 		goto out2;
 	}
 
-	shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER,
-				tmpfs_fs_type.name, NULL);
+	shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER,
+				 shmem_fs_type.name, NULL);
 	if (IS_ERR(shm_mnt)) {
 		error = PTR_ERR(shm_mnt);
 		printk(KERN_ERR "Could not kern_mount tmpfs\n");
@@ -2229,9 +2227,9 @@ int __init init_tmpfs(void)
 	return 0;
 
 out1:
-	unregister_filesystem(&tmpfs_fs_type);
+	unregister_filesystem(&shmem_fs_type);
 out2:
-	destroy_inodecache();
+	shmem_destroy_inodecache();
 out3:
 	bdi_destroy(&shmem_backing_dev_info);
 out4:
@@ -2241,37 +2239,37 @@ out4:
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 /**
- * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
+ * mem_cgroup_get_shmem_target - find page or swap assigned to the shmem file
  * @inode: the inode to be searched
- * @pgoff: the offset to be searched
+ * @index: the page offset to be searched
  * @pagep: the pointer for the found page to be stored
- * @ent: the pointer for the found swap entry to be stored
+ * @swapp: the pointer for the found swap entry to be stored
  *
  * If a page is found, refcount of it is incremented. Callers should handle
  * these refcount.
  */
-void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
-					struct page **pagep, swp_entry_t *ent)
+void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t index,
+				 struct page **pagep, swp_entry_t *swapp)
 {
-	swp_entry_t entry = { .val = 0 };
-	struct page *page = NULL;
 	struct shmem_inode_info *info = SHMEM_I(inode);
+	struct page *page = NULL;
+	swp_entry_t swap = {0};
 
-	if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
+	if ((index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
 		goto out;
 
 	spin_lock(&info->lock);
 #ifdef CONFIG_SWAP
-	entry = shmem_get_swap(info, pgoff);
-	if (entry.val)
-		page = find_get_page(&swapper_space, entry.val);
+	swap = shmem_get_swap(info, index);
+	if (swap.val)
+		page = find_get_page(&swapper_space, swap.val);
 	else
 #endif
-		page = find_get_page(inode->i_mapping, pgoff);
+		page = find_get_page(inode->i_mapping, index);
 	spin_unlock(&info->lock);
 out:
 	*pagep = page;
-	*ent = entry;
+	*swapp = swap;
 }
 #endif
 
@@ -2288,23 +2286,23 @@ out:
 
 #include <linux/ramfs.h>
 
-static struct file_system_type tmpfs_fs_type = {
+static struct file_system_type shmem_fs_type = {
 	.name		= "tmpfs",
 	.mount		= ramfs_mount,
 	.kill_sb	= kill_litter_super,
 };
 
-int __init init_tmpfs(void)
+int __init shmem_init(void)
 {
-	BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
+	BUG_ON(register_filesystem(&shmem_fs_type) != 0);
 
-	shm_mnt = kern_mount(&tmpfs_fs_type);
+	shm_mnt = kern_mount(&shmem_fs_type);
 	BUG_ON(IS_ERR(shm_mnt));
 
 	return 0;
 }
 
-int shmem_unuse(swp_entry_t entry, struct page *page)
+int shmem_unuse(swp_entry_t swap, struct page *page)
 {
 	return 0;
 }
@@ -2314,34 +2312,34 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
 	return 0;
 }
 
-void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 {
-	truncate_inode_pages_range(inode->i_mapping, start, end);
+	truncate_inode_pages_range(inode->i_mapping, lstart, lend);
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 /**
- * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
+ * mem_cgroup_get_shmem_target - find page or swap assigned to the shmem file
  * @inode: the inode to be searched
- * @pgoff: the offset to be searched
+ * @index: the page offset to be searched
  * @pagep: the pointer for the found page to be stored
- * @ent: the pointer for the found swap entry to be stored
+ * @swapp: the pointer for the found swap entry to be stored
  *
  * If a page is found, refcount of it is incremented. Callers should handle
  * these refcount.
  */
-void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
-					struct page **pagep, swp_entry_t *ent)
+void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t index,
+				 struct page **pagep, swp_entry_t *swapp)
 {
 	struct page *page = NULL;
 
-	if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
+	if ((index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
 		goto out;
-	page = find_get_page(inode->i_mapping, pgoff);
+	page = find_get_page(inode->i_mapping, index);
 out:
 	*pagep = page;
-	*ent = (swp_entry_t){ .val = 0 };
+	*swapp = (swp_entry_t){0};
 }
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From bda97eab0cc9c6385b9f26abdda6459f630f4513 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 3 Aug 2011 16:21:21 -0700
Subject: tmpfs: copy truncate_inode_pages_range

Bring truncate.c's code for truncate_inode_pages_range() inline into
shmem_truncate_range(), replacing its first call (there's a followup
call below, but leave that one, it will disappear next).

Don't play with it yet, apart from leaving out the cleancache flush, and
(importantly) the nrpages == 0 skip, and moving shmem_setattr()'s
partial page preparation into its partial page handling.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 79 insertions(+), 20 deletions(-)

(limited to 'mm/shmem.c')

diff --git a/mm/shmem.c b/mm/shmem.c
index 24e95ac1605..e101c211ed1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -50,6 +50,7 @@ static struct vfsmount *shm_mnt;
 #include <linux/shmem_fs.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
+#include <linux/pagevec.h>
 #include <linux/percpu_counter.h>
 #include <linux/splice.h>
 #include <linux/security.h>
@@ -242,11 +243,88 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 	struct address_space *mapping = inode->i_mapping;
 	struct shmem_inode_info *info = SHMEM_I(inode);
 	pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
 	pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
+	struct pagevec pvec;
 	pgoff_t index;
 	swp_entry_t swap;
+	int i;
+
+	BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
+
+	pagevec_init(&pvec, 0);
+	index = start;
+	while (index <= end && pagevec_lookup(&pvec, mapping, index,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+		mem_cgroup_uncharge_start();
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+
+			/* We rely upon deletion not changing page->index */
+			index = page->index;
+			if (index > end)
+				break;
+
+			if (!trylock_page(page))
+				continue;
+			WARN_ON(page->index != index);
+			if (PageWriteback(page)) {
+				unlock_page(page);
+				continue;
+			}
+			truncate_inode_page(mapping, page);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		mem_cgroup_uncharge_end();
+		cond_resched();
+		index++;
+	}
+
+	if (partial) {
+		struct page *page = NULL;
+		shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
+		if (page) {
+			zero_user_segment(page, partial, PAGE_CACHE_SIZE);
+			set_page_dirty(page);
+			unlock_page(page);
+			page_cache_release(page);
+		}
+	}
+
+	index = start;
+	for ( ; ; ) {
+		cond_resched();
+		if (!pagevec_lookup(&pvec, mapping, index,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+			if (index == start)
+				break;
+			index = start;
+			continue;
+		}
+		if (index == start && pvec.pages[0]->index > end) {
+			pagevec_release(&pvec);
+			break;
+		}
+		mem_cgroup_uncharge_start();
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
 
-	truncate_inode_pages_range(mapping, lstart, lend);
+			/* We rely upon deletion not changing page->index */
+			index = page->index;
+			if (index > end)
+				break;
+
+			lock_page(page);
+			WARN_ON(page->index != index);
+			wait_on_page_writeback(page);
+			truncate_inode_page(mapping, page);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		mem_cgroup_uncharge_end();
+		index++;
+	}
 
 	if (end > SHMEM_NR_DIRECT)
 		end = SHMEM_NR_DIRECT;
@@ -289,24 +367,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
 		loff_t oldsize = inode->i_size;
 		loff_t newsize = attr->ia_size;
-		struct page *page = NULL;
 
-		if (newsize < oldsize) {
-			/*
-			 * If truncating down to a partial page, then
-			 * if that page is already allocated, hold it
-			 * in memory until the truncation is over, so
-			 * truncate_partial_page cannot miss it were
-			 * it assigned to swap.
-			 */
-			if (newsize & (PAGE_CACHE_SIZE-1)) {
-				(void) shmem_getpage(inode,
-					newsize >> PAGE_CACHE_SHIFT,
-						&page, SGP_READ, NULL);
-				if (page)
-					unlock_page(page);
-			}
-		}
 		if (newsize != oldsize) {
 			i_size_write(inode, newsize);
 			inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -318,8 +379,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 			/* unmap again to remove racily COWed private pages */
 			unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
 		}
-		if (page)
-			page_cache_release(page);
 	}
 
 	setattr_copy(inode, attr);
-- 
cgit v1.2.3-70-g09d2


From 7a5d0fbb29936fad7f17b1cb001b0c33a5f13328 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 3 Aug 2011 16:21:22 -0700
Subject: tmpfs: convert shmem_truncate_range to radix-swap

Disable the toy swapping implementation in shmem_writepage() - it's hard
to support two schemes at once - and convert shmem_truncate_range() to a
lockless gang lookup of swap entries along with pages, freeing both.

Since the second loop tightens its noose until all entries of either
kind have been squeezed out (and we shall make sure that there's not an
instant when neither is visible), there is no longer a need for yet
another pass below.

shmem_radix_tree_replace() compensates for the lockless lookup by
checking that the expected entry is in place, under lock, before
replacing it.  Here it just deletes, but will be used in later patches
to substitute swap entry for page or page for swap entry.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 146 insertions(+), 46 deletions(-)

(limited to 'mm/shmem.c')

diff --git a/mm/shmem.c b/mm/shmem.c
index e101c211ed1..4439b7d5581 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -238,6 +238,111 @@ static swp_entry_t shmem_get_swap(struct shmem_inode_info *info, pgoff_t index)
 		info->i_direct[index] : (swp_entry_t){0};
 }
 
+/*
+ * Replace item expected in radix tree by a new item, while holding tree lock.
+ */
+static int shmem_radix_tree_replace(struct address_space *mapping,
+			pgoff_t index, void *expected, void *replacement)
+{
+	void **pslot;
+	void *item = NULL;
+
+	VM_BUG_ON(!expected);
+	pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
+	if (pslot)
+		item = radix_tree_deref_slot_protected(pslot,
+							&mapping->tree_lock);
+	if (item != expected)
+		return -ENOENT;
+	if (replacement)
+		radix_tree_replace_slot(pslot, replacement);
+	else
+		radix_tree_delete(&mapping->page_tree, index);
+	return 0;
+}
+
+/*
+ * Like find_get_pages, but collecting swap entries as well as pages.
+ */
+static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
+					pgoff_t start, unsigned int nr_pages,
+					struct page **pages, pgoff_t *indices)
+{
+	unsigned int i;
+	unsigned int ret;
+	unsigned int nr_found;
+
+	rcu_read_lock();
+restart:
+	nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+				(void ***)pages, indices, start, nr_pages);
+	ret = 0;
+	for (i = 0; i < nr_found; i++) {
+		struct page *page;
+repeat:
+		page = radix_tree_deref_slot((void **)pages[i]);
+		if (unlikely(!page))
+			continue;
+		if (radix_tree_exception(page)) {
+			if (radix_tree_exceptional_entry(page))
+				goto export;
+			/* radix_tree_deref_retry(page) */
+			goto restart;
+		}
+		if (!page_cache_get_speculative(page))
+			goto repeat;
+
+		/* Has the page moved? */
+		if (unlikely(page != *((void **)pages[i]))) {
+			page_cache_release(page);
+			goto repeat;
+		}
+export:
+		indices[ret] = indices[i];
+		pages[ret] = page;
+		ret++;
+	}
+	if (unlikely(!ret && nr_found))
+		goto restart;
+	rcu_read_unlock();
+	return ret;
+}
+
+/*
+ * Remove swap entry from radix tree, free the swap and its page cache.
+ */
+static int shmem_free_swap(struct address_space *mapping,
+			   pgoff_t index, void *radswap)
+{
+	int error;
+
+	spin_lock_irq(&mapping->tree_lock);
+	error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
+	spin_unlock_irq(&mapping->tree_lock);
+	if (!error)
+		free_swap_and_cache(radix_to_swp_entry(radswap));
+	return error;
+}
+
+/*
+ * Pagevec may contain swap entries, so shuffle up pages before releasing.
+ */
+static void shmem_pagevec_release(struct pagevec *pvec)
+{
+	int i, j;
+
+	for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
+		struct page *page = pvec->pages[i];
+		if (!radix_tree_exceptional_entry(page))
+			pvec->pages[j++] = page;
+	}
+	pvec->nr = j;
+	pagevec_release(pvec);
+}
+
+/*
+ * Remove range of pages and swap entries from radix tree, and free them.
+ */
 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 {
 	struct address_space *mapping = inode->i_mapping;
@@ -246,36 +351,44 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 	unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
 	pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
 	struct pagevec pvec;
+	pgoff_t indices[PAGEVEC_SIZE];
+	long nr_swaps_freed = 0;
 	pgoff_t index;
-	swp_entry_t swap;
 	int i;
 
 	BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
 
 	pagevec_init(&pvec, 0);
 	index = start;
-	while (index <= end && pagevec_lookup(&pvec, mapping, index,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+	while (index <= end) {
+		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+							pvec.pages, indices);
+		if (!pvec.nr)
+			break;
 		mem_cgroup_uncharge_start();
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
-			/* We rely upon deletion not changing page->index */
-			index = page->index;
+			index = indices[i];
 			if (index > end)
 				break;
 
-			if (!trylock_page(page))
+			if (radix_tree_exceptional_entry(page)) {
+				nr_swaps_freed += !shmem_free_swap(mapping,
+								index, page);
 				continue;
-			WARN_ON(page->index != index);
-			if (PageWriteback(page)) {
-				unlock_page(page);
+			}
+
+			if (!trylock_page(page))
 				continue;
+			if (page->mapping == mapping) {
+				VM_BUG_ON(PageWriteback(page));
+				truncate_inode_page(mapping, page);
 			}
-			truncate_inode_page(mapping, page);
 			unlock_page(page);
 		}
-		pagevec_release(&pvec);
+		shmem_pagevec_release(&pvec);
 		mem_cgroup_uncharge_end();
 		cond_resched();
 		index++;
@@ -295,59 +408,47 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 	index = start;
 	for ( ; ; ) {
 		cond_resched();
-		if (!pagevec_lookup(&pvec, mapping, index,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+							pvec.pages, indices);
+		if (!pvec.nr) {
 			if (index == start)
 				break;
 			index = start;
 			continue;
 		}
-		if (index == start && pvec.pages[0]->index > end) {
-			pagevec_release(&pvec);
+		if (index == start && indices[0] > end) {
+			shmem_pagevec_release(&pvec);
 			break;
 		}
 		mem_cgroup_uncharge_start();
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
-			/* We rely upon deletion not changing page->index */
-			index = page->index;
+			index = indices[i];
 			if (index > end)
 				break;
 
+			if (radix_tree_exceptional_entry(page)) {
+				nr_swaps_freed += !shmem_free_swap(mapping,
+								index, page);
+				continue;
+			}
+
 			lock_page(page);
-			WARN_ON(page->index != index);
-			wait_on_page_writeback(page);
-			truncate_inode_page(mapping, page);
+			if (page->mapping == mapping) {
+				VM_BUG_ON(PageWriteback(page));
+				truncate_inode_page(mapping, page);
+			}
 			unlock_page(page);
 		}
-		pagevec_release(&pvec);
+		shmem_pagevec_release(&pvec);
 		mem_cgroup_uncharge_end();
 		index++;
 	}
 
-	if (end > SHMEM_NR_DIRECT)
-		end = SHMEM_NR_DIRECT;
-
 	spin_lock(&info->lock);
-	for (index = start; index < end; index++) {
-		swap = shmem_get_swap(info, index);
-		if (swap.val) {
-			free_swap_and_cache(swap);
-			shmem_put_swap(info, index, (swp_entry_t){0});
-			info->swapped--;
-		}
-	}
-
-	if (mapping->nrpages) {
-		spin_unlock(&info->lock);
-		/*
-		 * A page may have meanwhile sneaked in from swap.
-		 */
-		truncate_inode_pages_range(mapping, lstart, lend);
-		spin_lock(&info->lock);
-	}
-
+	info->swapped -= nr_swaps_freed;
 	shmem_recalc_inode(inode);
 	spin_unlock(&info->lock);
 
@@ -552,11 +653,10 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 	}
 
 	/*
-	 * Just for this patch, we have a toy implementation,
-	 * which can swap out only the first SHMEM_NR_DIRECT pages:
-	 * for simple demonstration of where we need to think about swap.
+	 * Disable even the toy swapping implementation, while we convert
+	 * functions one by one to having swap entries in the radix tree.
 	 */
-	if (index >= SHMEM_NR_DIRECT)
+	if (index < ULONG_MAX)
 		goto redirty;
 
 	swap = get_swap_page();
-- 
cgit v1.2.3-70-g09d2


From 46f65ec15c6878a2b4a49f6e01b20b201b46a9e4 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 3 Aug 2011 16:21:23 -0700
Subject: tmpfs: convert shmem_unuse_inode to radix-swap

Convert shmem_unuse_inode() to use a lockless gang lookup of the radix
tree, searching for matching swap.

This is somewhat slower than the old method: because of repeated radix
tree descents, because of copying entries up, but probably most because
the old method noted and skipped once a vector page was cleared of swap.
Perhaps we can devise a use of radix tree tagging to achieve that later.

shmem_add_to_page_cache() uses shmem_radix_tree_replace() to compensate
for the lockless lookup by checking that the expected entry is in place,
under lock.  It is not very satisfactory to be copying this much from
add_to_page_cache_locked(), but I think easier to sell than insisting
that every caller of add_to_page_cache*() go through the extras.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 133 +++++++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 107 insertions(+), 26 deletions(-)

(limited to 'mm/shmem.c')

diff --git a/mm/shmem.c b/mm/shmem.c
index 4439b7d5581..174f97188e8 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -261,6 +261,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
 	return 0;
 }
 
+/*
+ * Like add_to_page_cache_locked, but error if expected item has gone.
+ */
+static int shmem_add_to_page_cache(struct page *page,
+				   struct address_space *mapping,
+				   pgoff_t index, gfp_t gfp, void *expected)
+{
+	int error;
+
+	VM_BUG_ON(!PageLocked(page));
+	VM_BUG_ON(!PageSwapBacked(page));
+
+	error = mem_cgroup_cache_charge(page, current->mm,
+						gfp & GFP_RECLAIM_MASK);
+	if (error)
+		goto out;
+	if (!expected)
+		error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+	if (!error) {
+		page_cache_get(page);
+		page->mapping = mapping;
+		page->index = index;
+
+		spin_lock_irq(&mapping->tree_lock);
+		if (!expected)
+			error = radix_tree_insert(&mapping->page_tree,
+							index, page);
+		else
+			error = shmem_radix_tree_replace(mapping, index,
+							expected, page);
+		if (!error) {
+			mapping->nrpages++;
+			__inc_zone_page_state(page, NR_FILE_PAGES);
+			__inc_zone_page_state(page, NR_SHMEM);
+			spin_unlock_irq(&mapping->tree_lock);
+		} else {
+			page->mapping = NULL;
+			spin_unlock_irq(&mapping->tree_lock);
+			page_cache_release(page);
+		}
+		if (!expected)
+			radix_tree_preload_end();
+	}
+	if (error)
+		mem_cgroup_uncharge_cache_page(page);
+out:
+	return error;
+}
+
 /*
  * Like find_get_pages, but collecting swap entries as well as pages.
  */
@@ -308,6 +357,42 @@ export:
 	return ret;
 }
 
+/*
+ * Lockless lookup of swap entry in radix tree, avoiding refcount on pages.
+ */
+static pgoff_t shmem_find_swap(struct address_space *mapping, void *radswap)
+{
+	void  **slots[PAGEVEC_SIZE];
+	pgoff_t indices[PAGEVEC_SIZE];
+	unsigned int nr_found;
+
+restart:
+	nr_found = 1;
+	indices[0] = -1;
+	while (nr_found) {
+		pgoff_t index = indices[nr_found - 1] + 1;
+		unsigned int i;
+
+		rcu_read_lock();
+		nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+					slots, indices, index, PAGEVEC_SIZE);
+		for (i = 0; i < nr_found; i++) {
+			void *item = radix_tree_deref_slot(slots[i]);
+			if (radix_tree_deref_retry(item)) {
+				rcu_read_unlock();
+				goto restart;
+			}
+			if (item == radswap) {
+				rcu_read_unlock();
+				return indices[i];
+			}
+		}
+		rcu_read_unlock();
+		cond_resched();
+	}
+	return -1;
+}
+
 /*
  * Remove swap entry from radix tree, free the swap and its page cache.
  */
@@ -515,23 +600,21 @@ static void shmem_evict_inode(struct inode *inode)
 	end_writeback(inode);
 }
 
+/*
+ * If swap found in inode, free it and move page from swapcache to filecache.
+ */
 static int shmem_unuse_inode(struct shmem_inode_info *info,
 			     swp_entry_t swap, struct page *page)
 {
 	struct address_space *mapping = info->vfs_inode.i_mapping;
+	void *radswap;
 	pgoff_t index;
 	int error;
 
-	for (index = 0; index < SHMEM_NR_DIRECT; index++)
-		if (shmem_get_swap(info, index).val == swap.val)
-			goto found;
-	return 0;
-found:
-	spin_lock(&info->lock);
-	if (shmem_get_swap(info, index).val != swap.val) {
-		spin_unlock(&info->lock);
+	radswap = swp_to_radix_entry(swap);
+	index = shmem_find_swap(mapping, radswap);
+	if (index == -1)
 		return 0;
-	}
 
 	/*
 	 * Move _head_ to start search for next from here.
@@ -547,23 +630,30 @@ found:
 	 * but also to hold up shmem_evict_inode(): so inode cannot be freed
 	 * beneath us (pagelock doesn't help until the page is in pagecache).
 	 */
-	error = add_to_page_cache_locked(page, mapping, index, GFP_NOWAIT);
+	error = shmem_add_to_page_cache(page, mapping, index,
+						GFP_NOWAIT, radswap);
 	/* which does mem_cgroup_uncharge_cache_page on error */
 
 	if (error != -ENOMEM) {
+		/*
+		 * Truncation and eviction use free_swap_and_cache(), which
+		 * only does trylock page: if we raced, best clean up here.
+		 */
 		delete_from_swap_cache(page);
 		set_page_dirty(page);
-		shmem_put_swap(info, index, (swp_entry_t){0});
-		info->swapped--;
-		swap_free(swap);
+		if (!error) {
+			spin_lock(&info->lock);
+			info->swapped--;
+			spin_unlock(&info->lock);
+			swap_free(swap);
+		}
 		error = 1;	/* not an error, but entry was found */
 	}
-	spin_unlock(&info->lock);
 	return error;
 }
 
 /*
- * shmem_unuse() search for an eventually swapped out shmem page.
+ * Search through swapped inodes to find and replace swap by page.
  */
 int shmem_unuse(swp_entry_t swap, struct page *page)
 {
@@ -576,20 +666,12 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
 	 * Charge page using GFP_KERNEL while we can wait, before taking
 	 * the shmem_swaplist_mutex which might hold up shmem_writepage().
 	 * Charged back to the user (not to caller) when swap account is used.
-	 * add_to_page_cache() will be called with GFP_NOWAIT.
+	 * shmem_add_to_page_cache() will be called with GFP_NOWAIT.
 	 */
 	error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
 	if (error)
 		goto out;
-	/*
-	 * Try to preload while we can wait, to not make a habit of
-	 * draining atomic reserves; but don't latch on to this cpu,
-	 * it's okay if sometimes we get rescheduled after this.
-	 */
-	error = radix_tree_preload(GFP_KERNEL);
-	if (error)
-		goto uncharge;
-	radix_tree_preload_end();
+	/* No radix_tree_preload: swap entry keeps a place for page in tree */
 
 	mutex_lock(&shmem_swaplist_mutex);
 	list_for_each_safe(this, next, &shmem_swaplist) {
@@ -608,7 +690,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
 	}
 	mutex_unlock(&shmem_swaplist_mutex);
 
-uncharge:
 	if (!found)
 		mem_cgroup_uncharge_cache_page(page);
 	if (found < 0)
-- 
cgit v1.2.3-70-g09d2


From 54af60421822bb9cb664dd5cd7aac46c01ccfcf8 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 3 Aug 2011 16:21:24 -0700
Subject: tmpfs: convert shmem_getpage_gfp to radix-swap

Convert shmem_getpage_gfp(), the engine-room of shmem, to expect page or
swap entry returned from radix tree by find_lock_page().

Whereas the repetitive old method proceeded mainly under info->lock,
dropping and repeating whenever one of the conditions needed was not
met, now we can proceed without it, leaving shmem_add_to_page_cache() to
check for a race.

This way there is no need to preallocate a page, no need for an early
radix_tree_preload(), no need for mem_cgroup_shmem_charge_fallback().

Move the error unwinding down to the bottom instead of repeating it
throughout.  ENOSPC handling is a little different from before: there is
no longer any race between find_lock_page() and finding swap, but we can
arrive at ENOSPC before calling shmem_recalc_inode(), which might
occasionally discover freed space.

Be stricter to check i_size before returning.  info->lock is used for
little but alloced, swapped, i_blocks updates.  Move i_blocks updates
out from under the max_blocks check, so even an unlimited size=0 mount
can show accurate du.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 259 ++++++++++++++++++++++++++-----------------------------------
 1 file changed, 112 insertions(+), 147 deletions(-)

(limited to 'mm/shmem.c')

diff --git a/mm/shmem.c b/mm/shmem.c
index 174f97188e8..92f01d7cc15 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -166,15 +166,6 @@ static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
 static LIST_HEAD(shmem_swaplist);
 static DEFINE_MUTEX(shmem_swaplist_mutex);
 
-static void shmem_free_blocks(struct inode *inode, long pages)
-{
-	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
-	if (sbinfo->max_blocks) {
-		percpu_counter_add(&sbinfo->used_blocks, -pages);
-		inode->i_blocks -= pages*BLOCKS_PER_PAGE;
-	}
-}
-
 static int shmem_reserve_inode(struct super_block *sb)
 {
 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
@@ -219,9 +210,12 @@ static void shmem_recalc_inode(struct inode *inode)
 
 	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
 	if (freed > 0) {
+		struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+		if (sbinfo->max_blocks)
+			percpu_counter_add(&sbinfo->used_blocks, -freed);
 		info->alloced -= freed;
+		inode->i_blocks -= freed * BLOCKS_PER_PAGE;
 		shmem_unacct_blocks(info->flags, freed);
-		shmem_free_blocks(inode, freed);
 	}
 }
 
@@ -888,205 +882,180 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 	struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
 {
 	struct address_space *mapping = inode->i_mapping;
-	struct shmem_inode_info *info = SHMEM_I(inode);
+	struct shmem_inode_info *info;
 	struct shmem_sb_info *sbinfo;
 	struct page *page;
-	struct page *prealloc_page = NULL;
 	swp_entry_t swap;
 	int error;
+	int once = 0;
 
 	if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
 		return -EFBIG;
 repeat:
+	swap.val = 0;
 	page = find_lock_page(mapping, index);
-	if (page) {
+	if (radix_tree_exceptional_entry(page)) {
+		swap = radix_to_swp_entry(page);
+		page = NULL;
+	}
+
+	if (sgp != SGP_WRITE &&
+	    ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+		error = -EINVAL;
+		goto failed;
+	}
+
+	if (page || (sgp == SGP_READ && !swap.val)) {
 		/*
 		 * Once we can get the page lock, it must be uptodate:
 		 * if there were an error in reading back from swap,
 		 * the page would not be inserted into the filecache.
 		 */
-		BUG_ON(!PageUptodate(page));
-		goto done;
+		BUG_ON(page && !PageUptodate(page));
+		*pagep = page;
+		return 0;
 	}
 
 	/*
-	 * Try to preload while we can wait, to not make a habit of
-	 * draining atomic reserves; but don't latch on to this cpu.
+	 * Fast cache lookup did not find it:
+	 * bring it back from swap or allocate.
 	 */
-	error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
-	if (error)
-		goto out;
-	radix_tree_preload_end();
-
-	if (sgp != SGP_READ && !prealloc_page) {
-		prealloc_page = shmem_alloc_page(gfp, info, index);
-		if (prealloc_page) {
-			SetPageSwapBacked(prealloc_page);
-			if (mem_cgroup_cache_charge(prealloc_page,
-					current->mm, GFP_KERNEL)) {
-				page_cache_release(prealloc_page);
-				prealloc_page = NULL;
-			}
-		}
-	}
+	info = SHMEM_I(inode);
+	sbinfo = SHMEM_SB(inode->i_sb);
 
-	spin_lock(&info->lock);
-	shmem_recalc_inode(inode);
-	swap = shmem_get_swap(info, index);
 	if (swap.val) {
 		/* Look it up and read it in.. */
 		page = lookup_swap_cache(swap);
 		if (!page) {
-			spin_unlock(&info->lock);
 			/* here we actually do the io */
 			if (fault_type)
 				*fault_type |= VM_FAULT_MAJOR;
 			page = shmem_swapin(swap, gfp, info, index);
 			if (!page) {
-				swp_entry_t nswap = shmem_get_swap(info, index);
-				if (nswap.val == swap.val) {
-					error = -ENOMEM;
-					goto out;
-				}
-				goto repeat;
+				error = -ENOMEM;
+				goto failed;
 			}
-			wait_on_page_locked(page);
-			page_cache_release(page);
-			goto repeat;
 		}
 
 		/* We have to do this with page locked to prevent races */
-		if (!trylock_page(page)) {
-			spin_unlock(&info->lock);
-			wait_on_page_locked(page);
-			page_cache_release(page);
-			goto repeat;
-		}
-		if (PageWriteback(page)) {
-			spin_unlock(&info->lock);
-			wait_on_page_writeback(page);
-			unlock_page(page);
-			page_cache_release(page);
-			goto repeat;
-		}
+		lock_page(page);
 		if (!PageUptodate(page)) {
-			spin_unlock(&info->lock);
-			unlock_page(page);
-			page_cache_release(page);
 			error = -EIO;
-			goto out;
+			goto failed;
 		}
-
-		error = add_to_page_cache_locked(page, mapping,
-						 index, GFP_NOWAIT);
-		if (error) {
-			spin_unlock(&info->lock);
-			if (error == -ENOMEM) {
-				/*
-				 * reclaim from proper memory cgroup and
-				 * call memcg's OOM if needed.
-				 */
-				error = mem_cgroup_shmem_charge_fallback(
-						page, current->mm, gfp);
-				if (error) {
-					unlock_page(page);
-					page_cache_release(page);
-					goto out;
-				}
-			}
-			unlock_page(page);
-			page_cache_release(page);
-			goto repeat;
+		wait_on_page_writeback(page);
+
+		/* Someone may have already done it for us */
+		if (page->mapping) {
+			if (page->mapping == mapping &&
+			    page->index == index)
+				goto done;
+			error = -EEXIST;
+			goto failed;
 		}
 
-		delete_from_swap_cache(page);
-		shmem_put_swap(info, index, (swp_entry_t){0});
+		error = shmem_add_to_page_cache(page, mapping, index,
+					gfp, swp_to_radix_entry(swap));
+		if (error)
+			goto failed;
+
+		spin_lock(&info->lock);
 		info->swapped--;
+		shmem_recalc_inode(inode);
 		spin_unlock(&info->lock);
+
+		delete_from_swap_cache(page);
 		set_page_dirty(page);
 		swap_free(swap);
 
-	} else if (sgp == SGP_READ) {
-		page = find_get_page(mapping, index);
-		if (page && !trylock_page(page)) {
-			spin_unlock(&info->lock);
-			wait_on_page_locked(page);
-			page_cache_release(page);
-			goto repeat;
+	} else {
+		if (shmem_acct_block(info->flags)) {
+			error = -ENOSPC;
+			goto failed;
 		}
-		spin_unlock(&info->lock);
-
-	} else if (prealloc_page) {
-		sbinfo = SHMEM_SB(inode->i_sb);
 		if (sbinfo->max_blocks) {
 			if (percpu_counter_compare(&sbinfo->used_blocks,
-						sbinfo->max_blocks) >= 0 ||
-			    shmem_acct_block(info->flags))
-				goto nospace;
+						sbinfo->max_blocks) >= 0) {
+				error = -ENOSPC;
+				goto unacct;
+			}
 			percpu_counter_inc(&sbinfo->used_blocks);
-			inode->i_blocks += BLOCKS_PER_PAGE;
-		} else if (shmem_acct_block(info->flags))
-			goto nospace;
-
-		page = prealloc_page;
-		prealloc_page = NULL;
+		}
 
-		swap = shmem_get_swap(info, index);
-		if (swap.val)
-			mem_cgroup_uncharge_cache_page(page);
-		else
-			error = add_to_page_cache_lru(page, mapping,
-						index, GFP_NOWAIT);
-		/*
-		 * At add_to_page_cache_lru() failure,
-		 * uncharge will be done automatically.
-		 */
-		if (swap.val || error) {
-			shmem_unacct_blocks(info->flags, 1);
-			shmem_free_blocks(inode, 1);
-			spin_unlock(&info->lock);
-			page_cache_release(page);
-			goto repeat;
+		page = shmem_alloc_page(gfp, info, index);
+		if (!page) {
+			error = -ENOMEM;
+			goto decused;
 		}
 
+		SetPageSwapBacked(page);
+		__set_page_locked(page);
+		error = shmem_add_to_page_cache(page, mapping, index,
+								gfp, NULL);
+		if (error)
+			goto decused;
+		lru_cache_add_anon(page);
+
+		spin_lock(&info->lock);
 		info->alloced++;
+		inode->i_blocks += BLOCKS_PER_PAGE;
+		shmem_recalc_inode(inode);
 		spin_unlock(&info->lock);
+
 		clear_highpage(page);
 		flush_dcache_page(page);
 		SetPageUptodate(page);
 		if (sgp == SGP_DIRTY)
 			set_page_dirty(page);
-
-	} else {
-		spin_unlock(&info->lock);
-		error = -ENOMEM;
-		goto out;
 	}
 done:
-	*pagep = page;
-	error = 0;
-out:
-	if (prealloc_page) {
-		mem_cgroup_uncharge_cache_page(prealloc_page);
-		page_cache_release(prealloc_page);
+	/* Perhaps the file has been truncated since we checked */
+	if (sgp != SGP_WRITE &&
+	    ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+		error = -EINVAL;
+		goto trunc;
 	}
-	return error;
+	*pagep = page;
+	return 0;
 
-nospace:
 	/*
-	 * Perhaps the page was brought in from swap between find_lock_page
-	 * and taking info->lock?  We allow for that at add_to_page_cache_lru,
-	 * but must also avoid reporting a spurious ENOSPC while working on a
-	 * full tmpfs.
+	 * Error recovery.
 	 */
-	page = find_get_page(mapping, index);
+trunc:
+	ClearPageDirty(page);
+	delete_from_page_cache(page);
+	spin_lock(&info->lock);
+	info->alloced--;
+	inode->i_blocks -= BLOCKS_PER_PAGE;
 	spin_unlock(&info->lock);
+decused:
+	if (sbinfo->max_blocks)
+		percpu_counter_add(&sbinfo->used_blocks, -1);
+unacct:
+	shmem_unacct_blocks(info->flags, 1);
+failed:
+	if (swap.val && error != -EINVAL) {
+		struct page *test = find_get_page(mapping, index);
+		if (test && !radix_tree_exceptional_entry(test))
+			page_cache_release(test);
+		/* Have another try if the entry has changed */
+		if (test != swp_to_radix_entry(swap))
+			error = -EEXIST;
+	}
 	if (page) {
+		unlock_page(page);
 		page_cache_release(page);
+	}
+	if (error == -ENOSPC && !once++) {
+		info = SHMEM_I(inode);
+		spin_lock(&info->lock);
+		shmem_recalc_inode(inode);
+		spin_unlock(&info->lock);
 		goto repeat;
 	}
-	error = -ENOSPC;
-	goto out;
+	if (error == -EEXIST)
+		goto repeat;
+	return error;
 }
 
 static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -1095,9 +1064,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	int error;
 	int ret = VM_FAULT_LOCKED;
 
-	if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
-		return VM_FAULT_SIGBUS;
-
 	error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
 	if (error)
 		return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
@@ -2164,8 +2130,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 	if (config.max_inodes < inodes)
 		goto out;
 	/*
-	 * Those tests also disallow limited->unlimited while any are in
-	 * use, so i_blocks will always be zero when max_blocks is zero;
+	 * Those tests disallow limited->unlimited while any are in use;
 	 * but we must separately disallow unlimited->limited, because
 	 * in that case we have no record of how much is already in use.
 	 */
-- 
cgit v1.2.3-70-g09d2


From aa3b189551ad8e5cc1d9c663735c131650238278 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 3 Aug 2011 16:21:24 -0700
Subject: tmpfs: convert mem_cgroup shmem to radix-swap

Remove mem_cgroup_shmem_charge_fallback(): it was only required when we
had to move swappage to filecache with GFP_NOWAIT.

Remove the GFP_NOWAIT special case from mem_cgroup_cache_charge(), by
moving its call out from shmem_add_to_page_cache() to two of thats three
callers.  But leave it doing mem_cgroup_uncharge_cache_page() on error:
although asymmetrical, it's easier for all 3 callers to handle.

These two changes would also be appropriate if anyone were to start
using shmem_read_mapping_page_gfp() with GFP_NOWAIT.

Remove mem_cgroup_get_shmem_target(): mc_handle_file_pte() can test
radix_tree_exceptional_entry() to get what it needs for itself.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  8 -----
 include/linux/shmem_fs.h   |  2 --
 mm/memcontrol.c            | 66 +++++-------------------------------
 mm/shmem.c                 | 83 ++++++----------------------------------------
 4 files changed, 20 insertions(+), 139 deletions(-)

(limited to 'mm/shmem.c')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b9660078691..3b535db00a9 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -86,8 +86,6 @@ extern void mem_cgroup_uncharge_end(void);
 
 extern void mem_cgroup_uncharge_page(struct page *page);
 extern void mem_cgroup_uncharge_cache_page(struct page *page);
-extern int mem_cgroup_shmem_charge_fallback(struct page *page,
-			struct mm_struct *mm, gfp_t gfp_mask);
 
 extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask);
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
@@ -225,12 +223,6 @@ static inline void mem_cgroup_uncharge_cache_page(struct page *page)
 {
 }
 
-static inline int mem_cgroup_shmem_charge_fallback(struct page *page,
-			struct mm_struct *mm, gfp_t gfp_mask)
-{
-	return 0;
-}
-
 static inline void mem_cgroup_add_lru_list(struct page *page, int lru)
 {
 }
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 3f05795dcf7..0c8e952df59 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -57,8 +57,6 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
 					pgoff_t index, gfp_t gfp_mask);
 extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
-extern void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
-					struct page **pagep, swp_entry_t *ent);
 
 static inline struct page *shmem_read_mapping_page(
 				struct address_space *mapping, pgoff_t index)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5f84d2351dd..f4ec4e7ca4c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,7 +35,6 @@
 #include <linux/limits.h>
 #include <linux/mutex.h>
 #include <linux/rbtree.h>
-#include <linux/shmem_fs.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
@@ -2873,30 +2872,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 		return 0;
 	if (PageCompound(page))
 		return 0;
-	/*
-	 * Corner case handling. This is called from add_to_page_cache()
-	 * in usual. But some FS (shmem) precharges this page before calling it
-	 * and call add_to_page_cache() with GFP_NOWAIT.
-	 *
-	 * For GFP_NOWAIT case, the page may be pre-charged before calling
-	 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
-	 * charge twice. (It works but has to pay a bit larger cost.)
-	 * And when the page is SwapCache, it should take swap information
-	 * into account. This is under lock_page() now.
-	 */
-	if (!(gfp_mask & __GFP_WAIT)) {
-		struct page_cgroup *pc;
-
-		pc = lookup_page_cgroup(page);
-		if (!pc)
-			return 0;
-		lock_page_cgroup(pc);
-		if (PageCgroupUsed(pc)) {
-			unlock_page_cgroup(pc);
-			return 0;
-		}
-		unlock_page_cgroup(pc);
-	}
 
 	if (unlikely(!mm))
 		mm = &init_mm;
@@ -3486,31 +3461,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
 	cgroup_release_and_wakeup_rmdir(&mem->css);
 }
 
-/*
- * A call to try to shrink memory usage on charge failure at shmem's swapin.
- * Calling hierarchical_reclaim is not enough because we should update
- * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
- * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
- * not from the memcg which this page would be charged to.
- * try_charge_swapin does all of these works properly.
- */
-int mem_cgroup_shmem_charge_fallback(struct page *page,
-			    struct mm_struct *mm,
-			    gfp_t gfp_mask)
-{
-	struct mem_cgroup *mem;
-	int ret;
-
-	if (mem_cgroup_disabled())
-		return 0;
-
-	ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
-	if (!ret)
-		mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
-
-	return ret;
-}
-
 #ifdef CONFIG_DEBUG_VM
 static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
 {
@@ -5330,15 +5280,17 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
 		pgoff = pte_to_pgoff(ptent);
 
 	/* page is moved even if it's not RSS of this task(page-faulted). */
-	if (!mapping_cap_swap_backed(mapping)) { /* normal file */
-		page = find_get_page(mapping, pgoff);
-	} else { /* shmem/tmpfs file. we should take account of swap too. */
-		swp_entry_t ent;
-		mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
+	page = find_get_page(mapping, pgoff);
+
+#ifdef CONFIG_SWAP
+	/* shmem/tmpfs may report page out on swap: account for that too. */
+	if (radix_tree_exceptional_entry(page)) {
+		swp_entry_t swap = radix_to_swp_entry(page);
 		if (do_swap_account)
-			entry->val = ent.val;
+			*entry = swap;
+		page = find_get_page(&swapper_space, swap.val);
 	}
-
+#endif
 	return page;
 }
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 92f01d7cc15..13ef2d7e912 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -262,15 +262,11 @@ static int shmem_add_to_page_cache(struct page *page,
 				   struct address_space *mapping,
 				   pgoff_t index, gfp_t gfp, void *expected)
 {
-	int error;
+	int error = 0;
 
 	VM_BUG_ON(!PageLocked(page));
 	VM_BUG_ON(!PageSwapBacked(page));
 
-	error = mem_cgroup_cache_charge(page, current->mm,
-						gfp & GFP_RECLAIM_MASK);
-	if (error)
-		goto out;
 	if (!expected)
 		error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
 	if (!error) {
@@ -300,7 +296,6 @@ static int shmem_add_to_page_cache(struct page *page,
 	}
 	if (error)
 		mem_cgroup_uncharge_cache_page(page);
-out:
 	return error;
 }
 
@@ -660,7 +655,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
 	 * Charge page using GFP_KERNEL while we can wait, before taking
 	 * the shmem_swaplist_mutex which might hold up shmem_writepage().
 	 * Charged back to the user (not to caller) when swap account is used.
-	 * shmem_add_to_page_cache() will be called with GFP_NOWAIT.
 	 */
 	error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
 	if (error)
@@ -954,8 +948,11 @@ repeat:
 			goto failed;
 		}
 
-		error = shmem_add_to_page_cache(page, mapping, index,
-					gfp, swp_to_radix_entry(swap));
+		error = mem_cgroup_cache_charge(page, current->mm,
+						gfp & GFP_RECLAIM_MASK);
+		if (!error)
+			error = shmem_add_to_page_cache(page, mapping, index,
+						gfp, swp_to_radix_entry(swap));
 		if (error)
 			goto failed;
 
@@ -990,8 +987,11 @@ repeat:
 
 		SetPageSwapBacked(page);
 		__set_page_locked(page);
-		error = shmem_add_to_page_cache(page, mapping, index,
-								gfp, NULL);
+		error = mem_cgroup_cache_charge(page, current->mm,
+						gfp & GFP_RECLAIM_MASK);
+		if (!error)
+			error = shmem_add_to_page_cache(page, mapping, index,
+						gfp, NULL);
 		if (error)
 			goto decused;
 		lru_cache_add_anon(page);
@@ -2442,42 +2442,6 @@ out4:
 	return error;
 }
 
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-/**
- * mem_cgroup_get_shmem_target - find page or swap assigned to the shmem file
- * @inode: the inode to be searched
- * @index: the page offset to be searched
- * @pagep: the pointer for the found page to be stored
- * @swapp: the pointer for the found swap entry to be stored
- *
- * If a page is found, refcount of it is incremented. Callers should handle
- * these refcount.
- */
-void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t index,
-				 struct page **pagep, swp_entry_t *swapp)
-{
-	struct shmem_inode_info *info = SHMEM_I(inode);
-	struct page *page = NULL;
-	swp_entry_t swap = {0};
-
-	if ((index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
-		goto out;
-
-	spin_lock(&info->lock);
-#ifdef CONFIG_SWAP
-	swap = shmem_get_swap(info, index);
-	if (swap.val)
-		page = find_get_page(&swapper_space, swap.val);
-	else
-#endif
-		page = find_get_page(inode->i_mapping, index);
-	spin_unlock(&info->lock);
-out:
-	*pagep = page;
-	*swapp = swap;
-}
-#endif
-
 #else /* !CONFIG_SHMEM */
 
 /*
@@ -2523,31 +2487,6 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
 
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-/**
- * mem_cgroup_get_shmem_target - find page or swap assigned to the shmem file
- * @inode: the inode to be searched
- * @index: the page offset to be searched
- * @pagep: the pointer for the found page to be stored
- * @swapp: the pointer for the found swap entry to be stored
- *
- * If a page is found, refcount of it is incremented. Callers should handle
- * these refcount.
- */
-void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t index,
-				 struct page **pagep, swp_entry_t *swapp)
-{
-	struct page *page = NULL;
-
-	if ((index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
-		goto out;
-	page = find_get_page(inode->i_mapping, index);
-out:
-	*pagep = page;
-	*swapp = (swp_entry_t){0};
-}
-#endif
-
 #define shmem_vm_ops				generic_file_vm_ops
 #define shmem_file_operations			ramfs_file_operations
 #define shmem_get_inode(sb, dir, mode, dev, flags)	ramfs_get_inode(sb, dir, mode, dev)
-- 
cgit v1.2.3-70-g09d2


From 6922c0c7abd387374255801f7739624867e8acad Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 3 Aug 2011 16:21:25 -0700
Subject: tmpfs: convert shmem_writepage and enable swap

Convert shmem_writepage() to use shmem_delete_from_page_cache() to use
shmem_radix_tree_replace() to substitute swap entry for page pointer
atomically in the radix tree.

As with shmem_add_to_page_cache(), it's not entirely satisfactory to be
copying such code from delete_from_swap_cache, but again judged easier
to sell than making its other callers go through the extras.

Remove the toy implementation's shmem_put_swap() and shmem_get_swap(),
now unreferenced, and the hack to disable swap: it's now good to go.

The way things have worked out, info->lock no longer helps to guard the
shmem_swaplist: we increment swapped under shmem_swaplist_mutex only.
That global mutex exclusion between shmem_writepage() and shmem_unuse()
is not pretty, and we ought to find another way; but it's been forced on
us by recent race discoveries, not a consequence of this patchset.

And what has become of the WARN_ON_ONCE(1) free_swap_and_cache() if a
swap entry was found already present? That's no longer possible, the
(unknown) one inserting this page into filecache would hit the swap
entry occupying that slot.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 88 ++++++++++++++++++++++++++------------------------------------
 1 file changed, 37 insertions(+), 51 deletions(-)

(limited to 'mm/shmem.c')

diff --git a/mm/shmem.c b/mm/shmem.c
index 13ef2d7e912..0f094a25852 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -6,7 +6,8 @@
  *		 2000-2001 Christoph Rohland
  *		 2000-2001 SAP AG
  *		 2002 Red Hat Inc.
- * Copyright (C) 2002-2005 Hugh Dickins.
+ * Copyright (C) 2002-2011 Hugh Dickins.
+ * Copyright (C) 2011 Google Inc.
  * Copyright (C) 2002-2005 VERITAS Software Corporation.
  * Copyright (C) 2004 Andi Kleen, SuSE Labs
  *
@@ -219,19 +220,6 @@ static void shmem_recalc_inode(struct inode *inode)
 	}
 }
 
-static void shmem_put_swap(struct shmem_inode_info *info, pgoff_t index,
-			   swp_entry_t swap)
-{
-	if (index < SHMEM_NR_DIRECT)
-		info->i_direct[index] = swap;
-}
-
-static swp_entry_t shmem_get_swap(struct shmem_inode_info *info, pgoff_t index)
-{
-	return (index < SHMEM_NR_DIRECT) ?
-		info->i_direct[index] : (swp_entry_t){0};
-}
-
 /*
  * Replace item expected in radix tree by a new item, while holding tree lock.
  */
@@ -299,6 +287,25 @@ static int shmem_add_to_page_cache(struct page *page,
 	return error;
 }
 
+/*
+ * Like delete_from_page_cache, but substitutes swap for page.
+ */
+static void shmem_delete_from_page_cache(struct page *page, void *radswap)
+{
+	struct address_space *mapping = page->mapping;
+	int error;
+
+	spin_lock_irq(&mapping->tree_lock);
+	error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
+	page->mapping = NULL;
+	mapping->nrpages--;
+	__dec_zone_page_state(page, NR_FILE_PAGES);
+	__dec_zone_page_state(page, NR_SHMEM);
+	spin_unlock_irq(&mapping->tree_lock);
+	page_cache_release(page);
+	BUG_ON(error);
+}
+
 /*
  * Like find_get_pages, but collecting swap entries as well as pages.
  */
@@ -664,14 +671,10 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
 	mutex_lock(&shmem_swaplist_mutex);
 	list_for_each_safe(this, next, &shmem_swaplist) {
 		info = list_entry(this, struct shmem_inode_info, swaplist);
-		if (!info->swapped) {
-			spin_lock(&info->lock);
-			if (!info->swapped)
-				list_del_init(&info->swaplist);
-			spin_unlock(&info->lock);
-		}
 		if (info->swapped)
 			found = shmem_unuse_inode(info, swap, page);
+		else
+			list_del_init(&info->swaplist);
 		cond_resched();
 		if (found)
 			break;
@@ -694,10 +697,10 @@ out:
 static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct shmem_inode_info *info;
-	swp_entry_t swap, oswap;
 	struct address_space *mapping;
-	pgoff_t index;
 	struct inode *inode;
+	swp_entry_t swap;
+	pgoff_t index;
 
 	BUG_ON(!PageLocked(page));
 	mapping = page->mapping;
@@ -720,55 +723,38 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 		WARN_ON_ONCE(1);	/* Still happens? Tell us about it! */
 		goto redirty;
 	}
-
-	/*
-	 * Disable even the toy swapping implementation, while we convert
-	 * functions one by one to having swap entries in the radix tree.
-	 */
-	if (index < ULONG_MAX)
-		goto redirty;
-
 	swap = get_swap_page();
 	if (!swap.val)
 		goto redirty;
 
 	/*
 	 * Add inode to shmem_unuse()'s list of swapped-out inodes,
-	 * if it's not already there.  Do it now because we cannot take
-	 * mutex while holding spinlock, and must do so before the page
-	 * is moved to swap cache, when its pagelock no longer protects
+	 * if it's not already there.  Do it now before the page is
+	 * moved to swap cache, when its pagelock no longer protects
 	 * the inode from eviction.  But don't unlock the mutex until
-	 * we've taken the spinlock, because shmem_unuse_inode() will
-	 * prune a !swapped inode from the swaplist under both locks.
+	 * we've incremented swapped, because shmem_unuse_inode() will
+	 * prune a !swapped inode from the swaplist under this mutex.
 	 */
 	mutex_lock(&shmem_swaplist_mutex);
 	if (list_empty(&info->swaplist))
 		list_add_tail(&info->swaplist, &shmem_swaplist);
 
-	spin_lock(&info->lock);
-	mutex_unlock(&shmem_swaplist_mutex);
-
-	oswap = shmem_get_swap(info, index);
-	if (oswap.val) {
-		WARN_ON_ONCE(1);	/* Still happens? Tell us about it! */
-		free_swap_and_cache(oswap);
-		shmem_put_swap(info, index, (swp_entry_t){0});
-		info->swapped--;
-	}
-	shmem_recalc_inode(inode);
-
 	if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
-		delete_from_page_cache(page);
-		shmem_put_swap(info, index, swap);
-		info->swapped++;
 		swap_shmem_alloc(swap);
+		shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
+
+		spin_lock(&info->lock);
+		info->swapped++;
+		shmem_recalc_inode(inode);
 		spin_unlock(&info->lock);
+
+		mutex_unlock(&shmem_swaplist_mutex);
 		BUG_ON(page_mapped(page));
 		swap_writepage(page, wbc);
 		return 0;
 	}
 
-	spin_unlock(&info->lock);
+	mutex_unlock(&shmem_swaplist_mutex);
 	swapcache_free(swap, NULL);
 redirty:
 	set_page_dirty(page);
-- 
cgit v1.2.3-70-g09d2


From 69f07ec938712b58755add82dd3d0b35f01317cc Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 3 Aug 2011 16:21:26 -0700
Subject: tmpfs: use kmemdup for short symlinks

But we've not yet removed the old swp_entry_t i_direct[16] from
shmem_inode_info.  That's because it was still being shared with the
inline symlink.  Remove it now (saving 64 or 128 bytes from shmem inode
size), and use kmemdup() for short symlinks, say, those up to 128 bytes.

I wonder why mpol_free_shared_policy() is done in shmem_destroy_inode()
rather than shmem_evict_inode(), where we usually do such freeing? I
guess it doesn't matter, and I'm not into NUMA mpol testing right now.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shmem_fs.h | 11 +++--------
 mm/shmem.c               | 31 ++++++++++++++++++-------------
 2 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'mm/shmem.c')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 0c8e952df59..9291ac3cc62 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -8,20 +8,15 @@
 
 /* inode in-kernel data */
 
-#define SHMEM_NR_DIRECT 16
-
-#define SHMEM_SYMLINK_INLINE_LEN (SHMEM_NR_DIRECT * sizeof(swp_entry_t))
-
 struct shmem_inode_info {
 	spinlock_t		lock;
 	unsigned long		flags;
 	unsigned long		alloced;	/* data pages alloced to file */
-	unsigned long		swapped;	/* subtotal assigned to swap */
-	struct shared_policy	policy;		/* NUMA memory alloc policy */
 	union {
-		swp_entry_t	i_direct[SHMEM_NR_DIRECT]; /* first blocks */
-		char		inline_symlink[SHMEM_SYMLINK_INLINE_LEN];
+		unsigned long	swapped;	/* subtotal assigned to swap */
+		char		*symlink;	/* unswappable short symlink */
 	};
+	struct shared_policy	policy;		/* NUMA memory alloc policy */
 	struct list_head	swaplist;	/* chain of maybes on swap */
 	struct list_head	xattr_list;	/* list of shmem_xattr */
 	struct inode		vfs_inode;
diff --git a/mm/shmem.c b/mm/shmem.c
index 0f094a25852..3a5be0feb6a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -73,6 +73,9 @@ static struct vfsmount *shm_mnt;
 /* Pretend that each entry is of this size in directory's i_size */
 #define BOGO_DIRENT_SIZE 20
 
+/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
+#define SHORT_SYMLINK_LEN 128
+
 struct shmem_xattr {
 	struct list_head list;	/* anchored by shmem_inode_info->xattr_list */
 	char *name;		/* xattr name */
@@ -585,7 +588,8 @@ static void shmem_evict_inode(struct inode *inode)
 			list_del_init(&info->swaplist);
 			mutex_unlock(&shmem_swaplist_mutex);
 		}
-	}
+	} else
+		kfree(info->symlink);
 
 	list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
 		kfree(xattr->name);
@@ -1173,7 +1177,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 
 #ifdef CONFIG_TMPFS
 static const struct inode_operations shmem_symlink_inode_operations;
-static const struct inode_operations shmem_symlink_inline_operations;
+static const struct inode_operations shmem_short_symlink_operations;
 
 static int
 shmem_write_begin(struct file *file, struct address_space *mapping,
@@ -1638,10 +1642,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 
 	info = SHMEM_I(inode);
 	inode->i_size = len-1;
-	if (len <= SHMEM_SYMLINK_INLINE_LEN) {
-		/* do it inline */
-		memcpy(info->inline_symlink, symname, len);
-		inode->i_op = &shmem_symlink_inline_operations;
+	if (len <= SHORT_SYMLINK_LEN) {
+		info->symlink = kmemdup(symname, len, GFP_KERNEL);
+		if (!info->symlink) {
+			iput(inode);
+			return -ENOMEM;
+		}
+		inode->i_op = &shmem_short_symlink_operations;
 	} else {
 		error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
 		if (error) {
@@ -1664,9 +1671,9 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 	return 0;
 }
 
-static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
+static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)
 {
-	nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink);
+	nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink);
 	return NULL;
 }
 
@@ -1914,9 +1921,9 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
 }
 #endif /* CONFIG_TMPFS_XATTR */
 
-static const struct inode_operations shmem_symlink_inline_operations = {
+static const struct inode_operations shmem_short_symlink_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= shmem_follow_link_inline,
+	.follow_link	= shmem_follow_short_symlink,
 #ifdef CONFIG_TMPFS_XATTR
 	.setxattr	= shmem_setxattr,
 	.getxattr	= shmem_getxattr,
@@ -2259,10 +2266,8 @@ static void shmem_destroy_callback(struct rcu_head *head)
 
 static void shmem_destroy_inode(struct inode *inode)
 {
-	if ((inode->i_mode & S_IFMT) == S_IFREG) {
-		/* only struct inode is valid if it's an inline symlink */
+	if ((inode->i_mode & S_IFMT) == S_IFREG)
 		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
-	}
 	call_rcu(&inode->i_rcu, shmem_destroy_callback);
 }
 
-- 
cgit v1.2.3-70-g09d2


From e504f3fdd63d486d45b18009e5a65f2e329acb0a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 3 Aug 2011 16:21:27 -0700
Subject: tmpfs radix_tree: locate_item to speed up swapoff

We have already acknowledged that swapoff of a tmpfs file is slower than
it was before conversion to the generic radix_tree: a little slower
there will be acceptable, if the hotter paths are faster.

But it was a shock to find swapoff of a 500MB file 20 times slower on my
laptop, taking 10 minutes; and at that rate it significantly slows down
my testing.

Now, most of that turned out to be overhead from PROVE_LOCKING and
PROVE_RCU: without those it was only 4 times slower than before; and
more realistic tests on other machines don't fare as badly.

I've tried a number of things to improve it, including tagging the swap
entries, then doing lookup by tag: I'd expected that to halve the time,
but in practice it's erratic, and often counter-productive.

The only change I've so far found to make a consistent improvement, is
to short-circuit the way we go back and forth, gang lookup packing
entries into the array supplied, then shmem scanning that array for the
target entry.  Scanning in place doubles the speed, so it's now only
twice as slow as before (or three times slower when the PROVEs are on).

So, add radix_tree_locate_item() as an expedient, once-off,
single-caller hack to do the lookup directly in place.  #ifdef it on
CONFIG_SHMEM and CONFIG_SWAP, as much to document its limited
applicability as save space in other configurations.  And, sadly,
#include sched.h for cond_resched().

Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h |  1 +
 lib/radix-tree.c           | 92 ++++++++++++++++++++++++++++++++++++++++++++++
 mm/shmem.c                 | 38 +------------------
 3 files changed, 94 insertions(+), 37 deletions(-)

(limited to 'mm/shmem.c')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index b7edf825145..9d4539c52e5 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -252,6 +252,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
 		unsigned long nr_to_tag,
 		unsigned int fromtag, unsigned int totag);
 int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
+unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
 
 static inline void radix_tree_preload_end(void)
 {
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 348eaefbed7..a2f9da59c19 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -1203,6 +1203,98 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
 }
 EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);
 
+#if defined(CONFIG_SHMEM) && defined(CONFIG_SWAP)
+#include <linux/sched.h> /* for cond_resched() */
+
+/*
+ * This linear search is at present only useful to shmem_unuse_inode().
+ */
+static unsigned long __locate(struct radix_tree_node *slot, void *item,
+			      unsigned long index, unsigned long *found_index)
+{
+	unsigned int shift, height;
+	unsigned long i;
+
+	height = slot->height;
+	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+
+	for ( ; height > 1; height--) {
+		i = (index >> shift) & RADIX_TREE_MAP_MASK;
+		for (;;) {
+			if (slot->slots[i] != NULL)
+				break;
+			index &= ~((1UL << shift) - 1);
+			index += 1UL << shift;
+			if (index == 0)
+				goto out;	/* 32-bit wraparound */
+			i++;
+			if (i == RADIX_TREE_MAP_SIZE)
+				goto out;
+		}
+
+		shift -= RADIX_TREE_MAP_SHIFT;
+		slot = rcu_dereference_raw(slot->slots[i]);
+		if (slot == NULL)
+			goto out;
+	}
+
+	/* Bottom level: check items */
+	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
+		if (slot->slots[i] == item) {
+			*found_index = index + i;
+			index = 0;
+			goto out;
+		}
+	}
+	index += RADIX_TREE_MAP_SIZE;
+out:
+	return index;
+}
+
+/**
+ *	radix_tree_locate_item - search through radix tree for item
+ *	@root:		radix tree root
+ *	@item:		item to be found
+ *
+ *	Returns index where item was found, or -1 if not found.
+ *	Caller must hold no lock (since this time-consuming function needs
+ *	to be preemptible), and must check afterwards if item is still there.
+ */
+unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
+{
+	struct radix_tree_node *node;
+	unsigned long max_index;
+	unsigned long cur_index = 0;
+	unsigned long found_index = -1;
+
+	do {
+		rcu_read_lock();
+		node = rcu_dereference_raw(root->rnode);
+		if (!radix_tree_is_indirect_ptr(node)) {
+			rcu_read_unlock();
+			if (node == item)
+				found_index = 0;
+			break;
+		}
+
+		node = indirect_to_ptr(node);
+		max_index = radix_tree_maxindex(node->height);
+		if (cur_index > max_index)
+			break;
+
+		cur_index = __locate(node, item, cur_index, &found_index);
+		rcu_read_unlock();
+		cond_resched();
+	} while (cur_index != 0 && cur_index <= max_index);
+
+	return found_index;
+}
+#else
+unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
+{
+	return -1;
+}
+#endif /* CONFIG_SHMEM && CONFIG_SWAP */
 
 /**
  *	radix_tree_shrink    -    shrink height of a radix tree to minimal
diff --git a/mm/shmem.c b/mm/shmem.c
index 3a5be0feb6a..1c702f6f124 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -356,42 +356,6 @@ export:
 	return ret;
 }
 
-/*
- * Lockless lookup of swap entry in radix tree, avoiding refcount on pages.
- */
-static pgoff_t shmem_find_swap(struct address_space *mapping, void *radswap)
-{
-	void  **slots[PAGEVEC_SIZE];
-	pgoff_t indices[PAGEVEC_SIZE];
-	unsigned int nr_found;
-
-restart:
-	nr_found = 1;
-	indices[0] = -1;
-	while (nr_found) {
-		pgoff_t index = indices[nr_found - 1] + 1;
-		unsigned int i;
-
-		rcu_read_lock();
-		nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
-					slots, indices, index, PAGEVEC_SIZE);
-		for (i = 0; i < nr_found; i++) {
-			void *item = radix_tree_deref_slot(slots[i]);
-			if (radix_tree_deref_retry(item)) {
-				rcu_read_unlock();
-				goto restart;
-			}
-			if (item == radswap) {
-				rcu_read_unlock();
-				return indices[i];
-			}
-		}
-		rcu_read_unlock();
-		cond_resched();
-	}
-	return -1;
-}
-
 /*
  * Remove swap entry from radix tree, free the swap and its page cache.
  */
@@ -612,7 +576,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
 	int error;
 
 	radswap = swp_to_radix_entry(swap);
-	index = shmem_find_swap(mapping, radswap);
+	index = radix_tree_locate_item(&mapping->page_tree, radswap);
 	if (index == -1)
 		return 0;
 
-- 
cgit v1.2.3-70-g09d2


From 8079b1c859c44f27d63da4951f5038a16589a563 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 3 Aug 2011 16:21:28 -0700
Subject: mm: clarify the radix_tree exceptional cases

Make the radix_tree exceptional cases, mostly in filemap.c, clearer.

It's hard to devise a suitable snappy name that illuminates the use by
shmem/tmpfs for swap, while keeping filemap/pagecache/radix_tree
generality.  And akpm points out that /* radix_tree_deref_retry(page) */
comments look like calls that have been commented out for unknown
reason.

Skirt the naming difficulty by rearranging these blocks to handle the
transient radix_tree_deref_retry(page) case first; then just explain the
remaining shmem/tmpfs swap case in a comment.

Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c | 66 ++++++++++++++++++++++++++++++++++++++++--------------------
 mm/mincore.c |  1 +
 mm/shmem.c   | 12 +++++++----
 3 files changed, 53 insertions(+), 26 deletions(-)

(limited to 'mm/shmem.c')

diff --git a/mm/filemap.c b/mm/filemap.c
index 96778faf82d..645a080ba4d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -700,10 +700,14 @@ repeat:
 		if (unlikely(!page))
 			goto out;
 		if (radix_tree_exception(page)) {
-			if (radix_tree_exceptional_entry(page))
-				goto out;
-			/* radix_tree_deref_retry(page) */
-			goto repeat;
+			if (radix_tree_deref_retry(page))
+				goto repeat;
+			/*
+			 * Otherwise, shmem/tmpfs must be storing a swap entry
+			 * here as an exceptional entry: so return it without
+			 * attempting to raise page count.
+			 */
+			goto out;
 		}
 		if (!page_cache_get_speculative(page))
 			goto repeat;
@@ -838,15 +842,21 @@ repeat:
 			continue;
 
 		if (radix_tree_exception(page)) {
-			if (radix_tree_exceptional_entry(page))
-				continue;
+			if (radix_tree_deref_retry(page)) {
+				/*
+				 * Transient condition which can only trigger
+				 * when entry at index 0 moves out of or back
+				 * to root: none yet gotten, safe to restart.
+				 */
+				WARN_ON(start | i);
+				goto restart;
+			}
 			/*
-			 * radix_tree_deref_retry(page):
-			 * can only trigger when entry at index 0 moves out of
-			 * or back to root: none yet gotten, safe to restart.
+			 * Otherwise, shmem/tmpfs must be storing a swap entry
+			 * here as an exceptional entry: so skip over it -
+			 * we only reach this from invalidate_mapping_pages().
 			 */
-			WARN_ON(start | i);
-			goto restart;
+			continue;
 		}
 
 		if (!page_cache_get_speculative(page))
@@ -904,14 +914,20 @@ repeat:
 			continue;
 
 		if (radix_tree_exception(page)) {
-			if (radix_tree_exceptional_entry(page))
-				break;
+			if (radix_tree_deref_retry(page)) {
+				/*
+				 * Transient condition which can only trigger
+				 * when entry at index 0 moves out of or back
+				 * to root: none yet gotten, safe to restart.
+				 */
+				goto restart;
+			}
 			/*
-			 * radix_tree_deref_retry(page):
-			 * can only trigger when entry at index 0 moves out of
-			 * or back to root: none yet gotten, safe to restart.
+			 * Otherwise, shmem/tmpfs must be storing a swap entry
+			 * here as an exceptional entry: so stop looking for
+			 * contiguous pages.
 			 */
-			goto restart;
+			break;
 		}
 
 		if (!page_cache_get_speculative(page))
@@ -973,13 +989,19 @@ repeat:
 			continue;
 
 		if (radix_tree_exception(page)) {
-			BUG_ON(radix_tree_exceptional_entry(page));
+			if (radix_tree_deref_retry(page)) {
+				/*
+				 * Transient condition which can only trigger
+				 * when entry at index 0 moves out of or back
+				 * to root: none yet gotten, safe to restart.
+				 */
+				goto restart;
+			}
 			/*
-			 * radix_tree_deref_retry(page):
-			 * can only trigger when entry at index 0 moves out of
-			 * or back to root: none yet gotten, safe to restart.
+			 * This function is never used on a shmem/tmpfs
+			 * mapping, so a swap entry won't be found here.
 			 */
-			goto restart;
+			BUG();
 		}
 
 		if (!page_cache_get_speculative(page))
diff --git a/mm/mincore.c b/mm/mincore.c
index 733f1829b0d..636a86876ff 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -72,6 +72,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
 	 */
 	page = find_get_page(mapping, pgoff);
 #ifdef CONFIG_SWAP
+	/* shmem/tmpfs may return swap: account for swapcache page too. */
 	if (radix_tree_exceptional_entry(page)) {
 		swp_entry_t swap = radix_to_swp_entry(page);
 		page = find_get_page(&swapper_space, swap.val);
diff --git a/mm/shmem.c b/mm/shmem.c
index 1c702f6f124..32f6763f16f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -332,10 +332,14 @@ repeat:
 		if (unlikely(!page))
 			continue;
 		if (radix_tree_exception(page)) {
-			if (radix_tree_exceptional_entry(page))
-				goto export;
-			/* radix_tree_deref_retry(page) */
-			goto restart;
+			if (radix_tree_deref_retry(page))
+				goto restart;
+			/*
+			 * Otherwise, we must be storing a swap entry
+			 * here as an exceptional entry: so return it
+			 * without attempting to raise page count.
+			 */
+			goto export;
 		}
 		if (!page_cache_get_speculative(page))
 			goto repeat;
-- 
cgit v1.2.3-70-g09d2