From 285b2c4fdd69ea73b4762785d8c6be83b6c074a6 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 3 Aug 2011 16:21:20 -0700 Subject: tmpfs: demolish old swap vector support The maximum size of a shmem/tmpfs file has been limited by the maximum size of its triple-indirect swap vector. With 4kB page size, maximum filesize was just over 2TB on a 32-bit kernel, but sadly one eighth of that on a 64-bit kernel. (With 8kB page size, maximum filesize was just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel, MAX_LFS_FILESIZE being then more restrictive than swap vector layout.) It's a shame that tmpfs should be more restrictive than ramfs, and this limitation has now been noticed. Add another level to the swap vector? No, it became obscure and hard to maintain, once I complicated it to make use of highmem pages nine years ago: better choose another way. Surely, if 2.4 had had the radix tree pagecache introduced in 2.5, then tmpfs would never have invented its own peculiar radix tree: we would have fitted swap entries into the common radix tree instead, in much the same way as we fit swap entries into page tables. And why should each file have a separate radix tree for its pages and for its swap entries? The swap entries are required precisely where and when the pages are not. We want to put them together in a single radix tree: which can then avoid much of the locking which was needed to prevent them from being exchanged underneath us. This also avoids the waste of memory devoted to swap vectors, first in the shmem_inode itself, then at least two more pages once a file grew beyond 16 data pages (pages accounted by df and du, but not by memcg). Allocated upfront, to avoid allocation when under swapping pressure, but pure waste when CONFIG_SWAP is not set - I have never spattered around the ifdefs to prevent that, preferring this move to sharing the common radix tree instead. There are three downsides to sharing the radix tree. One, that it binds tmpfs more tightly to the rest of mm, either requiring knowledge of swap entries in radix tree there, or duplication of its code here in shmem.c. I believe that the simplications and memory savings (and probable higher performance, not yet measured) justify that. Two, that on HIGHMEM systems with SWAP enabled, it's the lowmem radix nodes that cannot be freed under memory pressure - whereas before it was the less precious highmem swap vector pages that could not be freed. I'm hoping that 64-bit has now been accessible for long enough, that the highmem argument has grown much less persuasive. Three, that swapoff is slower than it used to be on tmpfs files, since it's using a simple generic mechanism not tailored to it: I find this noticeable, and shall want to improve, but maybe nobody else will notice. So... now remove most of the old swap vector code from shmem.c. But, for the moment, keep the simple i_direct vector of 16 pages, with simple accessors shmem_put_swap() and shmem_get_swap(), as a toy implementation to help mark where swap needs to be handled in subsequent patches. Signed-off-by: Hugh Dickins Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/shmem_fs.h') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index aa08fa8fd79..80b695213fd 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -17,9 +17,7 @@ struct shmem_inode_info { unsigned long flags; unsigned long alloced; /* data pages alloced to file */ unsigned long swapped; /* subtotal assigned to swap */ - unsigned long next_index; /* highest alloced index + 1 */ struct shared_policy policy; /* NUMA memory alloc policy */ - struct page *i_indirect; /* top indirect blocks page */ union { swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* first blocks */ char inline_symlink[SHMEM_SYMLINK_INLINE_LEN]; -- cgit v1.2.3-70-g09d2 From 41ffe5d5ceef7f7ff2ff18e320d88ca6d629efaf Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 3 Aug 2011 16:21:21 -0700 Subject: tmpfs: miscellaneous trivial cleanups While it's at its least, make a number of boring nitpicky cleanups to shmem.c, mostly for consistency of variable naming. Things like "swap" instead of "entry", "pgoff_t index" instead of "unsigned long idx". And since everything else here is prefixed "shmem_", better change init_tmpfs() to shmem_init(). Signed-off-by: Hugh Dickins Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 2 +- init/main.c | 2 +- mm/shmem.c | 216 +++++++++++++++++++++++------------------------ 3 files changed, 109 insertions(+), 111 deletions(-) (limited to 'include/linux/shmem_fs.h') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 80b695213fd..3f05795dcf7 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -47,7 +47,7 @@ static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) /* * Functions in mm/shmem.c called directly from elsewhere: */ -extern int init_tmpfs(void); +extern int shmem_init(void); extern int shmem_fill_super(struct super_block *sb, void *data, int silent); extern struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); diff --git a/init/main.c b/init/main.c index d7211faed2a..1952d37e4ec 100644 --- a/init/main.c +++ b/init/main.c @@ -715,7 +715,7 @@ static void __init do_basic_setup(void) { cpuset_init_smp(); usermodehelper_init(); - init_tmpfs(); + shmem_init(); driver_init(); init_irq_proc(); do_ctors(); diff --git a/mm/shmem.c b/mm/shmem.c index 5574b00ca77..24e95ac1605 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -28,7 +28,6 @@ #include #include #include -#include #include static struct vfsmount *shm_mnt; @@ -51,6 +50,7 @@ static struct vfsmount *shm_mnt; #include #include #include +#include #include #include #include @@ -63,7 +63,6 @@ static struct vfsmount *shm_mnt; #include #include -#include #include #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) @@ -201,7 +200,7 @@ static void shmem_free_inode(struct super_block *sb) } /** - * shmem_recalc_inode - recalculate the size of an inode + * shmem_recalc_inode - recalculate the block usage of an inode * @inode: inode to recalc * * We have to calculate the free blocks since the mm can drop @@ -356,19 +355,20 @@ static void shmem_evict_inode(struct inode *inode) end_writeback(inode); } -static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) +static int shmem_unuse_inode(struct shmem_inode_info *info, + swp_entry_t swap, struct page *page) { struct address_space *mapping = info->vfs_inode.i_mapping; - unsigned long idx; + pgoff_t index; int error; - for (idx = 0; idx < SHMEM_NR_DIRECT; idx++) - if (shmem_get_swap(info, idx).val == entry.val) + for (index = 0; index < SHMEM_NR_DIRECT; index++) + if (shmem_get_swap(info, index).val == swap.val) goto found; return 0; found: spin_lock(&info->lock); - if (shmem_get_swap(info, idx).val != entry.val) { + if (shmem_get_swap(info, index).val != swap.val) { spin_unlock(&info->lock); return 0; } @@ -387,15 +387,15 @@ found: * but also to hold up shmem_evict_inode(): so inode cannot be freed * beneath us (pagelock doesn't help until the page is in pagecache). */ - error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); + error = add_to_page_cache_locked(page, mapping, index, GFP_NOWAIT); /* which does mem_cgroup_uncharge_cache_page on error */ if (error != -ENOMEM) { delete_from_swap_cache(page); set_page_dirty(page); - shmem_put_swap(info, idx, (swp_entry_t){0}); + shmem_put_swap(info, index, (swp_entry_t){0}); info->swapped--; - swap_free(entry); + swap_free(swap); error = 1; /* not an error, but entry was found */ } spin_unlock(&info->lock); @@ -405,9 +405,9 @@ found: /* * shmem_unuse() search for an eventually swapped out shmem page. */ -int shmem_unuse(swp_entry_t entry, struct page *page) +int shmem_unuse(swp_entry_t swap, struct page *page) { - struct list_head *p, *next; + struct list_head *this, *next; struct shmem_inode_info *info; int found = 0; int error; @@ -432,8 +432,8 @@ int shmem_unuse(swp_entry_t entry, struct page *page) radix_tree_preload_end(); mutex_lock(&shmem_swaplist_mutex); - list_for_each_safe(p, next, &shmem_swaplist) { - info = list_entry(p, struct shmem_inode_info, swaplist); + list_for_each_safe(this, next, &shmem_swaplist) { + info = list_entry(this, struct shmem_inode_info, swaplist); if (!info->swapped) { spin_lock(&info->lock); if (!info->swapped) @@ -441,7 +441,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page) spin_unlock(&info->lock); } if (info->swapped) - found = shmem_unuse_inode(info, entry, page); + found = shmem_unuse_inode(info, swap, page); cond_resched(); if (found) break; @@ -467,7 +467,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) struct shmem_inode_info *info; swp_entry_t swap, oswap; struct address_space *mapping; - unsigned long index; + pgoff_t index; struct inode *inode; BUG_ON(!PageLocked(page)); @@ -577,35 +577,33 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) } #endif /* CONFIG_TMPFS */ -static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, - struct shmem_inode_info *info, unsigned long idx) +static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) { struct mempolicy mpol, *spol; struct vm_area_struct pvma; - struct page *page; spol = mpol_cond_copy(&mpol, - mpol_shared_policy_lookup(&info->policy, idx)); + mpol_shared_policy_lookup(&info->policy, index)); /* Create a pseudo vma that just contains the policy */ pvma.vm_start = 0; - pvma.vm_pgoff = idx; + pvma.vm_pgoff = index; pvma.vm_ops = NULL; pvma.vm_policy = spol; - page = swapin_readahead(entry, gfp, &pvma, 0); - return page; + return swapin_readahead(swap, gfp, &pvma, 0); } static struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, unsigned long idx) + struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; /* Create a pseudo vma that just contains the policy */ pvma.vm_start = 0; - pvma.vm_pgoff = idx; + pvma.vm_pgoff = index; pvma.vm_ops = NULL; - pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); + pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); /* * alloc_page_vma() will drop the shared policy reference @@ -614,19 +612,19 @@ static struct page *shmem_alloc_page(gfp_t gfp, } #else /* !CONFIG_NUMA */ #ifdef CONFIG_TMPFS -static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p) +static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) { } #endif /* CONFIG_TMPFS */ -static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, - struct shmem_inode_info *info, unsigned long idx) +static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) { - return swapin_readahead(entry, gfp, NULL, 0); + return swapin_readahead(swap, gfp, NULL, 0); } static inline struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, unsigned long idx) + struct shmem_inode_info *info, pgoff_t index) { return alloc_page(gfp); } @@ -646,7 +644,7 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache */ -static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx, +static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) { struct address_space *mapping = inode->i_mapping; @@ -657,10 +655,10 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx, swp_entry_t swap; int error; - if (idx > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) + if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) return -EFBIG; repeat: - page = find_lock_page(mapping, idx); + page = find_lock_page(mapping, index); if (page) { /* * Once we can get the page lock, it must be uptodate: @@ -681,7 +679,7 @@ repeat: radix_tree_preload_end(); if (sgp != SGP_READ && !prealloc_page) { - prealloc_page = shmem_alloc_page(gfp, info, idx); + prealloc_page = shmem_alloc_page(gfp, info, index); if (prealloc_page) { SetPageSwapBacked(prealloc_page); if (mem_cgroup_cache_charge(prealloc_page, @@ -694,7 +692,7 @@ repeat: spin_lock(&info->lock); shmem_recalc_inode(inode); - swap = shmem_get_swap(info, idx); + swap = shmem_get_swap(info, index); if (swap.val) { /* Look it up and read it in.. */ page = lookup_swap_cache(swap); @@ -703,9 +701,9 @@ repeat: /* here we actually do the io */ if (fault_type) *fault_type |= VM_FAULT_MAJOR; - page = shmem_swapin(swap, gfp, info, idx); + page = shmem_swapin(swap, gfp, info, index); if (!page) { - swp_entry_t nswap = shmem_get_swap(info, idx); + swp_entry_t nswap = shmem_get_swap(info, index); if (nswap.val == swap.val) { error = -ENOMEM; goto out; @@ -740,7 +738,7 @@ repeat: } error = add_to_page_cache_locked(page, mapping, - idx, GFP_NOWAIT); + index, GFP_NOWAIT); if (error) { spin_unlock(&info->lock); if (error == -ENOMEM) { @@ -762,14 +760,14 @@ repeat: } delete_from_swap_cache(page); - shmem_put_swap(info, idx, (swp_entry_t){0}); + shmem_put_swap(info, index, (swp_entry_t){0}); info->swapped--; spin_unlock(&info->lock); set_page_dirty(page); swap_free(swap); } else if (sgp == SGP_READ) { - page = find_get_page(mapping, idx); + page = find_get_page(mapping, index); if (page && !trylock_page(page)) { spin_unlock(&info->lock); wait_on_page_locked(page); @@ -793,12 +791,12 @@ repeat: page = prealloc_page; prealloc_page = NULL; - swap = shmem_get_swap(info, idx); + swap = shmem_get_swap(info, index); if (swap.val) mem_cgroup_uncharge_cache_page(page); else error = add_to_page_cache_lru(page, mapping, - idx, GFP_NOWAIT); + index, GFP_NOWAIT); /* * At add_to_page_cache_lru() failure, * uncharge will be done automatically. @@ -841,7 +839,7 @@ nospace: * but must also avoid reporting a spurious ENOSPC while working on a * full tmpfs. */ - page = find_get_page(mapping, idx); + page = find_get_page(mapping, index); spin_unlock(&info->lock); if (page) { page_cache_release(page); @@ -872,20 +870,20 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } #ifdef CONFIG_NUMA -static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) +static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { - struct inode *i = vma->vm_file->f_path.dentry->d_inode; - return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); } static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) { - struct inode *i = vma->vm_file->f_path.dentry->d_inode; - unsigned long idx; + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + pgoff_t index; - idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx); + index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); } #endif @@ -1016,7 +1014,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ { struct inode *inode = filp->f_path.dentry->d_inode; struct address_space *mapping = inode->i_mapping; - unsigned long index, offset; + pgoff_t index; + unsigned long offset; enum sgp_type sgp = SGP_READ; /* @@ -1032,7 +1031,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ for (;;) { struct page *page = NULL; - unsigned long end_index, nr, ret; + pgoff_t end_index; + unsigned long nr, ret; loff_t i_size = i_size_read(inode); end_index = i_size >> PAGE_CACHE_SHIFT; @@ -1270,8 +1270,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = NAME_MAX; if (sbinfo->max_blocks) { buf->f_blocks = sbinfo->max_blocks; - buf->f_bavail = buf->f_bfree = - sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks); + buf->f_bavail = + buf->f_bfree = sbinfo->max_blocks - + percpu_counter_sum(&sbinfo->used_blocks); } if (sbinfo->max_inodes) { buf->f_files = sbinfo->max_inodes; @@ -1480,8 +1481,8 @@ static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *n static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) { struct page *page = NULL; - int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); - nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); + int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); + nd_set_link(nd, error ? ERR_PTR(error) : kmap(page)); if (page) unlock_page(page); return page; @@ -1592,7 +1593,6 @@ out: return err; } - static const struct xattr_handler *shmem_xattr_handlers[] = { #ifdef CONFIG_TMPFS_POSIX_ACL &generic_acl_access_handler, @@ -2052,14 +2052,14 @@ static struct kmem_cache *shmem_inode_cachep; static struct inode *shmem_alloc_inode(struct super_block *sb) { - struct shmem_inode_info *p; - p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); - if (!p) + struct shmem_inode_info *info; + info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); + if (!info) return NULL; - return &p->vfs_inode; + return &info->vfs_inode; } -static void shmem_i_callback(struct rcu_head *head) +static void shmem_destroy_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); INIT_LIST_HEAD(&inode->i_dentry); @@ -2072,25 +2072,24 @@ static void shmem_destroy_inode(struct inode *inode) /* only struct inode is valid if it's an inline symlink */ mpol_free_shared_policy(&SHMEM_I(inode)->policy); } - call_rcu(&inode->i_rcu, shmem_i_callback); + call_rcu(&inode->i_rcu, shmem_destroy_callback); } -static void init_once(void *foo) +static void shmem_init_inode(void *foo) { - struct shmem_inode_info *p = (struct shmem_inode_info *) foo; - - inode_init_once(&p->vfs_inode); + struct shmem_inode_info *info = foo; + inode_init_once(&info->vfs_inode); } -static int init_inodecache(void) +static int shmem_init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), - 0, SLAB_PANIC, init_once); + 0, SLAB_PANIC, shmem_init_inode); return 0; } -static void destroy_inodecache(void) +static void shmem_destroy_inodecache(void) { kmem_cache_destroy(shmem_inode_cachep); } @@ -2187,21 +2186,20 @@ static const struct vm_operations_struct shmem_vm_ops = { #endif }; - static struct dentry *shmem_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_nodev(fs_type, flags, data, shmem_fill_super); } -static struct file_system_type tmpfs_fs_type = { +static struct file_system_type shmem_fs_type = { .owner = THIS_MODULE, .name = "tmpfs", .mount = shmem_mount, .kill_sb = kill_litter_super, }; -int __init init_tmpfs(void) +int __init shmem_init(void) { int error; @@ -2209,18 +2207,18 @@ int __init init_tmpfs(void) if (error) goto out4; - error = init_inodecache(); + error = shmem_init_inodecache(); if (error) goto out3; - error = register_filesystem(&tmpfs_fs_type); + error = register_filesystem(&shmem_fs_type); if (error) { printk(KERN_ERR "Could not register tmpfs\n"); goto out2; } - shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER, - tmpfs_fs_type.name, NULL); + shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER, + shmem_fs_type.name, NULL); if (IS_ERR(shm_mnt)) { error = PTR_ERR(shm_mnt); printk(KERN_ERR "Could not kern_mount tmpfs\n"); @@ -2229,9 +2227,9 @@ int __init init_tmpfs(void) return 0; out1: - unregister_filesystem(&tmpfs_fs_type); + unregister_filesystem(&shmem_fs_type); out2: - destroy_inodecache(); + shmem_destroy_inodecache(); out3: bdi_destroy(&shmem_backing_dev_info); out4: @@ -2241,37 +2239,37 @@ out4: #ifdef CONFIG_CGROUP_MEM_RES_CTLR /** - * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file + * mem_cgroup_get_shmem_target - find page or swap assigned to the shmem file * @inode: the inode to be searched - * @pgoff: the offset to be searched + * @index: the page offset to be searched * @pagep: the pointer for the found page to be stored - * @ent: the pointer for the found swap entry to be stored + * @swapp: the pointer for the found swap entry to be stored * * If a page is found, refcount of it is incremented. Callers should handle * these refcount. */ -void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, - struct page **pagep, swp_entry_t *ent) +void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t index, + struct page **pagep, swp_entry_t *swapp) { - swp_entry_t entry = { .val = 0 }; - struct page *page = NULL; struct shmem_inode_info *info = SHMEM_I(inode); + struct page *page = NULL; + swp_entry_t swap = {0}; - if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) + if ((index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) goto out; spin_lock(&info->lock); #ifdef CONFIG_SWAP - entry = shmem_get_swap(info, pgoff); - if (entry.val) - page = find_get_page(&swapper_space, entry.val); + swap = shmem_get_swap(info, index); + if (swap.val) + page = find_get_page(&swapper_space, swap.val); else #endif - page = find_get_page(inode->i_mapping, pgoff); + page = find_get_page(inode->i_mapping, index); spin_unlock(&info->lock); out: *pagep = page; - *ent = entry; + *swapp = swap; } #endif @@ -2288,23 +2286,23 @@ out: #include -static struct file_system_type tmpfs_fs_type = { +static struct file_system_type shmem_fs_type = { .name = "tmpfs", .mount = ramfs_mount, .kill_sb = kill_litter_super, }; -int __init init_tmpfs(void) +int __init shmem_init(void) { - BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); + BUG_ON(register_filesystem(&shmem_fs_type) != 0); - shm_mnt = kern_mount(&tmpfs_fs_type); + shm_mnt = kern_mount(&shmem_fs_type); BUG_ON(IS_ERR(shm_mnt)); return 0; } -int shmem_unuse(swp_entry_t entry, struct page *page) +int shmem_unuse(swp_entry_t swap, struct page *page) { return 0; } @@ -2314,34 +2312,34 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) return 0; } -void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) +void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) { - truncate_inode_pages_range(inode->i_mapping, start, end); + truncate_inode_pages_range(inode->i_mapping, lstart, lend); } EXPORT_SYMBOL_GPL(shmem_truncate_range); #ifdef CONFIG_CGROUP_MEM_RES_CTLR /** - * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file + * mem_cgroup_get_shmem_target - find page or swap assigned to the shmem file * @inode: the inode to be searched - * @pgoff: the offset to be searched + * @index: the page offset to be searched * @pagep: the pointer for the found page to be stored - * @ent: the pointer for the found swap entry to be stored + * @swapp: the pointer for the found swap entry to be stored * * If a page is found, refcount of it is incremented. Callers should handle * these refcount. */ -void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, - struct page **pagep, swp_entry_t *ent) +void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t index, + struct page **pagep, swp_entry_t *swapp) { struct page *page = NULL; - if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) + if ((index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) goto out; - page = find_get_page(inode->i_mapping, pgoff); + page = find_get_page(inode->i_mapping, index); out: *pagep = page; - *ent = (swp_entry_t){ .val = 0 }; + *swapp = (swp_entry_t){0}; } #endif -- cgit v1.2.3-70-g09d2 From aa3b189551ad8e5cc1d9c663735c131650238278 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 3 Aug 2011 16:21:24 -0700 Subject: tmpfs: convert mem_cgroup shmem to radix-swap Remove mem_cgroup_shmem_charge_fallback(): it was only required when we had to move swappage to filecache with GFP_NOWAIT. Remove the GFP_NOWAIT special case from mem_cgroup_cache_charge(), by moving its call out from shmem_add_to_page_cache() to two of thats three callers. But leave it doing mem_cgroup_uncharge_cache_page() on error: although asymmetrical, it's easier for all 3 callers to handle. These two changes would also be appropriate if anyone were to start using shmem_read_mapping_page_gfp() with GFP_NOWAIT. Remove mem_cgroup_get_shmem_target(): mc_handle_file_pte() can test radix_tree_exceptional_entry() to get what it needs for itself. Signed-off-by: Hugh Dickins Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 8 ----- include/linux/shmem_fs.h | 2 -- mm/memcontrol.c | 66 +++++------------------------------- mm/shmem.c | 83 ++++++---------------------------------------- 4 files changed, 20 insertions(+), 139 deletions(-) (limited to 'include/linux/shmem_fs.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b9660078691..3b535db00a9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -86,8 +86,6 @@ extern void mem_cgroup_uncharge_end(void); extern void mem_cgroup_uncharge_page(struct page *page); extern void mem_cgroup_uncharge_cache_page(struct page *page); -extern int mem_cgroup_shmem_charge_fallback(struct page *page, - struct mm_struct *mm, gfp_t gfp_mask); extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask); int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem); @@ -225,12 +223,6 @@ static inline void mem_cgroup_uncharge_cache_page(struct page *page) { } -static inline int mem_cgroup_shmem_charge_fallback(struct page *page, - struct mm_struct *mm, gfp_t gfp_mask) -{ - return 0; -} - static inline void mem_cgroup_add_lru_list(struct page *page, int lru) { } diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 3f05795dcf7..0c8e952df59 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -57,8 +57,6 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); extern int shmem_unuse(swp_entry_t entry, struct page *page); -extern void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, - struct page **pagep, swp_entry_t *ent); static inline struct page *shmem_read_mapping_page( struct address_space *mapping, pgoff_t index) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5f84d2351dd..f4ec4e7ca4c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -2873,30 +2872,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, return 0; if (PageCompound(page)) return 0; - /* - * Corner case handling. This is called from add_to_page_cache() - * in usual. But some FS (shmem) precharges this page before calling it - * and call add_to_page_cache() with GFP_NOWAIT. - * - * For GFP_NOWAIT case, the page may be pre-charged before calling - * add_to_page_cache(). (See shmem.c) check it here and avoid to call - * charge twice. (It works but has to pay a bit larger cost.) - * And when the page is SwapCache, it should take swap information - * into account. This is under lock_page() now. - */ - if (!(gfp_mask & __GFP_WAIT)) { - struct page_cgroup *pc; - - pc = lookup_page_cgroup(page); - if (!pc) - return 0; - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - unlock_page_cgroup(pc); - return 0; - } - unlock_page_cgroup(pc); - } if (unlikely(!mm)) mm = &init_mm; @@ -3486,31 +3461,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, cgroup_release_and_wakeup_rmdir(&mem->css); } -/* - * A call to try to shrink memory usage on charge failure at shmem's swapin. - * Calling hierarchical_reclaim is not enough because we should update - * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. - * Moreover considering hierarchy, we should reclaim from the mem_over_limit, - * not from the memcg which this page would be charged to. - * try_charge_swapin does all of these works properly. - */ -int mem_cgroup_shmem_charge_fallback(struct page *page, - struct mm_struct *mm, - gfp_t gfp_mask) -{ - struct mem_cgroup *mem; - int ret; - - if (mem_cgroup_disabled()) - return 0; - - ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); - if (!ret) - mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ - - return ret; -} - #ifdef CONFIG_DEBUG_VM static struct page_cgroup *lookup_page_cgroup_used(struct page *page) { @@ -5330,15 +5280,17 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, pgoff = pte_to_pgoff(ptent); /* page is moved even if it's not RSS of this task(page-faulted). */ - if (!mapping_cap_swap_backed(mapping)) { /* normal file */ - page = find_get_page(mapping, pgoff); - } else { /* shmem/tmpfs file. we should take account of swap too. */ - swp_entry_t ent; - mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); + page = find_get_page(mapping, pgoff); + +#ifdef CONFIG_SWAP + /* shmem/tmpfs may report page out on swap: account for that too. */ + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swap = radix_to_swp_entry(page); if (do_swap_account) - entry->val = ent.val; + *entry = swap; + page = find_get_page(&swapper_space, swap.val); } - +#endif return page; } diff --git a/mm/shmem.c b/mm/shmem.c index 92f01d7cc15..13ef2d7e912 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -262,15 +262,11 @@ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp, void *expected) { - int error; + int error = 0; VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageSwapBacked(page)); - error = mem_cgroup_cache_charge(page, current->mm, - gfp & GFP_RECLAIM_MASK); - if (error) - goto out; if (!expected) error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); if (!error) { @@ -300,7 +296,6 @@ static int shmem_add_to_page_cache(struct page *page, } if (error) mem_cgroup_uncharge_cache_page(page); -out: return error; } @@ -660,7 +655,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page) * Charge page using GFP_KERNEL while we can wait, before taking * the shmem_swaplist_mutex which might hold up shmem_writepage(). * Charged back to the user (not to caller) when swap account is used. - * shmem_add_to_page_cache() will be called with GFP_NOWAIT. */ error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); if (error) @@ -954,8 +948,11 @@ repeat: goto failed; } - error = shmem_add_to_page_cache(page, mapping, index, - gfp, swp_to_radix_entry(swap)); + error = mem_cgroup_cache_charge(page, current->mm, + gfp & GFP_RECLAIM_MASK); + if (!error) + error = shmem_add_to_page_cache(page, mapping, index, + gfp, swp_to_radix_entry(swap)); if (error) goto failed; @@ -990,8 +987,11 @@ repeat: SetPageSwapBacked(page); __set_page_locked(page); - error = shmem_add_to_page_cache(page, mapping, index, - gfp, NULL); + error = mem_cgroup_cache_charge(page, current->mm, + gfp & GFP_RECLAIM_MASK); + if (!error) + error = shmem_add_to_page_cache(page, mapping, index, + gfp, NULL); if (error) goto decused; lru_cache_add_anon(page); @@ -2442,42 +2442,6 @@ out4: return error; } -#ifdef CONFIG_CGROUP_MEM_RES_CTLR -/** - * mem_cgroup_get_shmem_target - find page or swap assigned to the shmem file - * @inode: the inode to be searched - * @index: the page offset to be searched - * @pagep: the pointer for the found page to be stored - * @swapp: the pointer for the found swap entry to be stored - * - * If a page is found, refcount of it is incremented. Callers should handle - * these refcount. - */ -void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t index, - struct page **pagep, swp_entry_t *swapp) -{ - struct shmem_inode_info *info = SHMEM_I(inode); - struct page *page = NULL; - swp_entry_t swap = {0}; - - if ((index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) - goto out; - - spin_lock(&info->lock); -#ifdef CONFIG_SWAP - swap = shmem_get_swap(info, index); - if (swap.val) - page = find_get_page(&swapper_space, swap.val); - else -#endif - page = find_get_page(inode->i_mapping, index); - spin_unlock(&info->lock); -out: - *pagep = page; - *swapp = swap; -} -#endif - #else /* !CONFIG_SHMEM */ /* @@ -2523,31 +2487,6 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) } EXPORT_SYMBOL_GPL(shmem_truncate_range); -#ifdef CONFIG_CGROUP_MEM_RES_CTLR -/** - * mem_cgroup_get_shmem_target - find page or swap assigned to the shmem file - * @inode: the inode to be searched - * @index: the page offset to be searched - * @pagep: the pointer for the found page to be stored - * @swapp: the pointer for the found swap entry to be stored - * - * If a page is found, refcount of it is incremented. Callers should handle - * these refcount. - */ -void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t index, - struct page **pagep, swp_entry_t *swapp) -{ - struct page *page = NULL; - - if ((index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) - goto out; - page = find_get_page(inode->i_mapping, index); -out: - *pagep = page; - *swapp = (swp_entry_t){0}; -} -#endif - #define shmem_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) -- cgit v1.2.3-70-g09d2 From 69f07ec938712b58755add82dd3d0b35f01317cc Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 3 Aug 2011 16:21:26 -0700 Subject: tmpfs: use kmemdup for short symlinks But we've not yet removed the old swp_entry_t i_direct[16] from shmem_inode_info. That's because it was still being shared with the inline symlink. Remove it now (saving 64 or 128 bytes from shmem inode size), and use kmemdup() for short symlinks, say, those up to 128 bytes. I wonder why mpol_free_shared_policy() is done in shmem_destroy_inode() rather than shmem_evict_inode(), where we usually do such freeing? I guess it doesn't matter, and I'm not into NUMA mpol testing right now. Signed-off-by: Hugh Dickins Acked-by: Rik van Riel Reviewed-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 11 +++-------- mm/shmem.c | 31 ++++++++++++++++++------------- 2 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux/shmem_fs.h') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 0c8e952df59..9291ac3cc62 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -8,20 +8,15 @@ /* inode in-kernel data */ -#define SHMEM_NR_DIRECT 16 - -#define SHMEM_SYMLINK_INLINE_LEN (SHMEM_NR_DIRECT * sizeof(swp_entry_t)) - struct shmem_inode_info { spinlock_t lock; unsigned long flags; unsigned long alloced; /* data pages alloced to file */ - unsigned long swapped; /* subtotal assigned to swap */ - struct shared_policy policy; /* NUMA memory alloc policy */ union { - swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* first blocks */ - char inline_symlink[SHMEM_SYMLINK_INLINE_LEN]; + unsigned long swapped; /* subtotal assigned to swap */ + char *symlink; /* unswappable short symlink */ }; + struct shared_policy policy; /* NUMA memory alloc policy */ struct list_head swaplist; /* chain of maybes on swap */ struct list_head xattr_list; /* list of shmem_xattr */ struct inode vfs_inode; diff --git a/mm/shmem.c b/mm/shmem.c index 0f094a25852..3a5be0feb6a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -73,6 +73,9 @@ static struct vfsmount *shm_mnt; /* Pretend that each entry is of this size in directory's i_size */ #define BOGO_DIRENT_SIZE 20 +/* Symlink up to this size is kmalloc'ed instead of using a swappable page */ +#define SHORT_SYMLINK_LEN 128 + struct shmem_xattr { struct list_head list; /* anchored by shmem_inode_info->xattr_list */ char *name; /* xattr name */ @@ -585,7 +588,8 @@ static void shmem_evict_inode(struct inode *inode) list_del_init(&info->swaplist); mutex_unlock(&shmem_swaplist_mutex); } - } + } else + kfree(info->symlink); list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { kfree(xattr->name); @@ -1173,7 +1177,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; -static const struct inode_operations shmem_symlink_inline_operations; +static const struct inode_operations shmem_short_symlink_operations; static int shmem_write_begin(struct file *file, struct address_space *mapping, @@ -1638,10 +1642,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s info = SHMEM_I(inode); inode->i_size = len-1; - if (len <= SHMEM_SYMLINK_INLINE_LEN) { - /* do it inline */ - memcpy(info->inline_symlink, symname, len); - inode->i_op = &shmem_symlink_inline_operations; + if (len <= SHORT_SYMLINK_LEN) { + info->symlink = kmemdup(symname, len, GFP_KERNEL); + if (!info->symlink) { + iput(inode); + return -ENOMEM; + } + inode->i_op = &shmem_short_symlink_operations; } else { error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); if (error) { @@ -1664,9 +1671,9 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s return 0; } -static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) +static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd) { - nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink); + nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink); return NULL; } @@ -1914,9 +1921,9 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) } #endif /* CONFIG_TMPFS_XATTR */ -static const struct inode_operations shmem_symlink_inline_operations = { +static const struct inode_operations shmem_short_symlink_operations = { .readlink = generic_readlink, - .follow_link = shmem_follow_link_inline, + .follow_link = shmem_follow_short_symlink, #ifdef CONFIG_TMPFS_XATTR .setxattr = shmem_setxattr, .getxattr = shmem_getxattr, @@ -2259,10 +2266,8 @@ static void shmem_destroy_callback(struct rcu_head *head) static void shmem_destroy_inode(struct inode *inode) { - if ((inode->i_mode & S_IFMT) == S_IFREG) { - /* only struct inode is valid if it's an inline symlink */ + if ((inode->i_mode & S_IFMT) == S_IFREG) mpol_free_shared_policy(&SHMEM_I(inode)->policy); - } call_rcu(&inode->i_rcu, shmem_destroy_callback); } -- cgit v1.2.3-70-g09d2