From f758eeabeb96f878c860e8f110f94ec8820822a9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Apr 2011 18:19:44 -0600 Subject: writeback: split inode_wb_list_lock into bdi_writeback.list_lock Split the global inode_wb_list_lock into a per-bdi_writeback list_lock, as it's currently the most contended lock in the system for metadata heavy workloads. It won't help for single-filesystem workloads for which we'll need the I/O-less balance_dirty_pages, but at least we can dedicate a cpu to spinning on each bdi now for larger systems. Based on earlier patches from Nick Piggin and Dave Chinner. It reduces lock contentions to 1/4 in this test case: 10 HDD JBOD, 100 dd on each disk, XFS, 6GB ram lock_stat version 0.3 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- class name con-bounces contentions waittime-min waittime-max waittime-total acq-bounces acquisitions holdtime-min holdtime-max holdtime-total ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- vanilla 2.6.39-rc3: inode_wb_list_lock: 42590 44433 0.12 147.74 144127.35 252274 886792 0.08 121.34 917211.23 ------------------ inode_wb_list_lock 2 [] bdev_inode_switch_bdi+0x29/0x85 inode_wb_list_lock 34 [] inode_wb_list_del+0x22/0x49 inode_wb_list_lock 12893 [] __mark_inode_dirty+0x170/0x1d0 inode_wb_list_lock 10702 [] writeback_single_inode+0x16d/0x20a ------------------ inode_wb_list_lock 2 [] bdev_inode_switch_bdi+0x29/0x85 inode_wb_list_lock 19 [] inode_wb_list_del+0x22/0x49 inode_wb_list_lock 5550 [] __mark_inode_dirty+0x170/0x1d0 inode_wb_list_lock 8511 [] writeback_sb_inodes+0x10f/0x157 2.6.39-rc3 + patch: &(&wb->list_lock)->rlock: 11383 11657 0.14 151.69 40429.51 90825 527918 0.11 145.90 556843.37 ------------------------ &(&wb->list_lock)->rlock 10 [] inode_wb_list_del+0x5f/0x86 &(&wb->list_lock)->rlock 1493 [] writeback_inodes_wb+0x3d/0x150 &(&wb->list_lock)->rlock 3652 [] writeback_sb_inodes+0x123/0x16f &(&wb->list_lock)->rlock 1412 [] writeback_single_inode+0x17f/0x223 ------------------------ &(&wb->list_lock)->rlock 3 [] bdi_lock_two+0x46/0x4b &(&wb->list_lock)->rlock 6 [] inode_wb_list_del+0x5f/0x86 &(&wb->list_lock)->rlock 2061 [] __mark_inode_dirty+0x173/0x1cf &(&wb->list_lock)->rlock 2629 [] writeback_sb_inodes+0x123/0x16f hughd@google.com: fix recursive lock when bdi_lock_two() is called with new the same as old akpm@linux-foundation.org: cleanup bdev_inode_switch_bdi() comment Signed-off-by: Christoph Hellwig Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Wu Fengguang --- fs/inode.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index 0f7e88a7803..4be128cbc75 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -37,7 +37,7 @@ * inode_lru, inode->i_lru * inode_sb_list_lock protects: * sb->s_inodes, inode->i_sb_list - * inode_wb_list_lock protects: + * bdi->wb.list_lock protects: * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list * inode_hash_lock protects: * inode_hashtable, inode->i_hash @@ -48,7 +48,7 @@ * inode->i_lock * inode_lru_lock * - * inode_wb_list_lock + * bdi->wb.list_lock * inode->i_lock * * inode_hash_lock @@ -68,7 +68,6 @@ static LIST_HEAD(inode_lru); static DEFINE_SPINLOCK(inode_lru_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); -__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); /* * iprune_sem provides exclusion between the icache shrinking and the -- cgit v1.2.3-70-g09d2 From a209dfc7b0d94bd6fa94553c097836a2e6d0f0ba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Jul 2011 11:36:34 +0200 Subject: vfs: dont chain pipe/anon/socket on superblock s_inodes list Workloads using pipes and sockets hit inode_sb_list_lock contention. superblock s_inodes list is needed for quota, dirty, pagecache and fsnotify management. pipe/anon/socket fs are clearly not candidates for these. Signed-off-by: Eric Dumazet Reviewed-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/anon_inodes.c | 2 +- fs/inode.c | 39 ++++++++++++++++++++++++++++++--------- fs/pipe.c | 2 +- include/linux/fs.h | 3 ++- net/socket.c | 2 +- 5 files changed, 35 insertions(+), 13 deletions(-) (limited to 'fs/inode.c') diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 4d433d34736..f11e43ed907 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -187,7 +187,7 @@ EXPORT_SYMBOL_GPL(anon_inode_getfd); */ static struct inode *anon_inode_mkinode(void) { - struct inode *inode = new_inode(anon_inode_mnt->mnt_sb); + struct inode *inode = new_inode_pseudo(anon_inode_mnt->mnt_sb); if (!inode) return ERR_PTR(-ENOMEM); diff --git a/fs/inode.c b/fs/inode.c index 96c77b81167..319b93b5557 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -362,9 +362,11 @@ EXPORT_SYMBOL_GPL(inode_sb_list_add); static inline void inode_sb_list_del(struct inode *inode) { - spin_lock(&inode_sb_list_lock); - list_del_init(&inode->i_sb_list); - spin_unlock(&inode_sb_list_lock); + if (!list_empty(&inode->i_sb_list)) { + spin_lock(&inode_sb_list_lock); + list_del_init(&inode->i_sb_list); + spin_unlock(&inode_sb_list_lock); + } } static unsigned long hash(struct super_block *sb, unsigned long hashval) @@ -796,6 +798,29 @@ unsigned int get_next_ino(void) } EXPORT_SYMBOL(get_next_ino); +/** + * new_inode_pseudo - obtain an inode + * @sb: superblock + * + * Allocates a new inode for given superblock. + * Inode wont be chained in superblock s_inodes list + * This means : + * - fs can't be unmount + * - quotas, fsnotify, writeback can't work + */ +struct inode *new_inode_pseudo(struct super_block *sb) +{ + struct inode *inode = alloc_inode(sb); + + if (inode) { + spin_lock(&inode->i_lock); + inode->i_state = 0; + spin_unlock(&inode->i_lock); + INIT_LIST_HEAD(&inode->i_sb_list); + } + return inode; +} + /** * new_inode - obtain an inode * @sb: superblock @@ -814,13 +839,9 @@ struct inode *new_inode(struct super_block *sb) spin_lock_prefetch(&inode_sb_list_lock); - inode = alloc_inode(sb); - if (inode) { - spin_lock(&inode->i_lock); - inode->i_state = 0; - spin_unlock(&inode->i_lock); + inode = new_inode_pseudo(sb); + if (inode) inode_sb_list_add(inode); - } return inode; } EXPORT_SYMBOL(new_inode); diff --git a/fs/pipe.c b/fs/pipe.c index 1b7f9af67cc..0e0be1dc0f8 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -948,7 +948,7 @@ static const struct dentry_operations pipefs_dentry_operations = { static struct inode * get_pipe_inode(void) { - struct inode *inode = new_inode(pipe_mnt->mnt_sb); + struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); struct pipe_inode_info *pipe; if (!inode) diff --git a/include/linux/fs.h b/include/linux/fs.h index a6658043258..cc363fa7bb8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2310,7 +2310,8 @@ extern void __iget(struct inode * inode); extern void iget_failed(struct inode *); extern void end_writeback(struct inode *); extern void __destroy_inode(struct inode *); -extern struct inode *new_inode(struct super_block *); +extern struct inode *new_inode_pseudo(struct super_block *sb); +extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int should_remove_suid(struct dentry *); extern int file_remove_suid(struct file *); diff --git a/net/socket.c b/net/socket.c index 02dc82db3d2..26ed35c7751 100644 --- a/net/socket.c +++ b/net/socket.c @@ -467,7 +467,7 @@ static struct socket *sock_alloc(void) struct inode *inode; struct socket *sock; - inode = new_inode(sock_mnt->mnt_sb); + inode = new_inode_pseudo(sock_mnt->mnt_sb); if (!inode) return NULL; -- cgit v1.2.3-70-g09d2 From b12362bdb61a230a67daa77bcd2a11e59b2802e1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Jul 2011 06:11:47 +0200 Subject: vfs: conditionally call inode_wb_list_del() Some inodes (pipes, sockets, ...) are not in bdi writeback list. evict() can avoid calling inode_wb_list_del() and its expensive spinlock by checking inode i_wb_list being empty or not. At this point, no other cpu/user can concurrently manipulate this inode i_wb_list Signed-off-by: Eric Dumazet Signed-off-by: Al Viro --- fs/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index d0c72ff6b30..9dab13ae6ef 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -454,7 +454,9 @@ static void evict(struct inode *inode) BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(!list_empty(&inode->i_lru)); - inode_wb_list_del(inode); + if (!list_empty(&inode->i_wb_list)) + inode_wb_list_del(inode); + inode_sb_list_del(inode); if (op->evict_inode) { -- cgit v1.2.3-70-g09d2 From f2ee7abf4c40c8e6bffced923a7c01ea2d1f6c97 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Jul 2011 06:41:09 +0200 Subject: vfs: avoid taking inode_hash_lock on pipes and sockets Some inodes (pipes, sockets, ...) are not hashed, no need to take contended inode_hash_lock at dismantle time. nice speedup on SMP machines on socket intensive workloads. Signed-off-by: Eric Dumazet Signed-off-by: Al Viro --- fs/inode.c | 6 +++--- include/linux/fs.h | 9 ++++++++- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index 9dab13ae6ef..e445be2a18f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -399,12 +399,12 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval) EXPORT_SYMBOL(__insert_inode_hash); /** - * remove_inode_hash - remove an inode from the hash + * __remove_inode_hash - remove an inode from the hash * @inode: inode to unhash * * Remove an inode from the superblock. */ -void remove_inode_hash(struct inode *inode) +void __remove_inode_hash(struct inode *inode) { spin_lock(&inode_hash_lock); spin_lock(&inode->i_lock); @@ -412,7 +412,7 @@ void remove_inode_hash(struct inode *inode) spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); } -EXPORT_SYMBOL(remove_inode_hash); +EXPORT_SYMBOL(__remove_inode_hash); void end_writeback(struct inode *inode) { diff --git a/include/linux/fs.h b/include/linux/fs.h index f23bcb77260..786b3b1113c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2317,11 +2317,18 @@ extern int should_remove_suid(struct dentry *); extern int file_remove_suid(struct file *); extern void __insert_inode_hash(struct inode *, unsigned long hashval); -extern void remove_inode_hash(struct inode *); static inline void insert_inode_hash(struct inode *inode) { __insert_inode_hash(inode, inode->i_ino); } + +extern void __remove_inode_hash(struct inode *); +static inline void remove_inode_hash(struct inode *inode) +{ + if (!inode_unhashed(inode)) + __remove_inode_hash(inode); +} + extern void inode_sb_list_add(struct inode *inode); #ifdef CONFIG_BLOCK -- cgit v1.2.3-70-g09d2 From c4ae0c65455c1bb30d1b71c6dd9a1a62aadde8ef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Jul 2011 06:55:13 +0200 Subject: vfs: avoid call to inode_lru_list_del() if possible inode_lru_list_del() is expensive because of per superblock lru locking, while some inodes are not in lru list. Adding a check in iput_final() can speedup pipe/sockets workloads on SMP. Signed-off-by: Eric Dumazet Signed-off-by: Al Viro --- fs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index e445be2a18f..5aab80dc008 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1330,7 +1330,8 @@ static void iput_final(struct inode *inode) } inode->i_state |= I_FREEING; - inode_lru_list_del(inode); + if (!list_empty(&inode->i_lru)) + inode_lru_list_del(inode); spin_unlock(&inode->i_lock); evict(inode); -- cgit v1.2.3-70-g09d2 From 3ddcd0569cd68f00f3beae9a7959b72918bb91f4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 6 Aug 2011 22:45:50 -0700 Subject: vfs: optimize inode cache access patterns The inode structure layout is largely random, and some of the vfs paths really do care. The path lookup in particular is already quite D$ intensive, and profiles show that accessing the 'inode->i_op->xyz' fields is quite costly. We already optimized the dcache to not unnecessarily load the d_op structure for members that are often NULL using the DCACHE_OP_xyz bits in dentry->d_flags, and this does something very similar for the inode ops that are used during pathname lookup. It also re-orders the fields so that the fields accessed by 'stat' are together at the beginning of the inode structure, and roughly in the order accessed. The effect of this seems to be in the 1-2% range for an empty kernel "make -j" run (which is fairly kernel-intensive, mostly in filename lookup), so it's visible. The numbers are fairly noisy, though, and likely depend a lot on exact microarchitecture. So there's more tuning to be done. Signed-off-by: Linus Torvalds --- fs/inode.c | 1 + fs/namei.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++------- fs/stat.c | 4 +-- include/linux/fs.h | 59 ++++++++++++++++++++++++++---------------- 4 files changed, 106 insertions(+), 34 deletions(-) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index 5aab80dc008..73920d555c8 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -143,6 +143,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_op = &empty_iops; inode->i_fop = &empty_fops; inode->i_nlink = 1; + inode->i_opflags = 0; inode->i_uid = 0; inode->i_gid = 0; atomic_set(&inode->i_writecount, 0); diff --git a/fs/namei.c b/fs/namei.c index 3d607bd80e0..4a98bf154d8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -308,6 +308,26 @@ int generic_permission(struct inode *inode, int mask) return -EACCES; } +/* + * We _really_ want to just do "generic_permission()" without + * even looking at the inode->i_op values. So we keep a cache + * flag in inode->i_opflags, that says "this has not special + * permission function, use the fast case". + */ +static inline int do_inode_permission(struct inode *inode, int mask) +{ + if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { + if (likely(inode->i_op->permission)) + return inode->i_op->permission(inode, mask); + + /* This gets set once for the inode lifetime */ + spin_lock(&inode->i_lock); + inode->i_opflags |= IOP_FASTPERM; + spin_unlock(&inode->i_lock); + } + return generic_permission(inode, mask); +} + /** * inode_permission - check for access rights to a given inode * @inode: inode to check permission on @@ -322,7 +342,7 @@ int inode_permission(struct inode *inode, int mask) { int retval; - if (mask & MAY_WRITE) { + if (unlikely(mask & MAY_WRITE)) { umode_t mode = inode->i_mode; /* @@ -339,11 +359,7 @@ int inode_permission(struct inode *inode, int mask) return -EACCES; } - if (inode->i_op->permission) - retval = inode->i_op->permission(inode, mask); - else - retval = generic_permission(inode, mask); - + retval = do_inode_permission(inode, mask); if (retval) return retval; @@ -1245,6 +1261,26 @@ static void terminate_walk(struct nameidata *nd) } } +/* + * Do we need to follow links? We _really_ want to be able + * to do this check without having to look at inode->i_op, + * so we keep a cache of "no, this doesn't need follow_link" + * for the common case. + */ +static inline int do_follow_link(struct inode *inode, int follow) +{ + if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { + if (likely(inode->i_op->follow_link)) + return follow; + + /* This gets set once for the inode lifetime */ + spin_lock(&inode->i_lock); + inode->i_opflags |= IOP_NOFOLLOW; + spin_unlock(&inode->i_lock); + } + return 0; +} + static inline int walk_component(struct nameidata *nd, struct path *path, struct qstr *name, int type, int follow) { @@ -1267,7 +1303,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path, terminate_walk(nd); return -ENOENT; } - if (unlikely(inode->i_op->follow_link) && follow) { + if (do_follow_link(inode, follow)) { if (nd->flags & LOOKUP_RCU) { if (unlikely(unlazy_walk(nd, path->dentry))) { terminate_walk(nd); @@ -1319,6 +1355,26 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd) return res; } +/* + * We really don't want to look at inode->i_op->lookup + * when we don't have to. So we keep a cache bit in + * the inode ->i_opflags field that says "yes, we can + * do lookup on this inode". + */ +static inline int can_lookup(struct inode *inode) +{ + if (likely(inode->i_opflags & IOP_LOOKUP)) + return 1; + if (likely(!inode->i_op->lookup)) + return 0; + + /* We do this once for the lifetime of the inode */ + spin_lock(&inode->i_lock); + inode->i_opflags |= IOP_LOOKUP; + spin_unlock(&inode->i_lock); + return 1; +} + /* * Name resolution. * This is the basic name resolution function, turning a pathname into @@ -1398,10 +1454,10 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (err) return err; } + if (can_lookup(nd->inode)) + continue; err = -ENOTDIR; - if (!nd->inode->i_op->lookup) - break; - continue; + break; /* here ends the main loop */ last_component: diff --git a/fs/stat.c b/fs/stat.c index 961039121cb..ba5316ffac6 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -27,12 +27,12 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) stat->uid = inode->i_uid; stat->gid = inode->i_gid; stat->rdev = inode->i_rdev; + stat->size = i_size_read(inode); stat->atime = inode->i_atime; stat->mtime = inode->i_mtime; stat->ctime = inode->i_ctime; - stat->size = i_size_read(inode); - stat->blocks = inode->i_blocks; stat->blksize = (1 << inode->i_blkbits); + stat->blocks = inode->i_blocks; } EXPORT_SYMBOL(generic_fillattr); diff --git a/include/linux/fs.h b/include/linux/fs.h index 786b3b1113c..178cdb4f1d4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -738,22 +738,54 @@ static inline int mapping_writably_mapped(struct address_space *mapping) struct posix_acl; #define ACL_NOT_CACHED ((void *)(-1)) +#define IOP_FASTPERM 0x0001 +#define IOP_LOOKUP 0x0002 +#define IOP_NOFOLLOW 0x0004 + +/* + * Keep mostly read-only and often accessed (especially for + * the RCU path lookup and 'stat' data) fields at the beginning + * of the 'struct inode' + */ struct inode { - /* RCU path lookup touches following: */ umode_t i_mode; + unsigned short i_opflags; uid_t i_uid; gid_t i_gid; + unsigned int i_flags; + +#ifdef CONFIG_FS_POSIX_ACL + struct posix_acl *i_acl; + struct posix_acl *i_default_acl; +#endif + const struct inode_operations *i_op; struct super_block *i_sb; + struct address_space *i_mapping; - spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ - unsigned int i_flags; - unsigned long i_state; #ifdef CONFIG_SECURITY void *i_security; #endif - struct mutex i_mutex; + /* Stat data, not accessed from path walking */ + unsigned long i_ino; + unsigned int i_nlink; + dev_t i_rdev; + loff_t i_size; + struct timespec i_atime; + struct timespec i_mtime; + struct timespec i_ctime; + unsigned int i_blkbits; + blkcnt_t i_blocks; + +#ifdef __NEED_I_SIZE_ORDERED + seqcount_t i_size_seqcount; +#endif + + /* Misc */ + unsigned long i_state; + spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ + struct mutex i_mutex; unsigned long dirtied_when; /* jiffies of first dirtying */ @@ -765,25 +797,12 @@ struct inode { struct list_head i_dentry; struct rcu_head i_rcu; }; - unsigned long i_ino; atomic_t i_count; - unsigned int i_nlink; - dev_t i_rdev; - unsigned int i_blkbits; u64 i_version; - loff_t i_size; -#ifdef __NEED_I_SIZE_ORDERED - seqcount_t i_size_seqcount; -#endif - struct timespec i_atime; - struct timespec i_mtime; - struct timespec i_ctime; - blkcnt_t i_blocks; unsigned short i_bytes; atomic_t i_dio_count; const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ struct file_lock *i_flock; - struct address_space *i_mapping; struct address_space i_data; #ifdef CONFIG_QUOTA struct dquot *i_dquot[MAXQUOTAS]; @@ -806,10 +825,6 @@ struct inode { atomic_t i_readcount; /* struct files open RO */ #endif atomic_t i_writecount; -#ifdef CONFIG_FS_POSIX_ACL - struct posix_acl *i_acl; - struct posix_acl *i_default_acl; -#endif void *i_private; /* fs or device private pointer */ }; -- cgit v1.2.3-70-g09d2 From e096d0c7e2e4e5893792db865dd065ac73cf1f00 Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Thu, 25 Aug 2011 07:48:12 -0400 Subject: lockdep: Add helper function for dir vs file i_mutex annotation Purely in-memory filesystems do not use the inode hash as the dcache tells us if an entry already exists. As a result, they do not call unlock_new_inode, and thus directory inodes do not get put into a different lockdep class for i_sem. We need the different lockdep classes, because the locking order for i_mutex is different for directory inodes and regular inodes. Directory inodes can do "readdir()", which takes i_mutex *before* possibly taking mm->mmap_sem (due to a page fault while copying the directory entry to user space). In contrast, regular inodes can be mmap'ed, which takes mm->mmap_sem before accessing i_mutex. The two cases can never happen for the same inode, so no real deadlock can occur, but without the different lockdep classes, lockdep cannot understand that. As a result, if CONFIG_DEBUG_LOCK_ALLOC is set, this can lead to false positives from lockdep like below: find/645 is trying to acquire lock: (&mm->mmap_sem){++++++}, at: [] might_fault+0x5c/0xac but task is already holding lock: (&sb->s_type->i_mutex_key#15){+.+.+.}, at: [] vfs_readdir+0x5b/0xb4 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&sb->s_type->i_mutex_key#15){+.+.+.}: [] lock_acquire+0xbf/0x103 [] __mutex_lock_common+0x4c/0x361 [] mutex_lock_nested+0x40/0x45 [] hugetlbfs_file_mmap+0x82/0x110 [] mmap_region+0x258/0x432 [] do_mmap_pgoff+0x2ac/0x306 [] sys_mmap_pgoff+0x118/0x16a [] sys_mmap+0x22/0x24 [] system_call_fastpath+0x16/0x1b -> #0 (&mm->mmap_sem){++++++}: [] __lock_acquire+0xa1a/0xcf7 [] lock_acquire+0xbf/0x103 [] might_fault+0x89/0xac [] filldir+0x6f/0xc7 [] dcache_readdir+0x67/0x205 [] vfs_readdir+0x7b/0xb4 [] sys_getdents+0x7e/0xd1 [] system_call_fastpath+0x16/0x1b This patch moves the directory vs file lockdep annotation into a helper function that can be called by in-memory filesystems and has hugetlbfs call it. Signed-off-by: Josh Boyer Acked-by: Peter Zijlstra Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 1 + fs/inode.c | 24 +++++++++++++++--------- include/linux/fs.h | 5 +++++ 3 files changed, 21 insertions(+), 9 deletions(-) (limited to 'fs/inode.c') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 87b6e0421c1..ec889538e5a 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -491,6 +491,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, inode->i_op = &page_symlink_inode_operations; break; } + lockdep_annotate_inode_mutex_key(inode); } return inode; } diff --git a/fs/inode.c b/fs/inode.c index 73920d555c8..ec7924696a1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -848,16 +848,9 @@ struct inode *new_inode(struct super_block *sb) } EXPORT_SYMBOL(new_inode); -/** - * unlock_new_inode - clear the I_NEW state and wake up any waiters - * @inode: new inode to unlock - * - * Called when the inode is fully initialised to clear the new state of the - * inode and wake up anyone waiting for the inode to finish initialisation. - */ -void unlock_new_inode(struct inode *inode) -{ #ifdef CONFIG_DEBUG_LOCK_ALLOC +void lockdep_annotate_inode_mutex_key(struct inode *inode) +{ if (S_ISDIR(inode->i_mode)) { struct file_system_type *type = inode->i_sb->s_type; @@ -873,7 +866,20 @@ void unlock_new_inode(struct inode *inode) &type->i_mutex_dir_key); } } +} +EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key); #endif + +/** + * unlock_new_inode - clear the I_NEW state and wake up any waiters + * @inode: new inode to unlock + * + * Called when the inode is fully initialised to clear the new state of the + * inode and wake up anyone waiting for the inode to finish initialisation. + */ +void unlock_new_inode(struct inode *inode) +{ + lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW; diff --git a/include/linux/fs.h b/include/linux/fs.h index 178cdb4f1d4..c2bd68f2277 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2318,6 +2318,11 @@ extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*te extern struct inode * iget_locked(struct super_block *, unsigned long); extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); extern int insert_inode_locked(struct inode *); +#ifdef CONFIG_DEBUG_LOCK_ALLOC +extern void lockdep_annotate_inode_mutex_key(struct inode *inode); +#else +static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { }; +#endif extern void unlock_new_inode(struct inode *); extern unsigned int get_next_ino(void); -- cgit v1.2.3-70-g09d2