From 65bc98b0059360e458aebd208587be44641227c1 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 10 Jul 2009 15:27:25 +0000 Subject: [CIFS] Distinguish posix opens and mkdirs from legacy mkdirs in stats Acked-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifs_debug.c | 8 +++++++- fs/cifs/cifsglob.h | 2 ++ fs/cifs/cifssmb.c | 5 ++++- 3 files changed, 13 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 7f19fefd3d4..42cec2a7c0c 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -261,6 +261,8 @@ static ssize_t cifs_stats_proc_write(struct file *file, atomic_set(&tcon->num_reads, 0); atomic_set(&tcon->num_oplock_brks, 0); atomic_set(&tcon->num_opens, 0); + atomic_set(&tcon->num_posixopens, 0); + atomic_set(&tcon->num_posixmkdirs, 0); atomic_set(&tcon->num_closes, 0); atomic_set(&tcon->num_deletes, 0); atomic_set(&tcon->num_mkdirs, 0); @@ -347,11 +349,15 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) atomic_read(&tcon->num_locks), atomic_read(&tcon->num_hardlinks), atomic_read(&tcon->num_symlinks)); - seq_printf(m, "\nOpens: %d Closes: %d" + seq_printf(m, "\nOpens: %d Closes: %d " "Deletes: %d", atomic_read(&tcon->num_opens), atomic_read(&tcon->num_closes), atomic_read(&tcon->num_deletes)); + seq_printf(m, "\nPosix Opens: %d " + "Posix Mkdirs: %d", + atomic_read(&tcon->num_posixopens), + atomic_read(&tcon->num_posixmkdirs)); seq_printf(m, "\nMkdirs: %d Rmdirs: %d", atomic_read(&tcon->num_mkdirs), atomic_read(&tcon->num_rmdirs)); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 63f6cdfa563..6084d6379c0 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -260,6 +260,8 @@ struct cifsTconInfo { atomic_t num_closes; atomic_t num_deletes; atomic_t num_mkdirs; + atomic_t num_posixopens; + atomic_t num_posixmkdirs; atomic_t num_rmdirs; atomic_t num_renames; atomic_t num_t2renames; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 922f5fe2084..1866bc2927d 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1113,7 +1113,10 @@ PsxCreat: psx_create_err: cifs_buf_release(pSMB); - cifs_stats_inc(&tcon->num_mkdirs); + if (posix_flags & SMB_O_DIRECTORY) + cifs_stats_inc(&tcon->num_posixmkdirs); + else + cifs_stats_inc(&tcon->num_posixopens); if (rc == -EAGAIN) goto PsxCreat; -- cgit v1.2.3-70-g09d2 From 4fed598a49c014cbc563179b25f2a4b8565e2a50 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sun, 12 Jul 2009 11:13:55 +0900 Subject: fs/Kconfig: move nilfs2 out fs/Kconfig file was split into individual fs/*/Kconfig files before nilfs was merged. I've found the current config entry of nilfs is tainting the work. Sorry, I didn't notice. This fixes the violation. Signed-off-by: Ryusuke Konishi Cc: Alexey Dobriyan --- fs/Kconfig | 27 +-------------------------- fs/nilfs2/Kconfig | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 26 deletions(-) create mode 100644 fs/nilfs2/Kconfig (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index a97263be6a9..0e7da7bb5d9 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -186,32 +186,7 @@ source "fs/romfs/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/exofs/Kconfig" - -config NILFS2_FS - tristate "NILFS2 file system support (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL - select CRC32 - help - NILFS2 is a log-structured file system (LFS) supporting continuous - snapshotting. In addition to versioning capability of the entire - file system, users can even restore files mistakenly overwritten or - destroyed just a few seconds ago. Since this file system can keep - consistency like conventional LFS, it achieves quick recovery after - system crashes. - - NILFS2 creates a number of checkpoints every few seconds or per - synchronous write basis (unless there is no change). Users can - select significant versions among continuously created checkpoints, - and can change them into snapshots which will be preserved for long - periods until they are changed back to checkpoints. Each - snapshot is mountable as a read-only file system concurrently with - its writable mount, and this feature is convenient for online backup. - - Some features including atime, extended attributes, and POSIX ACLs, - are not supported yet. - - To compile this file system support as a module, choose M here: the - module will be called nilfs2. If unsure, say N. +source "fs/nilfs2/Kconfig" endif # MISC_FILESYSTEMS diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig new file mode 100644 index 00000000000..72da095d400 --- /dev/null +++ b/fs/nilfs2/Kconfig @@ -0,0 +1,25 @@ +config NILFS2_FS + tristate "NILFS2 file system support (EXPERIMENTAL)" + depends on BLOCK && EXPERIMENTAL + select CRC32 + help + NILFS2 is a log-structured file system (LFS) supporting continuous + snapshotting. In addition to versioning capability of the entire + file system, users can even restore files mistakenly overwritten or + destroyed just a few seconds ago. Since this file system can keep + consistency like conventional LFS, it achieves quick recovery after + system crashes. + + NILFS2 creates a number of checkpoints every few seconds or per + synchronous write basis (unless there is no change). Users can + select significant versions among continuously created checkpoints, + and can change them into snapshots which will be preserved for long + periods until they are changed back to checkpoints. Each + snapshot is mountable as a read-only file system concurrently with + its writable mount, and this feature is convenient for online backup. + + Some features including atime, extended attributes, and POSIX ACLs, + are not supported yet. + + To compile this file system support as a module, choose M here: the + module will be called nilfs2. If unsure, say N. -- cgit v1.2.3-70-g09d2 From 9c9ad6162e2aa1e528ed687ccab87fe681ebbef1 Mon Sep 17 00:00:00 2001 From: Abhishek Kulkarni Date: Tue, 14 Jul 2009 13:26:52 -0500 Subject: 9p: Fix incorrect parameters to v9fs_file_readn. Fix v9fs_vfs_readpage. The offset and size parameters to v9fs_file_readn were interchanged and hence passed incorrectly. Signed-off-by: Abhishek Kulkarni Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_addr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 6fcb1e7095c..92828281a30 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -57,7 +57,7 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page) buffer = kmap(page); offset = page_offset(page); - retval = v9fs_file_readn(filp, buffer, NULL, offset, PAGE_CACHE_SIZE); + retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset); if (retval < 0) goto done; -- cgit v1.2.3-70-g09d2 From 7447a668a3860b66b3c9db86fdea91e355ba59ac Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Jul 2009 20:36:08 +0200 Subject: jbd: Fail to load a journal if it is too short Due to on disk corruption, it can happen that journal is too short. Fail to load it in such case so that we don't oops somewhere later. Reported-by: Nageswara R Sastry Signed-off-by: Jan Kara --- fs/jbd/journal.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 737f7246a4b..94a64a199a6 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -848,6 +848,12 @@ static int journal_reset(journal_t *journal) first = be32_to_cpu(sb->s_first); last = be32_to_cpu(sb->s_maxlen); + if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) { + printk(KERN_ERR "JBD: Journal too short (blocks %lu-%lu).\n", + first, last); + journal_fail_superblock(journal); + return -EINVAL; + } journal->j_first = first; journal->j_last = last; -- cgit v1.2.3-70-g09d2 From 9eaaa2d5759837402ec5eee13b2a97921808c3eb Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 13 Jul 2009 20:26:52 +0200 Subject: ext3: Fix truncation of symlinks after failed write Contents of long symlinks is written via standard write methods. So when the write fails, we add inode to orphan list. But symlinks don't have .truncate method defined so nobody properly removes them from the orphan list (both on disk and in memory). Fix this by calling ext3_truncate() directly instead of calling vmtruncate() (which is saner anyway since we don't need anything vmtruncate() does except from calling .truncate in these paths). We also add inode to orphan list only if ext3_can_truncate() is true (currently, it can be false for symlinks when there are no blocks allocated) - otherwise orphan list processing will complain and ext3_truncate() will not remove inode from on-disk orphan list. Signed-off-by: Jan Kara --- fs/ext3/inode.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 5f51fed5c75..4d7da6f6184 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1193,15 +1193,16 @@ write_begin_failed: * i_size_read because we hold i_mutex. * * Add inode to orphan list in case we crash before truncate - * finishes. + * finishes. Do this only if ext3_can_truncate() agrees so + * that orphan processing code is happy. */ - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); ext3_journal_stop(handle); unlock_page(page); page_cache_release(page); if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + ext3_truncate(inode); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -1287,7 +1288,7 @@ static int ext3_ordered_write_end(struct file *file, * There may be allocated blocks outside of i_size because * we failed to copy some data. Prepare for truncate. */ - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); ret2 = ext3_journal_stop(handle); if (!ret) @@ -1296,7 +1297,7 @@ static int ext3_ordered_write_end(struct file *file, page_cache_release(page); if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + ext3_truncate(inode); return ret ? ret : copied; } @@ -1315,14 +1316,14 @@ static int ext3_writeback_write_end(struct file *file, * There may be allocated blocks outside of i_size because * we failed to copy some data. Prepare for truncate. */ - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); ret = ext3_journal_stop(handle); unlock_page(page); page_cache_release(page); if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + ext3_truncate(inode); return ret ? ret : copied; } @@ -1358,7 +1359,7 @@ static int ext3_journalled_write_end(struct file *file, * There may be allocated blocks outside of i_size because * we failed to copy some data. Prepare for truncate. */ - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; if (inode->i_size > EXT3_I(inode)->i_disksize) { @@ -1375,7 +1376,7 @@ static int ext3_journalled_write_end(struct file *file, page_cache_release(page); if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + ext3_truncate(inode); return ret ? ret : copied; } -- cgit v1.2.3-70-g09d2 From 1e9fd53b783ea646de3ee09a4574afeb6778d504 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 24 Jun 2009 17:31:40 +0200 Subject: jbd: Fix a race between checkpointing code and journal_get_write_access() The following race can happen: CPU1 CPU2 checkpointing code checks the buffer, adds it to an array for writeback do_get_write_access() ... lock_buffer() unlock_buffer() flush_batch() submits the buffer for IO __jbd_journal_file_buffer() So a buffer under writeout is returned from do_get_write_access(). Since the filesystem code relies on the fact that journaled buffers cannot be written out, it does not take the buffer lock and so it can modify buffer while it is under writeout. That can lead to a filesystem corruption if we crash at the right moment. The similar problem can happen with the journal_get_create_access() path. We fix the problem by clearing the buffer dirty bit under buffer_lock even if the buffer is on BJ_None list. Actually, we clear the dirty bit regardless the list the buffer is in and warn about the fact if the buffer is already journalled. Thanks for spotting the problem goes to dingdinghua . Reported-by: dingdinghua Signed-off-by: Jan Kara --- fs/jbd/transaction.c | 68 +++++++++++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 73242ba7c7b..c03ac11f74b 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -489,34 +489,15 @@ void journal_unlock_updates (journal_t *journal) wake_up(&journal->j_wait_transaction_locked); } -/* - * Report any unexpected dirty buffers which turn up. Normally those - * indicate an error, but they can occur if the user is running (say) - * tune2fs to modify the live filesystem, so we need the option of - * continuing as gracefully as possible. # - * - * The caller should already hold the journal lock and - * j_list_lock spinlock: most callers will need those anyway - * in order to probe the buffer's journaling state safely. - */ -static void jbd_unexpected_dirty_buffer(struct journal_head *jh) +static void warn_dirty_buffer(struct buffer_head *bh) { - int jlist; - - /* If this buffer is one which might reasonably be dirty - * --- ie. data, or not part of this journal --- then - * we're OK to leave it alone, but otherwise we need to - * move the dirty bit to the journal's own internal - * JBDDirty bit. */ - jlist = jh->b_jlist; + char b[BDEVNAME_SIZE]; - if (jlist == BJ_Metadata || jlist == BJ_Reserved || - jlist == BJ_Shadow || jlist == BJ_Forget) { - struct buffer_head *bh = jh2bh(jh); - - if (test_clear_buffer_dirty(bh)) - set_buffer_jbddirty(bh); - } + printk(KERN_WARNING + "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " + "There's a risk of filesystem corruption in case of system " + "crash.\n", + bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } /* @@ -583,14 +564,16 @@ repeat: if (jh->b_next_transaction) J_ASSERT_JH(jh, jh->b_next_transaction == transaction); + warn_dirty_buffer(bh); } /* * In any case we need to clean the dirty flag and we must * do it under the buffer lock to be sure we don't race * with running write-out. */ - JBUFFER_TRACE(jh, "Unexpected dirty buffer"); - jbd_unexpected_dirty_buffer(jh); + JBUFFER_TRACE(jh, "Journalling dirty buffer"); + clear_buffer_dirty(bh); + set_buffer_jbddirty(bh); } unlock_buffer(bh); @@ -826,6 +809,15 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh) J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); if (jh->b_transaction == NULL) { + /* + * Previous journal_forget() could have left the buffer + * with jbddirty bit set because it was being committed. When + * the commit finished, we've filed the buffer for + * checkpointing and marked it dirty. Now we are reallocating + * the buffer so the transaction freeing it must have + * committed and so it's safe to clear the dirty bit. + */ + clear_buffer_dirty(jh2bh(jh)); jh->b_transaction = transaction; /* first access by this transaction */ @@ -1782,8 +1774,13 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) if (jh->b_cp_transaction) { JBUFFER_TRACE(jh, "on running+cp transaction"); + /* + * We don't want to write the buffer anymore, clear the + * bit so that we don't confuse checks in + * __journal_file_buffer + */ + clear_buffer_dirty(bh); __journal_file_buffer(jh, transaction, BJ_Forget); - clear_buffer_jbddirty(bh); may_free = 0; } else { JBUFFER_TRACE(jh, "on running transaction"); @@ -2041,12 +2038,17 @@ void __journal_file_buffer(struct journal_head *jh, if (jh->b_transaction && jh->b_jlist == jlist) return; - /* The following list of buffer states needs to be consistent - * with __jbd_unexpected_dirty_buffer()'s handling of dirty - * state. */ - if (jlist == BJ_Metadata || jlist == BJ_Reserved || jlist == BJ_Shadow || jlist == BJ_Forget) { + /* + * For metadata buffers, we track dirty bit in buffer_jbddirty + * instead of buffer_dirty. We should not see a dirty bit set + * here because we clear it in do_get_write_access but e.g. + * tune2fs can modify the sb and set the dirty bit at any time + * so we try to gracefully handle that. + */ + if (buffer_dirty(bh)) + warn_dirty_buffer(bh); if (test_clear_buffer_dirty(bh) || test_clear_buffer_jbddirty(bh)) was_dirty = 1; -- cgit v1.2.3-70-g09d2 From 43237b5490e8f2f4679decd660064ff35ce490cc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 20 May 2009 18:41:58 +0200 Subject: ext3: Get rid of extenddisksize parameter of ext3_get_blocks_handle() Get rid of extenddisksize parameter of ext3_get_blocks_handle(). This seems to be a relict from some old days and setting disksize in this function does not make much sence. Currently it was set only by ext3_getblk(). Since the parameter has some effect only if create == 1, it is easy to check that the three callers which end up calling ext3_getblk() with create == 1 (ext3_append, ext3_quota_write, ext3_mkdir) do the right thing and set disksize themselves. Signed-off-by: Jan Kara --- fs/ext3/dir.c | 3 +-- fs/ext3/inode.c | 13 +++---------- include/linux/ext3_fs.h | 2 +- 3 files changed, 5 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 3d724a95882..373fa90c796 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -130,8 +130,7 @@ static int ext3_readdir(struct file * filp, struct buffer_head *bh = NULL; map_bh.b_state = 0; - err = ext3_get_blocks_handle(NULL, inode, blk, 1, - &map_bh, 0, 0); + err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0); if (err > 0) { pgoff_t index = map_bh.b_blocknr >> (PAGE_CACHE_SHIFT - inode->i_blkbits); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 4d7da6f6184..b49908a167a 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -788,7 +788,7 @@ err_out: int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, - int create, int extend_disksize) + int create) { int err = -EIO; int offsets[4]; @@ -911,13 +911,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, if (!err) err = ext3_splice_branch(handle, inode, iblock, partial, indirect_blks, count); - /* - * i_disksize growing is protected by truncate_mutex. Don't forget to - * protect it if you're about to implement concurrent - * ext3_get_block() -bzzz - */ - if (!err && extend_disksize && inode->i_size > ei->i_disksize) - ei->i_disksize = inode->i_size; mutex_unlock(&ei->truncate_mutex); if (err) goto cleanup; @@ -972,7 +965,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock, } ret = ext3_get_blocks_handle(handle, inode, iblock, - max_blocks, bh_result, create, 0); + max_blocks, bh_result, create); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; @@ -1005,7 +998,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode, dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); err = ext3_get_blocks_handle(handle, inode, block, 1, - &dummy, create, 1); + &dummy, create); /* * ext3_get_blocks_handle() returns number of blocks * mapped. 0 in case of a HOLE. diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index 634a5e5aba3..7499b366779 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -874,7 +874,7 @@ struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, - int create, int extend_disksize); + int create); extern struct inode *ext3_iget(struct super_block *, unsigned long); extern int ext3_write_inode (struct inode *, int); -- cgit v1.2.3-70-g09d2 From 90a98b2f3f3647fb17667768a348b2b219f2a9f7 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 20 Jul 2009 13:40:52 -0400 Subject: cifs: free nativeFileSystem field before allocating a new one ...otherwise, we'll leak this memory if we have to reconnect (e.g. after network failure). Signed-off-by: Jeff Layton CC: Stable Signed-off-by: Steve French --- fs/cifs/connect.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index e16d7592116..9bb5c875073 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2726,6 +2726,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, strncpy(tcon->treeName, tree, MAX_TREE_SIZE); /* mostly informational -- no need to fail on error here */ + kfree(tcon->nativeFileSystem); tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr, bytes_left, is_unicode, nls_codepage); -- cgit v1.2.3-70-g09d2 From f1015c447781729060c415f5133164c638561f25 Mon Sep 17 00:00:00 2001 From: dingdinghua Date: Wed, 15 Jul 2009 21:42:05 +0200 Subject: jbd: fix race between write_metadata_buffer and get_write_access The function journal_write_metadata_buffer() calls jbd_unlock_bh_state(bh_in) too early; this could potentially allow another thread to call get_write_access on the buffer head, modify the data, and dirty it, and allowing the wrong data to be written into the journal. Fortunately, if we lose this race, the only time this will actually cause filesystem corruption is if there is a system crash or other unclean shutdown of the system before the next commit can take place. Signed-off-by: dingdinghua Acked-by: "Theodore Ts'o" Signed-off-by: Jan Kara --- fs/jbd/journal.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 94a64a199a6..f96f85092d1 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -287,6 +287,7 @@ int journal_write_metadata_buffer(transaction_t *transaction, struct page *new_page; unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); + journal_t *journal = transaction->t_journal; /* * The buffer really shouldn't be locked: only the current committing @@ -300,6 +301,11 @@ int journal_write_metadata_buffer(transaction_t *transaction, J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); + /* keep subsequent assertions sane */ + new_bh->b_state = 0; + init_buffer(new_bh, NULL, NULL); + atomic_set(&new_bh->b_count, 1); + new_jh = journal_add_journal_head(new_bh); /* This sleeps */ /* * If a new transaction has already done a buffer copy-out, then @@ -361,14 +367,6 @@ repeat: kunmap_atomic(mapped_data, KM_USER0); } - /* keep subsequent assertions sane */ - new_bh->b_state = 0; - init_buffer(new_bh, NULL, NULL); - atomic_set(&new_bh->b_count, 1); - jbd_unlock_bh_state(bh_in); - - new_jh = journal_add_journal_head(new_bh); /* This sleeps */ - set_bh_page(new_bh, new_page, new_offset); new_jh->b_transaction = NULL; new_bh->b_size = jh2bh(jh_in)->b_size; @@ -385,7 +383,11 @@ repeat: * copying is moved to the transaction's shadow queue. */ JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); - journal_file_buffer(jh_in, transaction, BJ_Shadow); + spin_lock(&journal->j_list_lock); + __journal_file_buffer(jh_in, transaction, BJ_Shadow); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh_in); + JBUFFER_TRACE(new_jh, "file as BJ_IO"); journal_file_buffer(new_jh, transaction, BJ_IO); -- cgit v1.2.3-70-g09d2 From 5549f7cdf84c02939fd368d0842aa2f472bb6e98 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 7 Jul 2009 10:28:23 -0400 Subject: inotify: drop user watch count when a watch is removed The inotify rewrite forgot to drop the inotify watch use cound when a watch was removed. This means that a single inotify fd can only ever register a maximum of /proc/sys/fs/max_user_watches even if some of those had been freed. Signed-off-by: Eric Paris --- fs/notify/inotify/inotify_user.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index ff27a296584..1a870f9157b 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -404,6 +404,8 @@ skip_send_ignore: /* removed from idr, drop that reference */ fsnotify_put_mark(entry); + + atomic_dec(&group->inotify_data.user->inotify_watches); } /* ding dong the mark is dead */ -- cgit v1.2.3-70-g09d2 From 75fe2b26394c59c8e16bd7b76f4be5d048103ad1 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 7 Jul 2009 10:28:23 -0400 Subject: inotify: do not leak inode marks in inotify_add_watch inotify_add_watch had a couple of problems. The biggest being that if inotify_add_watch was called on the same inode twice (to update or change the event mask) a refence was taken on the original inode mark by fsnotify_find_mark_entry but was not being dropped at the end of the inotify_add_watch call. Thus if inotify_rm_watch was called although the mark was removed from the inode, the refcnt wouldn't hit zero and we would leak memory. Reported-by: Catalin Marinas Signed-off-by: Eric Paris --- fs/notify/inotify/inotify_user.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 1a870f9157b..aff4214f16c 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -463,9 +463,6 @@ retry: goto out_err; spin_lock(&group->inotify_data.idr_lock); - /* if entry is added to the idr we keep the reference obtained - * through fsnotify_mark_add. remember to drop this reference - * when entry is removed from idr */ ret = idr_get_new_above(&group->inotify_data.idr, entry, ++group->inotify_data.last_wd, &ientry->wd); @@ -476,8 +473,13 @@ retry: goto out_err; } atomic_inc(&group->inotify_data.user->inotify_watches); + + /* we put the mark on the idr, take a reference */ + fsnotify_get_mark(entry); } + ret = ientry->wd; + spin_lock(&entry->lock); old_mask = entry->mask; @@ -508,7 +510,11 @@ retry: fsnotify_recalc_group_mask(group); } - return ientry->wd; + /* this either matches fsnotify_find_mark_entry, or init_mark_entry + * depending on which path we took... */ + fsnotify_put_mark(entry); + + return ret; out_err: /* see this isn't supposed to happen, just kill the watch */ -- cgit v1.2.3-70-g09d2 From 7e790dd5fc937bc8d2400c30a05e32a9e9eef276 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 7 Jul 2009 10:28:24 -0400 Subject: inotify: fix error paths in inotify_update_watch inotify_update_watch could leave things in a horrid state on a number of error paths. We could try to remove idr entries that didn't exist, we could send an IN_IGNORED to userspace for watches that don't exist, and a bit of other stupidity. Clean these up by doing the idr addition before we put the mark on the inode since we can clean that up on error and getting off the inode's mark list is hard. Signed-off-by: Eric Paris --- fs/notify/inotify/inotify_user.c | 79 +++++++++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index aff4214f16c..726118a5845 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -365,6 +365,17 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns return error; } +static void inotify_remove_from_idr(struct fsnotify_group *group, + struct inotify_inode_mark_entry *ientry) +{ + struct idr *idr; + + spin_lock(&group->inotify_data.idr_lock); + idr = &group->inotify_data.idr; + idr_remove(idr, ientry->wd); + spin_unlock(&group->inotify_data.idr_lock); + ientry->wd = -1; +} /* * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the * internal reference help on the mark because it is in the idr. @@ -375,7 +386,6 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, struct inotify_inode_mark_entry *ientry; struct inotify_event_private_data *event_priv; struct fsnotify_event_private_data *fsn_event_priv; - struct idr *idr; ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); @@ -397,10 +407,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, skip_send_ignore: /* remove this entry from the idr */ - spin_lock(&group->inotify_data.idr_lock); - idr = &group->inotify_data.idr; - idr_remove(idr, ientry->wd); - spin_unlock(&group->inotify_data.idr_lock); + inotify_remove_from_idr(group, ientry); /* removed from idr, drop that reference */ fsnotify_put_mark(entry); @@ -420,6 +427,7 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod { struct fsnotify_mark_entry *entry = NULL; struct inotify_inode_mark_entry *ientry; + struct inotify_inode_mark_entry *tmp_ientry; int ret = 0; int add = (arg & IN_MASK_ADD); __u32 mask; @@ -430,50 +438,60 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod if (unlikely(!mask)) return -EINVAL; - ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); - if (unlikely(!ientry)) + tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); + if (unlikely(!tmp_ientry)) return -ENOMEM; /* we set the mask at the end after attaching it */ - fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark); - ientry->wd = 0; + fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark); + tmp_ientry->wd = -1; find_entry: spin_lock(&inode->i_lock); entry = fsnotify_find_mark_entry(group, inode); spin_unlock(&inode->i_lock); if (entry) { - kmem_cache_free(inotify_inode_mark_cachep, ientry); ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); } else { - if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) { - ret = -ENOSPC; - goto out_err; - } - - ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode); - if (ret == -EEXIST) - goto find_entry; - else if (ret) + ret = -ENOSPC; + if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) goto out_err; - - entry = &ientry->fsn_entry; retry: ret = -ENOMEM; if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL))) goto out_err; spin_lock(&group->inotify_data.idr_lock); - ret = idr_get_new_above(&group->inotify_data.idr, entry, - ++group->inotify_data.last_wd, - &ientry->wd); + ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry, + group->inotify_data.last_wd, + &tmp_ientry->wd); spin_unlock(&group->inotify_data.idr_lock); if (ret) { if (ret == -EAGAIN) goto retry; goto out_err; } + + ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode); + if (ret) { + inotify_remove_from_idr(group, tmp_ientry); + if (ret == -EEXIST) + goto find_entry; + goto out_err; + } + + /* tmp_ientry has been added to the inode, so we are all set up. + * now we just need to make sure tmp_ientry doesn't get freed and + * we need to set up entry and ientry so the generic code can + * do its thing. */ + ientry = tmp_ientry; + entry = &ientry->fsn_entry; + tmp_ientry = NULL; + atomic_inc(&group->inotify_data.user->inotify_watches); + /* update the idr hint */ + group->inotify_data.last_wd = ientry->wd; + /* we put the mark on the idr, take a reference */ fsnotify_get_mark(entry); } @@ -514,14 +532,15 @@ retry: * depending on which path we took... */ fsnotify_put_mark(entry); - return ret; - out_err: - /* see this isn't supposed to happen, just kill the watch */ - if (entry) { - fsnotify_destroy_mark_by_entry(entry); - fsnotify_put_mark(entry); + /* could be an error, could be that we found an existing mark */ + if (tmp_ientry) { + /* on the idr but didn't make it on the inode */ + if (tmp_ientry->wd != -1) + inotify_remove_from_idr(group, tmp_ientry); + kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry); } + return ret; } -- cgit v1.2.3-70-g09d2 From 520dc2a526fd681337883b6ff1ddcf7c23b1b063 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 13 Jul 2009 15:56:54 -0400 Subject: fsnotify: use def_bool in kconfig instead of letting the user choose fsnotify doens't give the user anything. If someone chooses inotify or dnotify it should build fsnotify, if they don't select one it shouldn't be built. This patch changes fsnotify to be a def_bool=n and makes everything else select it. Also fixes the issue people complained about on lwn where gdm hung because they didn't have inotify and they didn't get the inotify build option..... Signed-off-by: Eric Paris --- fs/notify/Kconfig | 12 +----------- fs/notify/dnotify/Kconfig | 2 +- fs/notify/inotify/Kconfig | 2 +- 3 files changed, 3 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig index 31dac7e3b0f..dffbb0911d0 100644 --- a/fs/notify/Kconfig +++ b/fs/notify/Kconfig @@ -1,15 +1,5 @@ config FSNOTIFY - bool "Filesystem notification backend" - default y - ---help--- - fsnotify is a backend for filesystem notification. fsnotify does - not provide any userspace interface but does provide the basis - needed for other notification schemes such as dnotify, inotify, - and fanotify. - - Say Y here to enable fsnotify suport. - - If unsure, say Y. + def_bool n source "fs/notify/dnotify/Kconfig" source "fs/notify/inotify/Kconfig" diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig index 904ff8d5405..f9c1ca139d8 100644 --- a/fs/notify/dnotify/Kconfig +++ b/fs/notify/dnotify/Kconfig @@ -1,6 +1,6 @@ config DNOTIFY bool "Dnotify support" - depends on FSNOTIFY + select FSNOTIFY default y help Dnotify is a directory-based per-fd file change notification system diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig index 5356884289a..3e56dbffe72 100644 --- a/fs/notify/inotify/Kconfig +++ b/fs/notify/inotify/Kconfig @@ -15,7 +15,7 @@ config INOTIFY config INOTIFY_USER bool "Inotify support for userspace" - depends on FSNOTIFY + select FSNOTIFY default y ---help--- Say Y here to enable inotify support for userspace, including the -- cgit v1.2.3-70-g09d2 From 4a148ba988988b9c400ad0f2cbccc155289b954b Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 13 Jul 2009 15:56:55 -0400 Subject: inotify: check filename before dropping repeat events inotify drops events if the last event on the queue is the same as the current event. But it does 2 things wrong. First it is comparing old->inode with new->inode. But after an event if put on the queue the ->inode is no longer allowed to be used. It's possible between the last event and this new event the inode could be reused and we would falsely match the inode's memory address between two differing events. The second problem is that when a file is removed fsnotify is passed the negative dentry for the removed object rather than the postive dentry from immediately before the removal. This mean the (broken) inotify tail drop code was matching the NULL ->inode of differing events. The fix is to check the file name which is stored with events when doing the tail drop instead of wrongly checking the address of the stored ->inode. Reported-by: Scott James Remnant Signed-off-by: Eric Paris --- fs/notify/notification.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 959b73e756f..69391fe8efb 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -136,10 +136,15 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new { if ((old->mask == new->mask) && (old->to_tell == new->to_tell) && - (old->data_type == new->data_type)) { + (old->data_type == new->data_type) && + (old->name_len == new->name_len)) { switch (old->data_type) { case (FSNOTIFY_EVENT_INODE): - if (old->inode == new->inode) + /* remember, after old was put on the wait_q we aren't + * allowed to look at the inode any more, only thing + * left to check was if the file_name is the same */ + if (old->name_len && + !strcmp(old->file_name, new->file_name)) return true; break; case (FSNOTIFY_EVENT_PATH): -- cgit v1.2.3-70-g09d2 From c05594b62125c528d93af3a78229793aae36df7f Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 13 Jul 2009 15:56:55 -0400 Subject: fsnotify: fix inotify tail drop check with path entries fsnotify drops new events when they are the same as the tail event on the queue to be sent to userspace. The problem is that if the event comes with a path we forget to break out of the switch statement and fall into the code path which matches on events that do not have any type of file backed information (things like IN_UNMOUNT and IN_Q_OVERFLOW). The problem is that this code thinks all such events should be dropped. Fix is to add a break. Signed-off-by: Eric Paris --- fs/notify/notification.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 69391fe8efb..2b20feaf263 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -151,6 +151,7 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new if ((old->path.mnt == new->path.mnt) && (old->path.dentry == new->path.dentry)) return true; + break; case (FSNOTIFY_EVENT_NONE): return true; }; -- cgit v1.2.3-70-g09d2 From f44aebcc566d1d6275f7191867b9633dc11de2ee Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 15 Jul 2009 15:49:52 -0400 Subject: inotify: use GFP_NOFS under potential memory pressure inotify can have a watchs removed under filesystem reclaim. ================================= [ INFO: inconsistent lock state ] 2.6.31-rc2 #16 --------------------------------- inconsistent {IN-RECLAIM_FS-W} -> {RECLAIM_FS-ON-W} usage. khubd/217 [HC0[0]:SC0[0]:HE1:SE1] takes: (iprune_mutex){+.+.?.}, at: [] invalidate_inodes+0x20/0xe3 {IN-RECLAIM_FS-W} state was registered at: [] __lock_acquire+0x2c9/0xac4 [] lock_acquire+0x9f/0xc2 [] __mutex_lock_common+0x2d/0x323 [] mutex_lock_nested+0x2e/0x36 [] shrink_icache_memory+0x38/0x1b2 [] shrink_slab+0xe2/0x13c [] kswapd+0x3d1/0x55d [] kthread+0x66/0x6b [] kernel_thread_helper+0x7/0x10 [] 0xffffffff Two things are needed to fix this. First we need a method to tell fsnotify_create_event() to use GFP_NOFS and second we need to stop using one global IN_IGNORED event and allocate them one at a time. This solves current issues with multiple IN_IGNORED on a queue having tail drop problems and simplifies the allocations since we don't have to worry about two tasks opperating on the IGNORED event concurrently. Signed-off-by: Eric Paris --- fs/notify/fsnotify.c | 4 +++- fs/notify/inotify/inotify_user.c | 18 ++++++++++++------ fs/notify/notification.c | 9 +++++---- include/linux/fsnotify_backend.h | 2 +- 4 files changed, 21 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index ec2f7bd7681..037e878e03f 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -159,7 +159,9 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const if (!group->ops->should_send_event(group, to_tell, mask)) continue; if (!event) { - event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie); + event = fsnotify_create_event(to_tell, mask, data, + data_is, file_name, cookie, + GFP_KERNEL); /* shit, we OOM'd and now we can't tell, maybe * someday someone else will want to do something * here */ diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 726118a5845..f30d9bbc2e1 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -57,7 +57,6 @@ int inotify_max_user_watches __read_mostly; static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; struct kmem_cache *event_priv_cachep __read_mostly; -static struct fsnotify_event *inotify_ignored_event; /* * When inotify registers a new group it increments this and uses that @@ -384,12 +383,19 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, struct fsnotify_group *group) { struct inotify_inode_mark_entry *ientry; + struct fsnotify_event *ignored_event; struct inotify_event_private_data *event_priv; struct fsnotify_event_private_data *fsn_event_priv; + ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, + FSNOTIFY_EVENT_NONE, NULL, 0, + GFP_NOFS); + if (!ignored_event) + return; + ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); - event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); + event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS); if (unlikely(!event_priv)) goto skip_send_ignore; @@ -398,7 +404,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, fsn_event_priv->group = group; event_priv->wd = ientry->wd; - fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv); + fsnotify_add_notify_event(group, ignored_event, fsn_event_priv); /* did the private data get added? */ if (list_empty(&fsn_event_priv->event_list)) @@ -406,6 +412,9 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, skip_send_ignore: + /* matches the reference taken when the event was created */ + fsnotify_put_event(ignored_event); + /* remove this entry from the idr */ inotify_remove_from_idr(group, ientry); @@ -748,9 +757,6 @@ static int __init inotify_user_setup(void) inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC); event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); - inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0); - if (!inotify_ignored_event) - panic("unable to allocate the inotify ignored event\n"); inotify_max_queued_events = 16384; inotify_max_user_instances = 128; diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 2b20feaf263..521368574e9 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -153,7 +153,7 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new return true; break; case (FSNOTIFY_EVENT_NONE): - return true; + return false; }; } return false; @@ -345,18 +345,19 @@ static void initialize_event(struct fsnotify_event *event) * @name the filename, if available */ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, - int data_type, const char *name, u32 cookie) + int data_type, const char *name, u32 cookie, + gfp_t gfp) { struct fsnotify_event *event; - event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL); + event = kmem_cache_alloc(fsnotify_event_cachep, gfp); if (!event) return NULL; initialize_event(event); if (name) { - event->file_name = kstrdup(name, GFP_KERNEL); + event->file_name = kstrdup(name, gfp); if (!event->file_name) { kmem_cache_free(fsnotify_event_cachep, event); return NULL; diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 6c3de999fb3..4d6f47b5118 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -352,7 +352,7 @@ extern void fsnotify_unmount_inodes(struct list_head *list); /* put here because inotify does some weird stuff when destroying watches */ extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *name, - u32 cookie); + u32 cookie, gfp_t gfp); #else -- cgit v1.2.3-70-g09d2 From b64aec8d1e1d8482a7b6cca60c8105c756bf1fe4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 21 Jul 2009 16:47:46 -0400 Subject: NFSv4: Fix an Oops in nfs4_free_lock_state The oops http://www.kerneloops.org/raw.php?rawid=537858&msgid= appears to be due to the nfs4_lock_state->ls_state field being uninitialised. This happens if the call to nfs4_free_lock_state() is triggered at the end of nfs4_get_lock_state(). The fix is to move the initialisation of ls_state into the allocator. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index b73c5a72865..65ca8c18476 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -553,6 +553,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f INIT_LIST_HEAD(&lsp->ls_sequence.list); lsp->ls_seqid.sequence = &lsp->ls_sequence; atomic_set(&lsp->ls_count, 1); + lsp->ls_state = state; lsp->ls_owner = fl_owner; spin_lock(&clp->cl_lock); nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); @@ -587,7 +588,6 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_ if (lsp != NULL) break; if (new != NULL) { - new->ls_state = state; list_add(&new->ls_locks, &state->lock_states); set_bit(LK_STATE_IN_USE, &state->flags); lsp = new; -- cgit v1.2.3-70-g09d2 From fccba8045537f7e840d0e7565e1989d465e488a3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 21 Jul 2009 16:48:07 -0400 Subject: NFSv4: Fix an NFSv4 mount regression Commit 008f55d0e019943323c20a03493a2ba5672a4cc8 (nfs41: recover lease in _nfs4_lookup_root) forces the state manager to always run on mount. This is a bug in the case of NFSv4.0, which doesn't require us to send a setclientid until we want to grab file state. In any case, this is completely the wrong place to be doing state management. Moving that code into nfs4_init_session... Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 18 +++--------------- fs/nfs/nfs4_fs.h | 6 ++++++ fs/nfs/nfs4proc.c | 24 +++++++++++++++++------- 3 files changed, 26 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index c2d061675d8..8d25ccb2d51 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1242,20 +1242,6 @@ error: return error; } -/* - * Initialize a session. - * Note: save the mount rsize and wsize for create_server negotiation. - */ -static void nfs4_init_session(struct nfs_client *clp, - unsigned int wsize, unsigned int rsize) -{ -#if defined(CONFIG_NFS_V4_1) - if (nfs4_has_session(clp)) { - clp->cl_session->fc_attrs.max_rqst_sz = wsize; - clp->cl_session->fc_attrs.max_resp_sz = rsize; - } -#endif /* CONFIG_NFS_V4_1 */ -} /* * Session has been established, and the client marked ready. @@ -1350,7 +1336,9 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, BUG_ON(!server->nfs_client->rpc_ops); BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - nfs4_init_session(server->nfs_client, server->wsize, server->rsize); + error = nfs4_init_session(server); + if (error < 0) + goto error; /* Probe the root fh to retrieve its FSID */ error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 61bc3a32e1e..6ea07a3c75d 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -220,6 +220,7 @@ extern void nfs4_destroy_session(struct nfs4_session *session); extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); extern int nfs4_proc_create_session(struct nfs_client *, int reset); extern int nfs4_proc_destroy_session(struct nfs4_session *); +extern int nfs4_init_session(struct nfs_server *server); #else /* CONFIG_NFS_v4_1 */ static inline int nfs4_setup_sequence(struct nfs_client *clp, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, @@ -227,6 +228,11 @@ static inline int nfs4_setup_sequence(struct nfs_client *clp, { return 0; } + +static inline int nfs4_init_session(struct nfs_server *server) +{ + return 0; +} #endif /* CONFIG_NFS_V4_1 */ extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[]; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ff0c080db59..df24f67bca6 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2040,15 +2040,9 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_argp = &args, .rpc_resp = &res, }; - int status; nfs_fattr_init(info->fattr); - status = nfs4_recover_expired_lease(server); - if (!status) - status = nfs4_check_client_ready(server->nfs_client); - if (!status) - status = nfs4_call_sync(server, &msg, &args, &res, 0); - return status; + return nfs4_call_sync(server, &msg, &args, &res, 0); } static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, @@ -4793,6 +4787,22 @@ int nfs4_proc_destroy_session(struct nfs4_session *session) return status; } +int nfs4_init_session(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + int ret; + + if (!nfs4_has_session(clp)) + return 0; + + clp->cl_session->fc_attrs.max_rqst_sz = server->wsize; + clp->cl_session->fc_attrs.max_resp_sz = server->rsize; + ret = nfs4_recover_expired_lease(server); + if (!ret) + ret = nfs4_check_client_ready(clp); + return ret; +} + /* * Renew the cl_session lease. */ -- cgit v1.2.3-70-g09d2 From d953126a28f97ec965d23c69fd5795854c048f30 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 21 Jul 2009 19:22:38 -0400 Subject: NFSv4: Fix a problem whereby a buggy server can oops the kernel We just had a case in which a buggy server occasionally returns the wrong attributes during an OPEN call. While the client does catch this sort of condition in nfs4_open_done(), and causes the nfs4_atomic_open() to return -EISDIR, the logic in nfs_atomic_lookup() is broken, since it causes a fallback to an ordinary lookup instead of just returning the error. When the buggy server then returns a regular file for the fallback lookup, the VFS allows the open, and bad things start to happen, since the open file doesn't have any associated NFSv4 state. The fix is firstly to return the EISDIR/ENOTDIR errors immediately, and secondly to ensure that we are always careful when dereferencing the nfs_open_context state pointer. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 2 +- fs/nfs/nfs4proc.c | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 38d42c29fb9..32062c33c85 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1025,12 +1025,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry res = NULL; goto out; /* This turned out not to be a regular file */ - case -EISDIR: case -ENOTDIR: goto no_open; case -ELOOP: if (!(nd->intent.open.flags & O_NOFOLLOW)) goto no_open; + /* case -EISDIR: */ /* case -EINVAL: */ default: goto out; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index df24f67bca6..6917311f201 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4093,15 +4093,23 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) if (request->fl_start < 0 || request->fl_end < 0) return -EINVAL; - if (IS_GETLK(cmd)) - return nfs4_proc_getlk(state, F_GETLK, request); + if (IS_GETLK(cmd)) { + if (state != NULL) + return nfs4_proc_getlk(state, F_GETLK, request); + return 0; + } if (!(IS_SETLK(cmd) || IS_SETLKW(cmd))) return -EINVAL; - if (request->fl_type == F_UNLCK) - return nfs4_proc_unlck(state, cmd, request); + if (request->fl_type == F_UNLCK) { + if (state != NULL) + return nfs4_proc_unlck(state, cmd, request); + return 0; + } + if (state == NULL) + return -ENOLCK; do { status = nfs4_proc_setlk(state, cmd, request); if ((status != -EAGAIN) || IS_SETLK(cmd)) -- cgit v1.2.3-70-g09d2 From 023d43c7b5a23a81fe8afa9f37296f8ed4be11fb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jul 2009 10:09:23 +0200 Subject: lockdep: Fix lockdep annotation for pipe_double_lock() The presumed use of the pipe_double_lock() routine is to lock 2 locks in a deadlock free way by ordering the locks by their address. However it fails to keep the specified lock classes in order and explicitly annotates a deadlock. Rectify this. Signed-off-by: Peter Zijlstra Acked-by: Miklos Szeredi LKML-Reference: <1248163763.15751.11098.camel@twins> --- fs/pipe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index f7dd21ad85a..52c41511483 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -68,8 +68,8 @@ void pipe_double_lock(struct pipe_inode_info *pipe1, pipe_lock_nested(pipe1, I_MUTEX_PARENT); pipe_lock_nested(pipe2, I_MUTEX_CHILD); } else { - pipe_lock_nested(pipe2, I_MUTEX_CHILD); - pipe_lock_nested(pipe1, I_MUTEX_PARENT); + pipe_lock_nested(pipe2, I_MUTEX_PARENT); + pipe_lock_nested(pipe1, I_MUTEX_CHILD); } } -- cgit v1.2.3-70-g09d2 From ce6e7fcd43aab1f77e56aa36936dd7d2d05a1ffa Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 22 Jul 2009 15:08:58 -0400 Subject: cifs: disable serverino if server doesn't support it A recent regression when dealing with older servers. This bug was introduced when we made serverino the default... When the server can't provide inode numbers, disable it for the mount. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 18afe57b246..b6a47b32f21 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -513,9 +513,12 @@ int cifs_get_inode_info(struct inode **pinode, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc1) { - /* BB EOPNOSUPP disable SERVER_INUM? */ cFYI(1, ("GetSrvInodeNum rc %d", rc1)); fattr.cf_uniqueid = iunique(sb, ROOT_I); + /* disable serverino if call not supported */ + if (rc1 == -EINVAL) + cifs_sb->mnt_cifs_flags &= + ~CIFS_MOUNT_SERVER_INUM; } } else { fattr.cf_uniqueid = iunique(sb, ROOT_I); -- cgit v1.2.3-70-g09d2 From 03aa3a49ad3592a9e4e1ab19c6da3e852288caf1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 21 Jul 2009 19:42:03 -0400 Subject: cifs: fix sb->s_maxbytes so that it casts properly to a signed value This off-by-one bug causes sendfile() to not work properly. When a task calls sendfile() on a file on a CIFS filesystem, the syscall returns -1 and sets errno to EOVERFLOW. do_sendfile uses s_maxbytes to verify the returned offset of the file. The problem there is that this value is cast to a signed value (loff_t). When this is done on the s_maxbytes value that cifs uses, it becomes negative and the comparisons against it fail. Even though s_maxbytes is an unsigned value, it seems that it's not OK to set it in such a way that it'll end up negative when it's cast to a signed value. These casts happen in other codepaths besides sendfile too, but the VFS is a little hard to follow in this area and I can't be sure if there are other bugs that this will fix. It's not clear to me why s_maxbytes isn't just declared as loff_t in the first place, but either way we still need to fix these values to make sendfile work properly. This is also an opportunity to replace the magic bit-shift values here with the standard #defines for this. This fixes the reproducer program I have that does a sendfile and will probably also fix the situation where apache is serving from a CIFS share. Acked-by: Johannes Weiner Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 9bb5c875073..fc44d316d0b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2452,10 +2452,10 @@ try_mount_again: tcon->local_lease = volume_info->local_lease; } if (pSesInfo) { - if (pSesInfo->capabilities & CAP_LARGE_FILES) { - sb->s_maxbytes = (u64) 1 << 63; - } else - sb->s_maxbytes = (u64) 1 << 31; /* 2 GB */ + if (pSesInfo->capabilities & CAP_LARGE_FILES) + sb->s_maxbytes = MAX_LFS_FILESIZE; + else + sb->s_maxbytes = MAX_NON_LFS; } /* BB FIXME fix time_gran to be larger for LANMAN sessions */ -- cgit v1.2.3-70-g09d2 From f1230c97978f52268d8c66e6f88e54c3d2092a75 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 22 Jul 2009 23:13:01 +0000 Subject: [CIFS] fix sparse warning Signed-off-by: Steve French --- fs/cifs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index b6a47b32f21..82d83839655 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -212,7 +212,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, * junction to the new submount (ie to setup the fake directory * which represents a DFS referral). */ -void +static void cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb) { struct cifs_sb_info *cifs_sb = CIFS_SB(sb); @@ -388,7 +388,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, } /* Fill a cifs_fattr struct with info from FILE_ALL_INFO */ -void +static void cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, struct cifs_sb_info *cifs_sb, bool adjust_tz) { -- cgit v1.2.3-70-g09d2 From 4a19fb11a90fdbbcb3bc02effa036230d035ca28 Mon Sep 17 00:00:00 2001 From: Stefan Bader Date: Thu, 23 Jul 2009 11:26:05 +0200 Subject: jfs: Fix early release of acl in jfs_get_acl BugLink: http://bugs.launchpad.net/ubuntu/+bug/396780 Commit 073aaa1b142461d91f83da66db1184d7c1b1edea "helpers for acl caching + switch to those" introduced new helper functions for acl handling but seems to have introduced a regression for jfs as the acl is released before returning it to the caller, instead of leaving this for the caller to do. This causes the acl object to be used after freeing it, leading to kernel panics in completely different places. Thanks to Christophe Dumez for reporting and bisecting into this. Reported-by: Christophe Dumez Tested-by: Christophe Dumez Signed-off-by: Stefan Bader Acked-by: Andy Whitcroft Signed-off-by: Dave Kleikamp --- fs/jfs/acl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 91fa3ad6e8c..a29c7c3e3fb 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -67,10 +67,8 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type) acl = posix_acl_from_xattr(value, size); } kfree(value); - if (!IS_ERR(acl)) { + if (!IS_ERR(acl)) set_cached_acl(inode, type, acl); - posix_acl_release(acl); - } return acl; } -- cgit v1.2.3-70-g09d2