From 05496769e5da83ce22ed97345afd9c7b71d6bd24 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 16 Sep 2008 14:36:17 -0400 Subject: jbd2: clean up how the journal device name is printed Calculate the journal device name once and stash it away in the journal_s structure. This avoids needing to call bdevname() everywhere and reduces stack usage by not needing to allocate an on-stack buffer. In addition, we eliminate the '/' that can appear in device names (e.g. "cciss/c0d0p9" --- see kernel bugzilla #11321) that can cause problems when creating proc directory names, and include the inode number to support ocfs2 which creates multiple journals with different inode numbers. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index f2ad061e95e..b091e5378fe 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -147,12 +147,9 @@ static int journal_submit_commit_record(journal_t *journal, * to remember if we sent a barrier request */ if (ret == -EOPNOTSUPP && barrier_done) { - char b[BDEVNAME_SIZE]; - printk(KERN_WARNING - "JBD: barrier-based sync failed on %s - " - "disabling barriers\n", - bdevname(journal->j_dev, b)); + "JBD: barrier-based sync failed on %s - " + "disabling barriers\n", journal->j_devname); spin_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_BARRIER; spin_unlock(&journal->j_state_lock); @@ -681,11 +678,9 @@ start_journal_io: */ err = journal_finish_inode_data_buffers(journal, commit_transaction); if (err) { - char b[BDEVNAME_SIZE]; - printk(KERN_WARNING "JBD2: Detected IO errors while flushing file data " - "on %s\n", bdevname(journal->j_fs_dev, b)); + "on %s\n", journal->j_devname); err = 0; } -- cgit v1.2.3-70-g09d2 From ede86cc473defab74d778aeac14b19f43129d4d1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 5 Oct 2008 20:50:06 -0400 Subject: ext4: Add debugging markers that can be used by systemtap This debugging markers are designed to debug problems such as the random filesystem latency problems reported by Arjan. Signed-off-by: "Theodore Ts'o" --- fs/ext4/fsync.c | 5 +++++ fs/ext4/super.c | 2 ++ fs/jbd2/checkpoint.c | 3 +++ fs/jbd2/commit.c | 6 ++++++ 4 files changed, 16 insertions(+) (limited to 'fs/jbd2/commit.c') diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index c37d1e86f51..5afe4370840 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "ext4.h" #include "ext4_jbd2.h" @@ -51,6 +52,10 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) J_ASSERT(ext4_journal_current_handle() == NULL); + trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld", + inode->i_sb->s_id, datasync, inode->i_ino, + dentry->d_parent->d_inode->i_ino); + /* * data=writeback: * The caller's filemap_fdatawrite()/wait will sync the data. diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dfcd41fafb9..9c0214689de 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -2951,6 +2952,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) { tid_t target; + trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); sb->s_dirt = 0; if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { if (wait) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index af4651bf357..42895d36945 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -328,6 +329,8 @@ int jbd2_log_do_checkpoint(journal_t *journal) * journal straight away. */ result = jbd2_cleanup_journal_tail(journal); + trace_mark(jbd2_checkpoint, "dev %s need_checkpoint %d", + journal->j_devname, result); jbd_debug(1, "cleanup_journal_tail returned %d\n", result); if (result <= 0) return result; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index b091e5378fe..e91f051a985 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -368,6 +369,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) commit_transaction = journal->j_running_transaction; J_ASSERT(commit_transaction->t_state == T_RUNNING); + trace_mark(jbd2_start_commit, "dev %s transaction %d", + journal->j_devname, commit_transaction->t_tid); jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); @@ -985,6 +988,9 @@ restart_loop: } spin_unlock(&journal->j_list_lock); + trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", + journal->j_devname, commit_transaction->t_tid, + journal->j_tail_sequence); jbd_debug(1, "JBD: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); -- cgit v1.2.3-70-g09d2 From 45a90bfd90c1215bf824c0f705b409723f52361b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 6 Oct 2008 12:04:02 -0400 Subject: jbd2: Fix buffer head leak when writing the commit block Also make sure the buffer heads are marked clean before submitting bh for writing. The previous code was marking the buffer head dirty, which would have forced an unneeded write (and seek) to the journal for no good reason. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index e91f051a985..0d3814a35ed 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -127,8 +127,7 @@ static int journal_submit_commit_record(journal_t *journal, JBUFFER_TRACE(descriptor, "submit commit block"); lock_buffer(bh); - get_bh(bh); - set_buffer_dirty(bh); + clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; @@ -158,7 +157,7 @@ static int journal_submit_commit_record(journal_t *journal, /* And try again, without the barrier */ lock_buffer(bh); set_buffer_uptodate(bh); - set_buffer_dirty(bh); + clear_buffer_dirty(bh); ret = submit_bh(WRITE, bh); } *cbh = bh; -- cgit v1.2.3-70-g09d2 From 77e841de8abac4755cc83ca224fdf71418d65380 Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Sun, 12 Oct 2008 16:39:16 -0400 Subject: jbd2: abort when failed to log metadata buffers If we failed to write metadata buffers to the journal space and succeeded to write the commit record, stale data can be written back to the filesystem as metadata in the recovery phase. To avoid this, when we failed to write out metadata buffers, abort the journal before writing the commit record. We can also avoid this kind of corruption by using the journal checksum feature because it can detect invalid metadata blocks in the journal and avoid them from being replayed. So we don't need to care about asynchronous commit record writeout with a checksum. Signed-off-by: Hidehiro Kawai Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 0d3814a35ed..78e4da93412 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -783,6 +783,9 @@ wait_for_iobuf: /* AKPM: bforget here */ } + if (err) + jbd2_journal_abort(journal, err); + jbd_debug(3, "JBD: commit phase 5\n"); if (!JBD2_HAS_INCOMPAT_FEATURE(journal, -- cgit v1.2.3-70-g09d2 From 7ad7445f60fe4d46c4c9d2a9463db180d2a3b270 Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Fri, 10 Oct 2008 20:29:31 -0400 Subject: jbd2: don't dirty original metadata buffer on abort Currently, original metadata buffers are dirtied when they are unfiled whether the journal has aborted or not. Eventually these buffers will be written-back to the filesystem by pdflush. This means some metadata buffers are written to the filesystem without journaling if the journal aborts. So if both journal abort and system crash happen at the same time, the filesystem would become inconsistent state. Additionally, replaying journaled metadata can overwrite the latest metadata on the filesystem partly. Because, if the journal gets aborted, journaled metadata are preserved and replayed during the next mount not to lose uncheckpointed metadata. This would also break the consistency of the filesystem. This patch prevents original metadata buffers from being dirtied on abort by clearing BH_JBDDirty flag from those buffers. Thus, no metadata buffers are written to the filesystem without journaling. Signed-off-by: Hidehiro Kawai Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 78e4da93412..849f10496ce 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -504,9 +504,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) jh = commit_transaction->t_buffers; /* If we're in abort mode, we just un-journal the buffer and - release it for background writing. */ + release it. */ if (is_journal_aborted(journal)) { + clear_buffer_jbddirty(jh2bh(jh)); JBUFFER_TRACE(jh, "journal is aborting: refile"); jbd2_journal_refile_buffer(journal, jh); /* If that was the last one, we need to clean up @@ -884,6 +885,8 @@ restart_loop: if (buffer_jbddirty(bh)) { JBUFFER_TRACE(jh, "add to new checkpointing trans"); __jbd2_journal_insert_checkpoint(jh, commit_transaction); + if (is_journal_aborted(journal)) + clear_buffer_jbddirty(bh); JBUFFER_TRACE(jh, "refile for checkpoint writeback"); __jbd2_journal_refile_buffer(jh); jbd_unlock_bh_state(bh); -- cgit v1.2.3-70-g09d2 From 5bf5683a33f3584da6eced480967c4f7e11515a8 Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Fri, 10 Oct 2008 22:12:43 -0400 Subject: ext4: add an option to control error handling on file data If the journal doesn't abort when it gets an IO error in file data blocks, the file data corruption will spread silently. Because most of applications and commands do buffered writes without fsync(), they don't notice the IO error. It's scary for mission critical systems. On the other hand, if the journal aborts whenever it gets an IO error in file data blocks, the system will easily become inoperable. So this patch introduces a filesystem option to determine whether it aborts the journal or just call printk() when it gets an IO error in file data. If you mount an ext4 fs with data_err=abort option, it aborts on file data write error. If you mount it with data_err=ignore, it doesn't abort, just call printk(). data_err=ignore is the default. Here is the corresponding patch of the ext3 version: http://kerneltrap.org/mailarchive/linux-kernel/2008/9/9/3239374 Signed-off-by: Hidehiro Kawai Signed-off-by: Theodore Ts'o --- Documentation/filesystems/ext4.txt | 5 +++++ fs/ext4/ext4.h | 2 ++ fs/ext4/super.c | 16 ++++++++++++++++ fs/jbd2/commit.c | 2 ++ include/linux/jbd2.h | 3 +++ 5 files changed, 28 insertions(+) (limited to 'fs/jbd2/commit.c') diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 74484e69640..eb154ef36c2 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -223,6 +223,11 @@ errors=remount-ro(*) Remount the filesystem read-only on an error. errors=continue Keep going on a filesystem error. errors=panic Panic and halt the machine if an error occurs. +data_err=ignore(*) Just print an error message if an error occurs + in a file data buffer in ordered mode. +data_err=abort Abort the journal if an error occurs in a file + data buffer in ordered mode. + grpid Give objects the same group ID as their creator. bsdgroups diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f46a513a515..6690a41cdd9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -540,6 +540,8 @@ do { \ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ +#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ + /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 79bd3989e84..014677b8e22 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -778,6 +778,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_printf(seq, ",inode_readahead_blks=%u", sbi->s_inode_readahead_blks); + if (test_opt(sb, DATA_ERR_ABORT)) + seq_puts(seq, ",data_err=abort"); + ext4_show_quota_options(seq, sb); return 0; } @@ -907,6 +910,7 @@ enum { Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, @@ -953,6 +957,8 @@ static match_table_t tokens = { {Opt_data_journal, "data=journal"}, {Opt_data_ordered, "data=ordered"}, {Opt_data_writeback, "data=writeback"}, + {Opt_data_err_abort, "data_err=abort"}, + {Opt_data_err_ignore, "data_err=ignore"}, {Opt_offusrjquota, "usrjquota="}, {Opt_usrjquota, "usrjquota=%s"}, {Opt_offgrpjquota, "grpjquota="}, @@ -1187,6 +1193,12 @@ static int parse_options(char *options, struct super_block *sb, sbi->s_mount_opt |= data_opt; } break; + case Opt_data_err_abort: + set_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; + case Opt_data_err_ignore: + clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; #ifdef CONFIG_QUOTA case Opt_usrjquota: qtype = USRQUOTA; @@ -2535,6 +2547,10 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_flags |= JBD2_BARRIER; else journal->j_flags &= ~JBD2_BARRIER; + if (test_opt(sb, DATA_ERR_ABORT)) + journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; + else + journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; spin_unlock(&journal->j_state_lock); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 849f10496ce..0abe02c4242 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -684,6 +684,8 @@ start_journal_io: printk(KERN_WARNING "JBD2: Detected IO errors while flushing file data " "on %s\n", journal->j_devname); + if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR) + jbd2_journal_abort(journal, err); err = 0; } diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index c9e7d781db3..d2e91ea998f 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -967,6 +967,9 @@ struct journal_s #define JBD2_FLUSHED 0x008 /* The journal superblock has been flushed */ #define JBD2_LOADED 0x010 /* The journal superblock has been loaded */ #define JBD2_BARRIER 0x020 /* Use IDE barriers */ +#define JBD2_ABORT_ON_SYNCDATA_ERR 0x040 /* Abort the journal on file + * data write error in ordered + * mode */ /* * Function declarations for the journaling transaction and buffer -- cgit v1.2.3-70-g09d2 From 3e624fc72fba09b6f999a9fbb87b64efccd38036 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 16 Oct 2008 20:00:24 -0400 Subject: ext4: Replace hackish ext4_mb_poll_new_transaction with commit callback The multiblock allocator needs to be able to release blocks (and issue a blkdev discard request) when the transaction which freed those blocks is committed. Previously this was done via a polling mechanism when blocks are allocated or freed. A much better way of doing things is to create a jbd2 callback function and attaching the list of blocks to be freed directly to the transaction structure. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_sb.h | 3 -- fs/ext4/mballoc.c | 85 +++++++++------------------------------------------ fs/ext4/mballoc.h | 3 +- fs/jbd2/commit.c | 3 ++ fs/jbd2/transaction.c | 1 + include/linux/jbd2.h | 9 ++++++ 6 files changed, 29 insertions(+), 75 deletions(-) (limited to 'fs/jbd2/commit.c') diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 6a0b40d4326..445fde603df 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -99,9 +99,6 @@ struct ext4_sb_info { struct inode *s_buddy_cache; long s_blocks_reserved; spinlock_t s_reserve_lock; - struct list_head s_active_transaction; - struct list_head s_closed_transaction; - struct list_head s_committed_transaction; spinlock_t s_md_lock; tid_t s_last_transaction; unsigned short *s_mb_offsets, *s_mb_maxs; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index da1da1fe2b1..dfe17a13405 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2523,9 +2523,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) } spin_lock_init(&sbi->s_md_lock); - INIT_LIST_HEAD(&sbi->s_active_transaction); - INIT_LIST_HEAD(&sbi->s_closed_transaction); - INIT_LIST_HEAD(&sbi->s_committed_transaction); spin_lock_init(&sbi->s_bal_lock); sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; @@ -2554,6 +2551,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) ext4_mb_init_per_dev_proc(sb); ext4_mb_history_init(sb); + sbi->s_journal->j_commit_callback = release_blocks_on_commit; + printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); return 0; } @@ -2583,15 +2582,6 @@ int ext4_mb_release(struct super_block *sb) struct ext4_group_info *grinfo; struct ext4_sb_info *sbi = EXT4_SB(sb); - /* release freed, non-committed blocks */ - spin_lock(&sbi->s_md_lock); - list_splice_init(&sbi->s_closed_transaction, - &sbi->s_committed_transaction); - list_splice_init(&sbi->s_active_transaction, - &sbi->s_committed_transaction); - spin_unlock(&sbi->s_md_lock); - ext4_mb_free_committed_blocks(sb); - if (sbi->s_group_info) { for (i = 0; i < sbi->s_groups_count; i++) { grinfo = ext4_get_group_info(sb, i); @@ -2645,36 +2635,25 @@ int ext4_mb_release(struct super_block *sb) return 0; } -static noinline_for_stack void -ext4_mb_free_committed_blocks(struct super_block *sb) +/* + * This function is called by the jbd2 layer once the commit has finished, + * so we know we can free the blocks that were released with that commit. + */ +static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) { + struct super_block *sb = journal->j_private; struct ext4_buddy e4b; struct ext4_group_info *db; - struct ext4_sb_info *sbi = EXT4_SB(sb); int err, count = 0, count2 = 0; struct ext4_free_data *entry; ext4_fsblk_t discard_block; + struct list_head *l, *ltmp; - if (list_empty(&sbi->s_committed_transaction)) - return; - - /* there is committed blocks to be freed yet */ - do { - /* get next array of blocks */ - entry = NULL; - spin_lock(&sbi->s_md_lock); - if (!list_empty(&sbi->s_committed_transaction)) { - entry = list_entry(sbi->s_committed_transaction.next, - struct ext4_free_data, list); - list_del(&entry->list); - } - spin_unlock(&sbi->s_md_lock); - - if (entry == NULL) - break; + list_for_each_safe(l, ltmp, &txn->t_private_list) { + entry = list_entry(l, struct ext4_free_data, list); mb_debug("gonna free %u blocks in group %lu (0x%p):", - entry->count, entry->group, entry); + entry->count, entry->group, entry); err = ext4_mb_load_buddy(sb, entry->group, &e4b); /* we expect to find existing buddy because it's pinned */ @@ -2706,7 +2685,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb) kmem_cache_free(ext4_free_ext_cachep, entry); ext4_mb_release_desc(&e4b); - } while (1); + } mb_debug("freed %u blocks in %u structures\n", count, count2); } @@ -4348,8 +4327,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, goto out1; } - ext4_mb_poll_new_transaction(sb, handle); - *errp = ext4_mb_initialize_context(ac, ar); if (*errp) { ar->len = 0; @@ -4408,36 +4385,6 @@ out1: return block; } -static void ext4_mb_poll_new_transaction(struct super_block *sb, - handle_t *handle) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (sbi->s_last_transaction == handle->h_transaction->t_tid) - return; - - /* new transaction! time to close last one and free blocks for - * committed transaction. we know that only transaction can be - * active, so previos transaction can be being logged and we - * know that transaction before previous is known to be already - * logged. this means that now we may free blocks freed in all - * transactions before previous one. hope I'm clear enough ... */ - - spin_lock(&sbi->s_md_lock); - if (sbi->s_last_transaction != handle->h_transaction->t_tid) { - mb_debug("new transaction %lu, old %lu\n", - (unsigned long) handle->h_transaction->t_tid, - (unsigned long) sbi->s_last_transaction); - list_splice_init(&sbi->s_closed_transaction, - &sbi->s_committed_transaction); - list_splice_init(&sbi->s_active_transaction, - &sbi->s_closed_transaction); - sbi->s_last_transaction = handle->h_transaction->t_tid; - } - spin_unlock(&sbi->s_md_lock); - - ext4_mb_free_committed_blocks(sb); -} /* * We can merge two free data extents only if the physical blocks @@ -4531,9 +4478,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, kmem_cache_free(ext4_free_ext_cachep, entry); } } - /* Add the extent to active_transaction list */ + /* Add the extent to transaction's private list */ spin_lock(&sbi->s_md_lock); - list_add(&new_entry->list, &sbi->s_active_transaction); + list_add(&new_entry->list, &handle->h_transaction->t_private_list); spin_unlock(&sbi->s_md_lock); ext4_unlock_group(sb, group); return 0; @@ -4562,8 +4509,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, *freed = 0; - ext4_mb_poll_new_transaction(sb, handle); - sbi = EXT4_SB(sb); es = EXT4_SB(sb)->s_es; if (block < le32_to_cpu(es->s_first_data_block) || diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 94cb7b9fe3e..b5dff1fff1e 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -269,8 +269,6 @@ struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); -static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *); -static void ext4_mb_free_committed_blocks(struct super_block *); static void ext4_mb_return_to_preallocation(struct inode *inode, struct ext4_buddy *e4b, sector_t block, int count); @@ -278,6 +276,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *, struct super_block *, struct ext4_prealloc_space *pa); static int ext4_mb_init_per_dev_proc(struct super_block *sb); static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); +static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 0abe02c4242..8b119e16aa3 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -995,6 +995,9 @@ restart_loop: } spin_unlock(&journal->j_list_lock); + if (journal->j_commit_callback) + journal->j_commit_callback(journal, commit_transaction); + trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", journal->j_devname, commit_transaction->t_tid, journal->j_tail_sequence); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e5d540588fa..39b7805a599 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -52,6 +52,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); INIT_LIST_HEAD(&transaction->t_inode_list); + INIT_LIST_HEAD(&transaction->t_private_list); /* Set up the commit timer for the new transaction. */ journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 463d6f10b64..c7d106ef22e 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -641,6 +641,11 @@ struct transaction_s */ int t_handle_count; + /* + * For use by the filesystem to store fs-specific data + * structures associated with the transaction + */ + struct list_head t_private_list; }; struct transaction_run_stats_s { @@ -935,6 +940,10 @@ struct journal_s pid_t j_last_sync_writer; + /* This function is called when a transaction is closed */ + void (*j_commit_callback)(journal_t *, + transaction_t *); + /* * Journal statistics */ -- cgit v1.2.3-70-g09d2 From 6c20ec850360bc6e5c66a787f0523a80450d65ab Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 28 Oct 2008 21:08:20 -0400 Subject: jbd2: Call the commit callback before the transaction could get dropped The transaction can potentially get dropped if there are no buffers that need to be written. Make sure we call the commit callback before potentially deciding to drop the transaction. Also avoid dereferencing the commit_transaction pointer in the marker for the same reason. This patch fixes the bug reported by Eric Paris at: http://bugzilla.kernel.org/show_bug.cgi?id=11838 Signed-off-by: "Theodore Ts'o" Acked-by: Eric Sandeen Tested-by: Eric Paris --- fs/jbd2/commit.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 8b119e16aa3..ebc667bc54a 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -974,6 +974,9 @@ restart_loop: journal->j_committing_transaction = NULL; spin_unlock(&journal->j_state_lock); + if (journal->j_commit_callback) + journal->j_commit_callback(journal, commit_transaction); + if (commit_transaction->t_checkpoint_list == NULL && commit_transaction->t_checkpoint_io_list == NULL) { __jbd2_journal_drop_transaction(journal, commit_transaction); @@ -995,11 +998,8 @@ restart_loop: } spin_unlock(&journal->j_list_lock); - if (journal->j_commit_callback) - journal->j_commit_callback(journal, commit_transaction); - trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", - journal->j_devname, commit_transaction->t_tid, + journal->j_devname, journal->j_commit_sequence, journal->j_tail_sequence); jbd_debug(1, "JBD: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); -- cgit v1.2.3-70-g09d2