From 736603ab297506f4396cb5af592004499950fcfd Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: jbd2: Add commit time into the commit block

Carlo Wood has demonstrated that it's possible to recover deleted
files from the journal.  Something that will make this easier is if we
can put the time of the commit into commit block.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs/jbd2/commit.c')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index a2ed72f7cee..92b6ac3df8a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -112,6 +112,7 @@ static int journal_submit_commit_record(journal_t *journal,
 	struct buffer_head *bh;
 	int ret;
 	int barrier_done = 0;
+	struct timespec now = current_kernel_time();
 
 	if (is_journal_aborted(journal))
 		return 0;
@@ -126,6 +127,8 @@ static int journal_submit_commit_record(journal_t *journal,
 	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
+	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 
 	if (JBD2_HAS_COMPAT_FEATURE(journal,
 				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
-- 
cgit v1.2.3-70-g09d2


From c851ed540173736e60d48b53b91a16ea5c903896 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: jbd2: Implement data=ordered mode handling via inodes

This patch adds necessary framework into JBD2 to be able to track inodes
with each transaction and write-out their dirty data during transaction
commit time.

This new ordered mode brings all sorts of advantages such as possibility
to get rid of journal heads and buffer heads for data buffers in ordered
mode, better ordering of writes on transaction commit, simplification of
 some JBD code, no more anonymous pages when truncate of data being
committed happens.  Also with this new ordered mode, delayed allocation
on ordered mode is much simpler.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd2/commit.c      | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/jbd2/journal.c     | 52 +++++++++++++++++++++++++++++
 fs/jbd2/transaction.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/jbd2.h  | 42 ++++++++++++++++++++++++
 4 files changed, 270 insertions(+)

(limited to 'fs/jbd2/commit.c')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 92b6ac3df8a..3ca107b5c86 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -355,6 +355,81 @@ write_out_data:
 	journal_do_submit_data(wbuf, bufs);
 }
 
+/*
+ * Submit all the data buffers of inode associated with the transaction to
+ * disk.
+ *
+ * We are in a committing transaction. Therefore no new inode can be added to
+ * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
+ * operate on from being released while we write out pages.
+ */
+static int journal_submit_inode_data_buffers(journal_t *journal,
+		transaction_t *commit_transaction)
+{
+	struct jbd2_inode *jinode;
+	int err, ret = 0;
+	struct address_space *mapping;
+
+	spin_lock(&journal->j_list_lock);
+	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		mapping = jinode->i_vfs_inode->i_mapping;
+		jinode->i_flags |= JI_COMMIT_RUNNING;
+		spin_unlock(&journal->j_list_lock);
+		err = filemap_fdatawrite_range(mapping, 0,
+					i_size_read(jinode->i_vfs_inode));
+		if (!ret)
+			ret = err;
+		spin_lock(&journal->j_list_lock);
+		J_ASSERT(jinode->i_transaction == commit_transaction);
+		jinode->i_flags &= ~JI_COMMIT_RUNNING;
+		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+	}
+	spin_unlock(&journal->j_list_lock);
+	return ret;
+}
+
+/*
+ * Wait for data submitted for writeout, refile inodes to proper
+ * transaction if needed.
+ *
+ */
+static int journal_finish_inode_data_buffers(journal_t *journal,
+		transaction_t *commit_transaction)
+{
+	struct jbd2_inode *jinode, *next_i;
+	int err, ret = 0;
+
+	/* For locking, see the comment in journal_submit_inode_data_buffers() */
+	spin_lock(&journal->j_list_lock);
+	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		jinode->i_flags |= JI_COMMIT_RUNNING;
+		spin_unlock(&journal->j_list_lock);
+		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
+		if (!ret)
+			ret = err;
+		spin_lock(&journal->j_list_lock);
+		jinode->i_flags &= ~JI_COMMIT_RUNNING;
+		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+	}
+
+	/* Now refile inode to proper lists */
+	list_for_each_entry_safe(jinode, next_i,
+				 &commit_transaction->t_inode_list, i_list) {
+		list_del(&jinode->i_list);
+		if (jinode->i_next_transaction) {
+			jinode->i_transaction = jinode->i_next_transaction;
+			jinode->i_next_transaction = NULL;
+			list_add(&jinode->i_list,
+				&jinode->i_transaction->t_inode_list);
+		} else {
+			jinode->i_transaction = NULL;
+		}
+	}
+	spin_unlock(&journal->j_list_lock);
+
+	return ret;
+}
+
 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 {
 	struct page *page = bh->b_page;
@@ -529,6 +604,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 */
 	err = 0;
 	journal_submit_data_buffers(journal, commit_transaction);
+	err = journal_submit_inode_data_buffers(journal, commit_transaction);
+	if (err)
+		jbd2_journal_abort(journal, err);
 
 	/*
 	 * Wait for all previously submitted IO to complete if commit
@@ -760,6 +838,17 @@ start_journal_io:
 			__jbd2_journal_abort_hard(journal);
 	}
 
+	/*
+	 * This is the right place to wait for data buffers both for ASYNC
+	 * and !ASYNC commit. If commit is ASYNC, we need to wait only after
+	 * the commit block went to disk (which happens above). If commit is
+	 * SYNC, we need to wait for data buffers before we start writing
+	 * commit block, which happens below in such setting.
+	 */
+	err = journal_finish_inode_data_buffers(journal, commit_transaction);
+	if (err)
+		jbd2_journal_abort(journal, err);
+
 	/* Lo and behold: we have just managed to send a transaction to
            the log.  Before we can commit it, wait for the IO so far to
            complete.  Control buffers being written are on the
@@ -880,6 +969,7 @@ wait_for_iobuf:
 	jbd_debug(3, "JBD: commit phase 7\n");
 
 	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 	J_ASSERT(commit_transaction->t_buffers == NULL);
 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2e24567c4a7..78cf7bd7f60 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -82,6 +82,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
 EXPORT_SYMBOL(jbd2_journal_invalidatepage);
 EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
 EXPORT_SYMBOL(jbd2_journal_force_commit);
+EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
 
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
@@ -2194,6 +2198,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
 	jbd_unlock_bh_journal_head(bh);
 }
 
+/*
+ * Initialize jbd inode head
+ */
+void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
+{
+	jinode->i_transaction = NULL;
+	jinode->i_next_transaction = NULL;
+	jinode->i_vfs_inode = inode;
+	jinode->i_flags = 0;
+	INIT_LIST_HEAD(&jinode->i_list);
+}
+
+/*
+ * Function to be called before we start removing inode from memory (i.e.,
+ * clear_inode() is a fine place to be called from). It removes inode from
+ * transaction's lists.
+ */
+void jbd2_journal_release_jbd_inode(journal_t *journal,
+				    struct jbd2_inode *jinode)
+{
+	int writeout = 0;
+
+	if (!journal)
+		return;
+restart:
+	spin_lock(&journal->j_list_lock);
+	/* Is commit writing out inode - we have to wait */
+	if (jinode->i_flags & JI_COMMIT_RUNNING) {
+		wait_queue_head_t *wq;
+		DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
+		wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock(&journal->j_list_lock);
+		schedule();
+		finish_wait(wq, &wait.wait);
+		goto restart;
+	}
+
+	/* Do we need to wait for data writeback? */
+	if (journal->j_committing_transaction == jinode->i_transaction)
+		writeout = 1;
+	if (jinode->i_transaction) {
+		list_del(&jinode->i_list);
+		jinode->i_transaction = NULL;
+	}
+	spin_unlock(&journal->j_list_lock);
+}
+
 /*
  * debugfs tunables
  */
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index ba620c4493d..98b596d2370 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -51,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 	transaction->t_tid = journal->j_transaction_sequence++;
 	transaction->t_expires = jiffies + journal->j_commit_interval;
 	spin_lock_init(&transaction->t_handle_lock);
+	INIT_LIST_HEAD(&transaction->t_inode_list);
 
 	/* Set up the commit timer for the new transaction. */
 	journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
@@ -2195,3 +2196,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
 	spin_unlock(&journal->j_list_lock);
 	__brelse(bh);
 }
+
+/*
+ * File inode in the inode list of the handle's transaction
+ */
+int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+
+	if (is_handle_aborted(handle))
+		return -EIO;
+
+	jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+			transaction->t_tid);
+
+	/*
+	 * First check whether inode isn't already on the transaction's
+	 * lists without taking the lock. Note that this check is safe
+	 * without the lock as we cannot race with somebody removing inode
+	 * from the transaction. The reason is that we remove inode from the
+	 * transaction only in journal_release_jbd_inode() and when we commit
+	 * the transaction. We are guarded from the first case by holding
+	 * a reference to the inode. We are safe against the second case
+	 * because if jinode->i_transaction == transaction, commit code
+	 * cannot touch the transaction because we hold reference to it,
+	 * and if jinode->i_next_transaction == transaction, commit code
+	 * will only file the inode where we want it.
+	 */
+	if (jinode->i_transaction == transaction ||
+	    jinode->i_next_transaction == transaction)
+		return 0;
+
+	spin_lock(&journal->j_list_lock);
+
+	if (jinode->i_transaction == transaction ||
+	    jinode->i_next_transaction == transaction)
+		goto done;
+
+	/* On some different transaction's list - should be
+	 * the committing one */
+	if (jinode->i_transaction) {
+		J_ASSERT(jinode->i_next_transaction == NULL);
+		J_ASSERT(jinode->i_transaction ==
+					journal->j_committing_transaction);
+		jinode->i_next_transaction = transaction;
+		goto done;
+	}
+	/* Not on any transaction list... */
+	J_ASSERT(!jinode->i_next_transaction);
+	jinode->i_transaction = transaction;
+	list_add(&jinode->i_list, &transaction->t_inode_list);
+done:
+	spin_unlock(&journal->j_list_lock);
+
+	return 0;
+}
+
+/*
+ * This function must be called when inode is journaled in ordered mode
+ * before truncation happens. It starts writeout of truncated part in
+ * case it is in the committing transaction so that we stand to ordered
+ * mode consistency guarantees.
+ */
+int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+					loff_t new_size)
+{
+	journal_t *journal;
+	transaction_t *commit_trans;
+	int ret = 0;
+
+	if (!inode->i_transaction && !inode->i_next_transaction)
+		goto out;
+	journal = inode->i_transaction->t_journal;
+	spin_lock(&journal->j_state_lock);
+	commit_trans = journal->j_committing_transaction;
+	spin_unlock(&journal->j_state_lock);
+	if (inode->i_transaction == commit_trans) {
+		ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+			new_size, LLONG_MAX);
+		if (ret)
+			jbd2_journal_abort(journal, ret);
+	}
+out:
+	return ret;
+}
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index ec9cadf5822..622c3d8ca4e 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -381,6 +381,38 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
 	bit_spin_unlock(BH_JournalHead, &bh->b_state);
 }
 
+/* Flags in jbd_inode->i_flags */
+#define __JI_COMMIT_RUNNING 0
+/* Commit of the inode data in progress. We use this flag to protect us from
+ * concurrent deletion of inode. We cannot use reference to inode for this
+ * since we cannot afford doing last iput() on behalf of kjournald
+ */
+#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
+
+/**
+ * struct jbd_inode is the structure linking inodes in ordered mode
+ *   present in a transaction so that we can sync them during commit.
+ */
+struct jbd2_inode {
+	/* Which transaction does this inode belong to? Either the running
+	 * transaction or the committing one. [j_list_lock] */
+	transaction_t *i_transaction;
+
+	/* Pointer to the running transaction modifying inode's data in case
+	 * there is already a committing transaction touching it. [j_list_lock] */
+	transaction_t *i_next_transaction;
+
+	/* List of inodes in the i_transaction [j_list_lock] */
+	struct list_head i_list;
+
+	/* VFS inode this inode belongs to [constant during the lifetime
+	 * of the structure] */
+	struct inode *i_vfs_inode;
+
+	/* Flags of inode [j_list_lock] */
+	unsigned int i_flags;
+};
+
 struct jbd2_revoke_table_s;
 
 /**
@@ -566,6 +598,12 @@ struct transaction_s
 	 */
 	struct journal_head	*t_log_list;
 
+	/*
+	 * List of inodes whose data we've modified in data=ordered mode.
+	 * [j_list_lock]
+	 */
+	struct list_head	t_inode_list;
+
 	/*
 	 * Protects info related to handles
 	 */
@@ -1046,6 +1084,10 @@ extern void	   jbd2_journal_ack_err    (journal_t *);
 extern int	   jbd2_journal_clear_err  (journal_t *);
 extern int	   jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
 extern int	   jbd2_journal_force_commit(journal_t *);
+extern int	   jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
+extern int	   jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size);
+extern void	   jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
+extern void	   jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);
 
 /*
  * journal_head management
-- 
cgit v1.2.3-70-g09d2


From 87c89c232c8f7b3820c33c3b9bc803e9358027da Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: jbd2: Remove data=ordered mode support using jbd buffer heads

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd2/checkpoint.c  |   1 -
 fs/jbd2/commit.c      | 221 ++------------------------------------------------
 fs/jbd2/journal.c     |   1 -
 fs/jbd2/transaction.c | 217 ++-----------------------------------------------
 include/linux/jbd2.h  |  29 ++-----
 5 files changed, 21 insertions(+), 448 deletions(-)

(limited to 'fs/jbd2/commit.c')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 6914598022c..91389c8aee8 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
 
 	J_ASSERT(transaction->t_state == T_FINISHED);
 	J_ASSERT(transaction->t_buffers == NULL);
-	J_ASSERT(transaction->t_sync_datalist == NULL);
 	J_ASSERT(transaction->t_forget == NULL);
 	J_ASSERT(transaction->t_iobuf_list == NULL);
 	J_ASSERT(transaction->t_shadow_list == NULL);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 3ca107b5c86..483183d15ed 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -37,8 +37,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 }
 
 /*
- * When an ext3-ordered file is truncated, it is possible that many pages are
- * not sucessfully freed, because they are attached to a committing transaction.
+ * When an ext4 file is truncated, it is possible that some pages are not
+ * successfully freed, because they are attached to a committing transaction.
  * After the transaction commits, these pages are left on the LRU, with no
  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -79,21 +79,6 @@ nope:
 	__brelse(bh);
 }
 
-/*
- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held.  For ranking reasons we must trylock.  If we lose, schedule away and
- * return 0.  j_list_lock is dropped in this case.
- */
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
-{
-	if (!jbd_trylock_bh_state(bh)) {
-		spin_unlock(&journal->j_list_lock);
-		schedule();
-		return 0;
-	}
-	return 1;
-}
-
 /*
  * Done it all: now submit the commit record.  We should have
  * cleaned up our previous buffers by now, so if we are in abort
@@ -199,162 +184,6 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
 	return ret;
 }
 
-/*
- * Wait for all submitted IO to complete.
- */
-static int journal_wait_on_locked_list(journal_t *journal,
-				       transaction_t *commit_transaction)
-{
-	int ret = 0;
-	struct journal_head *jh;
-
-	while (commit_transaction->t_locked_list) {
-		struct buffer_head *bh;
-
-		jh = commit_transaction->t_locked_list->b_tprev;
-		bh = jh2bh(jh);
-		get_bh(bh);
-		if (buffer_locked(bh)) {
-			spin_unlock(&journal->j_list_lock);
-			wait_on_buffer(bh);
-			if (unlikely(!buffer_uptodate(bh)))
-				ret = -EIO;
-			spin_lock(&journal->j_list_lock);
-		}
-		if (!inverted_lock(journal, bh)) {
-			put_bh(bh);
-			spin_lock(&journal->j_list_lock);
-			continue;
-		}
-		if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
-			__jbd2_journal_unfile_buffer(jh);
-			jbd_unlock_bh_state(bh);
-			jbd2_journal_remove_journal_head(bh);
-			put_bh(bh);
-		} else {
-			jbd_unlock_bh_state(bh);
-		}
-		put_bh(bh);
-		cond_resched_lock(&journal->j_list_lock);
-	}
-	return ret;
-  }
-
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
-{
-	int i;
-
-	for (i = 0; i < bufs; i++) {
-		wbuf[i]->b_end_io = end_buffer_write_sync;
-		/* We use-up our safety reference in submit_bh() */
-		submit_bh(WRITE, wbuf[i]);
-	}
-}
-
-/*
- *  Submit all the data buffers to disk
- */
-static void journal_submit_data_buffers(journal_t *journal,
-				transaction_t *commit_transaction)
-{
-	struct journal_head *jh;
-	struct buffer_head *bh;
-	int locked;
-	int bufs = 0;
-	struct buffer_head **wbuf = journal->j_wbuf;
-
-	/*
-	 * Whenever we unlock the journal and sleep, things can get added
-	 * onto ->t_sync_datalist, so we have to keep looping back to
-	 * write_out_data until we *know* that the list is empty.
-	 *
-	 * Cleanup any flushed data buffers from the data list.  Even in
-	 * abort mode, we want to flush this out as soon as possible.
-	 */
-write_out_data:
-	cond_resched();
-	spin_lock(&journal->j_list_lock);
-
-	while (commit_transaction->t_sync_datalist) {
-		jh = commit_transaction->t_sync_datalist;
-		bh = jh2bh(jh);
-		locked = 0;
-
-		/* Get reference just to make sure buffer does not disappear
-		 * when we are forced to drop various locks */
-		get_bh(bh);
-		/* If the buffer is dirty, we need to submit IO and hence
-		 * we need the buffer lock. We try to lock the buffer without
-		 * blocking. If we fail, we need to drop j_list_lock and do
-		 * blocking lock_buffer().
-		 */
-		if (buffer_dirty(bh)) {
-			if (test_set_buffer_locked(bh)) {
-				BUFFER_TRACE(bh, "needs blocking lock");
-				spin_unlock(&journal->j_list_lock);
-				/* Write out all data to prevent deadlocks */
-				journal_do_submit_data(wbuf, bufs);
-				bufs = 0;
-				lock_buffer(bh);
-				spin_lock(&journal->j_list_lock);
-			}
-			locked = 1;
-		}
-		/* We have to get bh_state lock. Again out of order, sigh. */
-		if (!inverted_lock(journal, bh)) {
-			jbd_lock_bh_state(bh);
-			spin_lock(&journal->j_list_lock);
-		}
-		/* Someone already cleaned up the buffer? */
-		if (!buffer_jbd(bh)
-			|| jh->b_transaction != commit_transaction
-			|| jh->b_jlist != BJ_SyncData) {
-			jbd_unlock_bh_state(bh);
-			if (locked)
-				unlock_buffer(bh);
-			BUFFER_TRACE(bh, "already cleaned up");
-			put_bh(bh);
-			continue;
-		}
-		if (locked && test_clear_buffer_dirty(bh)) {
-			BUFFER_TRACE(bh, "needs writeout, adding to array");
-			wbuf[bufs++] = bh;
-			__jbd2_journal_file_buffer(jh, commit_transaction,
-						BJ_Locked);
-			jbd_unlock_bh_state(bh);
-			if (bufs == journal->j_wbufsize) {
-				spin_unlock(&journal->j_list_lock);
-				journal_do_submit_data(wbuf, bufs);
-				bufs = 0;
-				goto write_out_data;
-			}
-		} else if (!locked && buffer_locked(bh)) {
-			__jbd2_journal_file_buffer(jh, commit_transaction,
-						BJ_Locked);
-			jbd_unlock_bh_state(bh);
-			put_bh(bh);
-		} else {
-			BUFFER_TRACE(bh, "writeout complete: unfile");
-			__jbd2_journal_unfile_buffer(jh);
-			jbd_unlock_bh_state(bh);
-			if (locked)
-				unlock_buffer(bh);
-			jbd2_journal_remove_journal_head(bh);
-			/* Once for our safety reference, once for
-			 * jbd2_journal_remove_journal_head() */
-			put_bh(bh);
-			put_bh(bh);
-		}
-
-		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
-			spin_unlock(&journal->j_list_lock);
-			goto write_out_data;
-		}
-	}
-	spin_unlock(&journal->j_list_lock);
-	journal_do_submit_data(wbuf, bufs);
-}
-
 /*
  * Submit all the data buffers of inode associated with the transaction to
  * disk.
@@ -602,24 +431,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 * Now start flushing things to disk, in the order they appear
 	 * on the transaction lists.  Data blocks go first.
 	 */
-	err = 0;
-	journal_submit_data_buffers(journal, commit_transaction);
 	err = journal_submit_inode_data_buffers(journal, commit_transaction);
-	if (err)
-		jbd2_journal_abort(journal, err);
-
-	/*
-	 * Wait for all previously submitted IO to complete if commit
-	 * record is to be written synchronously.
-	 */
-	spin_lock(&journal->j_list_lock);
-	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
-		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
-		err = journal_wait_on_locked_list(journal,
-						commit_transaction);
-
-	spin_unlock(&journal->j_list_lock);
-
 	if (err)
 		jbd2_journal_abort(journal, err);
 
@@ -627,16 +439,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 
 	jbd_debug(3, "JBD: commit phase 2\n");
 
-	/*
-	 * If we found any dirty or locked buffers, then we should have
-	 * looped back up to the write_out_data label.  If there weren't
-	 * any then journal_clean_data_list should have wiped the list
-	 * clean by now, so check that it is in fact empty.
-	 */
-	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
-
-	jbd_debug (3, "JBD: commit phase 3\n");
-
 	/*
 	 * Way to go: we have now written out all of the data for a
 	 * transaction!  Now comes the tricky part: we need to write out
@@ -655,6 +457,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	J_ASSERT(commit_transaction->t_nr_buffers <=
 		 commit_transaction->t_outstanding_credits);
 
+	err = 0;
 	descriptor = NULL;
 	bufs = 0;
 	while (commit_transaction->t_buffers) {
@@ -829,13 +632,6 @@ start_journal_io:
 						 &cbh, crc32_sum);
 		if (err)
 			__jbd2_journal_abort_hard(journal);
-
-		spin_lock(&journal->j_list_lock);
-		err = journal_wait_on_locked_list(journal,
-						commit_transaction);
-		spin_unlock(&journal->j_list_lock);
-		if (err)
-			__jbd2_journal_abort_hard(journal);
 	}
 
 	/*
@@ -860,7 +656,7 @@ start_journal_io:
 	   so we incur less scheduling load.
 	*/
 
-	jbd_debug(3, "JBD: commit phase 4\n");
+	jbd_debug(3, "JBD: commit phase 3\n");
 
 	/*
 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -919,7 +715,7 @@ wait_for_iobuf:
 
 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
 
-	jbd_debug(3, "JBD: commit phase 5\n");
+	jbd_debug(3, "JBD: commit phase 4\n");
 
 	/* Here we wait for the revoke record and descriptor record buffers */
  wait_for_ctlbuf:
@@ -946,7 +742,7 @@ wait_for_iobuf:
 		/* AKPM: bforget here */
 	}
 
-	jbd_debug(3, "JBD: commit phase 6\n");
+	jbd_debug(3, "JBD: commit phase 5\n");
 
 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -966,9 +762,8 @@ wait_for_iobuf:
            transaction can be removed from any checkpoint list it was on
            before. */
 
-	jbd_debug(3, "JBD: commit phase 7\n");
+	jbd_debug(3, "JBD: commit phase 6\n");
 
-	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 	J_ASSERT(commit_transaction->t_buffers == NULL);
 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
@@ -1090,7 +885,7 @@ restart_loop:
 
 	/* Done with this transaction! */
 
-	jbd_debug(3, "JBD: commit phase 8\n");
+	jbd_debug(3, "JBD: commit phase 7\n");
 
 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
 
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 78cf7bd7f60..b26c6d9fe6a 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
 EXPORT_SYMBOL(jbd2_journal_get_write_access);
 EXPORT_SYMBOL(jbd2_journal_get_create_access);
 EXPORT_SYMBOL(jbd2_journal_get_undo_access);
-EXPORT_SYMBOL(jbd2_journal_dirty_data);
 EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
 EXPORT_SYMBOL(jbd2_journal_release_buffer);
 EXPORT_SYMBOL(jbd2_journal_forget);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 98b596d2370..4f7cadbb19f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -942,183 +942,6 @@ out:
 	return err;
 }
 
-/**
- * int jbd2_journal_dirty_data() -  mark a buffer as containing dirty data which
- *                             needs to be flushed before we can commit the
- *                             current transaction.
- * @handle: transaction
- * @bh: bufferhead to mark
- *
- * The buffer is placed on the transaction's data list and is marked as
- * belonging to the transaction.
- *
- * Returns error number or 0 on success.
- *
- * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
- * by kswapd.
- */
-int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
-	journal_t *journal = handle->h_transaction->t_journal;
-	int need_brelse = 0;
-	struct journal_head *jh;
-
-	if (is_handle_aborted(handle))
-		return 0;
-
-	jh = jbd2_journal_add_journal_head(bh);
-	JBUFFER_TRACE(jh, "entry");
-
-	/*
-	 * The buffer could *already* be dirty.  Writeout can start
-	 * at any time.
-	 */
-	jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
-
-	/*
-	 * What if the buffer is already part of a running transaction?
-	 *
-	 * There are two cases:
-	 * 1) It is part of the current running transaction.  Refile it,
-	 *    just in case we have allocated it as metadata, deallocated
-	 *    it, then reallocated it as data.
-	 * 2) It is part of the previous, still-committing transaction.
-	 *    If all we want to do is to guarantee that the buffer will be
-	 *    written to disk before this new transaction commits, then
-	 *    being sure that the *previous* transaction has this same
-	 *    property is sufficient for us!  Just leave it on its old
-	 *    transaction.
-	 *
-	 * In case (2), the buffer must not already exist as metadata
-	 * --- that would violate write ordering (a transaction is free
-	 * to write its data at any point, even before the previous
-	 * committing transaction has committed).  The caller must
-	 * never, ever allow this to happen: there's nothing we can do
-	 * about it in this layer.
-	 */
-	jbd_lock_bh_state(bh);
-	spin_lock(&journal->j_list_lock);
-
-	/* Now that we have bh_state locked, are we really still mapped? */
-	if (!buffer_mapped(bh)) {
-		JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
-		goto no_journal;
-	}
-
-	if (jh->b_transaction) {
-		JBUFFER_TRACE(jh, "has transaction");
-		if (jh->b_transaction != handle->h_transaction) {
-			JBUFFER_TRACE(jh, "belongs to older transaction");
-			J_ASSERT_JH(jh, jh->b_transaction ==
-					journal->j_committing_transaction);
-
-			/* @@@ IS THIS TRUE  ? */
-			/*
-			 * Not any more.  Scenario: someone does a write()
-			 * in data=journal mode.  The buffer's transaction has
-			 * moved into commit.  Then someone does another
-			 * write() to the file.  We do the frozen data copyout
-			 * and set b_next_transaction to point to j_running_t.
-			 * And while we're in that state, someone does a
-			 * writepage() in an attempt to pageout the same area
-			 * of the file via a shared mapping.  At present that
-			 * calls jbd2_journal_dirty_data(), and we get right here.
-			 * It may be too late to journal the data.  Simply
-			 * falling through to the next test will suffice: the
-			 * data will be dirty and wil be checkpointed.  The
-			 * ordering comments in the next comment block still
-			 * apply.
-			 */
-			//J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-
-			/*
-			 * If we're journalling data, and this buffer was
-			 * subject to a write(), it could be metadata, forget
-			 * or shadow against the committing transaction.  Now,
-			 * someone has dirtied the same darn page via a mapping
-			 * and it is being writepage()'d.
-			 * We *could* just steal the page from commit, with some
-			 * fancy locking there.  Instead, we just skip it -
-			 * don't tie the page's buffers to the new transaction
-			 * at all.
-			 * Implication: if we crash before the writepage() data
-			 * is written into the filesystem, recovery will replay
-			 * the write() data.
-			 */
-			if (jh->b_jlist != BJ_None &&
-					jh->b_jlist != BJ_SyncData &&
-					jh->b_jlist != BJ_Locked) {
-				JBUFFER_TRACE(jh, "Not stealing");
-				goto no_journal;
-			}
-
-			/*
-			 * This buffer may be undergoing writeout in commit.  We
-			 * can't return from here and let the caller dirty it
-			 * again because that can cause the write-out loop in
-			 * commit to never terminate.
-			 */
-			if (buffer_dirty(bh)) {
-				get_bh(bh);
-				spin_unlock(&journal->j_list_lock);
-				jbd_unlock_bh_state(bh);
-				need_brelse = 1;
-				sync_dirty_buffer(bh);
-				jbd_lock_bh_state(bh);
-				spin_lock(&journal->j_list_lock);
-				/* Since we dropped the lock... */
-				if (!buffer_mapped(bh)) {
-					JBUFFER_TRACE(jh, "buffer got unmapped");
-					goto no_journal;
-				}
-				/* The buffer may become locked again at any
-				   time if it is redirtied */
-			}
-
-			/* journal_clean_data_list() may have got there first */
-			if (jh->b_transaction != NULL) {
-				JBUFFER_TRACE(jh, "unfile from commit");
-				__jbd2_journal_temp_unlink_buffer(jh);
-				/* It still points to the committing
-				 * transaction; move it to this one so
-				 * that the refile assert checks are
-				 * happy. */
-				jh->b_transaction = handle->h_transaction;
-			}
-			/* The buffer will be refiled below */
-
-		}
-		/*
-		 * Special case --- the buffer might actually have been
-		 * allocated and then immediately deallocated in the previous,
-		 * committing transaction, so might still be left on that
-		 * transaction's metadata lists.
-		 */
-		if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
-			JBUFFER_TRACE(jh, "not on correct data list: unfile");
-			J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
-			__jbd2_journal_temp_unlink_buffer(jh);
-			jh->b_transaction = handle->h_transaction;
-			JBUFFER_TRACE(jh, "file as data");
-			__jbd2_journal_file_buffer(jh, handle->h_transaction,
-						BJ_SyncData);
-		}
-	} else {
-		JBUFFER_TRACE(jh, "not on a transaction");
-		__jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
-	}
-no_journal:
-	spin_unlock(&journal->j_list_lock);
-	jbd_unlock_bh_state(bh);
-	if (need_brelse) {
-		BUFFER_TRACE(bh, "brelse");
-		__brelse(bh);
-	}
-	JBUFFER_TRACE(jh, "exit");
-	jbd2_journal_put_journal_head(jh);
-	return 0;
-}
-
 /**
  * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
  * @handle: transaction to add buffer to.
@@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
  * Remove a buffer from the appropriate transaction list.
  *
  * Note that this function can *change* the value of
- * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
- * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list.  If the caller
- * is holding onto a copy of one of thee pointers, it could go bad.
- * Generally the caller needs to re-read the pointer from the transaction_t.
+ * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
+ * t_log_list or t_reserved_list.  If the caller is holding onto a copy of one
+ * of these pointers, it could go bad.  Generally the caller needs to re-read
+ * the pointer from the transaction_t.
  *
  * Called under j_list_lock.  The journal may not be locked.
  */
@@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
 	switch (jh->b_jlist) {
 	case BJ_None:
 		return;
-	case BJ_SyncData:
-		list = &transaction->t_sync_datalist;
-		break;
 	case BJ_Metadata:
 		transaction->t_nr_buffers--;
 		J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
@@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
 	case BJ_Reserved:
 		list = &transaction->t_reserved_list;
 		break;
-	case BJ_Locked:
-		list = &transaction->t_locked_list;
-		break;
 	}
 
 	__blist_del_buffer(list, jh);
@@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
 		goto out;
 
 	spin_lock(&journal->j_list_lock);
-	if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
-		if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
-			/* A written-back ordered data buffer */
-			JBUFFER_TRACE(jh, "release data");
-			__jbd2_journal_unfile_buffer(jh);
-			jbd2_journal_remove_journal_head(bh);
-			__brelse(bh);
-		}
-	} else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
+	if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
 		/* written-back checkpointed metadata buffer */
 		if (jh->b_jlist == BJ_None) {
 			JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1878,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 	if (!buffer_jbd(bh))
 		goto zap_buffer_unlocked;
 
+	/* OK, we have data buffer in journaled mode */
 	spin_lock(&journal->j_state_lock);
 	jbd_lock_bh_state(bh);
 	spin_lock(&journal->j_list_lock);
@@ -1941,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 		}
 	} else if (transaction == journal->j_committing_transaction) {
 		JBUFFER_TRACE(jh, "on committing transaction");
-		if (jh->b_jlist == BJ_Locked) {
-			/*
-			 * The buffer is on the committing transaction's locked
-			 * list.  We have the buffer locked, so I/O has
-			 * completed.  So we can nail the buffer now.
-			 */
-			may_free = __dispose_buffer(jh, transaction);
-			goto zap_buffer;
-		}
 		/*
 		 * If it is committing, we simply cannot touch it.  We
 		 * can remove it's next_transaction pointer from the
@@ -2082,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
 		J_ASSERT_JH(jh, !jh->b_committed_data);
 		J_ASSERT_JH(jh, !jh->b_frozen_data);
 		return;
-	case BJ_SyncData:
-		list = &transaction->t_sync_datalist;
-		break;
 	case BJ_Metadata:
 		transaction->t_nr_buffers++;
 		list = &transaction->t_buffers;
@@ -2104,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
 	case BJ_Reserved:
 		list = &transaction->t_reserved_list;
 		break;
-	case BJ_Locked:
-		list =  &transaction->t_locked_list;
-		break;
 	}
 
 	__blist_add_buffer(list, jh);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 622c3d8ca4e..3dd20900709 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -542,24 +542,12 @@ struct transaction_s
 	 */
 	struct journal_head	*t_reserved_list;
 
-	/*
-	 * Doubly-linked circular list of all buffers under writeout during
-	 * commit [j_list_lock]
-	 */
-	struct journal_head	*t_locked_list;
-
 	/*
 	 * Doubly-linked circular list of all metadata buffers owned by this
 	 * transaction [j_list_lock]
 	 */
 	struct journal_head	*t_buffers;
 
-	/*
-	 * Doubly-linked circular list of all data buffers still to be
-	 * flushed before this transaction can be committed [j_list_lock]
-	 */
-	struct journal_head	*t_sync_datalist;
-
 	/*
 	 * Doubly-linked circular list of all forget buffers (superseded
 	 * buffers which we can un-checkpoint once this transaction commits)
@@ -1044,7 +1032,6 @@ extern int	 jbd2_journal_extend (handle_t *, int nblocks);
 extern int	 jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_get_undo_access(handle_t *, struct buffer_head *);
-extern int	 jbd2_journal_dirty_data (handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
 extern void	 jbd2_journal_release_buffer (handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_forget (handle_t *, struct buffer_head *);
@@ -1223,15 +1210,13 @@ static inline int jbd_space_needed(journal_t *journal)
 
 /* journaling buffer types */
 #define BJ_None		0	/* Not journaled */
-#define BJ_SyncData	1	/* Normal data: flush before commit */
-#define BJ_Metadata	2	/* Normal journaled metadata */
-#define BJ_Forget	3	/* Buffer superseded by this transaction */
-#define BJ_IO		4	/* Buffer is for temporary IO use */
-#define BJ_Shadow	5	/* Buffer contents being shadowed to the log */
-#define BJ_LogCtl	6	/* Buffer contains log descriptors */
-#define BJ_Reserved	7	/* Buffer is reserved for access by journal */
-#define BJ_Locked	8	/* Locked for I/O during commit */
-#define BJ_Types	9
+#define BJ_Metadata	1	/* Normal journaled metadata */
+#define BJ_Forget	2	/* Buffer superseded by this transaction */
+#define BJ_IO		3	/* Buffer is for temporary IO use */
+#define BJ_Shadow	4	/* Buffer contents being shadowed to the log */
+#define BJ_LogCtl	5	/* Buffer contains log descriptors */
+#define BJ_Reserved	6	/* Buffer is reserved for access by journal */
+#define BJ_Types	7
 
 extern int jbd_blocks_per_page(struct inode *inode);
 
-- 
cgit v1.2.3-70-g09d2


From cd1aac32923a9c8adcc0ae85e33c1ca0c5855838 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: Add ordered mode support for delalloc

This provides a new ordered mode implementation which gets rid of using
buffer heads to enforce the ordering between metadata change with the
related data chage.  Instead, in the new ordering mode, it keeps track
of all of the inodes touched by each transaction on a list, and when
that transaction is committed, it flushes all of the dirty pages for
those inodes.  In addition, the new ordered mode reverses the lock
ordering of the page lock and transaction lock, which provides easier
support for delayed allocation.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c  | 30 +++++++++++++++++++++++++-----
 fs/jbd2/commit.c | 38 +++++++++++++++++++++++++++++++++-----
 2 files changed, 58 insertions(+), 10 deletions(-)

(limited to 'fs/jbd2/commit.c')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7923336ecf9..24518b57733 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2043,11 +2043,12 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
 	return !buffer_mapped(bh) || buffer_delay(bh);
 }
 
-/* FIXME!! only support data=writeback mode */
 /*
  * get called vi ext4_da_writepages after taking page lock
  * We may end up doing block allocation here in case
  * mpage_da_map_blocks failed to allocate blocks.
+ *
+ * We also get called via journal_submit_inode_data_buffers
  */
 static int ext4_da_writepage(struct page *page,
 				struct writeback_control *wbc)
@@ -2066,6 +2067,7 @@ static int ext4_da_writepage(struct page *page,
 		 * ext4_da_writepages() but directly (shrink_page_list).
 		 * We cannot easily start a transaction here so we just skip
 		 * writing the page in case we would have to do so.
+		 * We reach here also via journal_submit_inode_data_buffers
 		 */
 		size = i_size_read(inode);
 
@@ -2081,8 +2083,11 @@ static int ext4_da_writepage(struct page *page,
 			 * We can't do block allocation under
 			 * page lock without a handle . So redirty
 			 * the page and return
+			 * We may reach here when we do a journal commit
+			 * via journal_submit_inode_data_buffers.
+			 * If we don't have mapping block we just ignore
+			 * them
 			 */
-			BUG_ON(wbc->sync_mode != WB_SYNC_NONE);
 			redirty_page_for_writepage(wbc, page);
 			unlock_page(page);
 			return 0;
@@ -2097,7 +2102,6 @@ static int ext4_da_writepage(struct page *page,
 	return ret;
 }
 
-
 /*
  * For now just follow the DIO way to estimate the max credits
  * needed to write out EXT4_MAX_WRITEBACK_PAGES.
@@ -2130,7 +2134,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 		return 0;
 
 	/*
-	 *  Estimate the worse case needed credits to write out
+	 * Estimate the worse case needed credits to write out
 	 * EXT4_MAX_BUF_BLOCKS pages
 	 */
 	needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
@@ -2152,6 +2156,19 @@ static int ext4_da_writepages(struct address_space *mapping,
 			ret = PTR_ERR(handle);
 			goto out_writepages;
 		}
+		if (ext4_should_order_data(inode)) {
+			/*
+			 * With ordered mode we need to add
+			 * the inode to the journal handle
+			 * when we do block allocation.
+			 */
+			ret = ext4_jbd2_file_inode(handle, inode);
+			if (ret) {
+				ext4_journal_stop(handle);
+				goto out_writepages;
+			}
+
+		}
 		/*
 		 * set the max dirty pages could be write at a time
 		 * to fit into the reserved transaction credits
@@ -2735,7 +2752,10 @@ static const struct address_space_operations ext4_da_aops = {
 
 void ext4_set_aops(struct inode *inode)
 {
-	if (ext4_should_order_data(inode))
+	if (ext4_should_order_data(inode) &&
+		test_opt(inode->i_sb, DELALLOC))
+		inode->i_mapping->a_ops = &ext4_da_aops;
+	else if (ext4_should_order_data(inode))
 		inode->i_mapping->a_ops = &ext4_ordered_aops;
 	else if (ext4_should_writeback_data(inode) &&
 		 test_opt(inode->i_sb, DELALLOC))
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 483183d15ed..f8b3be87322 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -22,6 +22,8 @@
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -184,6 +186,27 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
 	return ret;
 }
 
+/*
+ * write the filemap data using writepage() address_space_operations.
+ * We don't do block allocation here even for delalloc. We don't
+ * use writepages() because with dealyed allocation we may be doing
+ * block allocation in writepages().
+ */
+static int journal_submit_inode_data_buffers(struct address_space *mapping)
+{
+	int ret;
+	struct writeback_control wbc = {
+		.sync_mode =  WB_SYNC_ALL,
+		.nr_to_write = mapping->nrpages * 2,
+		.range_start = 0,
+		.range_end = i_size_read(mapping->host),
+		.for_writepages = 1,
+	};
+
+	ret = generic_writepages(mapping, &wbc);
+	return ret;
+}
+
 /*
  * Submit all the data buffers of inode associated with the transaction to
  * disk.
@@ -192,7 +215,7 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
  * operate on from being released while we write out pages.
  */
-static int journal_submit_inode_data_buffers(journal_t *journal,
+static int journal_submit_data_buffers(journal_t *journal,
 		transaction_t *commit_transaction)
 {
 	struct jbd2_inode *jinode;
@@ -204,8 +227,13 @@ static int journal_submit_inode_data_buffers(journal_t *journal,
 		mapping = jinode->i_vfs_inode->i_mapping;
 		jinode->i_flags |= JI_COMMIT_RUNNING;
 		spin_unlock(&journal->j_list_lock);
-		err = filemap_fdatawrite_range(mapping, 0,
-					i_size_read(jinode->i_vfs_inode));
+		/*
+		 * submit the inode data buffers. We use writepage
+		 * instead of writepages. Because writepages can do
+		 * block allocation  with delalloc. We need to write
+		 * only allocated blocks here.
+		 */
+		err = journal_submit_inode_data_buffers(mapping);
 		if (!ret)
 			ret = err;
 		spin_lock(&journal->j_list_lock);
@@ -228,7 +256,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
 	struct jbd2_inode *jinode, *next_i;
 	int err, ret = 0;
 
-	/* For locking, see the comment in journal_submit_inode_data_buffers() */
+	/* For locking, see the comment in journal_submit_data_buffers() */
 	spin_lock(&journal->j_list_lock);
 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 		jinode->i_flags |= JI_COMMIT_RUNNING;
@@ -431,7 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 * Now start flushing things to disk, in the order they appear
 	 * on the transaction lists.  Data blocks go first.
 	 */
-	err = journal_submit_inode_data_buffers(journal, commit_transaction);
+	err = journal_submit_data_buffers(journal, commit_transaction);
 	if (err)
 		jbd2_journal_abort(journal, err);
 
-- 
cgit v1.2.3-70-g09d2


From e9e34f4e8f42177c66754fec1edfd35e70c18f99 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Thu, 31 Jul 2008 22:26:04 -0400
Subject: jbd2: don't abort if flushing file data failed

In ordered mode, the current jbd2 aborts the journal if a file data buffer
has an error.  But this behavior is unintended, and we found that it has
been adopted accidentally.

This patch undoes it and just calls printk() instead of aborting the
journal.  Unlike a similar patch for ext3/jbd, file data buffers are
written via generic_writepages().  But we also need to set AS_EIO
into their mappings because wait_on_page_writeback_range() clears
AS_EIO before a user process sees it.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

(limited to 'fs/jbd2/commit.c')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f8b3be87322..adf0395f318 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -262,8 +262,18 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
 		jinode->i_flags |= JI_COMMIT_RUNNING;
 		spin_unlock(&journal->j_list_lock);
 		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
-		if (!ret)
-			ret = err;
+		if (err) {
+			/*
+			 * Because AS_EIO is cleared by
+			 * wait_on_page_writeback_range(), set it again so
+			 * that user process can get -EIO from fsync().
+			 */
+			set_bit(AS_EIO,
+				&jinode->i_vfs_inode->i_mapping->flags);
+
+			if (!ret)
+				ret = err;
+		}
 		spin_lock(&journal->j_list_lock);
 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
@@ -670,8 +680,14 @@ start_journal_io:
 	 * commit block, which happens below in such setting.
 	 */
 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
-	if (err)
-		jbd2_journal_abort(journal, err);
+	if (err) {
+		char b[BDEVNAME_SIZE];
+
+		printk(KERN_WARNING
+			"JBD2: Detected IO errors while flushing file data "
+			"on %s\n", bdevname(journal->j_fs_dev, b));
+		err = 0;
+	}
 
 	/* Lo and behold: we have just managed to send a transaction to
            the log.  Before we can commit it, wait for the IO so far to
-- 
cgit v1.2.3-70-g09d2


From 529ae9aaa08378cfe2a4350bded76f32cc8ff0ce Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sat, 2 Aug 2008 12:01:03 +0200
Subject: mm: rename page trylock

Converting page lock to new locking bitops requires a change of page flag
operation naming, so we might as well convert it to something nicer
(!TestSetPageLocked_Lock => trylock_page, SetPageLocked => set_page_locked).

This also facilitates lockdeping of page lock.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/scsi/sg.c           |  2 +-
 fs/afs/write.c              |  2 +-
 fs/cifs/file.c              |  2 +-
 fs/jbd/commit.c             |  4 +--
 fs/jbd2/commit.c            |  2 +-
 fs/reiserfs/journal.c       |  2 +-
 fs/splice.c                 |  2 +-
 fs/xfs/linux-2.6/xfs_aops.c |  4 +--
 include/linux/page-flags.h  |  2 +-
 include/linux/pagemap.h     | 67 +++++++++++++++++++++++++++------------------
 mm/filemap.c                | 12 ++++----
 mm/memory.c                 |  2 +-
 mm/migrate.c                |  4 +--
 mm/rmap.c                   |  2 +-
 mm/shmem.c                  |  4 +--
 mm/swap.c                   |  2 +-
 mm/swap_state.c             |  8 +++---
 mm/swapfile.c               |  2 +-
 mm/truncate.c               |  4 +--
 mm/vmscan.c                 |  4 +--
 20 files changed, 74 insertions(+), 59 deletions(-)

(limited to 'fs/jbd2/commit.c')

diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index d3b8ebb8377..3d36270a8b4 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1747,7 +1747,7 @@ st_map_user_pages(struct scatterlist *sgl, const unsigned int max_pages,
                  */
 		flush_dcache_page(pages[i]);
 		/* ?? Is locking needed? I don't think so */
-		/* if (TestSetPageLocked(pages[i]))
+		/* if (!trylock_page(pages[i]))
 		   goto out_unlock; */
         }
 
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 9a849ad3c48..065b4e10681 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -404,7 +404,7 @@ static int afs_write_back_from_locked_page(struct afs_writeback *wb,
 			page = pages[loop];
 			if (page->index > wb->last)
 				break;
-			if (TestSetPageLocked(page))
+			if (!trylock_page(page))
 				break;
 			if (!PageDirty(page) ||
 			    page_private(page) != (unsigned long) wb) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0aac824371a..e692c42f24b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1280,7 +1280,7 @@ retry:
 
 			if (first < 0)
 				lock_page(page);
-			else if (TestSetPageLocked(page))
+			else if (!trylock_page(page))
 				break;
 
 			if (unlikely(page->mapping != mapping)) {
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 2eccbfaa1d4..81a9ad7177c 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -63,7 +63,7 @@ static void release_buffer_page(struct buffer_head *bh)
 		goto nope;
 
 	/* OK, it's a truncated page */
-	if (TestSetPageLocked(page))
+	if (!trylock_page(page))
 		goto nope;
 
 	page_cache_get(page);
@@ -446,7 +446,7 @@ void journal_commit_transaction(journal_t *journal)
 			spin_lock(&journal->j_list_lock);
 		}
 		if (unlikely(!buffer_uptodate(bh))) {
-			if (TestSetPageLocked(bh->b_page)) {
+			if (!trylock_page(bh->b_page)) {
 				spin_unlock(&journal->j_list_lock);
 				lock_page(bh->b_page);
 				spin_lock(&journal->j_list_lock);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index adf0395f318..f2ad061e95e 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -67,7 +67,7 @@ static void release_buffer_page(struct buffer_head *bh)
 		goto nope;
 
 	/* OK, it's a truncated page */
-	if (TestSetPageLocked(page))
+	if (!trylock_page(page))
 		goto nope;
 
 	page_cache_get(page);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index c8f60ee183b..ce2208b2711 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -627,7 +627,7 @@ static int journal_list_still_alive(struct super_block *s,
 static void release_buffer_page(struct buffer_head *bh)
 {
 	struct page *page = bh->b_page;
-	if (!page->mapping && !TestSetPageLocked(page)) {
+	if (!page->mapping && trylock_page(page)) {
 		page_cache_get(page);
 		put_bh(bh);
 		if (!page->mapping)
diff --git a/fs/splice.c b/fs/splice.c
index b30311ba8af..1bbc6f4bb09 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -371,7 +371,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 			 * for an in-flight io page
 			 */
 			if (flags & SPLICE_F_NONBLOCK) {
-				if (TestSetPageLocked(page)) {
+				if (!trylock_page(page)) {
 					error = -EAGAIN;
 					break;
 				}
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 0b211cba190..fa73179233a 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -675,7 +675,7 @@ xfs_probe_cluster(
 			} else
 				pg_offset = PAGE_CACHE_SIZE;
 
-			if (page->index == tindex && !TestSetPageLocked(page)) {
+			if (page->index == tindex && trylock_page(page)) {
 				pg_len = xfs_probe_page(page, pg_offset, mapped);
 				unlock_page(page);
 			}
@@ -759,7 +759,7 @@ xfs_convert_page(
 
 	if (page->index != tindex)
 		goto fail;
-	if (TestSetPageLocked(page))
+	if (!trylock_page(page))
 		goto fail;
 	if (PageWriteback(page))
 		goto fail_unlock_page;
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 25aaccdb2f2..c74d3e87531 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -163,7 +163,7 @@ static inline int Page##uname(struct page *page) 			\
 
 struct page;	/* forward declaration */
 
-PAGEFLAG(Locked, locked) TESTSCFLAG(Locked, locked)
+TESTPAGEFLAG(Locked, locked)
 PAGEFLAG(Error, error)
 PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
 PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 69ed3cb1197..5da31c12101 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -250,29 +250,6 @@ static inline struct page *read_mapping_page(struct address_space *mapping,
 	return read_cache_page(mapping, index, filler, data);
 }
 
-int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
-				pgoff_t index, gfp_t gfp_mask);
-int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
-				pgoff_t index, gfp_t gfp_mask);
-extern void remove_from_page_cache(struct page *page);
-extern void __remove_from_page_cache(struct page *page);
-
-/*
- * Like add_to_page_cache_locked, but used to add newly allocated pages:
- * the page is new, so we can just run SetPageLocked() against it.
- */
-static inline int add_to_page_cache(struct page *page,
-		struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
-{
-	int error;
-
-	SetPageLocked(page);
-	error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
-	if (unlikely(error))
-		ClearPageLocked(page);
-	return error;
-}
-
 /*
  * Return byte-offset into filesystem object for page.
  */
@@ -294,13 +271,28 @@ extern int __lock_page_killable(struct page *page);
 extern void __lock_page_nosync(struct page *page);
 extern void unlock_page(struct page *page);
 
+static inline void set_page_locked(struct page *page)
+{
+	set_bit(PG_locked, &page->flags);
+}
+
+static inline void clear_page_locked(struct page *page)
+{
+	clear_bit(PG_locked, &page->flags);
+}
+
+static inline int trylock_page(struct page *page)
+{
+	return !test_and_set_bit(PG_locked, &page->flags);
+}
+
 /*
  * lock_page may only be called if we have the page's inode pinned.
  */
 static inline void lock_page(struct page *page)
 {
 	might_sleep();
-	if (TestSetPageLocked(page))
+	if (!trylock_page(page))
 		__lock_page(page);
 }
 
@@ -312,7 +304,7 @@ static inline void lock_page(struct page *page)
 static inline int lock_page_killable(struct page *page)
 {
 	might_sleep();
-	if (TestSetPageLocked(page))
+	if (!trylock_page(page))
 		return __lock_page_killable(page);
 	return 0;
 }
@@ -324,7 +316,7 @@ static inline int lock_page_killable(struct page *page)
 static inline void lock_page_nosync(struct page *page)
 {
 	might_sleep();
-	if (TestSetPageLocked(page))
+	if (!trylock_page(page))
 		__lock_page_nosync(page);
 }
 	
@@ -409,4 +401,27 @@ static inline int fault_in_pages_readable(const char __user *uaddr, int size)
 	return ret;
 }
 
+int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
+				pgoff_t index, gfp_t gfp_mask);
+int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
+				pgoff_t index, gfp_t gfp_mask);
+extern void remove_from_page_cache(struct page *page);
+extern void __remove_from_page_cache(struct page *page);
+
+/*
+ * Like add_to_page_cache_locked, but used to add newly allocated pages:
+ * the page is new, so we can just run set_page_locked() against it.
+ */
+static inline int add_to_page_cache(struct page *page,
+		struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
+{
+	int error;
+
+	set_page_locked(page);
+	error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
+	if (unlikely(error))
+		clear_page_locked(page);
+	return error;
+}
+
 #endif /* _LINUX_PAGEMAP_H */
diff --git a/mm/filemap.c b/mm/filemap.c
index d97d1ad5547..54e96865085 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -558,14 +558,14 @@ EXPORT_SYMBOL(wait_on_page_bit);
  * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
  *
  * The first mb is necessary to safely close the critical section opened by the
- * TestSetPageLocked(), the second mb is necessary to enforce ordering between
- * the clear_bit and the read of the waitqueue (to avoid SMP races with a
- * parallel wait_on_page_locked()).
+ * test_and_set_bit() to lock the page; the second mb is necessary to enforce
+ * ordering between the clear_bit and the read of the waitqueue (to avoid SMP
+ * races with a parallel wait_on_page_locked()).
  */
 void unlock_page(struct page *page)
 {
 	smp_mb__before_clear_bit();
-	if (!TestClearPageLocked(page))
+	if (!test_and_clear_bit(PG_locked, &page->flags))
 		BUG();
 	smp_mb__after_clear_bit(); 
 	wake_up_page(page, PG_locked);
@@ -931,7 +931,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
 	struct page *page = find_get_page(mapping, index);
 
 	if (page) {
-		if (!TestSetPageLocked(page))
+		if (trylock_page(page))
 			return page;
 		page_cache_release(page);
 		return NULL;
@@ -1027,7 +1027,7 @@ find_page:
 			if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
 					!mapping->a_ops->is_partially_uptodate)
 				goto page_not_up_to_date;
-			if (TestSetPageLocked(page))
+			if (!trylock_page(page))
 				goto page_not_up_to_date;
 			if (!mapping->a_ops->is_partially_uptodate(page,
 								desc, offset))
diff --git a/mm/memory.c b/mm/memory.c
index a472bcd4b06..1002f473f49 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1789,7 +1789,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * not dirty accountable.
 	 */
 	if (PageAnon(old_page)) {
-		if (!TestSetPageLocked(old_page)) {
+		if (trylock_page(old_page)) {
 			reuse = can_share_swap_page(old_page);
 			unlock_page(old_page);
 		}
diff --git a/mm/migrate.c b/mm/migrate.c
index 153572fb60b..2a80136b23b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -605,7 +605,7 @@ static int move_to_new_page(struct page *newpage, struct page *page)
 	 * establishing additional references. We are the only one
 	 * holding a reference to the new page at this point.
 	 */
-	if (TestSetPageLocked(newpage))
+	if (!trylock_page(newpage))
 		BUG();
 
 	/* Prepare mapping for the new page.*/
@@ -667,7 +667,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 	BUG_ON(charge);
 
 	rc = -EAGAIN;
-	if (TestSetPageLocked(page)) {
+	if (!trylock_page(page)) {
 		if (!force)
 			goto move_newpage;
 		lock_page(page);
diff --git a/mm/rmap.c b/mm/rmap.c
index 94a5246a3f9..1ea4e6fcee7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -422,7 +422,7 @@ int page_referenced(struct page *page, int is_locked,
 			referenced += page_referenced_anon(page, mem_cont);
 		else if (is_locked)
 			referenced += page_referenced_file(page, mem_cont);
-		else if (TestSetPageLocked(page))
+		else if (!trylock_page(page))
 			referenced++;
 		else {
 			if (page->mapping)
diff --git a/mm/shmem.c b/mm/shmem.c
index c1e5a3b4f75..04fb4f1ab88 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1265,7 +1265,7 @@ repeat:
 		}
 
 		/* We have to do this with page locked to prevent races */
-		if (TestSetPageLocked(swappage)) {
+		if (!trylock_page(swappage)) {
 			shmem_swp_unmap(entry);
 			spin_unlock(&info->lock);
 			wait_on_page_locked(swappage);
@@ -1329,7 +1329,7 @@ repeat:
 		shmem_swp_unmap(entry);
 		filepage = find_get_page(mapping, idx);
 		if (filepage &&
-		    (!PageUptodate(filepage) || TestSetPageLocked(filepage))) {
+		    (!PageUptodate(filepage) || !trylock_page(filepage))) {
 			spin_unlock(&info->lock);
 			wait_on_page_locked(filepage);
 			page_cache_release(filepage);
diff --git a/mm/swap.c b/mm/swap.c
index 7417a2adbe5..9e0cb311807 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -444,7 +444,7 @@ void pagevec_strip(struct pagevec *pvec)
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
 
-		if (PagePrivate(page) && !TestSetPageLocked(page)) {
+		if (PagePrivate(page) && trylock_page(page)) {
 			if (PagePrivate(page))
 				try_to_release_page(page, 0);
 			unlock_page(page);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b8035b05512..167cf2dc8a0 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -201,7 +201,7 @@ void delete_from_swap_cache(struct page *page)
  */
 static inline void free_swap_cache(struct page *page)
 {
-	if (PageSwapCache(page) && !TestSetPageLocked(page)) {
+	if (PageSwapCache(page) && trylock_page(page)) {
 		remove_exclusive_swap_page(page);
 		unlock_page(page);
 	}
@@ -302,9 +302,9 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		 * re-using the just freed swap entry for an existing page.
 		 * May fail (-ENOMEM) if radix-tree node allocation failed.
 		 */
-		SetPageLocked(new_page);
+		set_page_locked(new_page);
 		err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
-		if (!err) {
+		if (likely(!err)) {
 			/*
 			 * Initiate read into locked page and return.
 			 */
@@ -312,7 +312,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 			swap_readpage(NULL, new_page);
 			return new_page;
 		}
-		ClearPageLocked(new_page);
+		clear_page_locked(new_page);
 		swap_free(entry);
 	} while (err != -ENOMEM);
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index bb7f79641f9..1e330f2998f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -403,7 +403,7 @@ void free_swap_and_cache(swp_entry_t entry)
 	if (p) {
 		if (swap_entry_free(p, swp_offset(entry)) == 1) {
 			page = find_get_page(&swapper_space, entry.val);
-			if (page && unlikely(TestSetPageLocked(page))) {
+			if (page && unlikely(!trylock_page(page))) {
 				page_cache_release(page);
 				page = NULL;
 			}
diff --git a/mm/truncate.c b/mm/truncate.c
index 894e9a70699..250505091d3 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -187,7 +187,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 			if (page_index > next)
 				next = page_index;
 			next++;
-			if (TestSetPageLocked(page))
+			if (!trylock_page(page))
 				continue;
 			if (PageWriteback(page)) {
 				unlock_page(page);
@@ -280,7 +280,7 @@ unsigned long __invalidate_mapping_pages(struct address_space *mapping,
 			pgoff_t index;
 			int lock_failed;
 
-			lock_failed = TestSetPageLocked(page);
+			lock_failed = !trylock_page(page);
 
 			/*
 			 * We really shouldn't be looking at the ->index of an
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 75be453628b..1ff1a58e7c1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -496,7 +496,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		page = lru_to_page(page_list);
 		list_del(&page->lru);
 
-		if (TestSetPageLocked(page))
+		if (!trylock_page(page))
 			goto keep;
 
 		VM_BUG_ON(PageActive(page));
@@ -582,7 +582,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				 * A synchronous write - probably a ramdisk.  Go
 				 * ahead and try to reclaim the page.
 				 */
-				if (TestSetPageLocked(page))
+				if (!trylock_page(page))
 					goto keep;
 				if (PageDirty(page) || PageWriteback(page))
 					goto keep_locked;
-- 
cgit v1.2.3-70-g09d2


From 05496769e5da83ce22ed97345afd9c7b71d6bd24 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 16 Sep 2008 14:36:17 -0400
Subject: jbd2: clean up how the journal device name is printed

Calculate the journal device name once and stash it away in the
journal_s structure.  This avoids needing to call bdevname()
everywhere and reduces stack usage by not needing to allocate an
on-stack buffer.  In addition, we eliminate the '/' that can appear in
device names (e.g. "cciss/c0d0p9" --- see kernel bugzilla #11321) that
can cause problems when creating proc directory names, and include the
inode number to support ocfs2 which creates multiple journals with
different inode numbers.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c      | 12 +++---------
 fs/jbd2/commit.c     | 11 +++--------
 fs/jbd2/journal.c    | 48 ++++++++++++++++--------------------------------
 include/linux/jbd2.h |  3 ++-
 4 files changed, 24 insertions(+), 50 deletions(-)

(limited to 'fs/jbd2/commit.c')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f58cc0309dc..64e1c21eb5d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1476,15 +1476,9 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 			EXT4_INODES_PER_GROUP(sb),
 			sbi->s_mount_opt);
 
-	if (EXT4_SB(sb)->s_journal->j_inode == NULL) {
-		char b[BDEVNAME_SIZE];
-
-		printk(KERN_INFO "EXT4 FS on %s, external journal on %s\n",
-		       sb->s_id, bdevname(EXT4_SB(sb)->s_journal->j_dev, b));
-	} else {
-		printk(KERN_INFO "EXT4 FS on %s, internal journal\n",
-		       sb->s_id);
-	}
+	printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
+	       sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
+	       "external", EXT4_SB(sb)->s_journal->j_devname);
 	return res;
 }
 
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f2ad061e95e..b091e5378fe 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -147,12 +147,9 @@ static int journal_submit_commit_record(journal_t *journal,
 	 * to remember if we sent a barrier request
 	 */
 	if (ret == -EOPNOTSUPP && barrier_done) {
-		char b[BDEVNAME_SIZE];
-
 		printk(KERN_WARNING
-			"JBD: barrier-based sync failed on %s - "
-			"disabling barriers\n",
-			bdevname(journal->j_dev, b));
+		       "JBD: barrier-based sync failed on %s - "
+		       "disabling barriers\n", journal->j_devname);
 		spin_lock(&journal->j_state_lock);
 		journal->j_flags &= ~JBD2_BARRIER;
 		spin_unlock(&journal->j_state_lock);
@@ -681,11 +678,9 @@ start_journal_io:
 	 */
 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
 	if (err) {
-		char b[BDEVNAME_SIZE];
-
 		printk(KERN_WARNING
 			"JBD2: Detected IO errors while flushing file data "
-			"on %s\n", bdevname(journal->j_fs_dev, b));
+		       "on %s\n", journal->j_devname);
 		err = 0;
 	}
 
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8207a01c4ed..81186a29742 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -597,13 +597,9 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
 		if (ret)
 			*retp = ret;
 		else {
-			char b[BDEVNAME_SIZE];
-
 			printk(KERN_ALERT "%s: journal block not found "
 					"at offset %lu on %s\n",
-				__func__,
-				blocknr,
-				bdevname(journal->j_dev, b));
+			       __func__, blocknr, journal->j_devname);
 			err = -EIO;
 			__journal_abort_soft(journal, err);
 		}
@@ -901,10 +897,7 @@ static struct proc_dir_entry *proc_jbd2_stats;
 
 static void jbd2_stats_proc_init(journal_t *journal)
 {
-	char name[BDEVNAME_SIZE];
-
-	bdevname(journal->j_dev, name);
-	journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats);
+	journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
 	if (journal->j_proc_entry) {
 		proc_create_data("history", S_IRUGO, journal->j_proc_entry,
 				 &jbd2_seq_history_fops, journal);
@@ -915,12 +908,9 @@ static void jbd2_stats_proc_init(journal_t *journal)
 
 static void jbd2_stats_proc_exit(journal_t *journal)
 {
-	char name[BDEVNAME_SIZE];
-
-	bdevname(journal->j_dev, name);
 	remove_proc_entry("info", journal->j_proc_entry);
 	remove_proc_entry("history", journal->j_proc_entry);
-	remove_proc_entry(name, proc_jbd2_stats);
+	remove_proc_entry(journal->j_devname, proc_jbd2_stats);
 }
 
 static void journal_init_stats(journal_t *journal)
@@ -1018,6 +1008,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
 {
 	journal_t *journal = journal_init_common();
 	struct buffer_head *bh;
+	char *p;
 	int n;
 
 	if (!journal)
@@ -1039,6 +1030,10 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
 	journal->j_fs_dev = fs_dev;
 	journal->j_blk_offset = start;
 	journal->j_maxlen = len;
+	bdevname(journal->j_dev, journal->j_devname);
+	p = journal->j_devname;
+	while ((p = strchr(p, '/')))
+		*p = '!';
 	jbd2_stats_proc_init(journal);
 
 	bh = __getblk(journal->j_dev, start, journal->j_blocksize);
@@ -1061,6 +1056,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 {
 	struct buffer_head *bh;
 	journal_t *journal = journal_init_common();
+	char *p;
 	int err;
 	int n;
 	unsigned long long blocknr;
@@ -1070,6 +1066,12 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 
 	journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
 	journal->j_inode = inode;
+	bdevname(journal->j_dev, journal->j_devname);
+	p = journal->j_devname;
+	while ((p = strchr(p, '/')))
+		*p = '!';
+	p = journal->j_devname + strlen(journal->j_devname);
+	sprintf(p, ":%lu", journal->j_inode->i_ino);
 	jbd_debug(1,
 		  "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
 		  journal, inode->i_sb->s_id, inode->i_ino,
@@ -1760,23 +1762,6 @@ int jbd2_journal_wipe(journal_t *journal, int write)
 	return err;
 }
 
-/*
- * journal_dev_name: format a character string to describe on what
- * device this journal is present.
- */
-
-static const char *journal_dev_name(journal_t *journal, char *buffer)
-{
-	struct block_device *bdev;
-
-	if (journal->j_inode)
-		bdev = journal->j_inode->i_sb->s_bdev;
-	else
-		bdev = journal->j_dev;
-
-	return bdevname(bdev, buffer);
-}
-
 /*
  * Journal abort has very specific semantics, which we describe
  * for journal abort.
@@ -1793,13 +1778,12 @@ static const char *journal_dev_name(journal_t *journal, char *buffer)
 void __jbd2_journal_abort_hard(journal_t *journal)
 {
 	transaction_t *transaction;
-	char b[BDEVNAME_SIZE];
 
 	if (journal->j_flags & JBD2_ABORT)
 		return;
 
 	printk(KERN_ERR "Aborting journal on device %s.\n",
-		journal_dev_name(journal, b));
+	       journal->j_devname);
 
 	spin_lock(&journal->j_state_lock);
 	journal->j_flags |= JBD2_ABORT;
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 3dd20900709..66c3499478b 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -850,7 +850,8 @@ struct journal_s
 	 */
 	struct block_device	*j_dev;
 	int			j_blocksize;
-	unsigned long long		j_blk_offset;
+	unsigned long long	j_blk_offset;
+	char			j_devname[BDEVNAME_SIZE+24];
 
 	/*
 	 * Device which holds the client fs.  For internal journal this will be
-- 
cgit v1.2.3-70-g09d2


From ede86cc473defab74d778aeac14b19f43129d4d1 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 5 Oct 2008 20:50:06 -0400
Subject: ext4: Add debugging markers that can be used by systemtap

This debugging markers are designed to debug problems such as the
random filesystem latency problems reported by Arjan.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/fsync.c      | 5 +++++
 fs/ext4/super.c      | 2 ++
 fs/jbd2/checkpoint.c | 3 +++
 fs/jbd2/commit.c     | 6 ++++++
 4 files changed, 16 insertions(+)

(limited to 'fs/jbd2/commit.c')

diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index c37d1e86f51..5afe4370840 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -28,6 +28,7 @@
 #include <linux/writeback.h>
 #include <linux/jbd2.h>
 #include <linux/blkdev.h>
+#include <linux/marker.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 
@@ -51,6 +52,10 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 
 	J_ASSERT(ext4_journal_current_handle() == NULL);
 
+	trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld",
+		   inode->i_sb->s_id, datasync, inode->i_ino,
+		   dentry->d_parent->d_inode->i_ino);
+
 	/*
 	 * data=writeback:
 	 *  The caller's filemap_fdatawrite()/wait will sync the data.
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dfcd41fafb9..9c0214689de 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -35,6 +35,7 @@
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
+#include <linux/marker.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
 #include <asm/uaccess.h>
@@ -2951,6 +2952,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 {
 	tid_t target;
 
+	trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
 	sb->s_dirt = 0;
 	if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
 		if (wait)
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index af4651bf357..42895d36945 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -20,6 +20,7 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
+#include <linux/marker.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 
@@ -328,6 +329,8 @@ int jbd2_log_do_checkpoint(journal_t *journal)
 	 * journal straight away.
 	 */
 	result = jbd2_cleanup_journal_tail(journal);
+	trace_mark(jbd2_checkpoint, "dev %s need_checkpoint %d",
+		   journal->j_devname, result);
 	jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
 	if (result <= 0)
 		return result;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index b091e5378fe..e91f051a985 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -16,6 +16,7 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
+#include <linux/marker.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
@@ -368,6 +369,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	commit_transaction = journal->j_running_transaction;
 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
 
+	trace_mark(jbd2_start_commit, "dev %s transaction %d",
+		   journal->j_devname, commit_transaction->t_tid);
 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
 			commit_transaction->t_tid);
 
@@ -985,6 +988,9 @@ restart_loop:
 	}
 	spin_unlock(&journal->j_list_lock);
 
+	trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
+		   journal->j_devname, commit_transaction->t_tid,
+		   journal->j_tail_sequence);
 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
 		  journal->j_commit_sequence, journal->j_tail_sequence);
 
-- 
cgit v1.2.3-70-g09d2


From 45a90bfd90c1215bf824c0f705b409723f52361b Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 6 Oct 2008 12:04:02 -0400
Subject: jbd2: Fix buffer head leak when writing the commit block

Also make sure the buffer heads are marked clean before submitting bh
for writing.  The previous code was marking the buffer head dirty,
which would have forced an unneeded write (and seek) to the journal
for no good reason.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs/jbd2/commit.c')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index e91f051a985..0d3814a35ed 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -127,8 +127,7 @@ static int journal_submit_commit_record(journal_t *journal,
 
 	JBUFFER_TRACE(descriptor, "submit commit block");
 	lock_buffer(bh);
-	get_bh(bh);
-	set_buffer_dirty(bh);
+	clear_buffer_dirty(bh);
 	set_buffer_uptodate(bh);
 	bh->b_end_io = journal_end_buffer_io_sync;
 
@@ -158,7 +157,7 @@ static int journal_submit_commit_record(journal_t *journal,
 		/* And try again, without the barrier */
 		lock_buffer(bh);
 		set_buffer_uptodate(bh);
-		set_buffer_dirty(bh);
+		clear_buffer_dirty(bh);
 		ret = submit_bh(WRITE, bh);
 	}
 	*cbh = bh;
-- 
cgit v1.2.3-70-g09d2


From 77e841de8abac4755cc83ca224fdf71418d65380 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Sun, 12 Oct 2008 16:39:16 -0400
Subject: jbd2: abort when failed to log metadata buffers

If we failed to write metadata buffers to the journal space and
succeeded to write the commit record, stale data can be written
back to the filesystem as metadata in the recovery phase.

To avoid this, when we failed to write out metadata buffers,
abort the journal before writing the commit record.

We can also avoid this kind of corruption by using the journal
checksum feature because it can detect invalid metadata blocks in the
journal and avoid them from being replayed.  So we don't need to care
about asynchronous commit record writeout with a checksum.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/commit.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs/jbd2/commit.c')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0d3814a35ed..78e4da93412 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -783,6 +783,9 @@ wait_for_iobuf:
 		/* AKPM: bforget here */
 	}
 
+	if (err)
+		jbd2_journal_abort(journal, err);
+
 	jbd_debug(3, "JBD: commit phase 5\n");
 
 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
-- 
cgit v1.2.3-70-g09d2


From 7ad7445f60fe4d46c4c9d2a9463db180d2a3b270 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Fri, 10 Oct 2008 20:29:31 -0400
Subject: jbd2: don't dirty original metadata buffer on abort

Currently, original metadata buffers are dirtied when they are
unfiled whether the journal has aborted or not.  Eventually these
buffers will be written-back to the filesystem by pdflush.  This
means some metadata buffers are written to the filesystem without
journaling if the journal aborts.  So if both journal abort and
system crash happen at the same time, the filesystem would become
inconsistent state.  Additionally, replaying journaled metadata
can overwrite the latest metadata on the filesystem partly.
Because, if the journal gets aborted, journaled metadata are
preserved and replayed during the next mount not to lose
uncheckpointed metadata.  This would also break the consistency
of the filesystem.

This patch prevents original metadata buffers from being dirtied
on abort by clearing BH_JBDDirty flag from those buffers.  Thus,
no metadata buffers are written to the filesystem without journaling.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/commit.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs/jbd2/commit.c')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 78e4da93412..849f10496ce 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -504,9 +504,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		jh = commit_transaction->t_buffers;
 
 		/* If we're in abort mode, we just un-journal the buffer and
-		   release it for background writing. */
+		   release it. */
 
 		if (is_journal_aborted(journal)) {
+			clear_buffer_jbddirty(jh2bh(jh));
 			JBUFFER_TRACE(jh, "journal is aborting: refile");
 			jbd2_journal_refile_buffer(journal, jh);
 			/* If that was the last one, we need to clean up
@@ -884,6 +885,8 @@ restart_loop:
 		if (buffer_jbddirty(bh)) {
 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
+			if (is_journal_aborted(journal))
+				clear_buffer_jbddirty(bh);
 			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 			__jbd2_journal_refile_buffer(jh);
 			jbd_unlock_bh_state(bh);
-- 
cgit v1.2.3-70-g09d2


From 5bf5683a33f3584da6eced480967c4f7e11515a8 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Fri, 10 Oct 2008 22:12:43 -0400
Subject: ext4: add an option to control error handling on file data

If the journal doesn't abort when it gets an IO error in file data
blocks, the file data corruption will spread silently.  Because
most of applications and commands do buffered writes without fsync(),
they don't notice the IO error.  It's scary for mission critical
systems.  On the other hand, if the journal aborts whenever it gets
an IO error in file data blocks, the system will easily become
inoperable.  So this patch introduces a filesystem option to
determine whether it aborts the journal or just call printk() when
it gets an IO error in file data.

If you mount an ext4 fs with data_err=abort option, it aborts on file
data write error.  If you mount it with data_err=ignore, it doesn't
abort, just call printk().  data_err=ignore is the default.

Here is the corresponding patch of the ext3 version:
http://kerneltrap.org/mailarchive/linux-kernel/2008/9/9/3239374

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 Documentation/filesystems/ext4.txt |  5 +++++
 fs/ext4/ext4.h                     |  2 ++
 fs/ext4/super.c                    | 16 ++++++++++++++++
 fs/jbd2/commit.c                   |  2 ++
 include/linux/jbd2.h               |  3 +++
 5 files changed, 28 insertions(+)

(limited to 'fs/jbd2/commit.c')

diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 74484e69640..eb154ef36c2 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -223,6 +223,11 @@ errors=remount-ro(*)	Remount the filesystem read-only on an error.
 errors=continue		Keep going on a filesystem error.
 errors=panic		Panic and halt the machine if an error occurs.
 
+data_err=ignore(*)	Just print an error message if an error occurs
+			in a file data buffer in ordered mode.
+data_err=abort		Abort the journal if an error occurs in a file
+			data buffer in ordered mode.
+
 grpid			Give objects the same group ID as their creator.
 bsdgroups
 
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f46a513a515..6690a41cdd9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -540,6 +540,8 @@ do {									       \
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
+#define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */
+
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)		o &= ~EXT4_MOUNT_##opt
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 79bd3989e84..014677b8e22 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -778,6 +778,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_printf(seq, ",inode_readahead_blks=%u",
 			   sbi->s_inode_readahead_blks);
 
+	if (test_opt(sb, DATA_ERR_ABORT))
+		seq_puts(seq, ",data_err=abort");
+
 	ext4_show_quota_options(seq, sb);
 	return 0;
 }
@@ -907,6 +910,7 @@ enum {
 	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
 	Opt_journal_checksum, Opt_journal_async_commit,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+	Opt_data_err_abort, Opt_data_err_ignore,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
@@ -953,6 +957,8 @@ static match_table_t tokens = {
 	{Opt_data_journal, "data=journal"},
 	{Opt_data_ordered, "data=ordered"},
 	{Opt_data_writeback, "data=writeback"},
+	{Opt_data_err_abort, "data_err=abort"},
+	{Opt_data_err_ignore, "data_err=ignore"},
 	{Opt_offusrjquota, "usrjquota="},
 	{Opt_usrjquota, "usrjquota=%s"},
 	{Opt_offgrpjquota, "grpjquota="},
@@ -1187,6 +1193,12 @@ static int parse_options(char *options, struct super_block *sb,
 				sbi->s_mount_opt |= data_opt;
 			}
 			break;
+		case Opt_data_err_abort:
+			set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+			break;
+		case Opt_data_err_ignore:
+			clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+			break;
 #ifdef CONFIG_QUOTA
 		case Opt_usrjquota:
 			qtype = USRQUOTA;
@@ -2535,6 +2547,10 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 		journal->j_flags |= JBD2_BARRIER;
 	else
 		journal->j_flags &= ~JBD2_BARRIER;
+	if (test_opt(sb, DATA_ERR_ABORT))
+		journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
+	else
+		journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
 	spin_unlock(&journal->j_state_lock);
 }
 
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 849f10496ce..0abe02c4242 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -684,6 +684,8 @@ start_journal_io:
 		printk(KERN_WARNING
 			"JBD2: Detected IO errors while flushing file data "
 		       "on %s\n", journal->j_devname);
+		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
+			jbd2_journal_abort(journal, err);
 		err = 0;
 	}
 
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index c9e7d781db3..d2e91ea998f 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -967,6 +967,9 @@ struct journal_s
 #define JBD2_FLUSHED	0x008	/* The journal superblock has been flushed */
 #define JBD2_LOADED	0x010	/* The journal superblock has been loaded */
 #define JBD2_BARRIER	0x020	/* Use IDE barriers */
+#define JBD2_ABORT_ON_SYNCDATA_ERR	0x040	/* Abort the journal on file
+						 * data write error in ordered
+						 * mode */
 
 /*
  * Function declarations for the journaling transaction and buffer
-- 
cgit v1.2.3-70-g09d2