From 9dba8cf128ef98257ca719722280c9634e7e9dc7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 6 Nov 2014 10:19:54 -0500
Subject: Btrfs: make sure we wait on logged extents when fsycning two subvols

If we have two fsync()'s race on different subvols one will do all of its work
to get into the log_tree, wait on it's outstanding IO, and then allow the
log_tree to finish it's commit.  The problem is we were just free'ing that
subvols logged extents instead of waiting on them, so whoever lost the race
wouldn't really have their data on disk.  Fix this by waiting properly instead
of freeing the logged extents.  Thanks,

cc: stable@vger.kernel.org
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/tree-log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 286213cec86..fc715ff31d2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2600,9 +2600,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	if (atomic_read(&log_root_tree->log_commit[index2])) {
 		blk_finish_plug(&plug);
 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+		btrfs_wait_logged_extents(log, log_transid);
 		wait_log_commit(trans, log_root_tree,
 				root_log_ctx.log_transid);
-		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
 		ret = root_log_ctx.log_ret;
 		goto out;
-- 
cgit v1.2.3-70-g09d2


From 50d9aa99bd35c77200e0e3dd7a72274f8304701f Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Fri, 21 Nov 2014 14:52:38 -0500
Subject: Btrfs: make sure logged extents complete in the current transaction
 V3

Liu Bo pointed out that my previous fix would lose the generation update in the
scenario I described.  It is actually much worse than that, we could lose the
entire extent if we lose power right after the transaction commits.  Consider
the following

write extent 0-4k
log extent in log tree
commit transaction
	< power fail happens here
ordered extent completes

We would lose the 0-4k extent because it hasn't updated the actual fs tree, and
the transaction commit will reset the log so it isn't replayed.  If we lose
power before the transaction commit we are save, otherwise we are not.

Fix this by keeping track of all extents we logged in this transaction.  Then
when we go to commit the transaction make sure we wait for all of those ordered
extents to complete before proceeding.  This will make sure that if we lose
power after the transaction commit we still have our data.  This also fixes the
problem of the improperly updated extent generation.  Thanks,

cc: stable@vger.kernel.org
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/disk-io.c      | 20 ++++++++++++++++++++
 fs/btrfs/ordered-data.c |  9 +++++++--
 fs/btrfs/ordered-data.h |  8 +++++++-
 fs/btrfs/transaction.c  | 33 +++++++++++++++++++++++++++++++++
 fs/btrfs/transaction.h  |  2 ++
 fs/btrfs/tree-log.c     |  6 +++---
 6 files changed, 72 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7af9a1978a2..6efaee8d773 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4129,6 +4129,25 @@ again:
 	return 0;
 }
 
+static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
+				       struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_ordered_extent *ordered;
+
+	spin_lock(&fs_info->trans_lock);
+	while (!list_empty(&cur_trans->pending_ordered)) {
+		ordered = list_first_entry(&cur_trans->pending_ordered,
+					   struct btrfs_ordered_extent,
+					   trans_list);
+		list_del_init(&ordered->trans_list);
+		spin_unlock(&fs_info->trans_lock);
+
+		btrfs_put_ordered_extent(ordered);
+		spin_lock(&fs_info->trans_lock);
+	}
+	spin_unlock(&fs_info->trans_lock);
+}
+
 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 				   struct btrfs_root *root)
 {
@@ -4140,6 +4159,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 	cur_trans->state = TRANS_STATE_UNBLOCKED;
 	wake_up(&root->fs_info->transaction_wait);
 
+	btrfs_free_pending_ordered(cur_trans, root->fs_info);
 	btrfs_destroy_delayed_inodes(root);
 	btrfs_assert_delayed_root_empty(root);
 
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 1401b1af4f0..9c28eb4da4d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -220,6 +220,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	INIT_LIST_HEAD(&entry->work_list);
 	init_completion(&entry->completion);
 	INIT_LIST_HEAD(&entry->log_list);
+	INIT_LIST_HEAD(&entry->trans_list);
 
 	trace_btrfs_ordered_extent_add(inode, entry);
 
@@ -443,6 +444,8 @@ void btrfs_get_logged_extents(struct inode *inode,
 		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
 		if (!list_empty(&ordered->log_list))
 			continue;
+		if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+			continue;
 		list_add_tail(&ordered->log_list, logged_list);
 		atomic_inc(&ordered->refs);
 	}
@@ -472,7 +475,8 @@ void btrfs_submit_logged_extents(struct list_head *logged_list,
 	spin_unlock_irq(&log->log_extents_lock[index]);
 }
 
-void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
+void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *log, u64 transid)
 {
 	struct btrfs_ordered_extent *ordered;
 	int index = transid % 2;
@@ -497,7 +501,8 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
 		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
 						   &ordered->flags));
 
-		btrfs_put_ordered_extent(ordered);
+		if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+			list_add_tail(&ordered->trans_list, &trans->ordered);
 		spin_lock_irq(&log->log_extents_lock[index]);
 	}
 	spin_unlock_irq(&log->log_extents_lock[index]);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index d81a274d621..0124bffc775 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -71,6 +71,8 @@ struct btrfs_ordered_sum {
 				       ordered extent */
 #define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */
 
+#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
+				 * in the logging code. */
 struct btrfs_ordered_extent {
 	/* logical offset in the file */
 	u64 file_offset;
@@ -121,6 +123,9 @@ struct btrfs_ordered_extent {
 	/* If we need to wait on this to be done */
 	struct list_head log_list;
 
+	/* If the transaction needs to wait on this ordered extent */
+	struct list_head trans_list;
+
 	/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
 	wait_queue_head_t wait;
 
@@ -197,7 +202,8 @@ void btrfs_get_logged_extents(struct inode *inode,
 void btrfs_put_logged_extents(struct list_head *logged_list);
 void btrfs_submit_logged_extents(struct list_head *logged_list,
 				 struct btrfs_root *log);
-void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
+void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *log, u64 transid);
 void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
 int __init ordered_data_init(void);
 void ordered_data_exit(void);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 16c704b6870..295a135c9c2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -247,6 +247,7 @@ loop:
 	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
 	INIT_LIST_HEAD(&cur_trans->pending_chunks);
 	INIT_LIST_HEAD(&cur_trans->switch_commits);
+	INIT_LIST_HEAD(&cur_trans->pending_ordered);
 	list_add_tail(&cur_trans->list, &fs_info->trans_list);
 	extent_io_tree_init(&cur_trans->dirty_pages,
 			     fs_info->btree_inode->i_mapping);
@@ -515,6 +516,7 @@ again:
 	h->sync = false;
 	INIT_LIST_HEAD(&h->qgroup_ref_list);
 	INIT_LIST_HEAD(&h->new_bgs);
+	INIT_LIST_HEAD(&h->ordered);
 
 	smp_mb();
 	if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@ -746,6 +748,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	if (!list_empty(&trans->new_bgs))
 		btrfs_create_pending_block_groups(trans, root);
 
+	if (!list_empty(&trans->ordered)) {
+		spin_lock(&info->trans_lock);
+		list_splice(&trans->ordered, &cur_trans->pending_ordered);
+		spin_unlock(&info->trans_lock);
+	}
+
 	trans->delayed_ref_updates = 0;
 	if (!trans->sync) {
 		must_run_delayed_refs =
@@ -1715,6 +1723,28 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
 		btrfs_wait_ordered_roots(fs_info, -1);
 }
 
+static inline void
+btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
+			   struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_ordered_extent *ordered;
+
+	spin_lock(&fs_info->trans_lock);
+	while (!list_empty(&cur_trans->pending_ordered)) {
+		ordered = list_first_entry(&cur_trans->pending_ordered,
+					   struct btrfs_ordered_extent,
+					   trans_list);
+		list_del_init(&ordered->trans_list);
+		spin_unlock(&fs_info->trans_lock);
+
+		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+						   &ordered->flags));
+		btrfs_put_ordered_extent(ordered);
+		spin_lock(&fs_info->trans_lock);
+	}
+	spin_unlock(&fs_info->trans_lock);
+}
+
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root)
 {
@@ -1765,6 +1795,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	}
 
 	spin_lock(&root->fs_info->trans_lock);
+	list_splice(&trans->ordered, &cur_trans->pending_ordered);
 	if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
 		spin_unlock(&root->fs_info->trans_lock);
 		atomic_inc(&cur_trans->use_count);
@@ -1817,6 +1848,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_wait_delalloc_flush(root->fs_info);
 
+	btrfs_wait_pending_ordered(cur_trans, root->fs_info);
+
 	btrfs_scrub_pause(root);
 	/*
 	 * Ok now we need to make sure to block out any other joins while we
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index b3f5b40aab2..fd400a3668a 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -56,6 +56,7 @@ struct btrfs_transaction {
 	wait_queue_head_t commit_wait;
 	struct list_head pending_snapshots;
 	struct list_head pending_chunks;
+	struct list_head pending_ordered;
 	struct list_head switch_commits;
 	struct btrfs_delayed_ref_root delayed_refs;
 	int aborted;
@@ -105,6 +106,7 @@ struct btrfs_trans_handle {
 	 */
 	struct btrfs_root *root;
 	struct seq_list delayed_ref_elem;
+	struct list_head ordered;
 	struct list_head qgroup_ref_list;
 	struct list_head new_bgs;
 };
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index fc715ff31d2..7d96cc96166 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2600,7 +2600,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	if (atomic_read(&log_root_tree->log_commit[index2])) {
 		blk_finish_plug(&plug);
 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
-		btrfs_wait_logged_extents(log, log_transid);
+		btrfs_wait_logged_extents(trans, log, log_transid);
 		wait_log_commit(trans, log_root_tree,
 				root_log_ctx.log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
@@ -2645,7 +2645,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	btrfs_wait_marked_extents(log_root_tree,
 				  &log_root_tree->dirty_log_pages,
 				  EXTENT_NEW | EXTENT_DIRTY);
-	btrfs_wait_logged_extents(log, log_transid);
+	btrfs_wait_logged_extents(trans, log, log_transid);
 
 	btrfs_set_super_log_root(root->fs_info->super_for_commit,
 				log_root_tree->node->start);
@@ -3766,7 +3766,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	fi = btrfs_item_ptr(leaf, path->slots[0],
 			    struct btrfs_file_extent_item);
 
-	btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
+	btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
 					       &token);
 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
 		btrfs_set_token_file_extent_type(leaf, fi,
-- 
cgit v1.2.3-70-g09d2


From 5ab5e44a36164f0366a98b47289c868d8fbcb256 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 13 Nov 2014 16:59:53 +0000
Subject: Btrfs: don't ignore log btree writeback errors

If an error happens during writeback of log btree extents, make sure the
error is returned to the caller (fsync), so that it takes proper action
(commit current transaction) instead of writing a superblock that points
to log btrees with all or some nodes that weren't durably persisted.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/tree-log.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7d96cc96166..afe483cb27b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2599,12 +2599,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	index2 = root_log_ctx.log_transid % 2;
 	if (atomic_read(&log_root_tree->log_commit[index2])) {
 		blk_finish_plug(&plug);
-		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+		ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
+						mark);
 		btrfs_wait_logged_extents(trans, log, log_transid);
 		wait_log_commit(trans, log_root_tree,
 				root_log_ctx.log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
-		ret = root_log_ctx.log_ret;
+		if (!ret)
+			ret = root_log_ctx.log_ret;
 		goto out;
 	}
 	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
@@ -2641,10 +2643,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		mutex_unlock(&log_root_tree->log_mutex);
 		goto out_wake_log_root;
 	}
-	btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
-	btrfs_wait_marked_extents(log_root_tree,
-				  &log_root_tree->dirty_log_pages,
-				  EXTENT_NEW | EXTENT_DIRTY);
+	ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+	if (!ret)
+		ret = btrfs_wait_marked_extents(log_root_tree,
+						&log_root_tree->dirty_log_pages,
+						EXTENT_NEW | EXTENT_DIRTY);
+	if (ret) {
+		btrfs_set_log_full_commit(root->fs_info, trans);
+		btrfs_free_logged_extents(log, log_transid);
+		mutex_unlock(&log_root_tree->log_mutex);
+		goto out_wake_log_root;
+	}
 	btrfs_wait_logged_extents(trans, log, log_transid);
 
 	btrfs_set_super_log_root(root->fs_info->super_for_commit,
-- 
cgit v1.2.3-70-g09d2


From 0870295b2371673b3563735825ad559409d8cedc Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 13 Nov 2014 17:00:35 +0000
Subject: Btrfs: collect only the necessary ordered extents on ranged fsync

Instead of collecting all ordered extents from the inode's ordered tree
and then wait for all of them to complete, just collect the ones that
overlap the fsync range.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ordered-data.c | 16 +++++++++++++---
 fs/btrfs/ordered-data.h |  4 +++-
 fs/btrfs/tree-log.c     |  2 +-
 3 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 9c28eb4da4d..534544e08f7 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -432,21 +432,31 @@ out:
 
 /* Needs to either be called under a log transaction or the log_mutex */
 void btrfs_get_logged_extents(struct inode *inode,
-			      struct list_head *logged_list)
+			      struct list_head *logged_list,
+			      const loff_t start,
+			      const loff_t end)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct btrfs_ordered_extent *ordered;
 	struct rb_node *n;
+	struct rb_node *prev;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
 	spin_lock_irq(&tree->lock);
-	for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+	n = __tree_search(&tree->tree, end, &prev);
+	if (!n)
+		n = prev;
+	for (; n; n = rb_prev(n)) {
 		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+		if (ordered->file_offset > end)
+			continue;
+		if (entry_end(ordered) <= start)
+			break;
 		if (!list_empty(&ordered->log_list))
 			continue;
 		if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
 			continue;
-		list_add_tail(&ordered->log_list, logged_list);
+		list_add(&ordered->log_list, logged_list);
 		atomic_inc(&ordered->refs);
 	}
 	spin_unlock_irq(&tree->lock);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 0124bffc775..e96cd4ccd80 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -198,7 +198,9 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
 int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
 void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
 void btrfs_get_logged_extents(struct inode *inode,
-			      struct list_head *logged_list);
+			      struct list_head *logged_list,
+			      const loff_t start,
+			      const loff_t end);
 void btrfs_put_logged_extents(struct list_head *logged_list);
 void btrfs_submit_logged_extents(struct list_head *logged_list,
 				 struct btrfs_root *log);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index afe483cb27b..3883d0febd8 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3972,7 +3972,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
 	mutex_lock(&BTRFS_I(inode)->log_mutex);
 
-	btrfs_get_logged_extents(inode, &logged_list);
+	btrfs_get_logged_extents(inode, &logged_list, start, end);
 
 	/*
 	 * a brute force approach to making sure we get the most uptodate
-- 
cgit v1.2.3-70-g09d2


From b38ef71cb102208dffcf4e8524e9d5ec4ec0eaa9 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 13 Nov 2014 17:01:45 +0000
Subject: Btrfs: ensure ordered extent errors aren't missed on fsync

When doing a fsync with a fast path we have a time window where we can miss
the fact that writeback of some file data failed, and therefore we endup
returning success (0) from fsync when we should return an error.
The steps that lead to this are the following:

1) We start all ordered extents by calling filemap_fdatawrite_range();

2) We do some other work like locking the inode's i_mutex, start a transaction,
   start a log transaction, etc;

3) We enter btrfs_log_inode(), acquire the inode's log_mutex and collect all the
   ordered extents from inode's ordered tree into a list;

4) But by the time we do ordered extent collection, some ordered extents we started
   at step 1) might have already completed with an error, and therefore we didn't
   found them in the ordered tree and had no idea they finished with an error. This
   makes our fsync return success (0) to userspace, but has no bad effects on the log
   like for example insertion of file extent items into the log that point to unwritten
   extents, because the invalid extent maps were removed before the ordered extent
   completed (in inode.c:btrfs_finish_ordered_io).

So after collecting the ordered extents just check if the inode's i_mapping has any
error flags set (AS_EIO or AS_ENOSPC) and leave with an error if it does. Whenever
writeback fails for a page of an ordered extent, we call mapping_set_error (done in
extent_io.c:end_extent_writepage, called by extent_io.c:end_bio_extent_writepage)
that sets one of those error flags in the inode's i_mapping flags.

This change also has the side effect of fixing the issue where for fast fsyncs we
never checked/cleared the error flags from the inode's i_mapping flags, which means
that a full fsync performed after a fast fsync could get such errors that belonged
to the fast fsync - because the full fsync calls btrfs_wait_ordered_range() which
calls filemap_fdatawait_range(), and the later checks for and clears those flags,
while for fast fsyncs we never call filemap_fdatawait_range() or anything else
that checks for and clears the error flags from the inode's i_mapping.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h    |  1 +
 fs/btrfs/inode.c    | 15 +++++++++++++++
 fs/btrfs/tree-log.c | 21 +++++++++++++++++++++
 3 files changed, 37 insertions(+)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a9466e34635..49d956b2cf3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3866,6 +3866,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
 				    struct btrfs_trans_handle *trans, int mode,
 				    u64 start, u64 num_bytes, u64 min_size,
 				    loff_t actual_len, u64 *alloc_hint);
+int btrfs_inode_check_errors(struct inode *inode);
 extern const struct dentry_operations btrfs_dentry_operations;
 
 /* ioctl.c */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index cb5978a4a27..a5374c2bb94 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9464,6 +9464,21 @@ out_inode:
 
 }
 
+/* Inspired by filemap_check_errors() */
+int btrfs_inode_check_errors(struct inode *inode)
+{
+	int ret = 0;
+
+	if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) &&
+	    test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags))
+		ret = -ENOSPC;
+	if (test_bit(AS_EIO, &inode->i_mapping->flags) &&
+	    test_and_clear_bit(AS_EIO, &inode->i_mapping->flags))
+		ret = -EIO;
+
+	return ret;
+}
+
 static const struct inode_operations btrfs_dir_inode_operations = {
 	.getattr	= btrfs_getattr,
 	.lookup		= btrfs_lookup,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3883d0febd8..9a02da16f2b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3635,6 +3635,12 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
 			    test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
 
 		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
+			/*
+			 * Clear the AS_EIO/AS_ENOSPC flags from the inode's
+			 * i_mapping flags, so that the next fsync won't get
+			 * an outdated io error too.
+			 */
+			btrfs_inode_check_errors(inode);
 			*ordered_io_error = true;
 			break;
 		}
@@ -4098,6 +4104,21 @@ log_extents:
 	btrfs_release_path(path);
 	btrfs_release_path(dst_path);
 	if (fast_search) {
+		/*
+		 * Some ordered extents started by fsync might have completed
+		 * before we collected the ordered extents in logged_list, which
+		 * means they're gone, not in our logged_list nor in the inode's
+		 * ordered tree. We want the application/user space to know an
+		 * error happened while attempting to persist file data so that
+		 * it can take proper action. If such error happened, we leave
+		 * without writing to the log tree and the fsync must report the
+		 * file data write error and not commit the current transaction.
+		 */
+		err = btrfs_inode_check_errors(inode);
+		if (err) {
+			ctx->io_err = err;
+			goto out_unlock;
+		}
 		ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
 						&logged_list, ctx);
 		if (ret) {
-- 
cgit v1.2.3-70-g09d2