summaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/backref.c118
-rw-r--r--fs/btrfs/btrfs_inode.h1
-rw-r--r--fs/btrfs/check-integrity.c16
-rw-r--r--fs/btrfs/ctree.c146
-rw-r--r--fs/btrfs/ctree.h11
-rw-r--r--fs/btrfs/delayed-inode.c18
-rw-r--r--fs/btrfs/delayed-inode.h3
-rw-r--r--fs/btrfs/disk-io.c111
-rw-r--r--fs/btrfs/extent-tree.c11
-rw-r--r--fs/btrfs/extent_io.c21
-rw-r--r--fs/btrfs/file.c13
-rw-r--r--fs/btrfs/free-space-cache.c145
-rw-r--r--fs/btrfs/inode.c130
-rw-r--r--fs/btrfs/ioctl.c118
-rw-r--r--fs/btrfs/ioctl.h2
-rw-r--r--fs/btrfs/ordered-data.c22
-rw-r--r--fs/btrfs/rcu-string.h56
-rw-r--r--fs/btrfs/scrub.c30
-rw-r--r--fs/btrfs/super.c37
-rw-r--r--fs/btrfs/transaction.c14
-rw-r--r--fs/btrfs/tree-log.c6
-rw-r--r--fs/btrfs/volumes.c187
-rw-r--r--fs/btrfs/volumes.h5
23 files changed, 791 insertions, 430 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 3f75895c919..a383c18e74e 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -179,60 +179,74 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
struct ulist *parents, int level,
- struct btrfs_key *key, u64 wanted_disk_byte,
+ struct btrfs_key *key_for_search, u64 time_seq,
+ u64 wanted_disk_byte,
const u64 *extent_item_pos)
{
- int ret;
- int slot = path->slots[level];
- struct extent_buffer *eb = path->nodes[level];
+ int ret = 0;
+ int slot;
+ struct extent_buffer *eb;
+ struct btrfs_key key;
struct btrfs_file_extent_item *fi;
struct extent_inode_elem *eie = NULL;
u64 disk_byte;
- u64 wanted_objectid = key->objectid;
-add_parent:
- if (level == 0 && extent_item_pos) {
- fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
- ret = check_extent_in_eb(key, eb, fi, *extent_item_pos, &eie);
+ if (level != 0) {
+ eb = path->nodes[level];
+ ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
if (ret < 0)
return ret;
- }
- ret = ulist_add(parents, eb->start, (unsigned long)eie, GFP_NOFS);
- if (ret < 0)
- return ret;
-
- if (level != 0)
return 0;
+ }
/*
- * if the current leaf is full with EXTENT_DATA items, we must
- * check the next one if that holds a reference as well.
- * ref->count cannot be used to skip this check.
- * repeat this until we don't find any additional EXTENT_DATA items.
+ * We normally enter this function with the path already pointing to
+ * the first item to check. But sometimes, we may enter it with
+ * slot==nritems. In that case, go to the next leaf before we continue.
*/
- while (1) {
- eie = NULL;
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- return ret;
- if (ret)
- return 0;
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
+ ret = btrfs_next_old_leaf(root, path, time_seq);
+ while (!ret) {
eb = path->nodes[0];
- for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
- btrfs_item_key_to_cpu(eb, key, slot);
- if (key->objectid != wanted_objectid ||
- key->type != BTRFS_EXTENT_DATA_KEY)
- return 0;
- fi = btrfs_item_ptr(eb, slot,
- struct btrfs_file_extent_item);
- disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
- if (disk_byte == wanted_disk_byte)
- goto add_parent;
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+
+ if (key.objectid != key_for_search->objectid ||
+ key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+
+ if (disk_byte == wanted_disk_byte) {
+ eie = NULL;
+ if (extent_item_pos) {
+ ret = check_extent_in_eb(&key, eb, fi,
+ *extent_item_pos,
+ &eie);
+ if (ret < 0)
+ break;
+ }
+ if (!ret) {
+ ret = ulist_add(parents, eb->start,
+ (unsigned long)eie, GFP_NOFS);
+ if (ret < 0)
+ break;
+ if (!extent_item_pos) {
+ ret = btrfs_next_old_leaf(root, path,
+ time_seq);
+ continue;
+ }
+ }
}
+ ret = btrfs_next_old_item(root, path, time_seq);
}
- return 0;
+ if (ret > 0)
+ ret = 0;
+ return ret;
}
/*
@@ -249,7 +263,6 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
struct btrfs_path *path;
struct btrfs_root *root;
struct btrfs_key root_key;
- struct btrfs_key key = {0};
struct extent_buffer *eb;
int ret = 0;
int root_level;
@@ -288,25 +301,19 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
goto out;
eb = path->nodes[level];
- if (!eb) {
- WARN_ON(1);
- ret = 1;
- goto out;
- }
-
- if (level == 0) {
- if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(root, path);
- if (ret)
- goto out;
- eb = path->nodes[0];
+ while (!eb) {
+ if (!level) {
+ WARN_ON(1);
+ ret = 1;
+ goto out;
}
-
- btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
+ level--;
+ eb = path->nodes[level];
}
- ret = add_all_parents(root, path, parents, level, &key,
- ref->wanted_disk_byte, extent_item_pos);
+ ret = add_all_parents(root, path, parents, level, &ref->key_for_search,
+ time_seq, ref->wanted_disk_byte,
+ extent_item_pos);
out:
btrfs_free_path(path);
return ret;
@@ -832,6 +839,7 @@ again:
}
ret = __add_delayed_refs(head, delayed_ref_seq,
&prefs_delayed);
+ mutex_unlock(&head->mutex);
if (ret) {
spin_unlock(&delayed_refs->lock);
goto out;
@@ -925,8 +933,6 @@ again:
}
out:
- if (head)
- mutex_unlock(&head->mutex);
btrfs_free_path(path);
while (!list_empty(&prefs)) {
ref = list_first_entry(&prefs, struct __prelim_ref, list);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index e616f8872e6..12394a90d60 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -37,6 +37,7 @@
#define BTRFS_INODE_IN_DEFRAG 3
#define BTRFS_INODE_DELALLOC_META_RESERVED 4
#define BTRFS_INODE_HAS_ORPHAN_ITEM 5
+#define BTRFS_INODE_HAS_ASYNC_EXTENT 6
/* in memory btrfs inode */
struct btrfs_inode {
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 9cebb1fd6a3..da6e9364a5e 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -93,6 +93,7 @@
#include "print-tree.h"
#include "locking.h"
#include "check-integrity.h"
+#include "rcu-string.h"
#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
@@ -843,13 +844,14 @@ static int btrfsic_process_superblock_dev_mirror(
superblock_tmp->never_written = 0;
superblock_tmp->mirror_num = 1 + superblock_mirror_num;
if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- printk(KERN_INFO "New initial S-block (bdev %p, %s)"
- " @%llu (%s/%llu/%d)\n",
- superblock_bdev, device->name,
- (unsigned long long)dev_bytenr,
- dev_state->name,
- (unsigned long long)dev_bytenr,
- superblock_mirror_num);
+ printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)"
+ " @%llu (%s/%llu/%d)\n",
+ superblock_bdev,
+ rcu_str_deref(device->name),
+ (unsigned long long)dev_bytenr,
+ dev_state->name,
+ (unsigned long long)dev_bytenr,
+ superblock_mirror_num);
list_add(&superblock_tmp->all_blocks_node,
&state->all_blocks_list);
btrfsic_block_hashtable_add(superblock_tmp,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index d7a96cfdc50..8206b390058 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -467,6 +467,15 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
return 0;
}
+/*
+ * This allocates memory and gets a tree modification sequence number when
+ * needed.
+ *
+ * Returns 0 when no sequence number is needed, < 0 on error.
+ * Returns 1 when a sequence number was added. In this case,
+ * fs_info->tree_mod_seq_lock was acquired and must be released by the caller
+ * after inserting into the rb tree.
+ */
static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
struct tree_mod_elem **tm_ret)
{
@@ -491,11 +500,11 @@ static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
*/
kfree(tm);
seq = 0;
+ spin_unlock(&fs_info->tree_mod_seq_lock);
} else {
__get_tree_mod_seq(fs_info, &tm->elem);
seq = tm->elem.seq;
}
- spin_unlock(&fs_info->tree_mod_seq_lock);
return seq;
}
@@ -521,7 +530,9 @@ tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
tm->slot = slot;
tm->generation = btrfs_node_ptr_generation(eb, slot);
- return __tree_mod_log_insert(fs_info, tm);
+ ret = __tree_mod_log_insert(fs_info, tm);
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+ return ret;
}
static noinline int
@@ -559,7 +570,9 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
tm->move.nr_items = nr_items;
tm->op = MOD_LOG_MOVE_KEYS;
- return __tree_mod_log_insert(fs_info, tm);
+ ret = __tree_mod_log_insert(fs_info, tm);
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+ return ret;
}
static noinline int
@@ -580,7 +593,9 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
tm->generation = btrfs_header_generation(old_root);
tm->op = MOD_LOG_ROOT_REPLACE;
- return __tree_mod_log_insert(fs_info, tm);
+ ret = __tree_mod_log_insert(fs_info, tm);
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+ return ret;
}
static struct tree_mod_elem *
@@ -1009,11 +1024,18 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
if (!looped && !tm)
return 0;
/*
- * we must have key remove operations in the log before the
- * replace operation.
+ * if there are no tree operation for the oldest root, we simply
+ * return it. this should only happen if that (old) root is at
+ * level 0.
*/
- BUG_ON(!tm);
+ if (!tm)
+ break;
+ /*
+ * if there's an operation that's not a root replacement, we
+ * found the oldest version of our root. normally, we'll find a
+ * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
+ */
if (tm->op != MOD_LOG_ROOT_REPLACE)
break;
@@ -1023,6 +1045,10 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
looped = 1;
}
+ /* if there's no old root to return, return what we found instead */
+ if (!found)
+ found = tm;
+
return found;
}
@@ -1068,11 +1094,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
tm->generation);
break;
case MOD_LOG_KEY_ADD:
- if (tm->slot != n - 1) {
- o_dst = btrfs_node_key_ptr_offset(tm->slot);
- o_src = btrfs_node_key_ptr_offset(tm->slot + 1);
- memmove_extent_buffer(eb, o_dst, o_src, p_size);
- }
+ /* if a move operation is needed it's in the log */
n--;
break;
case MOD_LOG_MOVE_KEYS:
@@ -1143,45 +1165,57 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
return eb_rewin;
}
+/*
+ * get_old_root() rewinds the state of @root's root node to the given @time_seq
+ * value. If there are no changes, the current root->root_node is returned. If
+ * anything changed in between, there's a fresh buffer allocated on which the
+ * rewind operations are done. In any case, the returned buffer is read locked.
+ * Returns NULL on error (with no locks held).
+ */
static inline struct extent_buffer *
get_old_root(struct btrfs_root *root, u64 time_seq)
{
struct tree_mod_elem *tm;
struct extent_buffer *eb;
- struct tree_mod_root *old_root;
- u64 old_generation;
+ struct tree_mod_root *old_root = NULL;
+ u64 old_generation = 0;
+ u64 logical;
+ eb = btrfs_read_lock_root_node(root);
tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
if (!tm)
return root->node;
- old_root = &tm->old_root;
- old_generation = tm->generation;
-
- tm = tree_mod_log_search(root->fs_info, old_root->logical, time_seq);
- /*
- * there was an item in the log when __tree_mod_log_oldest_root
- * returned. this one must not go away, because the time_seq passed to
- * us must be blocking its removal.
- */
- BUG_ON(!tm);
+ if (tm->op == MOD_LOG_ROOT_REPLACE) {
+ old_root = &tm->old_root;
+ old_generation = tm->generation;
+ logical = old_root->logical;
+ } else {
+ logical = root->node->start;
+ }
- if (old_root->logical == root->node->start) {
- /* there are logged operations for the current root */
+ tm = tree_mod_log_search(root->fs_info, logical, time_seq);
+ if (old_root)
+ eb = alloc_dummy_extent_buffer(logical, root->nodesize);
+ else
eb = btrfs_clone_extent_buffer(root->node);
- } else {
- /* there's a root replace operation for the current root */
- eb = alloc_dummy_extent_buffer(tm->index << PAGE_CACHE_SHIFT,
- root->nodesize);
+ btrfs_tree_read_unlock(root->node);
+ free_extent_buffer(root->node);
+ if (!eb)
+ return NULL;
+ btrfs_tree_read_lock(eb);
+ if (old_root) {
btrfs_set_header_bytenr(eb, eb->start);
btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
btrfs_set_header_owner(eb, root->root_key.objectid);
+ btrfs_set_header_level(eb, old_root->level);
+ btrfs_set_header_generation(eb, old_generation);
}
- if (!eb)
- return NULL;
- btrfs_set_header_level(eb, old_root->level);
- btrfs_set_header_generation(eb, old_generation);
- __tree_mod_log_rewind(eb, time_seq, tm);
+ if (tm)
+ __tree_mod_log_rewind(eb, time_seq, tm);
+ else
+ WARN_ON(btrfs_header_level(eb) != 0);
+ extent_buffer_get(eb);
return eb;
}
@@ -1650,8 +1684,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
return 0;
- btrfs_header_nritems(mid);
-
left = read_node_slot(root, parent, pslot - 1);
if (left) {
btrfs_tree_lock(left);
@@ -1681,7 +1713,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
wret = push_node_left(trans, root, left, mid, 1);
if (wret < 0)
ret = wret;
- btrfs_header_nritems(mid);
}
/*
@@ -2615,9 +2646,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
again:
b = get_old_root(root, time_seq);
- extent_buffer_get(b);
level = btrfs_header_level(b);
- btrfs_tree_read_lock(b);
p->locks[level] = BTRFS_READ_LOCK;
while (b) {
@@ -2964,7 +2993,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
static void insert_ptr(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_disk_key *key, u64 bytenr,
- int slot, int level, int tree_mod_log)
+ int slot, int level)
{
struct extent_buffer *lower;
int nritems;
@@ -2977,7 +3006,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
BUG_ON(slot > nritems);
BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
if (slot != nritems) {
- if (tree_mod_log && level)
+ if (level)
tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
slot, nritems - slot);
memmove_extent_buffer(lower,
@@ -2985,7 +3014,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
btrfs_node_key_ptr_offset(slot),
(nritems - slot) * sizeof(struct btrfs_key_ptr));
}
- if (tree_mod_log && level) {
+ if (level) {
ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
MOD_LOG_KEY_ADD);
BUG_ON(ret < 0);
@@ -3073,7 +3102,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(split);
insert_ptr(trans, root, path, &disk_key, split->start,
- path->slots[level + 1] + 1, level + 1, 1);
+ path->slots[level + 1] + 1, level + 1);
if (path->slots[level] >= mid) {
path->slots[level] -= mid;
@@ -3610,7 +3639,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
btrfs_set_header_nritems(l, mid);
btrfs_item_key(right, &disk_key, 0);
insert_ptr(trans, root, path, &disk_key, right->start,
- path->slots[1] + 1, 1, 0);
+ path->slots[1] + 1, 1);
btrfs_mark_buffer_dirty(right);
btrfs_mark_buffer_dirty(l);
@@ -3817,7 +3846,7 @@ again:
if (mid <= slot) {
btrfs_set_header_nritems(right, 0);
insert_ptr(trans, root, path, &disk_key, right->start,
- path->slots[1] + 1, 1, 0);
+ path->slots[1] + 1, 1);
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = right;
@@ -3826,7 +3855,7 @@ again:
} else {
btrfs_set_header_nritems(right, 0);
insert_ptr(trans, root, path, &disk_key, right->start,
- path->slots[1], 1, 0);
+ path->slots[1], 1);
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = right;
@@ -5001,6 +5030,12 @@ next:
*/
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
{
+ return btrfs_next_old_leaf(root, path, 0);
+}
+
+int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
+ u64 time_seq)
+{
int slot;
int level;
struct extent_buffer *c;
@@ -5025,7 +5060,10 @@ again:
path->keep_locks = 1;
path->leave_spinning = 1;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (time_seq)
+ ret = btrfs_search_old_slot(root, &key, path, time_seq);
+ else
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
path->keep_locks = 0;
if (ret < 0)
@@ -5081,6 +5119,18 @@ again:
if (!path->skip_locking) {
ret = btrfs_try_tree_read_lock(next);
+ if (!ret && time_seq) {
+ /*
+ * If we don't get the lock, we may be racing
+ * with push_leaf_left, holding that lock while
+ * itself waiting for the leaf we've currently
+ * locked. To solve this situation, we give up
+ * on our lock and cycle.
+ */
+ btrfs_release_path(path);
+ cond_resched();
+ goto again;
+ }
if (!ret) {
btrfs_set_path_blocking(path);
btrfs_tree_read_lock(next);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0236d03c673..fa5c45b3907 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2753,13 +2753,20 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
}
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
-static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
+int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
+ u64 time_seq);
+static inline int btrfs_next_old_item(struct btrfs_root *root,
+ struct btrfs_path *p, u64 time_seq)
{
++p->slots[0];
if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
- return btrfs_next_leaf(root, p);
+ return btrfs_next_old_leaf(root, p, time_seq);
return 0;
}
+static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
+{
+ return btrfs_next_old_item(root, p, 0);
+}
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
int __must_check btrfs_drop_snapshot(struct btrfs_root *root,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index c18d0442ae6..2399f408691 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1879,3 +1879,21 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
}
}
}
+
+void btrfs_destroy_delayed_inodes(struct btrfs_root *root)
+{
+ struct btrfs_delayed_root *delayed_root;
+ struct btrfs_delayed_node *curr_node, *prev_node;
+
+ delayed_root = btrfs_get_delayed_root(root);
+
+ curr_node = btrfs_first_delayed_node(delayed_root);
+ while (curr_node) {
+ __btrfs_kill_delayed_node(curr_node);
+
+ prev_node = curr_node;
+ curr_node = btrfs_next_delayed_node(curr_node);
+ btrfs_release_delayed_node(prev_node);
+ }
+}
+
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 7083d08b2a2..f5aa4023d3e 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -124,6 +124,9 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev);
/* Used for drop dead root */
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
+/* Used for clean the transaction */
+void btrfs_destroy_delayed_inodes(struct btrfs_root *root);
+
/* Used for readdir() */
void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
struct list_head *del_list);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7ae51decf6d..2936ca49b3b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -44,6 +44,7 @@
#include "free-space-cache.h"
#include "inode-map.h"
#include "check-integrity.h"
+#include "rcu-string.h"
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
@@ -2118,7 +2119,7 @@ int open_ctree(struct super_block *sb,
features = btrfs_super_incompat_flags(disk_super);
features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
- if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
+ if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
/*
@@ -2353,12 +2354,17 @@ retry_root_backup:
BTRFS_CSUM_TREE_OBJECTID, csum_root);
if (ret)
goto recovery_tree_root;
-
csum_root->track_dirty = 1;
fs_info->generation = generation;
fs_info->last_trans_committed = generation;
+ ret = btrfs_recover_balance(fs_info);
+ if (ret) {
+ printk(KERN_WARNING "btrfs: failed to recover balance\n");
+ goto fail_block_groups;
+ }
+
ret = btrfs_init_dev_stats(fs_info);
if (ret) {
printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n",
@@ -2484,20 +2490,23 @@ retry_root_backup:
goto fail_trans_kthread;
}
- if (!(sb->s_flags & MS_RDONLY)) {
- down_read(&fs_info->cleanup_work_sem);
- err = btrfs_orphan_cleanup(fs_info->fs_root);
- if (!err)
- err = btrfs_orphan_cleanup(fs_info->tree_root);
- up_read(&fs_info->cleanup_work_sem);
+ if (sb->s_flags & MS_RDONLY)
+ return 0;
- if (!err)
- err = btrfs_recover_balance(fs_info->tree_root);
+ down_read(&fs_info->cleanup_work_sem);
+ if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
+ (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
+ up_read(&fs_info->cleanup_work_sem);
+ close_ctree(tree_root);
+ return ret;
+ }
+ up_read(&fs_info->cleanup_work_sem);
- if (err) {
- close_ctree(tree_root);
- return err;
- }
+ ret = btrfs_resume_balance_async(fs_info);
+ if (ret) {
+ printk(KERN_WARNING "btrfs: failed to resume balance\n");
+ close_ctree(tree_root);
+ return ret;
}
return 0;
@@ -2575,8 +2584,9 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
struct btrfs_device *device = (struct btrfs_device *)
bh->b_private;
- printk_ratelimited(KERN_WARNING "lost page write due to "
- "I/O error on %s\n", device->name);
+ printk_ratelimited_in_rcu(KERN_WARNING "lost page write due to "
+ "I/O error on %s\n",
+ rcu_str_deref(device->name));
/* note, we dont' set_buffer_write_io_error because we have
* our own ways of dealing with the IO errors
*/
@@ -2749,8 +2759,8 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
wait_for_completion(&device->flush_wait);
if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
- printk("btrfs: disabling barriers on dev %s\n",
- device->name);
+ printk_in_rcu("btrfs: disabling barriers on dev %s\n",
+ rcu_str_deref(device->name));
device->nobarriers = 1;
}
if (!bio_flagged(bio, BIO_UPTODATE)) {
@@ -3400,7 +3410,6 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
delayed_refs = &trans->delayed_refs;
-again:
spin_lock(&delayed_refs->lock);
if (delayed_refs->num_entries == 0) {
spin_unlock(&delayed_refs->lock);
@@ -3408,31 +3417,37 @@ again:
return ret;
}
- node = rb_first(&delayed_refs->root);
- while (node) {
+ while ((node = rb_first(&delayed_refs->root)) != NULL) {
ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- node = rb_next(node);
-
- ref->in_tree = 0;
- rb_erase(&ref->rb_node, &delayed_refs->root);
- delayed_refs->num_entries--;
atomic_set(&ref->refs, 1);
if (btrfs_delayed_ref_is_head(ref)) {
struct btrfs_delayed_ref_head *head;
head = btrfs_delayed_node_to_head(ref);
- spin_unlock(&delayed_refs->lock);
- mutex_lock(&head->mutex);
+ if (!mutex_trylock(&head->mutex)) {
+ atomic_inc(&ref->refs);
+ spin_unlock(&delayed_refs->lock);
+
+ /* Need to wait for the delayed ref to run */
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(ref);
+
+ spin_lock(&delayed_refs->lock);
+ continue;
+ }
+
kfree(head->extent_op);
delayed_refs->num_heads--;
if (list_empty(&head->cluster))
delayed_refs->num_heads_ready--;
list_del_init(&head->cluster);
- mutex_unlock(&head->mutex);
- btrfs_put_delayed_ref(ref);
- goto again;
}
+ ref->in_tree = 0;
+ rb_erase(&ref->rb_node, &delayed_refs->root);
+ delayed_refs->num_entries--;
+
spin_unlock(&delayed_refs->lock);
btrfs_put_delayed_ref(ref);
@@ -3520,11 +3535,9 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
&(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
offset >> PAGE_CACHE_SHIFT);
spin_unlock(&dirty_pages->buffer_lock);
- if (eb) {
+ if (eb)
ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
&eb->bflags);
- atomic_set(&eb->refs, 1);
- }
if (PageWriteback(page))
end_page_writeback(page);
@@ -3538,8 +3551,8 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
spin_unlock_irq(&page->mapping->tree_lock);
}
- page->mapping->a_ops->invalidatepage(page, 0);
unlock_page(page);
+ page_cache_release(page);
}
}
@@ -3553,8 +3566,10 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
u64 start;
u64 end;
int ret;
+ bool loop = true;
unpin = pinned_extents;
+again:
while (1) {
ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY);
@@ -3572,6 +3587,15 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
cond_resched();
}
+ if (loop) {
+ if (unpin == &root->fs_info->freed_extents[0])
+ unpin = &root->fs_info->freed_extents[1];
+ else
+ unpin = &root->fs_info->freed_extents[0];
+ loop = false;
+ goto again;
+ }
+
return 0;
}
@@ -3585,21 +3609,23 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
/* FIXME: cleanup wait for commit */
cur_trans->in_commit = 1;
cur_trans->blocked = 1;
- if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
- wake_up(&root->fs_info->transaction_blocked_wait);
+ wake_up(&root->fs_info->transaction_blocked_wait);
cur_trans->blocked = 0;
- if (waitqueue_active(&root->fs_info->transaction_wait))
- wake_up(&root->fs_info->transaction_wait);
+ wake_up(&root->fs_info->transaction_wait);
cur_trans->commit_done = 1;
- if (waitqueue_active(&cur_trans->commit_wait))
- wake_up(&cur_trans->commit_wait);
+ wake_up(&cur_trans->commit_wait);
+
+ btrfs_destroy_delayed_inodes(root);
+ btrfs_assert_delayed_root_empty(root);
btrfs_destroy_pending_snapshots(cur_trans);
btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages,
EXTENT_DIRTY);
+ btrfs_destroy_pinned_extent(root,
+ root->fs_info->pinned_extents);
/*
memset(cur_trans, 0, sizeof(*cur_trans));
@@ -3648,6 +3674,9 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
if (waitqueue_active(&t->commit_wait))
wake_up(&t->commit_wait);
+ btrfs_destroy_delayed_inodes(root);
+ btrfs_assert_delayed_root_empty(root);
+
btrfs_destroy_pending_snapshots(t);
btrfs_destroy_delalloc_inodes(root);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4b5a1e1bdef..6e1d36702ff 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2347,12 +2347,10 @@ next:
return count;
}
-
static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
- unsigned long num_refs)
+ unsigned long num_refs,
+ struct list_head *first_seq)
{
- struct list_head *first_seq = delayed_refs->seq_head.next;
-
spin_unlock(&delayed_refs->lock);
pr_debug("waiting for more refs (num %ld, first %p)\n",
num_refs, first_seq);
@@ -2381,6 +2379,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_node *ref;
struct list_head cluster;
+ struct list_head *first_seq = NULL;
int ret;
u64 delayed_start;
int run_all = count == (unsigned long)-1;
@@ -2436,8 +2435,10 @@ again:
*/
consider_waiting = 1;
num_refs = delayed_refs->num_entries;
+ first_seq = root->fs_info->tree_mod_seq_list.next;
} else {
- wait_for_more_refs(delayed_refs, num_refs);
+ wait_for_more_refs(delayed_refs,
+ num_refs, first_seq);
/*
* after waiting, things have changed. we
* dropped the lock and someone else might have
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2c8f7b20461..01c21b6c6d4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -20,6 +20,7 @@
#include "volumes.h"
#include "check-integrity.h"
#include "locking.h"
+#include "rcu-string.h"
static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
@@ -1917,9 +1918,9 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
return -EIO;
}
- printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
- "sector %llu)\n", page->mapping->host->i_ino, start,
- dev->name, sector);
+ printk_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu "
+ "(dev %s sector %llu)\n", page->mapping->host->i_ino,
+ start, rcu_str_deref(dev->name), sector);
bio_put(bio);
return 0;
@@ -3323,6 +3324,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
writepage_t writepage, void *data,
void (*flush_fn)(void *))
{
+ struct inode *inode = mapping->host;
int ret = 0;
int done = 0;
int nr_to_write_done = 0;
@@ -3333,6 +3335,18 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
int scanned = 0;
int tag;
+ /*
+ * We have to hold onto the inode so that ordered extents can do their
+ * work when the IO finishes. The alternative to this is failing to add
+ * an ordered extent if the igrab() fails there and that is a huge pain
+ * to deal with, so instead just hold onto the inode throughout the
+ * writepages operation. If it fails here we are freeing up the inode
+ * anyway and we'd rather not waste our time writing out stuff that is
+ * going to be truncated anyway.
+ */
+ if (!igrab(inode))
+ return 0;
+
pagevec_init(&pvec, 0);
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
@@ -3427,6 +3441,7 @@ retry:
index = 0;
goto retry;
}
+ btrfs_add_delayed_iput(inode);
return ret;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 70dc8ca73e2..9aa01ec2138 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1334,7 +1334,6 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
loff_t *ppos, size_t count, size_t ocount)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = fdentry(file)->d_inode;
struct iov_iter i;
ssize_t written;
ssize_t written_buffered;
@@ -1344,18 +1343,6 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
count, ocount);
- /*
- * the generic O_DIRECT will update in-memory i_size after the
- * DIOs are done. But our endio handlers that update the on
- * disk i_size never update past the in memory i_size. So we
- * need one more update here to catch any additions to the
- * file
- */
- if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
- btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
- mark_inode_dirty(inode);
- }
-
if (written < 0 || written == count)
return written;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 81296c57405..6c4e2baa929 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1543,29 +1543,26 @@ again:
end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
/*
- * XXX - this can go away after a few releases.
- *
- * since the only user of btrfs_remove_free_space is the tree logging
- * stuff, and the only way to test that is under crash conditions, we
- * want to have this debug stuff here just in case somethings not
- * working. Search the bitmap for the space we are trying to use to
- * make sure its actually there. If its not there then we need to stop
- * because something has gone wrong.
+ * We need to search for bits in this bitmap. We could only cover some
+ * of the extent in this bitmap thanks to how we add space, so we need
+ * to search for as much as it as we can and clear that amount, and then
+ * go searching for the next bit.
*/
search_start = *offset;
- search_bytes = *bytes;
+ search_bytes = ctl->unit;
search_bytes = min(search_bytes, end - search_start + 1);
ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
BUG_ON(ret < 0 || search_start != *offset);
- if (*offset > bitmap_info->offset && *offset + *bytes > end) {
- bitmap_clear_bits(ctl, bitmap_info, *offset, end - *offset + 1);
- *bytes -= end - *offset + 1;
- *offset = end + 1;
- } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
- bitmap_clear_bits(ctl, bitmap_info, *offset, *bytes);
- *bytes = 0;
- }
+ /* We may have found more bits than what we need */
+ search_bytes = min(search_bytes, *bytes);
+
+ /* Cannot clear past the end of the bitmap */
+ search_bytes = min(search_bytes, end - search_start + 1);
+
+ bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
+ *offset += search_bytes;
+ *bytes -= search_bytes;
if (*bytes) {
struct rb_node *next = rb_next(&bitmap_info->offset_index);
@@ -1596,7 +1593,7 @@ again:
* everything over again.
*/
search_start = *offset;
- search_bytes = *bytes;
+ search_bytes = ctl->unit;
ret = search_bitmap(ctl, bitmap_info, &search_start,
&search_bytes);
if (ret < 0 || search_start != *offset)
@@ -1879,12 +1876,14 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *info;
- struct btrfs_free_space *next_info = NULL;
int ret = 0;
spin_lock(&ctl->tree_lock);
again:
+ if (!bytes)
+ goto out_lock;
+
info = tree_search_offset(ctl, offset, 0, 0);
if (!info) {
/*
@@ -1905,88 +1904,48 @@ again:
}
}
- if (info->bytes < bytes && rb_next(&info->offset_index)) {
- u64 end;
- next_info = rb_entry(rb_next(&info->offset_index),
- struct btrfs_free_space,
- offset_index);
-
- if (next_info->bitmap)
- end = next_info->offset +
- BITS_PER_BITMAP * ctl->unit - 1;
- else
- end = next_info->offset + next_info->bytes;
-
- if (next_info->bytes < bytes ||
- next_info->offset > offset || offset > end) {
- printk(KERN_CRIT "Found free space at %llu, size %llu,"
- " trying to use %llu\n",
- (unsigned long long)info->offset,
- (unsigned long long)info->bytes,
- (unsigned long long)bytes);
- WARN_ON(1);
- ret = -EINVAL;
- goto out_lock;
- }
-
- info = next_info;
- }
-
- if (info->bytes == bytes) {
+ if (!info->bitmap) {
unlink_free_space(ctl, info);
- if (info->bitmap) {
- kfree(info->bitmap);
- ctl->total_bitmaps--;
- }
- kmem_cache_free(btrfs_free_space_cachep, info);
- ret = 0;
- goto out_lock;
- }
-
- if (!info->bitmap && info->offset == offset) {
- unlink_free_space(ctl, info);
- info->offset += bytes;
- info->bytes -= bytes;
- ret = link_free_space(ctl, info);
- WARN_ON(ret);
- goto out_lock;
- }
+ if (offset == info->offset) {
+ u64 to_free = min(bytes, info->bytes);
+
+ info->bytes -= to_free;
+ info->offset += to_free;
+ if (info->bytes) {
+ ret = link_free_space(ctl, info);
+ WARN_ON(ret);
+ } else {
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ }
- if (!info->bitmap && info->offset <= offset &&
- info->offset + info->bytes >= offset + bytes) {
- u64 old_start = info->offset;
- /*
- * we're freeing space in the middle of the info,
- * this can happen during tree log replay
- *
- * first unlink the old info and then
- * insert it again after the hole we're creating
- */
- unlink_free_space(ctl, info);
- if (offset + bytes < info->offset + info->bytes) {
- u64 old_end = info->offset + info->bytes;
+ offset += to_free;
+ bytes -= to_free;
+ goto again;
+ } else {
+ u64 old_end = info->bytes + info->offset;
- info->offset = offset + bytes;
- info->bytes = old_end - info->offset;
+ info->bytes = offset - info->offset;
ret = link_free_space(ctl, info);
WARN_ON(ret);
if (ret)
goto out_lock;
- } else {
- /* the hole we're creating ends at the end
- * of the info struct, just free the info
- */
- kmem_cache_free(btrfs_free_space_cachep, info);
- }
- spin_unlock(&ctl->tree_lock);
- /* step two, insert a new info struct to cover
- * anything before the hole
- */
- ret = btrfs_add_free_space(block_group, old_start,
- offset - old_start);
- WARN_ON(ret); /* -ENOMEM */
- goto out;
+ /* Not enough bytes in this entry to satisfy us */
+ if (old_end < offset + bytes) {
+ bytes -= old_end - offset;
+ offset = old_end;
+ goto again;
+ } else if (old_end == offset + bytes) {
+ /* all done */
+ goto out_lock;
+ }
+ spin_unlock(&ctl->tree_lock);
+
+ ret = btrfs_add_free_space(block_group, offset + bytes,
+ old_end - (offset + bytes));
+ WARN_ON(ret);
+ goto out;
+ }
}
ret = remove_from_bitmap(ctl, info, &offset, &bytes);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f6ab6f5e635..a7d1921ac76 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -830,7 +830,7 @@ static noinline int cow_file_range(struct inode *inode,
if (IS_ERR(trans)) {
extent_clear_unlock_delalloc(inode,
&BTRFS_I(inode)->io_tree,
- start, end, NULL,
+ start, end, locked_page,
EXTENT_CLEAR_UNLOCK_PAGE |
EXTENT_CLEAR_UNLOCK |
EXTENT_CLEAR_DELALLOC |
@@ -963,7 +963,7 @@ out:
out_unlock:
extent_clear_unlock_delalloc(inode,
&BTRFS_I(inode)->io_tree,
- start, end, NULL,
+ start, end, locked_page,
EXTENT_CLEAR_UNLOCK_PAGE |
EXTENT_CLEAR_UNLOCK |
EXTENT_CLEAR_DELALLOC |
@@ -986,8 +986,10 @@ static noinline void async_cow_start(struct btrfs_work *work)
compress_file_range(async_cow->inode, async_cow->locked_page,
async_cow->start, async_cow->end, async_cow,
&num_added);
- if (num_added == 0)
+ if (num_added == 0) {
+ btrfs_add_delayed_iput(async_cow->inode);
async_cow->inode = NULL;
+ }
}
/*
@@ -1020,6 +1022,8 @@ static noinline void async_cow_free(struct btrfs_work *work)
{
struct async_cow *async_cow;
async_cow = container_of(work, struct async_cow, work);
+ if (async_cow->inode)
+ btrfs_add_delayed_iput(async_cow->inode);
kfree(async_cow);
}
@@ -1038,7 +1042,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
while (start < end) {
async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
BUG_ON(!async_cow); /* -ENOMEM */
- async_cow->inode = inode;
+ async_cow->inode = igrab(inode);
async_cow->root = root;
async_cow->locked_page = locked_page;
async_cow->start = start;
@@ -1136,8 +1140,18 @@ static noinline int run_delalloc_nocow(struct inode *inode,
u64 ino = btrfs_ino(inode);
path = btrfs_alloc_path();
- if (!path)
+ if (!path) {
+ extent_clear_unlock_delalloc(inode,
+ &BTRFS_I(inode)->io_tree,
+ start, end, locked_page,
+ EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK |
+ EXTENT_END_WRITEBACK);
return -ENOMEM;
+ }
nolock = btrfs_is_free_space_inode(root, inode);
@@ -1147,6 +1161,15 @@ static noinline int run_delalloc_nocow(struct inode *inode,
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
+ extent_clear_unlock_delalloc(inode,
+ &BTRFS_I(inode)->io_tree,
+ start, end, locked_page,
+ EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK |
+ EXTENT_END_WRITEBACK);
btrfs_free_path(path);
return PTR_ERR(trans);
}
@@ -1327,8 +1350,11 @@ out_check:
}
btrfs_release_path(path);
- if (cur_offset <= end && cow_start == (u64)-1)
+ if (cur_offset <= end && cow_start == (u64)-1) {
cow_start = cur_offset;
+ cur_offset = end;
+ }
+
if (cow_start != (u64)-1) {
ret = cow_file_range(inode, locked_page, cow_start, end,
page_started, nr_written, 1);
@@ -1347,6 +1373,17 @@ error:
if (!ret)
ret = err;
+ if (ret && cur_offset < end)
+ extent_clear_unlock_delalloc(inode,
+ &BTRFS_I(inode)->io_tree,
+ cur_offset, end, locked_page,
+ EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK |
+ EXTENT_END_WRITEBACK);
+
btrfs_free_path(path);
return ret;
}
@@ -1361,20 +1398,23 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
int ret;
struct btrfs_root *root = BTRFS_I(inode)->root;
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 1, nr_written);
- else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
+ } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 0, nr_written);
- else if (!btrfs_test_opt(root, COMPRESS) &&
- !(BTRFS_I(inode)->force_compress) &&
- !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))
+ } else if (!btrfs_test_opt(root, COMPRESS) &&
+ !(BTRFS_I(inode)->force_compress) &&
+ !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
ret = cow_file_range(inode, locked_page, start, end,
page_started, nr_written, 1);
- else
+ } else {
+ set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags);
ret = cow_file_range_async(inode, locked_page, start, end,
page_started, nr_written);
+ }
return ret;
}
@@ -3714,7 +3754,7 @@ void btrfs_evict_inode(struct inode *inode)
btrfs_wait_ordered_range(inode, 0, (u64)-1);
if (root->fs_info->log_root_recovering) {
- BUG_ON(!test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+ BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
&BTRFS_I(inode)->runtime_flags));
goto no_delete;
}
@@ -5836,8 +5876,17 @@ map:
bh_result->b_size = len;
bh_result->b_bdev = em->bdev;
set_buffer_mapped(bh_result);
- if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- set_buffer_new(bh_result);
+ if (create) {
+ if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ set_buffer_new(bh_result);
+
+ /*
+ * Need to update the i_size under the extent lock so buffered
+ * readers will get the updated i_size when we unlock.
+ */
+ if (start + len > i_size_read(inode))
+ i_size_write(inode, start + len);
+ }
free_extent_map(em);
@@ -6320,12 +6369,48 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
*/
ordered = btrfs_lookup_ordered_range(inode, lockstart,
lockend - lockstart + 1);
- if (!ordered)
+
+ /*
+ * We need to make sure there are no buffered pages in this
+ * range either, we could have raced between the invalidate in
+ * generic_file_direct_write and locking the extent. The
+ * invalidate needs to happen so that reads after a write do not
+ * get stale data.
+ */
+ if (!ordered && (!writing ||
+ !test_range_bit(&BTRFS_I(inode)->io_tree,
+ lockstart, lockend, EXTENT_UPTODATE, 0,
+ cached_state)))
break;
+
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state, GFP_NOFS);
- btrfs_start_ordered_extent(inode, ordered, 1);
- btrfs_put_ordered_extent(ordered);
+
+ if (ordered) {
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ } else {
+ /* Screw you mmap */
+ ret = filemap_write_and_wait_range(file->f_mapping,
+ lockstart,
+ lockend);
+ if (ret)
+ goto out;
+
+ /*
+ * If we found a page that couldn't be invalidated just
+ * fall back to buffered.
+ */
+ ret = invalidate_inode_pages2_range(file->f_mapping,
+ lockstart >> PAGE_CACHE_SHIFT,
+ lockend >> PAGE_CACHE_SHIFT);
+ if (ret) {
+ if (ret == -EBUSY)
+ ret = 0;
+ goto out;
+ }
+ }
+
cond_resched();
}
@@ -7054,10 +7139,13 @@ static void fixup_inode_flags(struct inode *dir, struct inode *inode)
else
b_inode->flags &= ~BTRFS_INODE_NODATACOW;
- if (b_dir->flags & BTRFS_INODE_COMPRESS)
+ if (b_dir->flags & BTRFS_INODE_COMPRESS) {
b_inode->flags |= BTRFS_INODE_COMPRESS;
- else
- b_inode->flags &= ~BTRFS_INODE_COMPRESS;
+ b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ } else {
+ b_inode->flags &= ~(BTRFS_INODE_COMPRESS |
+ BTRFS_INODE_NOCOMPRESS);
+ }
}
static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 24b776c08d9..0e92e576300 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -52,6 +52,7 @@
#include "locking.h"
#include "inode-map.h"
#include "backref.h"
+#include "rcu-string.h"
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -785,39 +786,57 @@ none:
return -ENOENT;
}
-/*
- * Validaty check of prev em and next em:
- * 1) no prev/next em
- * 2) prev/next em is an hole/inline extent
- */
-static int check_adjacent_extents(struct inode *inode, struct extent_map *em)
+static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
{
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
- struct extent_map *prev = NULL, *next = NULL;
- int ret = 0;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_map *em;
+ u64 len = PAGE_CACHE_SIZE;
+ /*
+ * hopefully we have this extent in the tree already, try without
+ * the full extent lock
+ */
read_lock(&em_tree->lock);
- prev = lookup_extent_mapping(em_tree, em->start - 1, (u64)-1);
- next = lookup_extent_mapping(em_tree, em->start + em->len, (u64)-1);
+ em = lookup_extent_mapping(em_tree, start, len);
read_unlock(&em_tree->lock);
- if ((!prev || prev->block_start >= EXTENT_MAP_LAST_BYTE) &&
- (!next || next->block_start >= EXTENT_MAP_LAST_BYTE))
- ret = 1;
- free_extent_map(prev);
- free_extent_map(next);
+ if (!em) {
+ /* get the big lock and read metadata off disk */
+ lock_extent(io_tree, start, start + len - 1);
+ em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+ unlock_extent(io_tree, start, start + len - 1);
+
+ if (IS_ERR(em))
+ return NULL;
+ }
+
+ return em;
+}
+
+static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
+{
+ struct extent_map *next;
+ bool ret = true;
+ /* this is the last extent */
+ if (em->start + em->len >= i_size_read(inode))
+ return false;
+
+ next = defrag_lookup_extent(inode, em->start + em->len);
+ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+ ret = false;
+
+ free_extent_map(next);
return ret;
}
-static int should_defrag_range(struct inode *inode, u64 start, u64 len,
- int thresh, u64 *last_len, u64 *skip,
- u64 *defrag_end)
+static int should_defrag_range(struct inode *inode, u64 start, int thresh,
+ u64 *last_len, u64 *skip, u64 *defrag_end)
{
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct extent_map *em = NULL;
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
int ret = 1;
+ bool next_mergeable = true;
/*
* make sure that once we start defragging an extent, we keep on
@@ -828,23 +847,9 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
*skip = 0;
- /*
- * hopefully we have this extent in the tree already, try without
- * the full extent lock
- */
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
- read_unlock(&em_tree->lock);
-
- if (!em) {
- /* get the big lock and read metadata off disk */
- lock_extent(io_tree, start, start + len - 1);
- em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
- unlock_extent(io_tree, start, start + len - 1);
-
- if (IS_ERR(em))
- return 0;
- }
+ em = defrag_lookup_extent(inode, start);
+ if (!em)
+ return 0;
/* this will cover holes, and inline extents */
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
@@ -852,18 +857,15 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
goto out;
}
- /* If we have nothing to merge with us, just skip. */
- if (check_adjacent_extents(inode, em)) {
- ret = 0;
- goto out;
- }
+ next_mergeable = defrag_check_next_extent(inode, em);
/*
- * we hit a real extent, if it is big don't bother defragging it again
+ * we hit a real extent, if it is big or the next extent is not a
+ * real extent, don't bother defragging it
*/
- if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh)
+ if ((*last_len == 0 || *last_len >= thresh) &&
+ (em->len >= thresh || !next_mergeable))
ret = 0;
-
out:
/*
* last_len ends up being a counter of how many bytes we've defragged.
@@ -1142,8 +1144,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
break;
if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
- PAGE_CACHE_SIZE, extent_thresh,
- &last_len, &skip, &defrag_end)) {
+ extent_thresh, &last_len, &skip,
+ &defrag_end)) {
unsigned long next;
/*
* the should_defrag function tells us how much to skip
@@ -1304,6 +1306,14 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
ret = -EINVAL;
goto out_free;
}
+ if (device->fs_devices && device->fs_devices->seeding) {
+ printk(KERN_INFO "btrfs: resizer unable to apply on "
+ "seeding device %llu\n",
+ (unsigned long long)devid);
+ ret = -EINVAL;
+ goto out_free;
+ }
+
if (!strcmp(sizestr, "max"))
new_size = device->bdev->bd_inode->i_size;
else {
@@ -1345,8 +1355,9 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
do_div(new_size, root->sectorsize);
new_size *= root->sectorsize;
- printk(KERN_INFO "btrfs: new size for %s is %llu\n",
- device->name, (unsigned long long)new_size);
+ printk_in_rcu(KERN_INFO "btrfs: new size for %s is %llu\n",
+ rcu_str_deref(device->name),
+ (unsigned long long)new_size);
if (new_size > old_size) {
trans = btrfs_start_transaction(root, 0);
@@ -2264,7 +2275,12 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
di_args->total_bytes = dev->total_bytes;
memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
if (dev->name) {
- strncpy(di_args->path, dev->name, sizeof(di_args->path));
+ struct rcu_string *name;
+
+ rcu_read_lock();
+ name = rcu_dereference(dev->name);
+ strncpy(di_args->path, name->str, sizeof(di_args->path));
+ rcu_read_unlock();
di_args->path[sizeof(di_args->path) - 1] = 0;
} else {
di_args->path[0] = '\0';
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 497c530724c..e440aa653c3 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -339,7 +339,7 @@ struct btrfs_ioctl_get_dev_stats {
#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
struct btrfs_ioctl_vol_args_v2)
-#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64)
+#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64)
#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \
struct btrfs_ioctl_scrub_args)
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 9e138cdc36c..643335a4fe3 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -627,7 +627,27 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
/* start IO across the range first to instantiate any delalloc
* extents
*/
- filemap_write_and_wait_range(inode->i_mapping, start, orig_end);
+ filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
+
+ /*
+ * So with compression we will find and lock a dirty page and clear the
+ * first one as dirty, setup an async extent, and immediately return
+ * with the entire range locked but with nobody actually marked with
+ * writeback. So we can't just filemap_write_and_wait_range() and
+ * expect it to work since it will just kick off a thread to do the
+ * actual work. So we need to call filemap_fdatawrite_range _again_
+ * since it will wait on the page lock, which won't be unlocked until
+ * after the pages have been marked as writeback and so we're good to go
+ * from there. We have to do this otherwise we'll miss the ordered
+ * extents and that results in badness. Please Josef, do not think you
+ * know better and pull this out at some point in the future, it is
+ * right and you are wrong.
+ */
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
+
+ filemap_fdatawait_range(inode->i_mapping, start, orig_end);
end = orig_end;
found = 0;
diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h
new file mode 100644
index 00000000000..9e111e4576d
--- /dev/null
+++ b/fs/btrfs/rcu-string.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+struct rcu_string {
+ struct rcu_head rcu;
+ char str[0];
+};
+
+static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask)
+{
+ size_t len = strlen(src) + 1;
+ struct rcu_string *ret = kzalloc(sizeof(struct rcu_string) +
+ (len * sizeof(char)), mask);
+ if (!ret)
+ return ret;
+ strncpy(ret->str, src, len);
+ return ret;
+}
+
+static inline void rcu_string_free(struct rcu_string *str)
+{
+ if (str)
+ kfree_rcu(str, rcu);
+}
+
+#define printk_in_rcu(fmt, ...) do { \
+ rcu_read_lock(); \
+ printk(fmt, __VA_ARGS__); \
+ rcu_read_unlock(); \
+} while (0)
+
+#define printk_ratelimited_in_rcu(fmt, ...) do { \
+ rcu_read_lock(); \
+ printk_ratelimited(fmt, __VA_ARGS__); \
+ rcu_read_unlock(); \
+} while (0)
+
+#define rcu_str_deref(rcu_str) ({ \
+ struct rcu_string *__str = rcu_dereference(rcu_str); \
+ __str->str; \
+})
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a38cfa4f251..b223620cd5a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -26,6 +26,7 @@
#include "backref.h"
#include "extent_io.h"
#include "check-integrity.h"
+#include "rcu-string.h"
/*
* This is only the first step towards a full-features scrub. It reads all
@@ -320,10 +321,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
* hold all of the paths here
*/
for (i = 0; i < ipath->fspath->elem_cnt; ++i)
- printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
+ printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
"%s, sector %llu, root %llu, inode %llu, offset %llu, "
"length %llu, links %u (path: %s)\n", swarn->errstr,
- swarn->logical, swarn->dev->name,
+ swarn->logical, rcu_str_deref(swarn->dev->name),
(unsigned long long)swarn->sector, root, inum, offset,
min(isize - offset, (u64)PAGE_SIZE), nlink,
(char *)(unsigned long)ipath->fspath->val[i]);
@@ -332,10 +333,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
return 0;
err:
- printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
+ printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
"resolving failed with ret=%d\n", swarn->errstr,
- swarn->logical, swarn->dev->name,
+ swarn->logical, rcu_str_deref(swarn->dev->name),
(unsigned long long)swarn->sector, root, inum, offset, ret);
free_ipath(ipath);
@@ -390,10 +391,11 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
do {
ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
&ref_root, &ref_level);
- printk(KERN_WARNING
+ printk_in_rcu(KERN_WARNING
"btrfs: %s at logical %llu on dev %s, "
"sector %llu: metadata %s (level %d) in tree "
- "%llu\n", errstr, swarn.logical, dev->name,
+ "%llu\n", errstr, swarn.logical,
+ rcu_str_deref(dev->name),
(unsigned long long)swarn.sector,
ref_level ? "node" : "leaf",
ret < 0 ? -1 : ref_level,
@@ -580,9 +582,11 @@ out:
spin_lock(&sdev->stat_lock);
++sdev->stat.uncorrectable_errors;
spin_unlock(&sdev->stat_lock);
- printk_ratelimited(KERN_ERR
+
+ printk_ratelimited_in_rcu(KERN_ERR
"btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
- (unsigned long long)fixup->logical, sdev->dev->name);
+ (unsigned long long)fixup->logical,
+ rcu_str_deref(sdev->dev->name));
}
btrfs_free_path(path);
@@ -936,18 +940,20 @@ corrected_error:
spin_lock(&sdev->stat_lock);
sdev->stat.corrected_errors++;
spin_unlock(&sdev->stat_lock);
- printk_ratelimited(KERN_ERR
+ printk_ratelimited_in_rcu(KERN_ERR
"btrfs: fixed up error at logical %llu on dev %s\n",
- (unsigned long long)logical, sdev->dev->name);
+ (unsigned long long)logical,
+ rcu_str_deref(sdev->dev->name));
}
} else {
did_not_correct_error:
spin_lock(&sdev->stat_lock);
sdev->stat.uncorrectable_errors++;
spin_unlock(&sdev->stat_lock);
- printk_ratelimited(KERN_ERR
+ printk_ratelimited_in_rcu(KERN_ERR
"btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
- (unsigned long long)logical, sdev->dev->name);
+ (unsigned long long)logical,
+ rcu_str_deref(sdev->dev->name));
}
out:
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 96eb9fef7bd..e23991574fd 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -54,6 +54,7 @@
#include "version.h"
#include "export.h"
#include "compression.h"
+#include "rcu-string.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
@@ -1186,6 +1187,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (ret)
goto restore;
+ ret = btrfs_resume_balance_async(fs_info);
+ if (ret)
+ goto restore;
+
sb->s_flags &= ~MS_RDONLY;
}
@@ -1482,12 +1487,44 @@ static void btrfs_fs_dirty_inode(struct inode *inode, int flags)
"error %d\n", btrfs_ino(inode), ret);
}
+static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
+ struct btrfs_fs_devices *cur_devices;
+ struct btrfs_device *dev, *first_dev = NULL;
+ struct list_head *head;
+ struct rcu_string *name;
+
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ cur_devices = fs_info->fs_devices;
+ while (cur_devices) {
+ head = &cur_devices->devices;
+ list_for_each_entry(dev, head, dev_list) {
+ if (!first_dev || dev->devid < first_dev->devid)
+ first_dev = dev;
+ }
+ cur_devices = cur_devices->seed;
+ }
+
+ if (first_dev) {
+ rcu_read_lock();
+ name = rcu_dereference(first_dev->name);
+ seq_escape(m, name->str, " \t\n\\");
+ rcu_read_unlock();
+ } else {
+ WARN_ON(1);
+ }
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ return 0;
+}
+
static const struct super_operations btrfs_super_ops = {
.drop_inode = btrfs_drop_inode,
.evict_inode = btrfs_evict_inode,
.put_super = btrfs_put_super,
.sync_fs = btrfs_sync_fs,
.show_options = btrfs_show_options,
+ .show_devname = btrfs_show_devname,
.write_inode = btrfs_write_inode,
.dirty_inode = btrfs_fs_dirty_inode,
.alloc_inode = btrfs_alloc_inode,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 1791c6e3d83..b72b068183e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -100,6 +100,10 @@ loop:
kmem_cache_free(btrfs_transaction_cachep, cur_trans);
cur_trans = fs_info->running_transaction;
goto loop;
+ } else if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ spin_unlock(&root->fs_info->trans_lock);
+ kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+ return -EROFS;
}
atomic_set(&cur_trans->num_writers, 1);
@@ -1213,14 +1217,20 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
static void cleanup_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_root *root, int err)
{
struct btrfs_transaction *cur_trans = trans->transaction;
WARN_ON(trans->use_count > 1);
+ btrfs_abort_transaction(trans, root, err);
+
spin_lock(&root->fs_info->trans_lock);
list_del_init(&cur_trans->list);
+ if (cur_trans == root->fs_info->running_transaction) {
+ root->fs_info->running_transaction = NULL;
+ root->fs_info->trans_no_join = 0;
+ }
spin_unlock(&root->fs_info->trans_lock);
btrfs_cleanup_one_transaction(trans->transaction, root);
@@ -1526,7 +1536,7 @@ cleanup_transaction:
// WARN_ON(1);
if (current->journal_info == trans)
current->journal_info = NULL;
- cleanup_transaction(trans, root);
+ cleanup_transaction(trans, root, ret);
return ret;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 2017d0ff511..8abeae4224f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -690,6 +690,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
kfree(name);
iput(inode);
+
+ btrfs_run_delayed_items(trans, root);
return ret;
}
@@ -895,6 +897,7 @@ again:
ret = btrfs_unlink_inode(trans, root, dir,
inode, victim_name,
victim_name_len);
+ btrfs_run_delayed_items(trans, root);
}
kfree(victim_name);
ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
@@ -1475,6 +1478,9 @@ again:
ret = btrfs_unlink_inode(trans, root, dir, inode,
name, name_len);
BUG_ON(ret);
+
+ btrfs_run_delayed_items(trans, root);
+
kfree(name);
iput(inode);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7782020996f..ecaad40e7ef 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -35,6 +35,7 @@
#include "volumes.h"
#include "async-thread.h"
#include "check-integrity.h"
+#include "rcu-string.h"
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -64,7 +65,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
device = list_entry(fs_devices->devices.next,
struct btrfs_device, dev_list);
list_del(&device->dev_list);
- kfree(device->name);
+ rcu_string_free(device->name);
kfree(device);
}
kfree(fs_devices);
@@ -334,8 +335,8 @@ static noinline int device_list_add(const char *path,
{
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices;
+ struct rcu_string *name;
u64 found_transid = btrfs_super_generation(disk_super);
- char *name;
fs_devices = find_fsid(disk_super->fsid);
if (!fs_devices) {
@@ -369,11 +370,13 @@ static noinline int device_list_add(const char *path,
memcpy(device->uuid, disk_super->dev_item.uuid,
BTRFS_UUID_SIZE);
spin_lock_init(&device->io_lock);
- device->name = kstrdup(path, GFP_NOFS);
- if (!device->name) {
+
+ name = rcu_string_strdup(path, GFP_NOFS);
+ if (!name) {
kfree(device);
return -ENOMEM;
}
+ rcu_assign_pointer(device->name, name);
INIT_LIST_HEAD(&device->dev_alloc_list);
/* init readahead state */
@@ -390,12 +393,12 @@ static noinline int device_list_add(const char *path,
device->fs_devices = fs_devices;
fs_devices->num_devices++;
- } else if (!device->name || strcmp(device->name, path)) {
- name = kstrdup(path, GFP_NOFS);
+ } else if (!device->name || strcmp(device->name->str, path)) {
+ name = rcu_string_strdup(path, GFP_NOFS);
if (!name)
return -ENOMEM;
- kfree(device->name);
- device->name = name;
+ rcu_string_free(device->name);
+ rcu_assign_pointer(device->name, name);
if (device->missing) {
fs_devices->missing_devices--;
device->missing = 0;
@@ -430,15 +433,22 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
/* We have held the volume lock, it is safe to get the devices. */
list_for_each_entry(orig_dev, &orig->devices, dev_list) {
+ struct rcu_string *name;
+
device = kzalloc(sizeof(*device), GFP_NOFS);
if (!device)
goto error;
- device->name = kstrdup(orig_dev->name, GFP_NOFS);
- if (!device->name) {
+ /*
+ * This is ok to do without rcu read locked because we hold the
+ * uuid mutex so nothing we touch in here is going to disappear.
+ */
+ name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
+ if (!name) {
kfree(device);
goto error;
}
+ rcu_assign_pointer(device->name, name);
device->devid = orig_dev->devid;
device->work.func = pending_bios_fn;
@@ -491,7 +501,7 @@ again:
}
list_del_init(&device->dev_list);
fs_devices->num_devices--;
- kfree(device->name);
+ rcu_string_free(device->name);
kfree(device);
}
@@ -516,7 +526,7 @@ static void __free_device(struct work_struct *work)
if (device->bdev)
blkdev_put(device->bdev, device->mode);
- kfree(device->name);
+ rcu_string_free(device->name);
kfree(device);
}
@@ -540,6 +550,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
struct btrfs_device *new_device;
+ struct rcu_string *name;
if (device->bdev)
fs_devices->open_devices--;
@@ -555,8 +566,11 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
BUG_ON(!new_device); /* -ENOMEM */
memcpy(new_device, device, sizeof(*new_device));
- new_device->name = kstrdup(device->name, GFP_NOFS);
- BUG_ON(device->name && !new_device->name); /* -ENOMEM */
+
+ /* Safe because we are under uuid_mutex */
+ name = rcu_string_strdup(device->name->str, GFP_NOFS);
+ BUG_ON(device->name && !name); /* -ENOMEM */
+ rcu_assign_pointer(new_device->name, name);
new_device->bdev = NULL;
new_device->writeable = 0;
new_device->in_fs_metadata = 0;
@@ -621,9 +635,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
if (!device->name)
continue;
- bdev = blkdev_get_by_path(device->name, flags, holder);
+ bdev = blkdev_get_by_path(device->name->str, flags, holder);
if (IS_ERR(bdev)) {
- printk(KERN_INFO "open %s failed\n", device->name);
+ printk(KERN_INFO "open %s failed\n", device->name->str);
goto error;
}
filemap_write_and_wait(bdev->bd_inode->i_mapping);
@@ -1632,6 +1646,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
struct block_device *bdev;
struct list_head *devices;
struct super_block *sb = root->fs_info->sb;
+ struct rcu_string *name;
u64 total_bytes;
int seeding_dev = 0;
int ret = 0;
@@ -1671,23 +1686,24 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
goto error;
}
- device->name = kstrdup(device_path, GFP_NOFS);
- if (!device->name) {
+ name = rcu_string_strdup(device_path, GFP_NOFS);
+ if (!name) {
kfree(device);
ret = -ENOMEM;
goto error;
}
+ rcu_assign_pointer(device->name, name);
ret = find_next_devid(root, &device->devid);
if (ret) {
- kfree(device->name);
+ rcu_string_free(device->name);
kfree(device);
goto error;
}
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
- kfree(device->name);
+ rcu_string_free(device->name);
kfree(device);
ret = PTR_ERR(trans);
goto error;
@@ -1796,7 +1812,7 @@ error_trans:
unlock_chunks(root);
btrfs_abort_transaction(trans, root, ret);
btrfs_end_transaction(trans, root);
- kfree(device->name);
+ rcu_string_free(device->name);
kfree(device);
error:
blkdev_put(bdev, FMODE_EXCL);
@@ -2829,31 +2845,48 @@ out:
static int balance_kthread(void *data)
{
- struct btrfs_balance_control *bctl =
- (struct btrfs_balance_control *)data;
- struct btrfs_fs_info *fs_info = bctl->fs_info;
+ struct btrfs_fs_info *fs_info = data;
int ret = 0;
mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
- set_balance_control(bctl);
-
- if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
- printk(KERN_INFO "btrfs: force skipping balance\n");
- } else {
+ if (fs_info->balance_ctl) {
printk(KERN_INFO "btrfs: continuing balance\n");
- ret = btrfs_balance(bctl, NULL);
+ ret = btrfs_balance(fs_info->balance_ctl, NULL);
}
mutex_unlock(&fs_info->balance_mutex);
mutex_unlock(&fs_info->volume_mutex);
+
return ret;
}
-int btrfs_recover_balance(struct btrfs_root *tree_root)
+int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
{
struct task_struct *tsk;
+
+ spin_lock(&fs_info->balance_lock);
+ if (!fs_info->balance_ctl) {
+ spin_unlock(&fs_info->balance_lock);
+ return 0;
+ }
+ spin_unlock(&fs_info->balance_lock);
+
+ if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
+ printk(KERN_INFO "btrfs: force skipping balance\n");
+ return 0;
+ }
+
+ tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
+ if (IS_ERR(tsk))
+ return PTR_ERR(tsk);
+
+ return 0;
+}
+
+int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
+{
struct btrfs_balance_control *bctl;
struct btrfs_balance_item *item;
struct btrfs_disk_balance_args disk_bargs;
@@ -2866,29 +2899,30 @@ int btrfs_recover_balance(struct btrfs_root *tree_root)
if (!path)
return -ENOMEM;
- bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
- if (!bctl) {
- ret = -ENOMEM;
- goto out;
- }
-
key.objectid = BTRFS_BALANCE_OBJECTID;
key.type = BTRFS_BALANCE_ITEM_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0)
- goto out_bctl;
+ goto out;
if (ret > 0) { /* ret = -ENOENT; */
ret = 0;
- goto out_bctl;
+ goto out;
+ }
+
+ bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+ if (!bctl) {
+ ret = -ENOMEM;
+ goto out;
}
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
- bctl->fs_info = tree_root->fs_info;
- bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
+ bctl->fs_info = fs_info;
+ bctl->flags = btrfs_balance_flags(leaf, item);
+ bctl->flags |= BTRFS_BALANCE_RESUME;
btrfs_balance_data(leaf, item, &disk_bargs);
btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
@@ -2897,14 +2931,13 @@ int btrfs_recover_balance(struct btrfs_root *tree_root)
btrfs_balance_sys(leaf, item, &disk_bargs);
btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
- tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
- if (IS_ERR(tsk))
- ret = PTR_ERR(tsk);
- else
- goto out;
+ mutex_lock(&fs_info->volume_mutex);
+ mutex_lock(&fs_info->balance_mutex);
-out_bctl:
- kfree(bctl);
+ set_balance_control(bctl);
+
+ mutex_unlock(&fs_info->balance_mutex);
+ mutex_unlock(&fs_info->volume_mutex);
out:
btrfs_free_path(path);
return ret;
@@ -4045,16 +4078,18 @@ static void btrfs_end_bio(struct bio *bio, int err)
BUG_ON(stripe_index >= bbio->num_stripes);
dev = bbio->stripes[stripe_index].dev;
- if (bio->bi_rw & WRITE)
- btrfs_dev_stat_inc(dev,
- BTRFS_DEV_STAT_WRITE_ERRS);
- else
- btrfs_dev_stat_inc(dev,
- BTRFS_DEV_STAT_READ_ERRS);
- if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
- btrfs_dev_stat_inc(dev,
- BTRFS_DEV_STAT_FLUSH_ERRS);
- btrfs_dev_stat_print_on_error(dev);
+ if (dev->bdev) {
+ if (bio->bi_rw & WRITE)
+ btrfs_dev_stat_inc(dev,
+ BTRFS_DEV_STAT_WRITE_ERRS);
+ else
+ btrfs_dev_stat_inc(dev,
+ BTRFS_DEV_STAT_READ_ERRS);
+ if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
+ btrfs_dev_stat_inc(dev,
+ BTRFS_DEV_STAT_FLUSH_ERRS);
+ btrfs_dev_stat_print_on_error(dev);
+ }
}
}
@@ -4204,10 +4239,17 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
dev = bbio->stripes[dev_nr].dev;
if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
+#ifdef DEBUG
+ struct rcu_string *name;
+
+ rcu_read_lock();
+ name = rcu_dereference(dev->name);
pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
"(%s id %llu), size=%u\n", rw,
(u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
- dev->name, dev->devid, bio->bi_size);
+ name->str, dev->devid, bio->bi_size);
+ rcu_read_unlock();
+#endif
bio->bi_bdev = dev->bdev;
if (async_submit)
schedule_bio(root, dev, rw, bio);
@@ -4694,8 +4736,9 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
key.offset = device->devid;
ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
if (ret) {
- printk(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n",
- device->name, (unsigned long long)device->devid);
+ printk_in_rcu(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n",
+ rcu_str_deref(device->name),
+ (unsigned long long)device->devid);
__btrfs_reset_dev_stats(device);
device->dev_stats_valid = 1;
btrfs_release_path(path);
@@ -4747,8 +4790,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
BUG_ON(!path);
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
if (ret < 0) {
- printk(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n",
- ret, device->name);
+ printk_in_rcu(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n",
+ ret, rcu_str_deref(device->name));
goto out;
}
@@ -4757,8 +4800,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
/* need to delete old one and insert a new one */
ret = btrfs_del_item(trans, dev_root, path);
if (ret != 0) {
- printk(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n",
- device->name, ret);
+ printk_in_rcu(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n",
+ rcu_str_deref(device->name), ret);
goto out;
}
ret = 1;
@@ -4770,8 +4813,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, dev_root, path,
&key, sizeof(*ptr));
if (ret < 0) {
- printk(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n",
- device->name, ret);
+ printk_in_rcu(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n",
+ rcu_str_deref(device->name), ret);
goto out;
}
}
@@ -4823,9 +4866,9 @@ void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
{
if (!dev->dev_stats_valid)
return;
- printk_ratelimited(KERN_ERR
+ printk_ratelimited_in_rcu(KERN_ERR
"btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
- dev->name,
+ rcu_str_deref(dev->name),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
@@ -4837,8 +4880,8 @@ void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
{
- printk(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
- dev->name,
+ printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+ rcu_str_deref(dev->name),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 3406a88ca83..95f6637614d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -58,7 +58,7 @@ struct btrfs_device {
/* the mode sent to blkdev_get */
fmode_t mode;
- char *name;
+ struct rcu_string *name;
/* the internal btrfs device id */
u64 devid;
@@ -281,7 +281,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_root *root, char *path);
int btrfs_balance(struct btrfs_balance_control *bctl,
struct btrfs_ioctl_balance_args *bargs);
-int btrfs_recover_balance(struct btrfs_root *tree_root);
+int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
+int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);