summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-10-10 10:49:20 +0900
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-10 10:49:20 +0900
commit72055425e53540d9d0e59a57ac8c9b8ce77b62d5 (patch)
tree8033d7d7bfdf8725eed785d02f7121d201052d2e /fs
parentfc81c038c2d61d4fcd8150f383fec1ce23087597 (diff)
parentf46dbe3dee853f8a860f889cb2b7ff4c624f2a7a (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs update from Chris Mason: "This is a large pull, with the bulk of the updates coming from: - Hole punching - send/receive fixes - fsync performance - Disk format extension allowing more hardlinks inside a single directory (btrfs-progs patch required to enable the compat bit for this one) I'm cooking more unrelated RAID code, but I wanted to make sure this original batch makes it in. The largest updates here are relatively old and have been in testing for some time." * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (121 commits) btrfs: init ref_index to zero in add_inode_ref Btrfs: remove repeated eb->pages check in, disk-io.c/csum_dirty_buffer Btrfs: fix page leakage Btrfs: do not warn_on when we cannot alloc a page for an extent buffer Btrfs: don't bug on enomem in readpage Btrfs: cleanup pages properly when ENOMEM in compression Btrfs: make filesystem read-only when submitting barrier fails Btrfs: detect corrupted filesystem after write I/O errors Btrfs: make compress and nodatacow mount options mutually exclusive btrfs: fix message printing Btrfs: don't bother committing delayed inode updates when fsyncing btrfs: move inline function code to header file Btrfs: remove unnecessary IS_ERR in bio_readpage_error() btrfs: remove unused function btrfs_insert_some_items() Btrfs: don't commit instead of overcommitting Btrfs: confirmation of value is added before trace_btrfs_get_extent() is called Btrfs: be smarter about dropping things from the tree log Btrfs: don't lookup csums for prealloc extents Btrfs: cache extent state when writing out dirty metadata pages Btrfs: do not hold the file extent leaf locked when adding extent item ...
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/backref.c299
-rw-r--r--fs/btrfs/backref.h10
-rw-r--r--fs/btrfs/btrfs_inode.h15
-rw-r--r--fs/btrfs/check-integrity.c16
-rw-r--r--fs/btrfs/compression.c13
-rw-r--r--fs/btrfs/ctree.c148
-rw-r--r--fs/btrfs/ctree.h109
-rw-r--r--fs/btrfs/delayed-inode.c6
-rw-r--r--fs/btrfs/disk-io.c230
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/extent-tree.c376
-rw-r--r--fs/btrfs/extent_io.c128
-rw-r--r--fs/btrfs/extent_io.h23
-rw-r--r--fs/btrfs/extent_map.c55
-rw-r--r--fs/btrfs/extent_map.h8
-rw-r--r--fs/btrfs/file-item.c5
-rw-r--r--fs/btrfs/file.c447
-rw-r--r--fs/btrfs/free-space-cache.c10
-rw-r--r--fs/btrfs/hash.h10
-rw-r--r--fs/btrfs/inode-item.c285
-rw-r--r--fs/btrfs/inode.c386
-rw-r--r--fs/btrfs/ioctl.c100
-rw-r--r--fs/btrfs/ordered-data.c97
-rw-r--r--fs/btrfs/ordered-data.h12
-rw-r--r--fs/btrfs/qgroup.c40
-rw-r--r--fs/btrfs/relocation.c11
-rw-r--r--fs/btrfs/root-tree.c29
-rw-r--r--fs/btrfs/scrub.c30
-rw-r--r--fs/btrfs/send.c915
-rw-r--r--fs/btrfs/send.h1
-rw-r--r--fs/btrfs/super.c74
-rw-r--r--fs/btrfs/transaction.c283
-rw-r--r--fs/btrfs/transaction.h20
-rw-r--r--fs/btrfs/tree-log.c889
-rw-r--r--fs/btrfs/ulist.c7
-rw-r--r--fs/btrfs/ulist.h9
-rw-r--r--fs/btrfs/volumes.c73
-rw-r--r--fs/btrfs/zlib.c8
38 files changed, 3564 insertions, 1615 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ff6475f409d..f3187938e08 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -16,6 +16,7 @@
* Boston, MA 021110-1307, USA.
*/
+#include <linux/vmalloc.h>
#include "ctree.h"
#include "disk-io.h"
#include "backref.h"
@@ -231,7 +232,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
}
if (!ret) {
ret = ulist_add(parents, eb->start,
- (unsigned long)eie, GFP_NOFS);
+ (uintptr_t)eie, GFP_NOFS);
if (ret < 0)
break;
if (!extent_item_pos) {
@@ -363,8 +364,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
ULIST_ITER_INIT(&uiter);
node = ulist_next(parents, &uiter);
ref->parent = node ? node->val : 0;
- ref->inode_list =
- node ? (struct extent_inode_elem *)node->aux : 0;
+ ref->inode_list = node ?
+ (struct extent_inode_elem *)(uintptr_t)node->aux : 0;
/* additional parents require new refs being added here */
while ((node = ulist_next(parents, &uiter))) {
@@ -375,8 +376,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
}
memcpy(new_ref, ref, sizeof(*ref));
new_ref->parent = node->val;
- new_ref->inode_list =
- (struct extent_inode_elem *)node->aux;
+ new_ref->inode_list = (struct extent_inode_elem *)
+ (uintptr_t)node->aux;
list_add(&new_ref->list, &ref->list);
}
ulist_reinit(parents);
@@ -914,8 +915,8 @@ again:
free_extent_buffer(eb);
}
ret = ulist_add_merge(refs, ref->parent,
- (unsigned long)ref->inode_list,
- (unsigned long *)&eie, GFP_NOFS);
+ (uintptr_t)ref->inode_list,
+ (u64 *)&eie, GFP_NOFS);
if (!ret && extent_item_pos) {
/*
* we've recorded that parent, so we must extend
@@ -959,7 +960,7 @@ static void free_leaf_list(struct ulist *blocks)
while ((node = ulist_next(blocks, &uiter))) {
if (!node->aux)
continue;
- eie = (struct extent_inode_elem *)node->aux;
+ eie = (struct extent_inode_elem *)(uintptr_t)node->aux;
for (; eie; eie = eie_next) {
eie_next = eie->next;
kfree(eie);
@@ -1108,26 +1109,80 @@ static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
found_key);
}
-/*
- * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
- * of the path are separated by '/' and the path is guaranteed to be
- * 0-terminated. the path is only given within the current file system.
- * Therefore, it never starts with a '/'. the caller is responsible to provide
- * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
- * the start point of the resulting string is returned. this pointer is within
- * dest, normally.
- * in case the path buffer would overflow, the pointer is decremented further
- * as if output was written to the buffer, though no more output is actually
- * generated. that way, the caller can determine how much space would be
- * required for the path to fit into the buffer. in that case, the returned
- * value will be smaller than dest. callers must check this!
- */
-char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
- struct btrfs_inode_ref *iref,
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
+ u64 start_off, struct btrfs_path *path,
+ struct btrfs_inode_extref **ret_extref,
+ u64 *found_off)
+{
+ int ret, slot;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_inode_extref *extref;
+ struct extent_buffer *leaf;
+ unsigned long ptr;
+
+ key.objectid = inode_objectid;
+ btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+ key.offset = start_off;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ while (1) {
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(leaf)) {
+ /*
+ * If the item at offset is not found,
+ * btrfs_search_slot will point us to the slot
+ * where it should be inserted. In our case
+ * that will be the slot directly before the
+ * next INODE_REF_KEY_V2 item. In the case
+ * that we're pointing to the last slot in a
+ * leaf, we must move one leaf over.
+ */
+ ret = btrfs_next_leaf(root, path);
+ if (ret) {
+ if (ret >= 1)
+ ret = -ENOENT;
+ break;
+ }
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ /*
+ * Check that we're still looking at an extended ref key for
+ * this particular objectid. If we have different
+ * objectid or type then there are no more to be found
+ * in the tree and we can exit.
+ */
+ ret = -ENOENT;
+ if (found_key.objectid != inode_objectid)
+ break;
+ if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY)
+ break;
+
+ ret = 0;
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ extref = (struct btrfs_inode_extref *)ptr;
+ *ret_extref = extref;
+ if (found_off)
+ *found_off = found_key.offset;
+ break;
+ }
+
+ return ret;
+}
+
+static char *ref_to_path(struct btrfs_root *fs_root,
+ struct btrfs_path *path,
+ u32 name_len, unsigned long name_off,
struct extent_buffer *eb_in, u64 parent,
char *dest, u32 size)
{
- u32 len;
int slot;
u64 next_inum;
int ret;
@@ -1135,17 +1190,17 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
struct extent_buffer *eb = eb_in;
struct btrfs_key found_key;
int leave_spinning = path->leave_spinning;
+ struct btrfs_inode_ref *iref;
if (bytes_left >= 0)
dest[bytes_left] = '\0';
path->leave_spinning = 1;
while (1) {
- len = btrfs_inode_ref_name_len(eb, iref);
- bytes_left -= len;
+ bytes_left -= name_len;
if (bytes_left >= 0)
read_extent_buffer(eb, dest + bytes_left,
- (unsigned long)(iref + 1), len);
+ name_off, name_len);
if (eb != eb_in) {
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
@@ -1155,6 +1210,7 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
ret = -ENOENT;
if (ret)
break;
+
next_inum = found_key.offset;
/* regular exit ahead */
@@ -1170,8 +1226,11 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
}
btrfs_release_path(path);
-
iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+
+ name_len = btrfs_inode_ref_name_len(eb, iref);
+ name_off = (unsigned long)(iref + 1);
+
parent = next_inum;
--bytes_left;
if (bytes_left >= 0)
@@ -1188,12 +1247,39 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
}
/*
+ * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
+ * of the path are separated by '/' and the path is guaranteed to be
+ * 0-terminated. the path is only given within the current file system.
+ * Therefore, it never starts with a '/'. the caller is responsible to provide
+ * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
+ * the start point of the resulting string is returned. this pointer is within
+ * dest, normally.
+ * in case the path buffer would overflow, the pointer is decremented further
+ * as if output was written to the buffer, though no more output is actually
+ * generated. that way, the caller can determine how much space would be
+ * required for the path to fit into the buffer. in that case, the returned
+ * value will be smaller than dest. callers must check this!
+ */
+char *btrfs_iref_to_path(struct btrfs_root *fs_root,
+ struct btrfs_path *path,
+ struct btrfs_inode_ref *iref,
+ struct extent_buffer *eb_in, u64 parent,
+ char *dest, u32 size)
+{
+ return ref_to_path(fs_root, path,
+ btrfs_inode_ref_name_len(eb_in, iref),
+ (unsigned long)(iref + 1),
+ eb_in, parent, dest, size);
+}
+
+/*
* this makes the path point to (logical EXTENT_ITEM *)
* returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
* tree blocks and <0 on error.
*/
int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
- struct btrfs_path *path, struct btrfs_key *found_key)
+ struct btrfs_path *path, struct btrfs_key *found_key,
+ u64 *flags_ret)
{
int ret;
u64 flags;
@@ -1237,10 +1323,17 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
(unsigned long long)found_key->objectid,
(unsigned long long)found_key->offset,
(unsigned long long)flags, item_size);
- if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
- return BTRFS_EXTENT_FLAG_TREE_BLOCK;
- if (flags & BTRFS_EXTENT_FLAG_DATA)
- return BTRFS_EXTENT_FLAG_DATA;
+
+ WARN_ON(!flags_ret);
+ if (flags_ret) {
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+ *flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK;
+ else if (flags & BTRFS_EXTENT_FLAG_DATA)
+ *flags_ret = BTRFS_EXTENT_FLAG_DATA;
+ else
+ BUG_ON(1);
+ return 0;
+ }
return -EIO;
}
@@ -1404,12 +1497,13 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
ULIST_ITER_INIT(&root_uiter);
while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
pr_debug("root %llu references leaf %llu, data list "
- "%#lx\n", root_node->val, ref_node->val,
- ref_node->aux);
- ret = iterate_leaf_refs(
- (struct extent_inode_elem *)ref_node->aux,
- root_node->val, extent_item_objectid,
- iterate, ctx);
+ "%#llx\n", root_node->val, ref_node->val,
+ (long long)ref_node->aux);
+ ret = iterate_leaf_refs((struct extent_inode_elem *)
+ (uintptr_t)ref_node->aux,
+ root_node->val,
+ extent_item_objectid,
+ iterate, ctx);
}
ulist_free(roots);
roots = NULL;
@@ -1432,15 +1526,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
{
int ret;
u64 extent_item_pos;
+ u64 flags = 0;
struct btrfs_key found_key;
int search_commit_root = path->search_commit_root;
- ret = extent_from_logical(fs_info, logical, path,
- &found_key);
+ ret = extent_from_logical(fs_info, logical, path, &found_key, &flags);
btrfs_release_path(path);
if (ret < 0)
return ret;
- if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
return -EINVAL;
extent_item_pos = logical - found_key.objectid;
@@ -1451,9 +1545,12 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
return ret;
}
-static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
- struct btrfs_path *path,
- iterate_irefs_t *iterate, void *ctx)
+typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb, void *ctx);
+
+static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
+ struct btrfs_path *path,
+ iterate_irefs_t *iterate, void *ctx)
{
int ret = 0;
int slot;
@@ -1470,7 +1567,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
while (!ret) {
path->leave_spinning = 1;
ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
- &found_key);
+ &found_key);
if (ret < 0)
break;
if (ret) {
@@ -1498,7 +1595,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
"tree %llu\n", cur,
(unsigned long long)found_key.objectid,
(unsigned long long)fs_root->objectid);
- ret = iterate(parent, iref, eb, ctx);
+ ret = iterate(parent, name_len,
+ (unsigned long)(iref + 1), eb, ctx);
if (ret)
break;
len = sizeof(*iref) + name_len;
@@ -1513,12 +1611,98 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
return ret;
}
+static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
+ struct btrfs_path *path,
+ iterate_irefs_t *iterate, void *ctx)
+{
+ int ret;
+ int slot;
+ u64 offset = 0;
+ u64 parent;
+ int found = 0;
+ struct extent_buffer *eb;
+ struct btrfs_inode_extref *extref;
+ struct extent_buffer *leaf;
+ u32 item_size;
+ u32 cur_offset;
+ unsigned long ptr;
+
+ while (1) {
+ ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref,
+ &offset);
+ if (ret < 0)
+ break;
+ if (ret) {
+ ret = found ? 0 : -ENOENT;
+ break;
+ }
+ ++found;
+
+ slot = path->slots[0];
+ eb = path->nodes[0];
+ /* make sure we can use eb after releasing the path */
+ atomic_inc(&eb->refs);
+
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_release_path(path);
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ cur_offset = 0;
+
+ while (cur_offset < item_size) {
+ u32 name_len;
+
+ extref = (struct btrfs_inode_extref *)(ptr + cur_offset);
+ parent = btrfs_inode_extref_parent(eb, extref);
+ name_len = btrfs_inode_extref_name_len(eb, extref);
+ ret = iterate(parent, name_len,
+ (unsigned long)&extref->name, eb, ctx);
+ if (ret)
+ break;
+
+ cur_offset += btrfs_inode_extref_name_len(leaf, extref);
+ cur_offset += sizeof(*extref);
+ }
+ btrfs_tree_read_unlock_blocking(eb);
+ free_extent_buffer(eb);
+
+ offset++;
+ }
+
+ btrfs_release_path(path);
+
+ return ret;
+}
+
+static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
+ struct btrfs_path *path, iterate_irefs_t *iterate,
+ void *ctx)
+{
+ int ret;
+ int found_refs = 0;
+
+ ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx);
+ if (!ret)
+ ++found_refs;
+ else if (ret != -ENOENT)
+ return ret;
+
+ ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx);
+ if (ret == -ENOENT && found_refs)
+ return 0;
+
+ return ret;
+}
+
/*
* returns 0 if the path could be dumped (probably truncated)
* returns <0 in case of an error
*/
-static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
- struct extent_buffer *eb, void *ctx)
+static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb, void *ctx)
{
struct inode_fs_paths *ipath = ctx;
char *fspath;
@@ -1531,20 +1715,17 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
ipath->fspath->bytes_left - s_ptr : 0;
fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
- fspath = btrfs_iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
- inum, fspath_min, bytes_left);
+ fspath = ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len,
+ name_off, eb, inum, fspath_min,
+ bytes_left);
if (IS_ERR(fspath))
return PTR_ERR(fspath);
if (fspath > fspath_min) {
- pr_debug("path resolved: %s\n", fspath);
ipath->fspath->val[i] = (u64)(unsigned long)fspath;
++ipath->fspath->elem_cnt;
ipath->fspath->bytes_left = fspath - fspath_min;
} else {
- pr_debug("missed path, not enough space. missing bytes: %lu, "
- "constructed so far: %s\n",
- (unsigned long)(fspath_min - fspath), fspath_min);
++ipath->fspath->elem_missed;
ipath->fspath->bytes_missing += fspath_min - fspath;
ipath->fspath->bytes_left = 0;
@@ -1566,7 +1747,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
{
return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
- inode_to_path, ipath);
+ inode_to_path, ipath);
}
struct btrfs_data_container *init_data_container(u32 total_bytes)
@@ -1575,7 +1756,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes)
size_t alloc_bytes;
alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
- data = kmalloc(alloc_bytes, GFP_NOFS);
+ data = vmalloc(alloc_bytes);
if (!data)
return ERR_PTR(-ENOMEM);
@@ -1626,6 +1807,6 @@ void free_ipath(struct inode_fs_paths *ipath)
{
if (!ipath)
return;
- kfree(ipath->fspath);
+ vfree(ipath->fspath);
kfree(ipath);
}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 032f4dc7eab..e75533043a5 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -33,14 +33,13 @@ struct inode_fs_paths {
typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
void *ctx);
-typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
- struct extent_buffer *eb, void *ctx);
int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
struct btrfs_path *path);
int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
- struct btrfs_path *path, struct btrfs_key *found_key);
+ struct btrfs_path *path, struct btrfs_key *found_key,
+ u64 *flags);
int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
struct btrfs_extent_item *ei, u32 item_size,
@@ -69,4 +68,9 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
struct btrfs_path *path);
void free_ipath(struct inode_fs_paths *ipath);
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
+ u64 start_off, struct btrfs_path *path,
+ struct btrfs_inode_extref **ret_extref,
+ u64 *found_off);
+
#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5b2ad6bc4fe..ed8ca7ca5ef 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -38,6 +38,7 @@
#define BTRFS_INODE_DELALLOC_META_RESERVED 4
#define BTRFS_INODE_HAS_ORPHAN_ITEM 5
#define BTRFS_INODE_HAS_ASYNC_EXTENT 6
+#define BTRFS_INODE_NEEDS_FULL_SYNC 7
/* in memory btrfs inode */
struct btrfs_inode {
@@ -143,6 +144,9 @@ struct btrfs_inode {
/* flags field from the on disk inode */
u32 flags;
+ /* a local copy of root's last_log_commit */
+ unsigned long last_log_commit;
+
/*
* Counters to keep track of the number of extent item's we may use due
* to delalloc and such. outstanding_extents is the number of extent
@@ -202,15 +206,10 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode)
static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- int ret = 0;
-
- mutex_lock(&root->log_mutex);
if (BTRFS_I(inode)->logged_trans == generation &&
- BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
- ret = 1;
- mutex_unlock(&root->log_mutex);
- return ret;
+ BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit)
+ return 1;
+ return 0;
}
#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 9197e2e3340..5a3e45db642 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -37,8 +37,9 @@
* the file system was mounted, (i.e., they have been
* referenced by the super block) or they have been
* written since then and the write completion callback
- * was called and a FLUSH request to the device where
- * these blocks are located was received and completed.
+ * was called and no write error was indicated and a
+ * FLUSH request to the device where these blocks are
+ * located was received and completed.
* 2b. All referenced blocks need to have a generation
* number which is equal to the parent's number.
*
@@ -2601,6 +2602,17 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
(unsigned long long)l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
ret = -1;
+ } else if (l->block_ref_to->iodone_w_error) {
+ printk(KERN_INFO "btrfs: attempt to write superblock"
+ " which references block %c @%llu (%s/%llu/%d)"
+ " which has write error!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ ret = -1;
} else if (l->parent_generation !=
l->block_ref_to->generation &&
BTRFSIC_GENERATION_UNKNOWN !=
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 43d1c5a3a03..c6467aa88be 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -577,6 +577,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
u64 em_start;
struct extent_map *em;
int ret = -ENOMEM;
+ int faili = 0;
u32 *sums;
tree = &BTRFS_I(inode)->io_tree;
@@ -626,9 +627,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
__GFP_HIGHMEM);
- if (!cb->compressed_pages[pg_index])
+ if (!cb->compressed_pages[pg_index]) {
+ faili = pg_index - 1;
+ ret = -ENOMEM;
goto fail2;
+ }
}
+ faili = nr_pages - 1;
cb->nr_pages = nr_pages;
add_ra_bio_pages(inode, em_start + em_len, cb);
@@ -713,8 +718,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
return 0;
fail2:
- for (pg_index = 0; pg_index < nr_pages; pg_index++)
- free_page((unsigned long)cb->compressed_pages[pg_index]);
+ while (faili >= 0) {
+ __free_page(cb->compressed_pages[faili]);
+ faili--;
+ }
kfree(cb->compressed_pages);
fail1:
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6d183f60d63..b3343621100 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4402,149 +4402,6 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
}
/*
- * Given a key and some data, insert items into the tree.
- * This does all the path init required, making room in the tree if needed.
- * Returns the number of keys that were inserted.
- */
-int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_key *cpu_key, u32 *data_size,
- int nr)
-{
- struct extent_buffer *leaf;
- struct btrfs_item *item;
- int ret = 0;
- int slot;
- int i;
- u32 nritems;
- u32 total_data = 0;
- u32 total_size = 0;
- unsigned int data_end;
- struct btrfs_disk_key disk_key;
- struct btrfs_key found_key;
- struct btrfs_map_token token;
-
- btrfs_init_map_token(&token);
-
- for (i = 0; i < nr; i++) {
- if (total_size + data_size[i] + sizeof(struct btrfs_item) >
- BTRFS_LEAF_DATA_SIZE(root)) {
- break;
- nr = i;
- }
- total_data += data_size[i];
- total_size += data_size[i] + sizeof(struct btrfs_item);
- }
- BUG_ON(nr == 0);
-
- ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
- if (ret == 0)
- return -EEXIST;
- if (ret < 0)
- goto out;
-
- leaf = path->nodes[0];
-
- nritems = btrfs_header_nritems(leaf);
- data_end = leaf_data_end(root, leaf);
-
- if (btrfs_leaf_free_space(root, leaf) < total_size) {
- for (i = nr; i >= 0; i--) {
- total_data -= data_size[i];
- total_size -= data_size[i] + sizeof(struct btrfs_item);
- if (total_size < btrfs_leaf_free_space(root, leaf))
- break;
- }
- nr = i;
- }
-
- slot = path->slots[0];
- BUG_ON(slot < 0);
-
- if (slot != nritems) {
- unsigned int old_data = btrfs_item_end_nr(leaf, slot);
-
- item = btrfs_item_nr(leaf, slot);
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
-
- /* figure out how many keys we can insert in here */
- total_data = data_size[0];
- for (i = 1; i < nr; i++) {
- if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
- break;
- total_data += data_size[i];
- }
- nr = i;
-
- if (old_data < data_end) {
- btrfs_print_leaf(root, leaf);
- printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
- slot, old_data, data_end);
- BUG_ON(1);
- }
- /*
- * item0..itemN ... dataN.offset..dataN.size .. data0.size
- */
- /* first correct the data pointers */
- for (i = slot; i < nritems; i++) {
- u32 ioff;
-
- item = btrfs_item_nr(leaf, i);
- ioff = btrfs_token_item_offset(leaf, item, &token);
- btrfs_set_token_item_offset(leaf, item,
- ioff - total_data, &token);
- }
- /* shift the items */
- memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
- btrfs_item_nr_offset(slot),
- (nritems - slot) * sizeof(struct btrfs_item));
-
- /* shift the data */
- memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
- data_end - total_data, btrfs_leaf_data(leaf) +
- data_end, old_data - data_end);
- data_end = old_data;
- } else {
- /*
- * this sucks but it has to be done, if we are inserting at
- * the end of the leaf only insert 1 of the items, since we
- * have no way of knowing whats on the next leaf and we'd have
- * to drop our current locks to figure it out
- */
- nr = 1;
- }
-
- /* setup the item for the new data */
- for (i = 0; i < nr; i++) {
- btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
- btrfs_set_item_key(leaf, &disk_key, slot + i);
- item = btrfs_item_nr(leaf, slot + i);
- btrfs_set_token_item_offset(leaf, item,
- data_end - data_size[i], &token);
- data_end -= data_size[i];
- btrfs_set_token_item_size(leaf, item, data_size[i], &token);
- }
- btrfs_set_header_nritems(leaf, nritems + nr);
- btrfs_mark_buffer_dirty(leaf);
-
- ret = 0;
- if (slot == 0) {
- btrfs_cpu_key_to_disk(&disk_key, cpu_key);
- fixup_low_keys(trans, root, path, &disk_key, 1);
- }
-
- if (btrfs_leaf_free_space(root, leaf) < 0) {
- btrfs_print_leaf(root, leaf);
- BUG();
- }
-out:
- if (!ret)
- ret = nr;
- return ret;
-}
-
-/*
* this is a helper for btrfs_insert_empty_items, the main goal here is
* to save stack depth by doing the bulk of the work in a function
* that doesn't call btrfs_search_slot
@@ -5073,6 +4930,7 @@ static void tree_move_down(struct btrfs_root *root,
struct btrfs_path *path,
int *level, int root_level)
{
+ BUG_ON(*level == 0);
path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
path->slots[*level]);
path->slots[*level - 1] = 0;
@@ -5089,7 +4947,7 @@ static int tree_move_next_or_upnext(struct btrfs_root *root,
path->slots[*level]++;
- while (path->slots[*level] == nritems) {
+ while (path->slots[*level] >= nritems) {
if (*level == root_level)
return -1;
@@ -5433,9 +5291,11 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
goto out;
advance_right = ADVANCE;
} else {
+ WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
ret = tree_compare_item(left_root, left_path,
right_path, tmp_buf);
if (ret) {
+ WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
ret = changed_cb(left_root, right_root,
left_path, right_path,
&left_key,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9821b672f5a..926c9ffc66d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -154,6 +154,13 @@ struct btrfs_ordered_sum;
*/
#define BTRFS_NAME_LEN 255
+/*
+ * Theoretical limit is larger, but we keep this down to a sane
+ * value. That should limit greatly the possibility of collisions on
+ * inode ref items.
+ */
+#define BTRFS_LINK_MAX 65535U
+
/* 32 bytes in various csum fields */
#define BTRFS_CSUM_SIZE 32
@@ -489,6 +496,8 @@ struct btrfs_super_block {
*/
#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
+#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
+
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
#define BTRFS_FEATURE_INCOMPAT_SUPP \
@@ -496,7 +505,8 @@ struct btrfs_super_block {
BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
- BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
+ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
/*
* A leaf is full of items. offset and size tell us where to find
@@ -643,6 +653,14 @@ struct btrfs_inode_ref {
/* name goes here */
} __attribute__ ((__packed__));
+struct btrfs_inode_extref {
+ __le64 parent_objectid;
+ __le64 index;
+ __le16 name_len;
+ __u8 name[0];
+ /* name goes here */
+} __attribute__ ((__packed__));
+
struct btrfs_timespec {
__le64 sec;
__le32 nsec;
@@ -1028,12 +1046,22 @@ struct btrfs_space_info {
wait_queue_head_t wait;
};
+#define BTRFS_BLOCK_RSV_GLOBAL 1
+#define BTRFS_BLOCK_RSV_DELALLOC 2
+#define BTRFS_BLOCK_RSV_TRANS 3
+#define BTRFS_BLOCK_RSV_CHUNK 4
+#define BTRFS_BLOCK_RSV_DELOPS 5
+#define BTRFS_BLOCK_RSV_EMPTY 6
+#define BTRFS_BLOCK_RSV_TEMP 7
+
struct btrfs_block_rsv {
u64 size;
u64 reserved;
struct btrfs_space_info *space_info;
spinlock_t lock;
- unsigned int full;
+ unsigned short full;
+ unsigned short type;
+ unsigned short failfast;
};
/*
@@ -1127,6 +1155,9 @@ struct btrfs_block_group_cache {
* Today it will only have one thing on it, but that may change
*/
struct list_head cluster_list;
+
+ /* For delayed block group creation */
+ struct list_head new_bg_list;
};
/* delayed seq elem */
@@ -1240,7 +1271,6 @@ struct btrfs_fs_info {
struct mutex reloc_mutex;
struct list_head trans_list;
- struct list_head hashers;
struct list_head dead_roots;
struct list_head caching_block_groups;
@@ -1366,9 +1396,6 @@ struct btrfs_fs_info {
struct rb_root defrag_inodes;
atomic_t defrag_running;
- spinlock_t ref_cache_lock;
- u64 total_ref_cache_size;
-
/*
* these three are in extended format (availability of single
* chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
@@ -1441,6 +1468,8 @@ struct btrfs_fs_info {
/* next backup root to be overwritten */
int backup_root_index;
+
+ int num_tolerated_disk_barrier_failures;
};
/*
@@ -1481,9 +1510,9 @@ struct btrfs_root {
wait_queue_head_t log_commit_wait[2];
atomic_t log_writers;
atomic_t log_commit[2];
+ atomic_t log_batch;
unsigned long log_transid;
unsigned long last_log_commit;
- unsigned long log_batch;
pid_t log_start_pid;
bool log_multiple_pids;
@@ -1592,6 +1621,7 @@ struct btrfs_ioctl_defrag_range_args {
*/
#define BTRFS_INODE_ITEM_KEY 1
#define BTRFS_INODE_REF_KEY 12
+#define BTRFS_INODE_EXTREF_KEY 13
#define BTRFS_XATTR_ITEM_KEY 24
#define BTRFS_ORPHAN_ITEM_KEY 48
/* reserve 2-15 close to the inode for later flexibility */
@@ -1978,6 +2008,13 @@ BTRFS_SETGET_STACK_FUNCS(block_group_flags,
BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
+/* struct btrfs_inode_extref */
+BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
+ parent_objectid, 64);
+BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref,
+ name_len, 16);
+BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64);
+
/* struct btrfs_inode_item */
BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
@@ -2858,6 +2895,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
u64 size);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start);
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
@@ -2874,8 +2913,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+ unsigned short type);
void btrfs_free_block_rsv(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
int btrfs_block_rsv_add(struct btrfs_root *root,
@@ -3172,12 +3212,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
u64 inode_objectid, u64 ref_objectid, u64 *index);
-struct btrfs_inode_ref *
-btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, int mod);
+int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, int mod,
+ u64 *ret_index);
int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid);
@@ -3185,6 +3225,19 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path,
struct btrfs_key *location, int mod);
+struct btrfs_inode_extref *
+btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, int ins_len,
+ int cow);
+
+int btrfs_find_name_in_ext_backref(struct btrfs_path *path,
+ u64 ref_objectid, const char *name,
+ int name_len,
+ struct btrfs_inode_extref **extref_ret);
+
/* file-item.c */
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
@@ -3249,6 +3302,8 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *dir, u64 objectid,
const char *name, int name_len);
+int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+ int front);
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode, u64 new_size,
@@ -3308,16 +3363,27 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
int btrfs_defrag_file(struct inode *inode, struct file *file,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_pages);
+void btrfs_get_block_group_info(struct list_head *groups_list,
+ struct btrfs_ioctl_space_info *space);
+
/* file.c */
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct inode *inode);
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
-int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
- int skip_pinned);
+void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ int skip_pinned);
+int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace,
+ u64 start, u64 end, int skip_pinned,
+ int modified);
extern const struct file_operations btrfs_file_operations;
-int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
- u64 start, u64 end, u64 *hint_byte, int drop_cache);
+int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct btrfs_path *path, u64 start, u64 end,
+ u64 *drop_end, int drop_cache);
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode, u64 start,
+ u64 end, int drop_cache);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct inode *inode, u64 start, u64 end);
int btrfs_release_file(struct inode *inode, struct file *file);
@@ -3378,6 +3444,11 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
}
}
+/*
+ * Call btrfs_abort_transaction as early as possible when an error condition is
+ * detected, that way the exact line number is reported.
+ */
+
#define btrfs_abort_transaction(trans, root, errno) \
do { \
__btrfs_abort_transaction(trans, root, __func__, \
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 52c85e2b95d..478f66bdc57 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -29,7 +29,7 @@ static struct kmem_cache *delayed_node_cache;
int __init btrfs_delayed_inode_init(void)
{
- delayed_node_cache = kmem_cache_create("delayed_node",
+ delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
sizeof(struct btrfs_delayed_node),
0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
@@ -650,7 +650,7 @@ static int btrfs_delayed_inode_reserve_metadata(
* we're accounted for.
*/
if (!src_rsv || (!trans->bytes_reserved &&
- src_rsv != &root->fs_info->delalloc_block_rsv)) {
+ src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
/*
* Since we're under a transaction reserve_metadata_bytes could
@@ -668,7 +668,7 @@ static int btrfs_delayed_inode_reserve_metadata(
num_bytes, 1);
}
return ret;
- } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
+ } else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
spin_lock(&BTRFS_I(inode)->lock);
if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
&BTRFS_I(inode)->runtime_flags)) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 22e98e04c2e..7cda51995c1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,6 +46,10 @@
#include "check-integrity.h"
#include "rcu-string.h"
+#ifdef CONFIG_X86
+#include <asm/cpufeature.h>
+#endif
+
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
static void free_fs_root(struct btrfs_root *root);
@@ -217,26 +221,16 @@ static struct extent_map *btree_get_extent(struct inode *inode,
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
if (ret == -EEXIST) {
- u64 failed_start = em->start;
- u64 failed_len = em->len;
-
free_extent_map(em);
em = lookup_extent_mapping(em_tree, start, len);
- if (em) {
- ret = 0;
- } else {
- em = lookup_extent_mapping(em_tree, failed_start,
- failed_len);
- ret = -EIO;
- }
+ if (!em)
+ em = ERR_PTR(-EIO);
} else if (ret) {
free_extent_map(em);
- em = NULL;
+ em = ERR_PTR(ret);
}
write_unlock(&em_tree->lock);
- if (ret)
- em = ERR_PTR(ret);
out:
return em;
}
@@ -439,10 +433,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
WARN_ON(1);
return 0;
}
- if (eb->pages[0] != page) {
- WARN_ON(1);
- return 0;
- }
if (!PageUptodate(page)) {
WARN_ON(1);
return 0;
@@ -869,10 +859,22 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
}
+static int check_async_write(struct inode *inode, unsigned long bio_flags)
+{
+ if (bio_flags & EXTENT_BIO_TREE_LOG)
+ return 0;
+#ifdef CONFIG_X86
+ if (cpu_has_xmm4_2)
+ return 0;
+#endif
+ return 1;
+}
+
static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset)
{
+ int async = check_async_write(inode, bio_flags);
int ret;
if (!(rw & REQ_WRITE)) {
@@ -887,6 +889,12 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
return ret;
return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
mirror_num, 0);
+ } else if (!async) {
+ ret = btree_csum_one_bio(bio);
+ if (ret)
+ return ret;
+ return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+ mirror_num, 0);
}
/*
@@ -1168,8 +1176,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
atomic_set(&root->log_commit[0], 0);
atomic_set(&root->log_commit[1], 0);
atomic_set(&root->log_writers, 0);
+ atomic_set(&root->log_batch, 0);
atomic_set(&root->orphan_inodes, 0);
- root->log_batch = 0;
root->log_transid = 0;
root->last_log_commit = 0;
extent_io_tree_init(&root->dirty_log_pages,
@@ -1667,9 +1675,10 @@ static int transaction_kthread(void *arg)
spin_unlock(&root->fs_info->trans_lock);
/* If the file system is aborted, this will always fail. */
- trans = btrfs_join_transaction(root);
+ trans = btrfs_attach_transaction(root);
if (IS_ERR(trans)) {
- cannot_commit = true;
+ if (PTR_ERR(trans) != -ENOENT)
+ cannot_commit = true;
goto sleep;
}
if (transid == trans->transid) {
@@ -1994,13 +2003,11 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->delayed_iputs);
- INIT_LIST_HEAD(&fs_info->hashers);
INIT_LIST_HEAD(&fs_info->delalloc_inodes);
INIT_LIST_HEAD(&fs_info->ordered_operations);
INIT_LIST_HEAD(&fs_info->caching_block_groups);
spin_lock_init(&fs_info->delalloc_lock);
spin_lock_init(&fs_info->trans_lock);
- spin_lock_init(&fs_info->ref_cache_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
@@ -2014,12 +2021,15 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
btrfs_mapping_init(&fs_info->mapping_tree);
- btrfs_init_block_rsv(&fs_info->global_block_rsv);
- btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
- btrfs_init_block_rsv(&fs_info->trans_block_rsv);
- btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
- btrfs_init_block_rsv(&fs_info->empty_block_rsv);
- btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
+ btrfs_init_block_rsv(&fs_info->global_block_rsv,
+ BTRFS_BLOCK_RSV_GLOBAL);
+ btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
+ BTRFS_BLOCK_RSV_DELALLOC);
+ btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
+ btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
+ btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
+ btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
+ BTRFS_BLOCK_RSV_DELOPS);
atomic_set(&fs_info->nr_async_submits, 0);
atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->async_submit_draining, 0);
@@ -2491,6 +2501,8 @@ retry_root_backup:
printk(KERN_ERR "Failed to read block groups: %d\n", ret);
goto fail_block_groups;
}
+ fs_info->num_tolerated_disk_barrier_failures =
+ btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
"btrfs-cleaner");
@@ -2874,12 +2886,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
printk_in_rcu("btrfs: disabling barriers on dev %s\n",
rcu_str_deref(device->name));
device->nobarriers = 1;
- }
- if (!bio_flagged(bio, BIO_UPTODATE)) {
+ } else if (!bio_flagged(bio, BIO_UPTODATE)) {
ret = -EIO;
- if (!bio_flagged(bio, BIO_EOPNOTSUPP))
- btrfs_dev_stat_inc_and_print(device,
- BTRFS_DEV_STAT_FLUSH_ERRS);
+ btrfs_dev_stat_inc_and_print(device,
+ BTRFS_DEV_STAT_FLUSH_ERRS);
}
/* drop the reference from the wait == 0 run */
@@ -2918,14 +2928,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
{
struct list_head *head;
struct btrfs_device *dev;
- int errors = 0;
+ int errors_send = 0;
+ int errors_wait = 0;
int ret;
/* send down all the barriers */
head = &info->fs_devices->devices;
list_for_each_entry_rcu(dev, head, dev_list) {
if (!dev->bdev) {
- errors++;
+ errors_send++;
continue;
}
if (!dev->in_fs_metadata || !dev->writeable)
@@ -2933,13 +2944,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
ret = write_dev_flush(dev, 0);
if (ret)
- errors++;
+ errors_send++;
}
/* wait for all the barriers */
list_for_each_entry_rcu(dev, head, dev_list) {
if (!dev->bdev) {
- errors++;
+ errors_wait++;
continue;
}
if (!dev->in_fs_metadata || !dev->writeable)
@@ -2947,13 +2958,87 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
ret = write_dev_flush(dev, 1);
if (ret)
- errors++;
+ errors_wait++;
}
- if (errors)
+ if (errors_send > info->num_tolerated_disk_barrier_failures ||
+ errors_wait > info->num_tolerated_disk_barrier_failures)
return -EIO;
return 0;
}
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_ioctl_space_info space;
+ struct btrfs_space_info *sinfo;
+ u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
+ BTRFS_BLOCK_GROUP_SYSTEM,
+ BTRFS_BLOCK_GROUP_METADATA,
+ BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
+ int num_types = 4;
+ int i;
+ int c;
+ int num_tolerated_disk_barrier_failures =
+ (int)fs_info->fs_devices->num_devices;
+
+ for (i = 0; i < num_types; i++) {
+ struct btrfs_space_info *tmp;
+
+ sinfo = NULL;
+ rcu_read_lock();
+ list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
+ if (tmp->flags == types[i]) {
+ sinfo = tmp;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ if (!sinfo)
+ continue;
+
+ down_read(&sinfo->groups_sem);
+ for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+ if (!list_empty(&sinfo->block_groups[c])) {
+ u64 flags;
+
+ btrfs_get_block_group_info(
+ &sinfo->block_groups[c], &space);
+ if (space.total_bytes == 0 ||
+ space.used_bytes == 0)
+ continue;
+ flags = space.flags;
+ /*
+ * return
+ * 0: if dup, single or RAID0 is configured for
+ * any of metadata, system or data, else
+ * 1: if RAID5 is configured, or if RAID1 or
+ * RAID10 is configured and only two mirrors
+ * are used, else
+ * 2: if RAID6 is configured, else
+ * num_mirrors - 1: if RAID1 or RAID10 is
+ * configured and more than
+ * 2 mirrors are used.
+ */
+ if (num_tolerated_disk_barrier_failures > 0 &&
+ ((flags & (BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID0)) ||
+ ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
+ == 0)))
+ num_tolerated_disk_barrier_failures = 0;
+ else if (num_tolerated_disk_barrier_failures > 1
+ &&
+ (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10)))
+ num_tolerated_disk_barrier_failures = 1;
+ }
+ }
+ up_read(&sinfo->groups_sem);
+ }
+
+ return num_tolerated_disk_barrier_failures;
+}
+
int write_all_supers(struct btrfs_root *root, int max_mirrors)
{
struct list_head *head;
@@ -2976,8 +3061,16 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
head = &root->fs_info->fs_devices->devices;
- if (do_barriers)
- barrier_all_devices(root->fs_info);
+ if (do_barriers) {
+ ret = barrier_all_devices(root->fs_info);
+ if (ret) {
+ mutex_unlock(
+ &root->fs_info->fs_devices->device_list_mutex);
+ btrfs_error(root->fs_info, ret,
+ "errors while submitting device barriers.");
+ return ret;
+ }
+ }
list_for_each_entry_rcu(dev, head, dev_list) {
if (!dev->bdev) {
@@ -3211,10 +3304,6 @@ int close_ctree(struct btrfs_root *root)
printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
(unsigned long long)fs_info->delalloc_bytes);
}
- if (fs_info->total_ref_cache_size) {
- printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
- (unsigned long long)fs_info->total_ref_cache_size);
- }
free_extent_buffer(fs_info->extent_root->node);
free_extent_buffer(fs_info->extent_root->commit_root);
@@ -3360,52 +3449,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
}
-int btree_lock_page_hook(struct page *page, void *data,
- void (*flush_fn)(void *))
-{
- struct inode *inode = page->mapping->host;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_buffer *eb;
-
- /*
- * We culled this eb but the page is still hanging out on the mapping,
- * carry on.
- */
- if (!PagePrivate(page))
- goto out;
-
- eb = (struct extent_buffer *)page->private;
- if (!eb) {
- WARN_ON(1);
- goto out;
- }
- if (page != eb->pages[0])
- goto out;
-
- if (!btrfs_try_tree_write_lock(eb)) {
- flush_fn(data);
- btrfs_tree_lock(eb);
- }
- btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-
- if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
- spin_lock(&root->fs_info->delalloc_lock);
- if (root->fs_info->dirty_metadata_bytes >= eb->len)
- root->fs_info->dirty_metadata_bytes -= eb->len;
- else
- WARN_ON(1);
- spin_unlock(&root->fs_info->delalloc_lock);
- }
-
- btrfs_tree_unlock(eb);
-out:
- if (!trylock_page(page)) {
- flush_fn(data);
- lock_page(page);
- }
- return 0;
-}
-
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only)
{
@@ -3608,7 +3651,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
while (1) {
ret = find_first_extent_bit(dirty_pages, start, &start, &end,
- mark);
+ mark, NULL);
if (ret)
break;
@@ -3663,7 +3706,7 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
again:
while (1) {
ret = find_first_extent_bit(unpin, 0, &start, &end,
- EXTENT_DIRTY);
+ EXTENT_DIRTY, NULL);
if (ret)
break;
@@ -3800,7 +3843,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
}
static struct extent_io_ops btree_extent_io_ops = {
- .write_cache_pages_lock_hook = btree_lock_page_hook,
.readpage_end_io_hook = btree_readpage_end_io_hook,
.readpage_io_failed_hook = btree_io_failed_hook,
.submit_bio_hook = btree_submit_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c5b00a735fe..2025a9132c1 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -95,6 +95,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid);
int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *));
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+ struct btrfs_fs_info *fs_info);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ba58024d40d..3d3e2c17d8d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -94,8 +94,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
u64 flags, struct btrfs_disk_key *key,
int level, struct btrfs_key *ins);
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root, u64 alloc_bytes,
- u64 flags, int force);
+ struct btrfs_root *extent_root, u64 flags,
+ int force);
static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key);
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -312,7 +312,8 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
while (start < end) {
ret = find_first_extent_bit(info->pinned_extents, start,
&extent_start, &extent_end,
- EXTENT_DIRTY | EXTENT_UPTODATE);
+ EXTENT_DIRTY | EXTENT_UPTODATE,
+ NULL);
if (ret)
break;
@@ -2361,10 +2362,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
}
next:
- do_chunk_alloc(trans, fs_info->extent_root,
- 2 * 1024 * 1024,
- btrfs_get_alloc_profile(root, 0),
- CHUNK_ALLOC_NO_FORCE);
cond_resched();
spin_lock(&delayed_refs->lock);
}
@@ -2478,10 +2475,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
if (root == root->fs_info->extent_root)
root = root->fs_info->tree_root;
- do_chunk_alloc(trans, root->fs_info->extent_root,
- 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
- CHUNK_ALLOC_NO_FORCE);
-
btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
delayed_refs = &trans->transaction->delayed_refs;
@@ -2551,6 +2544,12 @@ again:
}
if (run_all) {
+ if (!list_empty(&trans->new_bgs)) {
+ spin_unlock(&delayed_refs->lock);
+ btrfs_create_pending_block_groups(trans, root);
+ spin_lock(&delayed_refs->lock);
+ }
+
node = rb_first(&delayed_refs->root);
if (!node)
goto out;
@@ -3406,7 +3405,6 @@ alloc:
return PTR_ERR(trans);
ret = do_chunk_alloc(trans, root->fs_info->extent_root,
- bytes + 2 * 1024 * 1024,
alloc_target,
CHUNK_ALLOC_NO_FORCE);
btrfs_end_transaction(trans, root);
@@ -3488,8 +3486,7 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
}
static int should_alloc_chunk(struct btrfs_root *root,
- struct btrfs_space_info *sinfo, u64 alloc_bytes,
- int force)
+ struct btrfs_space_info *sinfo, int force)
{
struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
@@ -3504,7 +3501,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
* and purposes it's used space. Don't worry about locking the
* global_rsv, it doesn't change except when the transaction commits.
*/
- num_allocated += global_rsv->size;
+ if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
+ num_allocated += global_rsv->size;
/*
* in limited mode, we want to have some free space up to
@@ -3518,15 +3516,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
if (num_bytes - num_allocated < thresh)
return 1;
}
- thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
- /* 256MB or 2% of the FS */
- thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
- /* system chunks need a much small threshold */
- if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
- thresh = 32 * 1024 * 1024;
-
- if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
+ if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
return 0;
return 1;
}
@@ -3576,8 +3567,7 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
}
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root, u64 alloc_bytes,
- u64 flags, int force)
+ struct btrfs_root *extent_root, u64 flags, int force)
{
struct btrfs_space_info *space_info;
struct btrfs_fs_info *fs_info = extent_root->fs_info;
@@ -3601,7 +3591,7 @@ again:
return 0;
}
- if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
+ if (!should_alloc_chunk(extent_root, space_info, force)) {
spin_unlock(&space_info->lock);
return 0;
} else if (space_info->chunk_alloc) {
@@ -3669,6 +3659,46 @@ out:
return ret;
}
+static int can_overcommit(struct btrfs_root *root,
+ struct btrfs_space_info *space_info, u64 bytes,
+ int flush)
+{
+ u64 profile = btrfs_get_alloc_profile(root, 0);
+ u64 avail;
+ u64 used;
+
+ used = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ space_info->bytes_may_use;
+
+ spin_lock(&root->fs_info->free_chunk_lock);
+ avail = root->fs_info->free_chunk_space;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+
+ /*
+ * If we have dup, raid1 or raid10 then only half of the free
+ * space is actually useable.
+ */
+ if (profile & (BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ avail >>= 1;
+
+ /*
+ * If we aren't flushing don't let us overcommit too much, say
+ * 1/8th of the space. If we can flush, let it overcommit up to
+ * 1/2 of the space.
+ */
+ if (flush)
+ avail >>= 3;
+ else
+ avail >>= 1;
+
+ if (used + bytes < space_info->total_bytes + avail)
+ return 1;
+ return 0;
+}
+
/*
* shrink metadata reservation for delalloc
*/
@@ -3693,7 +3723,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
if (delalloc_bytes == 0) {
if (trans)
return;
- btrfs_wait_ordered_extents(root, 0, 0);
+ btrfs_wait_ordered_extents(root, 0);
return;
}
@@ -3703,11 +3733,15 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
WB_REASON_FS_FREE_SPACE);
+ /*
+ * We need to wait for the async pages to actually start before
+ * we do anything.
+ */
+ wait_event(root->fs_info->async_submit_wait,
+ !atomic_read(&root->fs_info->async_delalloc_pages));
+
spin_lock(&space_info->lock);
- if (space_info->bytes_used + space_info->bytes_reserved +
- space_info->bytes_pinned + space_info->bytes_readonly +
- space_info->bytes_may_use + orig <=
- space_info->total_bytes) {
+ if (can_overcommit(root, space_info, orig, !trans)) {
spin_unlock(&space_info->lock);
break;
}
@@ -3715,7 +3749,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
loops++;
if (wait_ordered && !trans) {
- btrfs_wait_ordered_extents(root, 0, 0);
+ btrfs_wait_ordered_extents(root, 0);
} else {
time_left = schedule_timeout_killable(1);
if (time_left)
@@ -3784,11 +3818,12 @@ commit:
}
enum flush_state {
- FLUSH_DELALLOC = 1,
- FLUSH_DELALLOC_WAIT = 2,
- FLUSH_DELAYED_ITEMS_NR = 3,
- FLUSH_DELAYED_ITEMS = 4,
- COMMIT_TRANS = 5,
+ FLUSH_DELAYED_ITEMS_NR = 1,
+ FLUSH_DELAYED_ITEMS = 2,
+ FLUSH_DELALLOC = 3,
+ FLUSH_DELALLOC_WAIT = 4,
+ ALLOC_CHUNK = 5,
+ COMMIT_TRANS = 6,
};
static int flush_space(struct btrfs_root *root,
@@ -3800,11 +3835,6 @@ static int flush_space(struct btrfs_root *root,
int ret = 0;
switch (state) {
- case FLUSH_DELALLOC:
- case FLUSH_DELALLOC_WAIT:
- shrink_delalloc(root, num_bytes, orig_bytes,
- state == FLUSH_DELALLOC_WAIT);
- break;
case FLUSH_DELAYED_ITEMS_NR:
case FLUSH_DELAYED_ITEMS:
if (state == FLUSH_DELAYED_ITEMS_NR) {
@@ -3825,6 +3855,24 @@ static int flush_space(struct btrfs_root *root,
ret = btrfs_run_delayed_items_nr(trans, root, nr);
btrfs_end_transaction(trans, root);
break;
+ case FLUSH_DELALLOC:
+ case FLUSH_DELALLOC_WAIT:
+ shrink_delalloc(root, num_bytes, orig_bytes,
+ state == FLUSH_DELALLOC_WAIT);
+ break;
+ case ALLOC_CHUNK:
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+ btrfs_get_alloc_profile(root, 0),
+ CHUNK_ALLOC_NO_FORCE);
+ btrfs_end_transaction(trans, root);
+ if (ret == -ENOSPC)
+ ret = 0;
+ break;
case COMMIT_TRANS:
ret = may_commit_transaction(root, space_info, orig_bytes, 0);
break;
@@ -3856,10 +3904,9 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
struct btrfs_space_info *space_info = block_rsv->space_info;
u64 used;
u64 num_bytes = orig_bytes;
- int flush_state = FLUSH_DELALLOC;
+ int flush_state = FLUSH_DELAYED_ITEMS_NR;
int ret = 0;
bool flushing = false;
- bool committed = false;
again:
ret = 0;
@@ -3922,57 +3969,12 @@ again:
(orig_bytes * 2);
}
- if (ret) {
- u64 profile = btrfs_get_alloc_profile(root, 0);
- u64 avail;
-
- /*
- * If we have a lot of space that's pinned, don't bother doing
- * the overcommit dance yet and just commit the transaction.
- */
- avail = (space_info->total_bytes - space_info->bytes_used) * 8;
- do_div(avail, 10);
- if (space_info->bytes_pinned >= avail && flush && !committed) {
- space_info->flush = 1;
- flushing = true;
- spin_unlock(&space_info->lock);
- ret = may_commit_transaction(root, space_info,
- orig_bytes, 1);
- if (ret)
- goto out;
- committed = true;
- goto again;
- }
-
- spin_lock(&root->fs_info->free_chunk_lock);
- avail = root->fs_info->free_chunk_space;
-
- /*
- * If we have dup, raid1 or raid10 then only half of the free
- * space is actually useable.
- */
- if (profile & (BTRFS_BLOCK_GROUP_DUP |
- BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10))
- avail >>= 1;
-
- /*
- * If we aren't flushing don't let us overcommit too much, say
- * 1/8th of the space. If we can flush, let it overcommit up to
- * 1/2 of the space.
- */
- if (flush)
- avail >>= 3;
- else
- avail >>= 1;
- spin_unlock(&root->fs_info->free_chunk_lock);
-
- if (used + num_bytes < space_info->total_bytes + avail) {
- space_info->bytes_may_use += orig_bytes;
- trace_btrfs_space_reservation(root->fs_info,
- "space_info", space_info->flags, orig_bytes, 1);
- ret = 0;
- }
+ if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
+ space_info->bytes_may_use += orig_bytes;
+ trace_btrfs_space_reservation(root->fs_info, "space_info",
+ space_info->flags, orig_bytes,
+ 1);
+ ret = 0;
}
/*
@@ -4114,13 +4116,15 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
return 0;
}
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
{
memset(rsv, 0, sizeof(*rsv));
spin_lock_init(&rsv->lock);
+ rsv->type = type;
}
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+ unsigned short type)
{
struct btrfs_block_rsv *block_rsv;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4129,7 +4133,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
if (!block_rsv)
return NULL;
- btrfs_init_block_rsv(block_rsv);
+ btrfs_init_block_rsv(block_rsv, type);
block_rsv->space_info = __find_space_info(fs_info,
BTRFS_BLOCK_GROUP_METADATA);
return block_rsv;
@@ -4138,6 +4142,8 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
void btrfs_free_block_rsv(struct btrfs_root *root,
struct btrfs_block_rsv *rsv)
{
+ if (!rsv)
+ return;
btrfs_block_rsv_release(root, rsv, (u64)-1);
kfree(rsv);
}
@@ -4416,10 +4422,10 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
/*
- * two for root back/forward refs, two for directory entries
- * and one for root of the snapshot.
+ * two for root back/forward refs, two for directory entries,
+ * one for root of the snapshot and one for parent inode.
*/
- u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
+ u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
dst_rsv->space_info = src_rsv->space_info;
return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
}
@@ -5018,7 +5024,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
while (1) {
ret = find_first_extent_bit(unpin, 0, &start, &end,
- EXTENT_DIRTY);
+ EXTENT_DIRTY, NULL);
if (ret)
break;
@@ -5096,8 +5102,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = remove_extent_backref(trans, extent_root, path,
NULL, refs_to_drop,
is_data);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
btrfs_release_path(path);
path->leave_spinning = 1;
@@ -5115,8 +5123,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_print_leaf(extent_root,
path->nodes[0]);
}
- if (ret < 0)
- goto abort;
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
extent_slot = path->slots[0];
}
} else if (ret == -ENOENT) {
@@ -5130,7 +5140,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
(unsigned long long)owner_objectid,
(unsigned long long)owner_offset);
} else {
- goto abort;
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
}
leaf = path->nodes[0];
@@ -5140,8 +5151,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
BUG_ON(found_extent || extent_slot != path->slots[0]);
ret = convert_extent_item_v0(trans, extent_root, path,
owner_objectid, 0);
- if (ret < 0)
- goto abort;
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
btrfs_release_path(path);
path->leave_spinning = 1;
@@ -5158,8 +5171,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
(unsigned long long)bytenr);
btrfs_print_leaf(extent_root, path->nodes[0]);
}
- if (ret < 0)
- goto abort;
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
+
extent_slot = path->slots[0];
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, extent_slot);
@@ -5196,8 +5212,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = remove_extent_backref(trans, extent_root, path,
iref, refs_to_drop,
is_data);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
}
} else {
if (found_extent) {
@@ -5214,27 +5232,29 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
btrfs_release_path(path);
if (is_data) {
ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
}
ret = update_block_group(trans, root, bytenr, num_bytes, 0);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
}
out:
btrfs_free_path(path);
return ret;
-
-abort:
- btrfs_abort_transaction(trans, extent_root, ret);
- goto out;
}
/*
@@ -5497,8 +5517,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *used_block_group;
u64 search_start = 0;
int empty_cluster = 2 * 1024 * 1024;
- int allowed_chunk_alloc = 0;
- int done_chunk_alloc = 0;
struct btrfs_space_info *space_info;
int loop = 0;
int index = 0;
@@ -5530,9 +5548,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
if (btrfs_mixed_space_info(space_info))
use_cluster = false;
- if (orig_root->ref_cows || empty_size)
- allowed_chunk_alloc = 1;
-
if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
last_ptr = &root->fs_info->meta_alloc_cluster;
if (!btrfs_test_opt(root, SSD))
@@ -5806,10 +5821,6 @@ checks:
trace_btrfs_reserve_extent(orig_root, block_group,
search_start, num_bytes);
- if (offset < search_start)
- btrfs_add_free_space(used_block_group, offset,
- search_start - offset);
- BUG_ON(offset > search_start);
if (used_block_group != block_group)
btrfs_put_block_group(used_block_group);
btrfs_put_block_group(block_group);
@@ -5842,34 +5853,17 @@ loop:
index = 0;
loop++;
if (loop == LOOP_ALLOC_CHUNK) {
- if (allowed_chunk_alloc) {
- ret = do_chunk_alloc(trans, root, num_bytes +
- 2 * 1024 * 1024, data,
- CHUNK_ALLOC_LIMITED);
- /*
- * Do not bail out on ENOSPC since we
- * can do more things.
- */
- if (ret < 0 && ret != -ENOSPC) {
- btrfs_abort_transaction(trans,
- root, ret);
- goto out;
- }
- allowed_chunk_alloc = 0;
- if (ret == 1)
- done_chunk_alloc = 1;
- } else if (!done_chunk_alloc &&
- space_info->force_alloc ==
- CHUNK_ALLOC_NO_FORCE) {
- space_info->force_alloc = CHUNK_ALLOC_LIMITED;
+ ret = do_chunk_alloc(trans, root, data,
+ CHUNK_ALLOC_FORCE);
+ /*
+ * Do not bail out on ENOSPC since we
+ * can do more things.
+ */
+ if (ret < 0 && ret != -ENOSPC) {
+ btrfs_abort_transaction(trans,
+ root, ret);
+ goto out;
}
-
- /*
- * We didn't allocate a chunk, go ahead and drop the
- * empty size and loop again.
- */
- if (!done_chunk_alloc)
- loop = LOOP_NO_EMPTY_SIZE;
}
if (loop == LOOP_NO_EMPTY_SIZE) {
@@ -5944,20 +5938,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
data = btrfs_get_alloc_profile(root, data);
again:
- /*
- * the only place that sets empty_size is btrfs_realloc_node, which
- * is not called recursively on allocations
- */
- if (empty_size || root->ref_cows) {
- ret = do_chunk_alloc(trans, root->fs_info->extent_root,
- num_bytes + 2 * 1024 * 1024, data,
- CHUNK_ALLOC_NO_FORCE);
- if (ret < 0 && ret != -ENOSPC) {
- btrfs_abort_transaction(trans, root, ret);
- return ret;
- }
- }
-
WARN_ON(num_bytes < root->sectorsize);
ret = find_free_extent(trans, root, num_bytes, empty_size,
hint_byte, ins, data);
@@ -5967,12 +5947,6 @@ again:
num_bytes = num_bytes >> 1;
num_bytes = num_bytes & ~(root->sectorsize - 1);
num_bytes = max(num_bytes, min_alloc_size);
- ret = do_chunk_alloc(trans, root->fs_info->extent_root,
- num_bytes, data, CHUNK_ALLOC_FORCE);
- if (ret < 0 && ret != -ENOSPC) {
- btrfs_abort_transaction(trans, root, ret);
- return ret;
- }
if (num_bytes == min_alloc_size)
final_tried = true;
goto again;
@@ -6314,7 +6288,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
ret = block_rsv_use_bytes(block_rsv, blocksize);
if (!ret)
return block_rsv;
- if (ret) {
+ if (ret && !block_rsv->failfast) {
static DEFINE_RATELIMIT_STATE(_rs,
DEFAULT_RATELIMIT_INTERVAL,
/*DEFAULT_RATELIMIT_BURST*/ 2);
@@ -7279,7 +7253,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
alloc_flags = update_block_group_flags(root, cache->flags);
if (alloc_flags != cache->flags) {
- ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+ ret = do_chunk_alloc(trans, root, alloc_flags,
CHUNK_ALLOC_FORCE);
if (ret < 0)
goto out;
@@ -7289,7 +7263,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
if (!ret)
goto out;
alloc_flags = get_alloc_profile(root, cache->space_info->flags);
- ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+ ret = do_chunk_alloc(trans, root, alloc_flags,
CHUNK_ALLOC_FORCE);
if (ret < 0)
goto out;
@@ -7303,7 +7277,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 type)
{
u64 alloc_flags = get_alloc_profile(root, type);
- return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+ return do_chunk_alloc(trans, root, alloc_flags,
CHUNK_ALLOC_FORCE);
}
@@ -7810,6 +7784,34 @@ error:
return ret;
}
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_block_group_cache *block_group, *tmp;
+ struct btrfs_root *extent_root = root->fs_info->extent_root;
+ struct btrfs_block_group_item item;
+ struct btrfs_key key;
+ int ret = 0;
+
+ list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
+ new_bg_list) {
+ list_del_init(&block_group->new_bg_list);
+
+ if (ret)
+ continue;
+
+ spin_lock(&block_group->lock);
+ memcpy(&item, &block_group->item, sizeof(item));
+ memcpy(&key, &block_group->key, sizeof(key));
+ spin_unlock(&block_group->lock);
+
+ ret = btrfs_insert_item(trans, extent_root, &key, &item,
+ sizeof(item));
+ if (ret)
+ btrfs_abort_transaction(trans, extent_root, ret);
+ }
+}
+
int btrfs_make_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytes_used,
u64 type, u64 chunk_objectid, u64 chunk_offset,
@@ -7843,6 +7845,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
spin_lock_init(&cache->lock);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
+ INIT_LIST_HEAD(&cache->new_bg_list);
btrfs_init_free_space_ctl(cache);
@@ -7874,12 +7877,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
ret = btrfs_add_block_group_cache(root->fs_info, cache);
BUG_ON(ret); /* Logic error */
- ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
- sizeof(cache->item));
- if (ret) {
- btrfs_abort_transaction(trans, extent_root, ret);
- return ret;
- }
+ list_add_tail(&cache->new_bg_list, &trans->new_bgs);
set_avail_alloc_bits(extent_root->fs_info, type);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b08ea4717e9..8036d3a8485 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -45,6 +45,7 @@ struct extent_page_data {
struct bio *bio;
struct extent_io_tree *tree;
get_extent_t *get_extent;
+ unsigned long bio_flags;
/* tells writepage not to lock the state bits for this range
* it still does the unlocking
@@ -64,13 +65,13 @@ tree_fs_info(struct extent_io_tree *tree)
int __init extent_io_init(void)
{
- extent_state_cache = kmem_cache_create("extent_state",
+ extent_state_cache = kmem_cache_create("btrfs_extent_state",
sizeof(struct extent_state), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!extent_state_cache)
return -ENOMEM;
- extent_buffer_cache = kmem_cache_create("extent_buffers",
+ extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
sizeof(struct extent_buffer), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!extent_buffer_cache)
@@ -942,6 +943,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
* @end: the end offset in bytes (inclusive)
* @bits: the bits to set in this range
* @clear_bits: the bits to clear in this range
+ * @cached_state: state that we're going to cache
* @mask: the allocation mask
*
* This will go through and set bits for the given range. If any states exist
@@ -951,7 +953,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
* boundary bits like LOCK.
*/
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int clear_bits, gfp_t mask)
+ int bits, int clear_bits,
+ struct extent_state **cached_state, gfp_t mask)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -968,6 +971,15 @@ again:
}
spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->start <= start && state->end > start &&
+ state->tree) {
+ node = &state->rb_node;
+ goto hit_next;
+ }
+ }
+
/*
* this search will find all the extents that end after
* our range starts.
@@ -998,6 +1010,7 @@ hit_next:
*/
if (state->start == start && state->end <= end) {
set_state_bits(tree, state, &bits);
+ cache_state(state, cached_state);
state = clear_state_bit(tree, state, &clear_bits, 0);
if (last_end == (u64)-1)
goto out;
@@ -1038,6 +1051,7 @@ hit_next:
goto out;
if (state->end <= end) {
set_state_bits(tree, state, &bits);
+ cache_state(state, cached_state);
state = clear_state_bit(tree, state, &clear_bits, 0);
if (last_end == (u64)-1)
goto out;
@@ -1076,6 +1090,7 @@ hit_next:
&bits);
if (err)
extent_io_tree_panic(tree, err);
+ cache_state(prealloc, cached_state);
prealloc = NULL;
start = this_end + 1;
goto search_again;
@@ -1098,6 +1113,7 @@ hit_next:
extent_io_tree_panic(tree, err);
set_state_bits(tree, prealloc, &bits);
+ cache_state(prealloc, cached_state);
clear_state_bit(tree, prealloc, &clear_bits, 0);
prealloc = NULL;
goto out;
@@ -1150,6 +1166,14 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
NULL, cached_state, mask);
}
+int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end,
+ EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
+ NULL, cached_state, mask);
+}
+
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
@@ -1294,18 +1318,42 @@ out:
* If nothing was found, 1 is returned. If found something, return 0.
*/
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, int bits)
+ u64 *start_ret, u64 *end_ret, int bits,
+ struct extent_state **cached_state)
{
struct extent_state *state;
+ struct rb_node *n;
int ret = 1;
spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->end == start - 1 && state->tree) {
+ n = rb_next(&state->rb_node);
+ while (n) {
+ state = rb_entry(n, struct extent_state,
+ rb_node);
+ if (state->state & bits)
+ goto got_it;
+ n = rb_next(n);
+ }
+ free_extent_state(*cached_state);
+ *cached_state = NULL;
+ goto out;
+ }
+ free_extent_state(*cached_state);
+ *cached_state = NULL;
+ }
+
state = find_first_extent_bit_state(tree, start, bits);
+got_it:
if (state) {
+ cache_state(state, cached_state);
*start_ret = state->start;
*end_ret = state->end;
ret = 0;
}
+out:
spin_unlock(&tree->lock);
return ret;
}
@@ -2068,7 +2116,7 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
}
read_unlock(&em_tree->lock);
- if (!em || IS_ERR(em)) {
+ if (!em) {
kfree(failrec);
return -EIO;
}
@@ -2304,8 +2352,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
struct extent_state *cached = NULL;
struct extent_state *state;
- pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
- "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
+ pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
+ "mirror=%ld\n", (u64)bio->bi_sector, err,
(long int)bio->bi_bdev);
tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -2709,12 +2757,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
end_bio_extent_readpage, mirror_num,
*bio_flags,
this_bio_flag);
- BUG_ON(ret == -ENOMEM);
- nr++;
- *bio_flags = this_bio_flag;
+ if (!ret) {
+ nr++;
+ *bio_flags = this_bio_flag;
+ }
}
- if (ret)
+ if (ret) {
SetPageError(page);
+ unlock_extent(tree, cur, cur + iosize - 1);
+ }
cur = cur + iosize;
pg_offset += iosize;
}
@@ -3161,12 +3212,16 @@ static int write_one_eb(struct extent_buffer *eb,
struct block_device *bdev = fs_info->fs_devices->latest_bdev;
u64 offset = eb->start;
unsigned long i, num_pages;
+ unsigned long bio_flags = 0;
int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
int ret = 0;
clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
num_pages = num_extent_pages(eb->start, eb->len);
atomic_set(&eb->io_pages, num_pages);
+ if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
+ bio_flags = EXTENT_BIO_TREE_LOG;
+
for (i = 0; i < num_pages; i++) {
struct page *p = extent_buffer_page(eb, i);
@@ -3175,7 +3230,8 @@ static int write_one_eb(struct extent_buffer *eb,
ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
-1, end_bio_extent_buffer_writepage,
- 0, 0, 0);
+ 0, epd->bio_flags, bio_flags);
+ epd->bio_flags = bio_flags;
if (ret) {
set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
SetPageError(p);
@@ -3210,6 +3266,7 @@ int btree_write_cache_pages(struct address_space *mapping,
.tree = tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+ .bio_flags = 0,
};
int ret = 0;
int done = 0;
@@ -3254,19 +3311,34 @@ retry:
break;
}
+ spin_lock(&mapping->private_lock);
+ if (!PagePrivate(page)) {
+ spin_unlock(&mapping->private_lock);
+ continue;
+ }
+
eb = (struct extent_buffer *)page->private;
+
+ /*
+ * Shouldn't happen and normally this would be a BUG_ON
+ * but no sense in crashing the users box for something
+ * we can survive anyway.
+ */
if (!eb) {
+ spin_unlock(&mapping->private_lock);
WARN_ON(1);
continue;
}
- if (eb == prev_eb)
+ if (eb == prev_eb) {
+ spin_unlock(&mapping->private_lock);
continue;
+ }
- if (!atomic_inc_not_zero(&eb->refs)) {
- WARN_ON(1);
+ ret = atomic_inc_not_zero(&eb->refs);
+ spin_unlock(&mapping->private_lock);
+ if (!ret)
continue;
- }
prev_eb = eb;
ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
@@ -3457,7 +3529,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd)
if (epd->sync_io)
rw = WRITE_SYNC;
- ret = submit_one_bio(rw, epd->bio, 0, 0);
+ ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);
BUG_ON(ret < 0); /* -ENOMEM */
epd->bio = NULL;
}
@@ -3480,6 +3552,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
.get_extent = get_extent,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+ .bio_flags = 0,
};
ret = __extent_writepage(page, wbc, &epd);
@@ -3504,6 +3577,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
.get_extent = get_extent,
.extent_locked = 1,
.sync_io = mode == WB_SYNC_ALL,
+ .bio_flags = 0,
};
struct writeback_control wbc_writepages = {
.sync_mode = mode,
@@ -3543,6 +3617,7 @@ int extent_writepages(struct extent_io_tree *tree,
.get_extent = get_extent,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+ .bio_flags = 0,
};
ret = extent_write_cache_pages(tree, mapping, wbc,
@@ -3920,18 +3995,6 @@ out:
return ret;
}
-inline struct page *extent_buffer_page(struct extent_buffer *eb,
- unsigned long i)
-{
- return eb->pages[i];
-}
-
-inline unsigned long num_extent_pages(u64 start, u64 len)
-{
- return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
- (start >> PAGE_CACHE_SHIFT);
-}
-
static void __free_extent_buffer(struct extent_buffer *eb)
{
#if LEAK_DEBUG
@@ -4047,7 +4110,7 @@ struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
return eb;
err:
- for (i--; i > 0; i--)
+ for (i--; i >= 0; i--)
__free_page(eb->pages[i]);
__free_extent_buffer(eb);
return NULL;
@@ -4192,10 +4255,8 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
for (i = 0; i < num_pages; i++, index++) {
p = find_or_create_page(mapping, index, GFP_NOFS);
- if (!p) {
- WARN_ON(1);
+ if (!p)
goto free_eb;
- }
spin_lock(&mapping->private_lock);
if (PagePrivate(p)) {
@@ -4338,7 +4399,6 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
/* Should be safe to release our pages at this point */
btrfs_release_extent_buffer_page(eb, 0);
-
call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
return 1;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 25900af5b15..711d12b8002 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -27,6 +27,7 @@
* type for this bio
*/
#define EXTENT_BIO_COMPRESSED 1
+#define EXTENT_BIO_TREE_LOG 2
#define EXTENT_BIO_FLAG_SHIFT 16
/* these are bit numbers for test/set bit */
@@ -232,11 +233,15 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int clear_bits, gfp_t mask);
+ int bits, int clear_bits,
+ struct extent_state **cached_state, gfp_t mask);
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
+int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state, gfp_t mask);
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, int bits);
+ u64 *start_ret, u64 *end_ret, int bits,
+ struct extent_state **cached_state);
struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
u64 start, int bits);
int extent_invalidatepage(struct extent_io_tree *tree,
@@ -277,8 +282,18 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
int read_extent_buffer_pages(struct extent_io_tree *tree,
struct extent_buffer *eb, u64 start, int wait,
get_extent_t *get_extent, int mirror_num);
-unsigned long num_extent_pages(u64 start, u64 len);
-struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
+
+static inline unsigned long num_extent_pages(u64 start, u64 len)
+{
+ return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+ (start >> PAGE_CACHE_SHIFT);
+}
+
+static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+ unsigned long i)
+{
+ return eb->pages[i];
+}
static inline void extent_buffer_get(struct extent_buffer *eb)
{
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 7c97b330145..b8cbc8d5c7f 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -11,7 +11,7 @@ static struct kmem_cache *extent_map_cache;
int __init extent_map_init(void)
{
- extent_map_cache = kmem_cache_create("extent_map",
+ extent_map_cache = kmem_cache_create("btrfs_extent_map",
sizeof(struct extent_map), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!extent_map_cache)
@@ -35,6 +35,7 @@ void extent_map_exit(void)
void extent_map_tree_init(struct extent_map_tree *tree)
{
tree->map = RB_ROOT;
+ INIT_LIST_HEAD(&tree->modified_extents);
rwlock_init(&tree->lock);
}
@@ -54,7 +55,9 @@ struct extent_map *alloc_extent_map(void)
em->in_tree = 0;
em->flags = 0;
em->compress_type = BTRFS_COMPRESS_NONE;
+ em->generation = 0;
atomic_set(&em->refs, 1);
+ INIT_LIST_HEAD(&em->list);
return em;
}
@@ -72,6 +75,7 @@ void free_extent_map(struct extent_map *em)
WARN_ON(atomic_read(&em->refs) == 0);
if (atomic_dec_and_test(&em->refs)) {
WARN_ON(em->in_tree);
+ WARN_ON(!list_empty(&em->list));
kmem_cache_free(extent_map_cache, em);
}
}
@@ -198,6 +202,14 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->block_len += merge->block_len;
em->block_start = merge->block_start;
merge->in_tree = 0;
+ if (merge->generation > em->generation) {
+ em->mod_start = em->start;
+ em->mod_len = em->len;
+ em->generation = merge->generation;
+ list_move(&em->list, &tree->modified_extents);
+ }
+
+ list_del_init(&merge->list);
rb_erase(&merge->rb_node, &tree->map);
free_extent_map(merge);
}
@@ -211,14 +223,34 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->block_len += merge->len;
rb_erase(&merge->rb_node, &tree->map);
merge->in_tree = 0;
+ if (merge->generation > em->generation) {
+ em->mod_len = em->len;
+ em->generation = merge->generation;
+ list_move(&em->list, &tree->modified_extents);
+ }
+ list_del_init(&merge->list);
free_extent_map(merge);
}
}
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
+/**
+ * unpint_extent_cache - unpin an extent from the cache
+ * @tree: tree to unpin the extent in
+ * @start: logical offset in the file
+ * @len: length of the extent
+ * @gen: generation that this extent has been modified in
+ * @prealloc: if this is set we need to clear the prealloc flag
+ *
+ * Called after an extent has been written to disk properly. Set the generation
+ * to the generation that actually added the file item to the inode so we know
+ * we need to sync this extent when we call fsync().
+ */
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
+ u64 gen)
{
int ret = 0;
struct extent_map *em;
+ bool prealloc = false;
write_lock(&tree->lock);
em = lookup_extent_mapping(tree, start, len);
@@ -228,10 +260,24 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
if (!em)
goto out;
+ list_move(&em->list, &tree->modified_extents);
+ em->generation = gen;
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->mod_start = em->start;
+ em->mod_len = em->len;
+
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ prealloc = true;
+ clear_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ }
try_merge_map(tree, em);
+ if (prealloc) {
+ em->mod_start = em->start;
+ em->mod_len = em->len;
+ }
+
free_extent_map(em);
out:
write_unlock(&tree->lock);
@@ -269,6 +315,9 @@ int add_extent_mapping(struct extent_map_tree *tree,
}
atomic_inc(&em->refs);
+ em->mod_start = em->start;
+ em->mod_len = em->len;
+
try_merge_map(tree, em);
out:
return ret;
@@ -358,6 +407,8 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
rb_erase(&em->rb_node, &tree->map);
+ if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+ list_del_init(&em->list);
em->in_tree = 0;
return ret;
}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 1195f09761f..679225555f7 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -13,6 +13,7 @@
#define EXTENT_FLAG_COMPRESSED 1
#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
+#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
struct extent_map {
struct rb_node rb_node;
@@ -20,18 +21,23 @@ struct extent_map {
/* all of these are in bytes */
u64 start;
u64 len;
+ u64 mod_start;
+ u64 mod_len;
u64 orig_start;
u64 block_start;
u64 block_len;
+ u64 generation;
unsigned long flags;
struct block_device *bdev;
atomic_t refs;
unsigned int in_tree;
unsigned int compress_type;
+ struct list_head list;
};
struct extent_map_tree {
struct rb_root map;
+ struct list_head modified_extents;
rwlock_t lock;
};
@@ -60,7 +66,7 @@ struct extent_map *alloc_extent_map(void);
void free_extent_map(struct extent_map *em);
int __init extent_map_init(void);
void extent_map_exit(void);
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 857d93cd01d..1ad08e4e4a1 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -25,11 +25,12 @@
#include "transaction.h"
#include "print-tree.h"
-#define __MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
+#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
sizeof(struct btrfs_item) * 2) / \
size) - 1))
-#define MAX_CSUM_ITEMS(r, size) (min(__MAX_CSUM_ITEMS(r, size), PAGE_CACHE_SIZE))
+#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
+ PAGE_CACHE_SIZE))
#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
sizeof(struct btrfs_ordered_sum)) / \
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f6b40e86121..9ab1bed8811 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -39,6 +39,7 @@
#include "tree-log.h"
#include "locking.h"
#include "compat.h"
+#include "volumes.h"
/*
* when auto defrag is enabled we
@@ -458,14 +459,15 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
* this drops all the extents in the cache that intersect the range
* [start, end]. Existing extents are split as required.
*/
-int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
- int skip_pinned)
+void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ int skip_pinned)
{
struct extent_map *em;
struct extent_map *split = NULL;
struct extent_map *split2 = NULL;
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
u64 len = end - start + 1;
+ u64 gen;
int ret;
int testend = 1;
unsigned long flags;
@@ -477,11 +479,14 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
testend = 0;
}
while (1) {
+ int no_splits = 0;
+
if (!split)
split = alloc_extent_map();
if (!split2)
split2 = alloc_extent_map();
- BUG_ON(!split || !split2); /* -ENOMEM */
+ if (!split || !split2)
+ no_splits = 1;
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
@@ -490,6 +495,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
break;
}
flags = em->flags;
+ gen = em->generation;
if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
if (testend && em->start + em->len >= start + len) {
free_extent_map(em);
@@ -506,6 +512,8 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
remove_extent_mapping(em_tree, em);
+ if (no_splits)
+ goto next;
if (em->block_start < EXTENT_MAP_LAST_BYTE &&
em->start < start) {
@@ -518,12 +526,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->block_len = em->block_len;
else
split->block_len = split->len;
-
+ split->generation = gen;
split->bdev = em->bdev;
split->flags = flags;
split->compress_type = em->compress_type;
ret = add_extent_mapping(em_tree, split);
BUG_ON(ret); /* Logic error */
+ list_move(&split->list, &em_tree->modified_extents);
free_extent_map(split);
split = split2;
split2 = NULL;
@@ -537,6 +546,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->bdev = em->bdev;
split->flags = flags;
split->compress_type = em->compress_type;
+ split->generation = gen;
if (compressed) {
split->block_len = em->block_len;
@@ -550,9 +560,11 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
ret = add_extent_mapping(em_tree, split);
BUG_ON(ret); /* Logic error */
+ list_move(&split->list, &em_tree->modified_extents);
free_extent_map(split);
split = NULL;
}
+next:
write_unlock(&em_tree->lock);
/* once for us */
@@ -564,7 +576,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
free_extent_map(split);
if (split2)
free_extent_map(split2);
- return 0;
}
/*
@@ -576,13 +587,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
* it is either truncated or split. Anything entirely inside the range
* is deleted from the tree.
*/
-int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
- u64 start, u64 end, u64 *hint_byte, int drop_cache)
+int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct btrfs_path *path, u64 start, u64 end,
+ u64 *drop_end, int drop_cache)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
- struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key new_key;
u64 ino = btrfs_ino(inode);
@@ -597,14 +608,12 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
int recow;
int ret;
int modify_tree = -1;
+ int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
+ int found = 0;
if (drop_cache)
btrfs_drop_extent_cache(inode, start, end - 1, 0);
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
if (start >= BTRFS_I(inode)->disk_i_size)
modify_tree = 0;
@@ -666,6 +675,7 @@ next_slot:
goto next_slot;
}
+ found = 1;
search_start = max(key.offset, start);
if (recow || !modify_tree) {
modify_tree = -1;
@@ -707,14 +717,13 @@ next_slot:
extent_end - start);
btrfs_mark_buffer_dirty(leaf);
- if (disk_bytenr > 0) {
+ if (update_refs && disk_bytenr > 0) {
ret = btrfs_inc_extent_ref(trans, root,
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
new_key.objectid,
start - extent_offset, 0);
BUG_ON(ret); /* -ENOMEM */
- *hint_byte = disk_bytenr;
}
key.offset = start;
}
@@ -734,10 +743,8 @@ next_slot:
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - end);
btrfs_mark_buffer_dirty(leaf);
- if (disk_bytenr > 0) {
+ if (update_refs && disk_bytenr > 0)
inode_sub_bytes(inode, end - key.offset);
- *hint_byte = disk_bytenr;
- }
break;
}
@@ -753,10 +760,8 @@ next_slot:
btrfs_set_file_extent_num_bytes(leaf, fi,
start - key.offset);
btrfs_mark_buffer_dirty(leaf);
- if (disk_bytenr > 0) {
+ if (update_refs && disk_bytenr > 0)
inode_sub_bytes(inode, extent_end - start);
- *hint_byte = disk_bytenr;
- }
if (end == extent_end)
break;
@@ -777,12 +782,13 @@ next_slot:
del_nr++;
}
- if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ if (update_refs &&
+ extent_type == BTRFS_FILE_EXTENT_INLINE) {
inode_sub_bytes(inode,
extent_end - key.offset);
extent_end = ALIGN(extent_end,
root->sectorsize);
- } else if (disk_bytenr > 0) {
+ } else if (update_refs && disk_bytenr > 0) {
ret = btrfs_free_extent(trans, root,
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
@@ -791,7 +797,6 @@ next_slot:
BUG_ON(ret); /* -ENOMEM */
inode_sub_bytes(inode,
extent_end - key.offset);
- *hint_byte = disk_bytenr;
}
if (end == extent_end)
@@ -806,7 +811,7 @@ next_slot:
del_nr);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
- goto out;
+ break;
}
del_nr = 0;
@@ -825,7 +830,24 @@ next_slot:
btrfs_abort_transaction(trans, root, ret);
}
-out:
+ if (drop_end)
+ *drop_end = found ? min(end, extent_end) : end;
+ btrfs_release_path(path);
+ return ret;
+}
+
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode, u64 start,
+ u64 end, int drop_cache)
+{
+ struct btrfs_path *path;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
+ drop_cache);
btrfs_free_path(path);
return ret;
}
@@ -892,8 +914,6 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
int ret;
u64 ino = btrfs_ino(inode);
- btrfs_drop_extent_cache(inode, start, end - 1, 0);
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -935,12 +955,16 @@ again:
btrfs_set_item_key_safe(trans, root, path, &new_key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi,
+ trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - end);
btrfs_set_file_extent_offset(leaf, fi,
end - orig_offset);
fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi,
+ trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
end - other_start);
btrfs_mark_buffer_dirty(leaf);
@@ -958,12 +982,16 @@ again:
struct btrfs_file_extent_item);
btrfs_set_file_extent_num_bytes(leaf, fi,
start - key.offset);
+ btrfs_set_file_extent_generation(leaf, fi,
+ trans->transid);
path->slots[0]++;
new_key.offset = start;
btrfs_set_item_key_safe(trans, root, path, &new_key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi,
+ trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
other_end - start);
btrfs_set_file_extent_offset(leaf, fi,
@@ -991,12 +1019,14 @@ again:
leaf = path->nodes[0];
fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
split - key.offset);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - split);
@@ -1056,12 +1086,14 @@ again:
struct btrfs_file_extent_item);
btrfs_set_file_extent_type(leaf, fi,
BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_mark_buffer_dirty(leaf);
} else {
fi = btrfs_item_ptr(leaf, del_slot - 1,
struct btrfs_file_extent_item);
btrfs_set_file_extent_type(leaf, fi,
BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - key.offset);
btrfs_mark_buffer_dirty(leaf);
@@ -1173,8 +1205,8 @@ again:
clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
- GFP_NOFS);
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+ 0, 0, &cached_state, GFP_NOFS);
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
start_pos, last_pos - 1, &cached_state,
GFP_NOFS);
@@ -1514,16 +1546,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trace_btrfs_sync_file(file, datasync);
+ /*
+ * We write the dirty pages in the range and wait until they complete
+ * out of the ->i_mutex. If so, we can flush the dirty pages by
+ * multi-task, and make the performance up.
+ */
+ ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (ret)
+ return ret;
+
mutex_lock(&inode->i_mutex);
/*
- * we wait first, since the writeback may change the inode, also wait
- * ordered range does a filemape_write_and_wait_range which is why we
- * don't do it above like other file systems.
+ * We flush the dirty pages again to avoid some dirty pages in the
+ * range being left.
*/
- root->log_batch++;
+ atomic_inc(&root->log_batch);
btrfs_wait_ordered_range(inode, start, end);
- root->log_batch++;
+ atomic_inc(&root->log_batch);
/*
* check the transaction that last modified this inode
@@ -1544,6 +1584,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
BTRFS_I(inode)->last_trans <=
root->fs_info->last_trans_committed) {
BTRFS_I(inode)->last_trans = 0;
+
+ /*
+ * We'v had everything committed since the last time we were
+ * modified so clear this flag in case it was set for whatever
+ * reason, it's no longer relevant.
+ */
+ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
mutex_unlock(&inode->i_mutex);
goto out;
}
@@ -1615,6 +1663,324 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
return 0;
}
+static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
+ int slot, u64 start, u64 end)
+{
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+
+ if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+ return 0;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != btrfs_ino(inode) ||
+ key.type != BTRFS_EXTENT_DATA_KEY)
+ return 0;
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
+ return 0;
+
+ if (btrfs_file_extent_disk_bytenr(leaf, fi))
+ return 0;
+
+ if (key.offset == end)
+ return 1;
+ if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
+ return 1;
+ return 0;
+}
+
+static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
+ struct btrfs_path *path, u64 offset, u64 end)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *fi;
+ struct extent_map *hole_em;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = offset;
+
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0)
+ return ret;
+ BUG_ON(!ret);
+
+ leaf = path->nodes[0];
+ if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
+ u64 num_bytes;
+
+ path->slots[0]--;
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
+ end - offset;
+ btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_offset(leaf, fi, 0);
+ btrfs_mark_buffer_dirty(leaf);
+ goto out;
+ }
+
+ if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
+ u64 num_bytes;
+
+ path->slots[0]++;
+ key.offset = offset;
+ btrfs_set_item_key_safe(trans, root, path, &key);
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
+ offset;
+ btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_offset(leaf, fi, 0);
+ btrfs_mark_buffer_dirty(leaf);
+ goto out;
+ }
+ btrfs_release_path(path);
+
+ ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
+ 0, 0, end - offset, 0, end - offset,
+ 0, 0, 0);
+ if (ret)
+ return ret;
+
+out:
+ btrfs_release_path(path);
+
+ hole_em = alloc_extent_map();
+ if (!hole_em) {
+ btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ } else {
+ hole_em->start = offset;
+ hole_em->len = end - offset;
+ hole_em->orig_start = offset;
+
+ hole_em->block_start = EXTENT_MAP_HOLE;
+ hole_em->block_len = 0;
+ hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+ hole_em->compress_type = BTRFS_COMPRESS_NONE;
+ hole_em->generation = trans->transid;
+
+ do {
+ btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, hole_em);
+ if (!ret)
+ list_move(&hole_em->list,
+ &em_tree->modified_extents);
+ write_unlock(&em_tree->lock);
+ } while (ret == -EEXIST);
+ free_extent_map(hole_em);
+ if (ret)
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ }
+
+ return 0;
+}
+
+static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_state *cached_state = NULL;
+ struct btrfs_path *path;
+ struct btrfs_block_rsv *rsv;
+ struct btrfs_trans_handle *trans;
+ u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+ u64 lockstart = (offset + mask) & ~mask;
+ u64 lockend = ((offset + len) & ~mask) - 1;
+ u64 cur_offset = lockstart;
+ u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
+ u64 drop_end;
+ unsigned long nr;
+ int ret = 0;
+ int err = 0;
+ bool same_page = (offset >> PAGE_CACHE_SHIFT) ==
+ ((offset + len) >> PAGE_CACHE_SHIFT);
+
+ btrfs_wait_ordered_range(inode, offset, len);
+
+ mutex_lock(&inode->i_mutex);
+ if (offset >= inode->i_size) {
+ mutex_unlock(&inode->i_mutex);
+ return 0;
+ }
+
+ /*
+ * Only do this if we are in the same page and we aren't doing the
+ * entire page.
+ */
+ if (same_page && len < PAGE_CACHE_SIZE) {
+ ret = btrfs_truncate_page(inode, offset, len, 0);
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+ }
+
+ /* zero back part of the first page */
+ ret = btrfs_truncate_page(inode, offset, 0, 0);
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+ }
+
+ /* zero the front end of the last page */
+ ret = btrfs_truncate_page(inode, offset + len, 0, 1);
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+ }
+
+ if (lockend < lockstart) {
+ mutex_unlock(&inode->i_mutex);
+ return 0;
+ }
+
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
+
+ truncate_pagecache_range(inode, lockstart, lockend);
+
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ 0, &cached_state);
+ ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+
+ /*
+ * We need to make sure we have no ordered extents in this range
+ * and nobody raced in and read a page in this range, if we did
+ * we need to try again.
+ */
+ if ((!ordered ||
+ (ordered->file_offset + ordered->len < lockstart ||
+ ordered->file_offset > lockend)) &&
+ !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, EXTENT_UPTODATE, 0,
+ cached_state)) {
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ break;
+ }
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, &cached_state, GFP_NOFS);
+ btrfs_wait_ordered_range(inode, lockstart,
+ lockend - lockstart + 1);
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+ if (!rsv) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+ rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
+ rsv->failfast = 1;
+
+ /*
+ * 1 - update the inode
+ * 1 - removing the extents in the range
+ * 1 - adding the hole extent
+ */
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_free;
+ }
+
+ ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
+ min_size);
+ BUG_ON(ret);
+ trans->block_rsv = rsv;
+
+ while (cur_offset < lockend) {
+ ret = __btrfs_drop_extents(trans, root, inode, path,
+ cur_offset, lockend + 1,
+ &drop_end, 1);
+ if (ret != -ENOSPC)
+ break;
+
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+
+ ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+ if (ret) {
+ err = ret;
+ break;
+ }
+
+ cur_offset = drop_end;
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ err = ret;
+ break;
+ }
+
+ nr = trans->blocks_used;
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root, nr);
+
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ break;
+ }
+
+ ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+ rsv, min_size);
+ BUG_ON(ret); /* shouldn't happen */
+ trans->block_rsv = rsv;
+ }
+
+ if (ret) {
+ err = ret;
+ goto out_trans;
+ }
+
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+ if (ret) {
+ err = ret;
+ goto out_trans;
+ }
+
+out_trans:
+ if (!trans)
+ goto out_free;
+
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ ret = btrfs_update_inode(trans, root, inode);
+ nr = trans->blocks_used;
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root, nr);
+out_free:
+ btrfs_free_path(path);
+ btrfs_free_block_rsv(root, rsv);
+out:
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state, GFP_NOFS);
+ mutex_unlock(&inode->i_mutex);
+ if (ret && !err)
+ err = ret;
+ return err;
+}
+
static long btrfs_fallocate(struct file *file, int mode,
loff_t offset, loff_t len)
{
@@ -1633,15 +1999,18 @@ static long btrfs_fallocate(struct file *file, int mode,
alloc_start = offset & ~mask;
alloc_end = (offset + len + mask) & ~mask;
- /* We only support the FALLOC_FL_KEEP_SIZE mode */
- if (mode & ~FALLOC_FL_KEEP_SIZE)
+ /* Make sure we aren't being give some crap mode */
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ return btrfs_punch_hole(inode, offset, len);
+
/*
* Make sure we have enough space before we do the
* allocation.
*/
- ret = btrfs_check_data_free_space(inode, len);
+ ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1);
if (ret)
return ret;
@@ -1748,7 +2117,7 @@ static long btrfs_fallocate(struct file *file, int mode,
out:
mutex_unlock(&inode->i_mutex);
/* Let go of our reservation. */
- btrfs_free_reserved_data_space(inode, len);
+ btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1);
return ret;
}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6b10acfc2f5..1027b854b90 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -966,7 +966,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
block_group->key.offset)) {
ret = find_first_extent_bit(unpin, start,
&extent_start, &extent_end,
- EXTENT_DIRTY);
+ EXTENT_DIRTY, NULL);
if (ret) {
ret = 0;
break;
@@ -1454,9 +1454,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
max_t(u64, *offset, bitmap_info->offset));
bits = bytes_to_bits(*bytes, ctl->unit);
- for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
- i < BITS_PER_BITMAP;
- i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
+ for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
next_zero = find_next_zero_bit(bitmap_info->bitmap,
BITS_PER_BITMAP, i);
if ((next_zero - i) >= bits) {
@@ -2307,9 +2305,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
again:
found_bits = 0;
- for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
- i < BITS_PER_BITMAP;
- i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
+ for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
next_zero = find_next_zero_bit(entry->bitmap,
BITS_PER_BITMAP, i);
if (next_zero - i >= min_bits) {
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index db2ff9773b9..1d982812ab6 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -24,4 +24,14 @@ static inline u64 btrfs_name_hash(const char *name, int len)
{
return crc32c((u32)~1, name, len);
}
+
+/*
+ * Figure the key offset of an extended inode ref
+ */
+static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
+ int len)
+{
+ return (u64) crc32c(parent_objectid, name, len);
+}
+
#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index a13cf1a96c7..48b8fda9313 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -18,6 +18,7 @@
#include "ctree.h"
#include "disk-io.h"
+#include "hash.h"
#include "transaction.h"
#include "print-tree.h"
@@ -50,18 +51,57 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
return 0;
}
-struct btrfs_inode_ref *
+int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid,
+ const char *name, int name_len,
+ struct btrfs_inode_extref **extref_ret)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_inode_extref *extref;
+ unsigned long ptr;
+ unsigned long name_ptr;
+ u32 item_size;
+ u32 cur_offset = 0;
+ int ref_name_len;
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+ /*
+ * Search all extended backrefs in this item. We're only
+ * looking through any collisions so most of the time this is
+ * just going to compare against one buffer. If all is well,
+ * we'll return success and the inode ref object.
+ */
+ while (cur_offset < item_size) {
+ extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
+ name_ptr = (unsigned long)(&extref->name);
+ ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+ if (ref_name_len == name_len &&
+ btrfs_inode_extref_parent(leaf, extref) == ref_objectid &&
+ (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) {
+ if (extref_ret)
+ *extref_ret = extref;
+ return 1;
+ }
+
+ cur_offset += ref_name_len + sizeof(*extref);
+ }
+ return 0;
+}
+
+static struct btrfs_inode_ref *
btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, int mod)
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, int ins_len,
+ int cow)
{
+ int ret;
struct btrfs_key key;
struct btrfs_inode_ref *ref;
- int ins_len = mod < 0 ? -1 : 0;
- int cow = mod != 0;
- int ret;
key.objectid = inode_objectid;
key.type = BTRFS_INODE_REF_KEY;
@@ -77,13 +117,150 @@ btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
return ref;
}
-int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+/* Returns NULL if no extref found */
+struct btrfs_inode_extref *
+btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, int ins_len,
+ int cow)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_inode_extref *extref;
+
+ key.objectid = inode_objectid;
+ key.type = BTRFS_INODE_EXTREF_KEY;
+ key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+ ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ return NULL;
+ if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref))
+ return NULL;
+ return extref;
+}
+
+int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, int mod,
+ u64 *ret_index)
+{
+ struct btrfs_inode_ref *ref;
+ struct btrfs_inode_extref *extref;
+ int ins_len = mod < 0 ? -1 : 0;
+ int cow = mod != 0;
+
+ ref = btrfs_lookup_inode_ref(trans, root, path, name, name_len,
+ inode_objectid, ref_objectid, ins_len,
+ cow);
+ if (IS_ERR(ref))
+ return PTR_ERR(ref);
+
+ if (ref != NULL) {
+ *ret_index = btrfs_inode_ref_index(path->nodes[0], ref);
+ return 0;
+ }
+
+ btrfs_release_path(path);
+
+ extref = btrfs_lookup_inode_extref(trans, root, path, name,
+ name_len, inode_objectid,
+ ref_objectid, ins_len, cow);
+ if (IS_ERR(extref))
+ return PTR_ERR(extref);
+
+ if (extref) {
+ *ret_index = btrfs_inode_extref_index(path->nodes[0], extref);
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
u64 inode_objectid, u64 ref_objectid, u64 *index)
{
struct btrfs_path *path;
struct btrfs_key key;
+ struct btrfs_inode_extref *extref;
+ struct extent_buffer *leaf;
+ int ret;
+ int del_len = name_len + sizeof(*extref);
+ unsigned long ptr;
+ unsigned long item_start;
+ u32 item_size;
+
+ key.objectid = inode_objectid;
+ btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+ key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->leave_spinning = 1;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret < 0)
+ goto out;
+
+ /*
+ * Sanity check - did we find the right item for this name?
+ * This should always succeed so error here will make the FS
+ * readonly.
+ */
+ if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
+ name, name_len, &extref)) {
+ btrfs_std_error(root->fs_info, -ENOENT);
+ ret = -EROFS;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ if (index)
+ *index = btrfs_inode_extref_index(leaf, extref);
+
+ if (del_len == item_size) {
+ /*
+ * Common case only one ref in the item, remove the
+ * whole item.
+ */
+ ret = btrfs_del_item(trans, root, path);
+ goto out;
+ }
+
+ ptr = (unsigned long)extref;
+ item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+ memmove_extent_buffer(leaf, ptr, ptr + del_len,
+ item_size - (ptr + del_len - item_start));
+
+ btrfs_truncate_item(trans, root, path, item_size - del_len, 1);
+
+out:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, u64 *index)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
struct btrfs_inode_ref *ref;
struct extent_buffer *leaf;
unsigned long ptr;
@@ -91,6 +268,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
u32 item_size;
u32 sub_item_len;
int ret;
+ int search_ext_refs = 0;
int del_len = name_len + sizeof(*ref);
key.objectid = inode_objectid;
@@ -106,12 +284,14 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
ret = -ENOENT;
+ search_ext_refs = 1;
goto out;
} else if (ret < 0) {
goto out;
}
if (!find_name_in_backref(path, name, name_len, &ref)) {
ret = -ENOENT;
+ search_ext_refs = 1;
goto out;
}
leaf = path->nodes[0];
@@ -129,8 +309,78 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
item_size - (ptr + sub_item_len - item_start));
- btrfs_truncate_item(trans, root, path,
- item_size - sub_item_len, 1);
+ btrfs_truncate_item(trans, root, path, item_size - sub_item_len, 1);
+out:
+ btrfs_free_path(path);
+
+ if (search_ext_refs) {
+ /*
+ * No refs were found, or we could not find the
+ * name in our ref array. Find and remove the extended
+ * inode ref then.
+ */
+ return btrfs_del_inode_extref(trans, root, name, name_len,
+ inode_objectid, ref_objectid, index);
+ }
+
+ return ret;
+}
+
+/*
+ * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree.
+ *
+ * The caller must have checked against BTRFS_LINK_MAX already.
+ */
+static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, u64 index)
+{
+ struct btrfs_inode_extref *extref;
+ int ret;
+ int ins_len = name_len + sizeof(*extref);
+ unsigned long ptr;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_item *item;
+
+ key.objectid = inode_objectid;
+ key.type = BTRFS_INODE_EXTREF_KEY;
+ key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->leave_spinning = 1;
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ ins_len);
+ if (ret == -EEXIST) {
+ if (btrfs_find_name_in_ext_backref(path, ref_objectid,
+ name, name_len, NULL))
+ goto out;
+
+ btrfs_extend_item(trans, root, path, ins_len);
+ ret = 0;
+ }
+ if (ret < 0)
+ goto out;
+
+ leaf = path->nodes[0];
+ item = btrfs_item_nr(leaf, path->slots[0]);
+ ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
+ ptr += btrfs_item_size(leaf, item) - ins_len;
+ extref = (struct btrfs_inode_extref *)ptr;
+
+ btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
+ btrfs_set_inode_extref_index(path->nodes[0], extref, index);
+ btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid);
+
+ ptr = (unsigned long)&extref->name;
+ write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+
out:
btrfs_free_path(path);
return ret;
@@ -191,6 +441,19 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
out:
btrfs_free_path(path);
+
+ if (ret == -EMLINK) {
+ struct btrfs_super_block *disk_super = root->fs_info->super_copy;
+ /* We ran out of space in the ref array. Need to
+ * add an extended ref. */
+ if (btrfs_super_incompat_flags(disk_super)
+ & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+ ret = btrfs_insert_inode_extref(trans, root, name,
+ name_len,
+ inode_objectid,
+ ref_objectid, index);
+ }
+
return ret;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a6ed6944e50..85a1e5053fe 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -230,7 +230,6 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
u64 inline_len = actual_end - start;
u64 aligned_end = (end + root->sectorsize - 1) &
~((u64)root->sectorsize - 1);
- u64 hint_byte;
u64 data_len = inline_len;
int ret;
@@ -247,8 +246,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
return 1;
}
- ret = btrfs_drop_extents(trans, inode, start, aligned_end,
- &hint_byte, 1);
+ ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1);
if (ret)
return ret;
@@ -664,7 +662,7 @@ retry:
async_extent->compressed_size,
async_extent->compressed_size,
0, alloc_hint, &ins, 1);
- if (ret)
+ if (ret && ret != -ENOSPC)
btrfs_abort_transaction(trans, root, ret);
btrfs_end_transaction(trans, root);
}
@@ -1308,6 +1306,7 @@ out_check:
em->block_start = disk_bytenr;
em->bdev = root->fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
while (1) {
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
@@ -1364,11 +1363,7 @@ out_check:
}
error:
- if (nolock) {
- err = btrfs_end_transaction_nolock(trans, root);
- } else {
- err = btrfs_end_transaction(trans, root);
- }
+ err = btrfs_end_transaction(trans, root);
if (!ret)
ret = err;
@@ -1785,7 +1780,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key ins;
- u64 hint;
int ret;
path = btrfs_alloc_path();
@@ -1803,8 +1797,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
* the caller is expected to unpin it and allow it to be merged
* with the others.
*/
- ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
- &hint, 0);
+ ret = btrfs_drop_extents(trans, root, inode, file_pos,
+ file_pos + num_bytes, 0);
if (ret)
goto out;
@@ -1828,10 +1822,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_encryption(leaf, fi, encryption);
btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
- btrfs_unlock_up_safe(path, 1);
- btrfs_set_lock_blocking(leaf);
-
btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
inode_add_bytes(inode, num_bytes);
@@ -1929,11 +1921,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->len,
compress_type, 0, 0,
BTRFS_FILE_EXTENT_REG);
- unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
- ordered_extent->file_offset,
- ordered_extent->len);
}
-
+ unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+ ordered_extent->file_offset, ordered_extent->len,
+ trans->transid);
if (ret < 0) {
btrfs_abort_transaction(trans, root, ret);
goto out_unlock;
@@ -1949,6 +1940,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
btrfs_abort_transaction(trans, root, ret);
goto out_unlock;
}
+ } else {
+ btrfs_set_inode_last_trans(trans, inode);
}
ret = 0;
out_unlock:
@@ -1958,12 +1951,8 @@ out_unlock:
out:
if (root != root->fs_info->tree_root)
btrfs_delalloc_release_metadata(inode, ordered_extent->len);
- if (trans) {
- if (nolock)
- btrfs_end_transaction_nolock(trans, root);
- else
- btrfs_end_transaction(trans, root);
- }
+ if (trans)
+ btrfs_end_transaction(trans, root);
if (ret)
clear_extent_uptodate(io_tree, ordered_extent->file_offset,
@@ -2119,7 +2108,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
if (empty)
return;
- down_read(&root->fs_info->cleanup_work_sem);
spin_lock(&fs_info->delayed_iput_lock);
list_splice_init(&fs_info->delayed_iputs, &list);
spin_unlock(&fs_info->delayed_iput_lock);
@@ -2130,7 +2118,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
iput(delayed->inode);
kfree(delayed);
}
- up_read(&root->fs_info->cleanup_work_sem);
}
enum btrfs_orphan_cleanup_state {
@@ -2198,7 +2185,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
int ret;
if (!root->orphan_block_rsv) {
- block_rsv = btrfs_alloc_block_rsv(root);
+ block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
if (!block_rsv)
return -ENOMEM;
}
@@ -2225,7 +2212,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
insert = 1;
#endif
insert = 1;
- atomic_dec(&root->orphan_inodes);
+ atomic_inc(&root->orphan_inodes);
}
if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
@@ -2590,6 +2577,18 @@ static void btrfs_read_locked_inode(struct inode *inode)
inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
+ BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
+
+ /*
+ * If we were modified in the current generation and evicted from memory
+ * and then re-read we need to do a full sync since we don't have any
+ * idea about which extents were modified before we were evicted from
+ * cache.
+ */
+ if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+
inode->i_version = btrfs_inode_sequence(leaf, inode_item);
inode->i_generation = BTRFS_I(inode)->generation;
inode->i_rdev = 0;
@@ -2894,7 +2893,6 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_path *path;
- struct btrfs_inode_ref *ref;
struct btrfs_dir_item *di;
struct inode *inode = dentry->d_inode;
u64 index;
@@ -3008,17 +3006,17 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
}
btrfs_release_path(path);
- ref = btrfs_lookup_inode_ref(trans, root, path,
- dentry->d_name.name, dentry->d_name.len,
- ino, dir_ino, 0);
- if (IS_ERR(ref)) {
- err = PTR_ERR(ref);
+ ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
+ dentry->d_name.len, ino, dir_ino, 0,
+ &index);
+ if (ret) {
+ err = ret;
goto out;
}
- BUG_ON(!ref); /* Logic error */
+
if (check_path_shared(root, path))
goto out;
- index = btrfs_inode_ref_index(path->nodes[0], ref);
+
btrfs_release_path(path);
/*
@@ -3061,7 +3059,7 @@ out:
static void __unlink_end_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+ if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
btrfs_block_rsv_release(root, trans->block_rsv,
trans->bytes_reserved);
trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -3191,9 +3189,10 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
struct btrfs_trans_handle *trans;
unsigned long nr = 0;
- if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
- btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
+ if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
+ if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
+ return -EPERM;
trans = __unlink_start_trans(dir, dentry);
if (IS_ERR(trans))
@@ -3267,8 +3266,13 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
return -ENOMEM;
path->reada = -1;
+ /*
+ * We want to drop from the next block forward in case this new size is
+ * not block aligned since we will be keeping the last block of the
+ * extent just the way it is.
+ */
if (root->ref_cows || root == root->fs_info->tree_root)
- btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
+ btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0);
/*
* This function is also used to drop the items in the log tree before
@@ -3429,12 +3433,6 @@ delete:
if (path->slots[0] == 0 ||
path->slots[0] != pending_del_slot) {
- if (root->ref_cows &&
- BTRFS_I(inode)->location.objectid !=
- BTRFS_FREE_INO_OBJECTID) {
- err = -EAGAIN;
- goto out;
- }
if (pending_del_nr) {
ret = btrfs_del_items(trans, root, path,
pending_del_slot,
@@ -3465,12 +3463,20 @@ error:
}
/*
- * taken from block_truncate_page, but does cow as it zeros out
- * any bytes left in the last page in the file.
+ * btrfs_truncate_page - read, zero a chunk and write a page
+ * @inode - inode that we're zeroing
+ * @from - the offset to start zeroing
+ * @len - the length to zero, 0 to zero the entire range respective to the
+ * offset
+ * @front - zero up to the offset instead of from the offset on
+ *
+ * This will find the page for the "from" offset and cow the page and zero the
+ * part we want to zero. This is used with truncate and hole punching.
*/
-static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
+int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+ int front)
{
- struct inode *inode = mapping->host;
+ struct address_space *mapping = inode->i_mapping;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_ordered_extent *ordered;
@@ -3485,7 +3491,8 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
u64 page_start;
u64 page_end;
- if ((offset & (blocksize - 1)) == 0)
+ if ((offset & (blocksize - 1)) == 0 &&
+ (!len || ((len & (blocksize - 1)) == 0)))
goto out;
ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
if (ret)
@@ -3532,7 +3539,8 @@ again:
}
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
- EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, &cached_state, GFP_NOFS);
ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
@@ -3545,8 +3553,13 @@ again:
ret = 0;
if (offset != PAGE_CACHE_SIZE) {
+ if (!len)
+ len = PAGE_CACHE_SIZE - offset;
kaddr = kmap(page);
- memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+ if (front)
+ memset(kaddr, 0, offset);
+ else
+ memset(kaddr + offset, 0, len);
flush_dcache_page(page);
kunmap(page);
}
@@ -3577,6 +3590,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
u64 mask = root->sectorsize - 1;
u64 hole_start = (oldsize + mask) & ~mask;
u64 block_end = (size + mask) & ~mask;
@@ -3613,7 +3627,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
last_byte = min(extent_map_end(em), block_end);
last_byte = (last_byte + mask) & ~mask;
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
- u64 hint_byte = 0;
+ struct extent_map *hole_em;
hole_size = last_byte - cur_offset;
trans = btrfs_start_transaction(root, 3);
@@ -3622,9 +3636,9 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
break;
}
- err = btrfs_drop_extents(trans, inode, cur_offset,
- cur_offset + hole_size,
- &hint_byte, 1);
+ err = btrfs_drop_extents(trans, root, inode,
+ cur_offset,
+ cur_offset + hole_size, 1);
if (err) {
btrfs_abort_transaction(trans, root, err);
btrfs_end_transaction(trans, root);
@@ -3641,9 +3655,39 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
break;
}
- btrfs_drop_extent_cache(inode, hole_start,
- last_byte - 1, 0);
+ btrfs_drop_extent_cache(inode, cur_offset,
+ cur_offset + hole_size - 1, 0);
+ hole_em = alloc_extent_map();
+ if (!hole_em) {
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ goto next;
+ }
+ hole_em->start = cur_offset;
+ hole_em->len = hole_size;
+ hole_em->orig_start = cur_offset;
+ hole_em->block_start = EXTENT_MAP_HOLE;
+ hole_em->block_len = 0;
+ hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+ hole_em->compress_type = BTRFS_COMPRESS_NONE;
+ hole_em->generation = trans->transid;
+
+ while (1) {
+ write_lock(&em_tree->lock);
+ err = add_extent_mapping(em_tree, hole_em);
+ if (!err)
+ list_move(&hole_em->list,
+ &em_tree->modified_extents);
+ write_unlock(&em_tree->lock);
+ if (err != -EEXIST)
+ break;
+ btrfs_drop_extent_cache(inode, cur_offset,
+ cur_offset +
+ hole_size - 1, 0);
+ }
+ free_extent_map(hole_em);
+next:
btrfs_update_inode(trans, root, inode);
btrfs_end_transaction(trans, root);
}
@@ -3768,26 +3812,22 @@ void btrfs_evict_inode(struct inode *inode)
goto no_delete;
}
- rsv = btrfs_alloc_block_rsv(root);
+ rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
if (!rsv) {
btrfs_orphan_del(NULL, inode);
goto no_delete;
}
rsv->size = min_size;
+ rsv->failfast = 1;
global_rsv = &root->fs_info->global_block_rsv;
btrfs_i_size_write(inode, 0);
/*
- * This is a bit simpler than btrfs_truncate since
- *
- * 1) We've already reserved our space for our orphan item in the
- * unlink.
- * 2) We're going to delete the inode item, so we don't need to update
- * it at all.
- *
- * So we just need to reserve some slack space in case we add bytes when
- * doing the truncate.
+ * This is a bit simpler than btrfs_truncate since we've already
+ * reserved our space for our orphan item in the unlink, so we just
+ * need to reserve some slack space in case we add bytes and update
+ * inode item when doing the truncate.
*/
while (1) {
ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
@@ -3808,7 +3848,7 @@ void btrfs_evict_inode(struct inode *inode)
goto no_delete;
}
- trans = btrfs_start_transaction(root, 0);
+ trans = btrfs_start_transaction_noflush(root, 1);
if (IS_ERR(trans)) {
btrfs_orphan_del(NULL, inode);
btrfs_free_block_rsv(root, rsv);
@@ -3818,9 +3858,13 @@ void btrfs_evict_inode(struct inode *inode)
trans->block_rsv = rsv;
ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
- if (ret != -EAGAIN)
+ if (ret != -ENOSPC)
break;
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ ret = btrfs_update_inode(trans, root, inode);
+ BUG_ON(ret);
+
nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
trans = NULL;
@@ -4470,10 +4514,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
- if (nolock)
- ret = btrfs_end_transaction_nolock(trans, root);
- else
- ret = btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans, root);
}
return ret;
}
@@ -4671,6 +4712,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
BTRFS_I(inode)->generation = trans->transid;
inode->i_generation = BTRFS_I(inode)->generation;
+ /*
+ * We could have gotten an inode number from somebody who was fsynced
+ * and then removed in this same transaction, so let's just set full
+ * sync since it will be a full sync anyway and this will blow away the
+ * old info in the log.
+ */
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+
if (S_ISDIR(mode))
owner = 0;
else
@@ -4680,6 +4729,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
key[0].offset = 0;
+ /*
+ * Start new inodes with an inode_ref. This is slightly more
+ * efficient for small numbers of hard links since they will
+ * be packed into one item. Extended refs will kick in if we
+ * add more hard links than can fit in the ref item.
+ */
key[1].objectid = objectid;
btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
key[1].offset = ref_objectid;
@@ -4986,7 +5041,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (root->objectid != BTRFS_I(inode)->root->objectid)
return -EXDEV;
- if (inode->i_nlink == ~0U)
+ if (inode->i_nlink >= BTRFS_LINK_MAX)
return -EMLINK;
err = btrfs_set_inode_index(dir, &index);
@@ -5450,7 +5505,8 @@ insert:
write_unlock(&em_tree->lock);
out:
- trace_btrfs_get_extent(root, em);
+ if (em)
+ trace_btrfs_get_extent(root, em);
if (path)
btrfs_free_path(path);
@@ -5836,6 +5892,48 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
return ret;
}
+static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
+ u64 len, u64 orig_start,
+ u64 block_start, u64 block_len,
+ int type)
+{
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ em_tree = &BTRFS_I(inode)->extent_tree;
+ em = alloc_extent_map();
+ if (!em)
+ return ERR_PTR(-ENOMEM);
+
+ em->start = start;
+ em->orig_start = orig_start;
+ em->len = len;
+ em->block_len = block_len;
+ em->block_start = block_start;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ if (type == BTRFS_ORDERED_PREALLOC)
+ set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+
+ do {
+ btrfs_drop_extent_cache(inode, em->start,
+ em->start + em->len - 1, 0);
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ } while (ret == -EEXIST);
+
+ if (ret) {
+ free_extent_map(em);
+ return ERR_PTR(ret);
+ }
+
+ return em;
+}
+
+
static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
@@ -5950,6 +6048,19 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
goto must_cow;
if (can_nocow_odirect(trans, inode, start, len) == 1) {
+ u64 orig_start = em->start;
+
+ if (type == BTRFS_ORDERED_PREALLOC) {
+ free_extent_map(em);
+ em = create_pinned_em(inode, start, len,
+ orig_start,
+ block_start, len, type);
+ if (IS_ERR(em)) {
+ btrfs_end_transaction(trans, root);
+ goto unlock_err;
+ }
+ }
+
ret = btrfs_add_ordered_extent_dio(inode, start,
block_start, len, len, type);
btrfs_end_transaction(trans, root);
@@ -5999,7 +6110,8 @@ unlock:
if (lockstart < lockend) {
if (create && len < lockend - lockstart) {
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
- lockstart + len - 1, unlock_bits, 1, 0,
+ lockstart + len - 1,
+ unlock_bits | EXTENT_DEFRAG, 1, 0,
&cached_state, GFP_NOFS);
/*
* Beside unlock, we also need to cleanup reserved space
@@ -6007,8 +6119,8 @@ unlock:
*/
clear_extent_bit(&BTRFS_I(inode)->io_tree,
lockstart + len, lockend,
- unlock_bits | EXTENT_DO_ACCOUNTING,
- 1, 0, NULL, GFP_NOFS);
+ unlock_bits | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS);
} else {
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
lockend, unlock_bits, 1, 0,
@@ -6573,8 +6685,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
*/
clear_extent_bit(tree, page_start, page_end,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
- &cached_state, GFP_NOFS);
+ EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS);
/*
* whoever cleared the private bit is responsible
* for the finish_ordered_io
@@ -6590,7 +6702,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
}
clear_extent_bit(tree, page_start, page_end,
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
+ &cached_state, GFP_NOFS);
__btrfs_releasepage(page, GFP_NOFS);
ClearPageChecked(page);
@@ -6687,7 +6800,8 @@ again:
* prepare_pages in the normal write path.
*/
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
- EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, &cached_state, GFP_NOFS);
ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
@@ -6718,6 +6832,7 @@ again:
BTRFS_I(inode)->last_trans = root->fs_info->generation;
BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+ BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
@@ -6745,7 +6860,7 @@ static int btrfs_truncate(struct inode *inode)
u64 mask = root->sectorsize - 1;
u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
- ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
+ ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
if (ret)
return ret;
@@ -6788,10 +6903,11 @@ static int btrfs_truncate(struct inode *inode)
* 3) fs_info->trans_block_rsv - this will have 1 items worth left for
* updating the inode.
*/
- rsv = btrfs_alloc_block_rsv(root);
+ rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
if (!rsv)
return -ENOMEM;
rsv->size = min_size;
+ rsv->failfast = 1;
/*
* 1 for the truncate slack space
@@ -6837,36 +6953,21 @@ static int btrfs_truncate(struct inode *inode)
&BTRFS_I(inode)->runtime_flags))
btrfs_add_ordered_operation(trans, root, inode);
- while (1) {
- ret = btrfs_block_rsv_refill(root, rsv, min_size);
- if (ret) {
- /*
- * This can only happen with the original transaction we
- * started above, every other time we shouldn't have a
- * transaction started yet.
- */
- if (ret == -EAGAIN)
- goto end_trans;
- err = ret;
- break;
- }
-
- if (!trans) {
- /* Just need the 1 for updating the inode */
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- ret = err = PTR_ERR(trans);
- trans = NULL;
- break;
- }
- }
-
- trans->block_rsv = rsv;
+ /*
+ * So if we truncate and then write and fsync we normally would just
+ * write the extents that changed, which is a problem if we need to
+ * first truncate that entire inode. So set this flag so we write out
+ * all of the extents in the inode to the sync log so we're completely
+ * safe.
+ */
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+ trans->block_rsv = rsv;
+ while (1) {
ret = btrfs_truncate_inode_items(trans, root, inode,
inode->i_size,
BTRFS_EXTENT_DATA_KEY);
- if (ret != -EAGAIN) {
+ if (ret != -ENOSPC) {
err = ret;
break;
}
@@ -6877,11 +6978,22 @@ static int btrfs_truncate(struct inode *inode)
err = ret;
break;
}
-end_trans:
+
nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
- trans = NULL;
btrfs_btree_balance_dirty(root, nr);
+
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans)) {
+ ret = err = PTR_ERR(trans);
+ trans = NULL;
+ break;
+ }
+
+ ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+ rsv, min_size);
+ BUG_ON(ret); /* shouldn't happen */
+ trans->block_rsv = rsv;
}
if (ret == 0 && inode->i_nlink > 0) {
@@ -6965,6 +7077,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->csum_bytes = 0;
ei->index_cnt = (u64)-1;
ei->last_unlink_trans = 0;
+ ei->last_log_commit = 0;
spin_lock_init(&ei->lock);
ei->outstanding_extents = 0;
@@ -7095,31 +7208,31 @@ void btrfs_destroy_cachep(void)
int btrfs_init_cachep(void)
{
- btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
+ btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
sizeof(struct btrfs_inode), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
if (!btrfs_inode_cachep)
goto fail;
- btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
+ btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
sizeof(struct btrfs_trans_handle), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!btrfs_trans_handle_cachep)
goto fail;
- btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
+ btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
sizeof(struct btrfs_transaction), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!btrfs_transaction_cachep)
goto fail;
- btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
+ btrfs_path_cachep = kmem_cache_create("btrfs_path",
sizeof(struct btrfs_path), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!btrfs_path_cachep)
goto fail;
- btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache",
+ btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
sizeof(struct btrfs_free_space), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!btrfs_free_space_cachep)
@@ -7513,6 +7626,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
loff_t actual_len, u64 *alloc_hint,
struct btrfs_trans_handle *trans)
{
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key ins;
u64 cur_offset = start;
@@ -7553,6 +7668,37 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
btrfs_drop_extent_cache(inode, cur_offset,
cur_offset + ins.offset -1, 0);
+ em = alloc_extent_map();
+ if (!em) {
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ goto next;
+ }
+
+ em->start = cur_offset;
+ em->orig_start = cur_offset;
+ em->len = ins.offset;
+ em->block_start = ins.objectid;
+ em->block_len = ins.offset;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ em->generation = trans->transid;
+
+ while (1) {
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ if (!ret)
+ list_move(&em->list,
+ &em_tree->modified_extents);
+ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST)
+ break;
+ btrfs_drop_extent_cache(inode, cur_offset,
+ cur_offset + ins.offset - 1,
+ 0);
+ }
+ free_extent_map(em);
+next:
num_bytes -= ins.offset;
cur_offset += ins.offset;
*alloc_hint = ins.objectid + ins.offset;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 47127c1bd29..e568c472f80 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -181,6 +181,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
int ret;
u64 ip_oldflags;
unsigned int i_oldflags;
+ umode_t mode;
if (btrfs_root_readonly(root))
return -EROFS;
@@ -203,6 +204,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
ip_oldflags = ip->flags;
i_oldflags = inode->i_flags;
+ mode = inode->i_mode;
flags = btrfs_mask_flags(inode->i_mode, flags);
oldflags = btrfs_flags_to_ioctl(ip->flags);
@@ -237,10 +239,31 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
ip->flags |= BTRFS_INODE_DIRSYNC;
else
ip->flags &= ~BTRFS_INODE_DIRSYNC;
- if (flags & FS_NOCOW_FL)
- ip->flags |= BTRFS_INODE_NODATACOW;
- else
- ip->flags &= ~BTRFS_INODE_NODATACOW;
+ if (flags & FS_NOCOW_FL) {
+ if (S_ISREG(mode)) {
+ /*
+ * It's safe to turn csums off here, no extents exist.
+ * Otherwise we want the flag to reflect the real COW
+ * status of the file and will not set it.
+ */
+ if (inode->i_size == 0)
+ ip->flags |= BTRFS_INODE_NODATACOW
+ | BTRFS_INODE_NODATASUM;
+ } else {
+ ip->flags |= BTRFS_INODE_NODATACOW;
+ }
+ } else {
+ /*
+ * Revert back under same assuptions as above
+ */
+ if (S_ISREG(mode)) {
+ if (inode->i_size == 0)
+ ip->flags &= ~(BTRFS_INODE_NODATACOW
+ | BTRFS_INODE_NODATASUM);
+ } else {
+ ip->flags &= ~BTRFS_INODE_NODATACOW;
+ }
+ }
/*
* The COMPRESS flag can only be changed by users, while the NOCOMPRESS
@@ -516,7 +539,8 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
if (!pending_snapshot)
return -ENOMEM;
- btrfs_init_block_rsv(&pending_snapshot->block_rsv);
+ btrfs_init_block_rsv(&pending_snapshot->block_rsv,
+ BTRFS_BLOCK_RSV_TEMP);
pending_snapshot->dentry = dentry;
pending_snapshot->root = root;
pending_snapshot->readonly = readonly;
@@ -525,7 +549,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
*inherit = NULL; /* take responsibility to free it */
}
- trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
+ trans = btrfs_start_transaction(root->fs_info->extent_root, 6);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto fail;
@@ -1022,8 +1046,8 @@ again:
page_start, page_end - 1, 0, &cached_state);
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
- GFP_NOFS);
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
+ &cached_state, GFP_NOFS);
if (i_done != page_cnt) {
spin_lock(&BTRFS_I(inode)->lock);
@@ -1034,8 +1058,8 @@ again:
}
- btrfs_set_extent_delalloc(inode, page_start, page_end - 1,
- &cached_state);
+ set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
+ &cached_state, GFP_NOFS);
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
page_start, page_end - 1, &cached_state,
@@ -2351,7 +2375,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
int ret;
u64 len = olen;
u64 bs = root->fs_info->sb->s_blocksize;
- u64 hint_byte;
/*
* TODO:
@@ -2456,13 +2479,13 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
another, and lock file content */
while (1) {
struct btrfs_ordered_extent *ordered;
- lock_extent(&BTRFS_I(src)->io_tree, off, off+len);
- ordered = btrfs_lookup_first_ordered_extent(src, off+len);
+ lock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
+ ordered = btrfs_lookup_first_ordered_extent(src, off + len - 1);
if (!ordered &&
- !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len,
- EXTENT_DELALLOC, 0, NULL))
+ !test_range_bit(&BTRFS_I(src)->io_tree, off, off + len - 1,
+ EXTENT_DELALLOC, 0, NULL))
break;
- unlock_extent(&BTRFS_I(src)->io_tree, off, off+len);
+ unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
if (ordered)
btrfs_put_ordered_extent(ordered);
btrfs_wait_ordered_range(src, off, len);
@@ -2536,7 +2559,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
btrfs_release_path(path);
if (key.offset + datal <= off ||
- key.offset >= off+len)
+ key.offset >= off + len - 1)
goto next;
memcpy(&new_key, &key, sizeof(new_key));
@@ -2574,10 +2597,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
datal -= off - key.offset;
}
- ret = btrfs_drop_extents(trans, inode,
+ ret = btrfs_drop_extents(trans, root, inode,
new_key.offset,
new_key.offset + datal,
- &hint_byte, 1);
+ 1);
if (ret) {
btrfs_abort_transaction(trans, root,
ret);
@@ -2637,8 +2660,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
new_key.offset += skip;
}
- if (key.offset + datal > off+len)
- trim = key.offset + datal - (off+len);
+ if (key.offset + datal > off + len)
+ trim = key.offset + datal - (off + len);
if (comp && (skip || trim)) {
ret = -EINVAL;
@@ -2648,10 +2671,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
size -= skip + trim;
datal -= skip + trim;
- ret = btrfs_drop_extents(trans, inode,
+ ret = btrfs_drop_extents(trans, root, inode,
new_key.offset,
new_key.offset + datal,
- &hint_byte, 1);
+ 1);
if (ret) {
btrfs_abort_transaction(trans, root,
ret);
@@ -2715,7 +2738,7 @@ next:
ret = 0;
out:
btrfs_release_path(path);
- unlock_extent(&BTRFS_I(src)->io_tree, off, off+len);
+ unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
out_unlock:
mutex_unlock(&src->i_mutex);
mutex_unlock(&inode->i_mutex);
@@ -2850,8 +2873,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
return 0;
}
-static void get_block_group_info(struct list_head *groups_list,
- struct btrfs_ioctl_space_info *space)
+void btrfs_get_block_group_info(struct list_head *groups_list,
+ struct btrfs_ioctl_space_info *space)
{
struct btrfs_block_group_cache *block_group;
@@ -2959,8 +2982,8 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
down_read(&info->groups_sem);
for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
if (!list_empty(&info->block_groups[c])) {
- get_block_group_info(&info->block_groups[c],
- &space);
+ btrfs_get_block_group_info(
+ &info->block_groups[c], &space);
memcpy(dest, &space, sizeof(space));
dest++;
space_args.total_spaces++;
@@ -3208,11 +3231,9 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
{
int ret = 0;
int size;
- u64 extent_item_pos;
struct btrfs_ioctl_logical_ino_args *loi;
struct btrfs_data_container *inodes = NULL;
struct btrfs_path *path = NULL;
- struct btrfs_key key;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -3230,7 +3251,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
goto out;
}
- size = min_t(u32, loi->size, 4096);
+ size = min_t(u32, loi->size, 64 * 1024);
inodes = init_data_container(size);
if (IS_ERR(inodes)) {
ret = PTR_ERR(inodes);
@@ -3238,22 +3259,13 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
goto out;
}
- ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
- btrfs_release_path(path);
-
- if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+ ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path,
+ build_ino_list, inodes);
+ if (ret == -EINVAL)
ret = -ENOENT;
if (ret < 0)
goto out;
- extent_item_pos = loi->logical - key.objectid;
- ret = iterate_extent_inodes(root->fs_info, key.objectid,
- extent_item_pos, 0, build_ino_list,
- inodes);
-
- if (ret < 0)
- goto out;
-
ret = copy_to_user((void *)(unsigned long)loi->inodes,
(void *)(unsigned long)inodes, size);
if (ret)
@@ -3261,7 +3273,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
out:
btrfs_free_path(path);
- kfree(inodes);
+ vfree(inodes);
kfree(loi);
return ret;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 051c7fe551d..7772f02ba28 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -25,6 +25,8 @@
#include "btrfs_inode.h"
#include "extent_io.h"
+static struct kmem_cache *btrfs_ordered_extent_cache;
+
static u64 entry_end(struct btrfs_ordered_extent *entry)
{
if (entry->file_offset + entry->len < entry->file_offset)
@@ -187,7 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
struct btrfs_ordered_extent *entry;
tree = &BTRFS_I(inode)->ordered_tree;
- entry = kzalloc(sizeof(*entry), GFP_NOFS);
+ entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
if (!entry)
return -ENOMEM;
@@ -421,7 +423,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
list_del(&sum->list);
kfree(sum);
}
- kfree(entry);
+ kmem_cache_free(btrfs_ordered_extent_cache, entry);
}
}
@@ -466,8 +468,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
* wait for all the ordered extents in a root. This is done when balancing
* space between drives.
*/
-void btrfs_wait_ordered_extents(struct btrfs_root *root,
- int nocow_only, int delay_iput)
+void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
{
struct list_head splice;
struct list_head *cur;
@@ -482,15 +483,6 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root,
cur = splice.next;
ordered = list_entry(cur, struct btrfs_ordered_extent,
root_extent_list);
- if (nocow_only &&
- !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
- !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
- list_move(&ordered->root_extent_list,
- &root->fs_info->ordered_extents);
- cond_resched_lock(&root->fs_info->ordered_extent_lock);
- continue;
- }
-
list_del_init(&ordered->root_extent_list);
atomic_inc(&ordered->refs);
@@ -775,7 +767,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
u64 disk_i_size;
u64 new_i_size;
- u64 i_size_test;
u64 i_size = i_size_read(inode);
struct rb_node *node;
struct rb_node *prev = NULL;
@@ -835,55 +826,30 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
break;
if (test->file_offset >= i_size)
break;
- if (test->file_offset >= disk_i_size)
+ if (test->file_offset >= disk_i_size) {
+ /*
+ * we don't update disk_i_size now, so record this
+ * undealt i_size. Or we will not know the real
+ * i_size.
+ */
+ if (test->outstanding_isize < offset)
+ test->outstanding_isize = offset;
+ if (ordered &&
+ ordered->outstanding_isize >
+ test->outstanding_isize)
+ test->outstanding_isize =
+ ordered->outstanding_isize;
goto out;
- }
- new_i_size = min_t(u64, offset, i_size);
-
- /*
- * at this point, we know we can safely update i_size to at least
- * the offset from this ordered extent. But, we need to
- * walk forward and see if ios from higher up in the file have
- * finished.
- */
- if (ordered) {
- node = rb_next(&ordered->rb_node);
- } else {
- if (prev)
- node = rb_next(prev);
- else
- node = rb_first(&tree->tree);
- }
-
- /*
- * We are looking for an area between our current extent and the next
- * ordered extent to update the i_size to. There are 3 cases here
- *
- * 1) We don't actually have anything and we can update to i_size.
- * 2) We have stuff but they already did their i_size update so again we
- * can just update to i_size.
- * 3) We have an outstanding ordered extent so the most we can update
- * our disk_i_size to is the start of the next offset.
- */
- i_size_test = i_size;
- for (; node; node = rb_next(node)) {
- test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-
- if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
- continue;
- if (test->file_offset > offset) {
- i_size_test = test->file_offset;
- break;
}
}
+ new_i_size = min_t(u64, offset, i_size);
/*
- * i_size_test is the end of a region after this ordered
- * extent where there are no ordered extents, we can safely set
- * disk_i_size to this.
+ * Some ordered extents may completed before the current one, and
+ * we hold the real i_size in ->outstanding_isize.
*/
- if (i_size_test > offset)
- new_i_size = min_t(u64, i_size_test, i_size);
+ if (ordered && ordered->outstanding_isize > new_i_size)
+ new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
BTRFS_I(inode)->disk_i_size = new_i_size;
ret = 0;
out:
@@ -984,3 +950,20 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
}
spin_unlock(&root->fs_info->ordered_extent_lock);
}
+
+int __init ordered_data_init(void)
+{
+ btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
+ sizeof(struct btrfs_ordered_extent), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
+ if (!btrfs_ordered_extent_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void ordered_data_exit(void)
+{
+ if (btrfs_ordered_extent_cache)
+ kmem_cache_destroy(btrfs_ordered_extent_cache);
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index e03c560d299..dd27a0b46a3 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -96,6 +96,13 @@ struct btrfs_ordered_extent {
/* number of bytes that still need writing */
u64 bytes_left;
+ /*
+ * the end of the ordered extent which is behind it but
+ * didn't update disk_i_size. Please see the comment of
+ * btrfs_ordered_update_i_size();
+ */
+ u64 outstanding_isize;
+
/* flags (described above) */
unsigned long flags;
@@ -183,6 +190,7 @@ void btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
-void btrfs_wait_ordered_extents(struct btrfs_root *root,
- int nocow_only, int delay_iput);
+void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
+int __init ordered_data_init(void);
+void ordered_data_exit(void);
#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b6501558174..5039686df6a 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1145,12 +1145,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
ulist_reinit(tmp);
/* XXX id not needed */
- ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC);
+ ulist_add(tmp, qg->qgroupid, (u64)(uintptr_t)qg, GFP_ATOMIC);
ULIST_ITER_INIT(&tmp_uiter);
while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
struct btrfs_qgroup_list *glist;
- qg = (struct btrfs_qgroup *)tmp_unode->aux;
+ qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
if (qg->refcnt < seq)
qg->refcnt = seq + 1;
else
@@ -1158,7 +1158,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
list_for_each_entry(glist, &qg->groups, next_group) {
ulist_add(tmp, glist->group->qgroupid,
- (unsigned long)glist->group,
+ (u64)(uintptr_t)glist->group,
GFP_ATOMIC);
}
}
@@ -1168,13 +1168,13 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
* step 2: walk from the new root
*/
ulist_reinit(tmp);
- ulist_add(tmp, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+ ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(tmp, &uiter))) {
struct btrfs_qgroup *qg;
struct btrfs_qgroup_list *glist;
- qg = (struct btrfs_qgroup *)unode->aux;
+ qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
if (qg->refcnt < seq) {
/* not visited by step 1 */
qg->rfer += sgn * node->num_bytes;
@@ -1190,7 +1190,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
list_for_each_entry(glist, &qg->groups, next_group) {
ulist_add(tmp, glist->group->qgroupid,
- (unsigned long)glist->group, GFP_ATOMIC);
+ (uintptr_t)glist->group, GFP_ATOMIC);
}
}
@@ -1208,12 +1208,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
continue;
ulist_reinit(tmp);
- ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC);
+ ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC);
ULIST_ITER_INIT(&tmp_uiter);
while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
struct btrfs_qgroup_list *glist;
- qg = (struct btrfs_qgroup *)tmp_unode->aux;
+ qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
if (qg->tag == seq)
continue;
@@ -1225,7 +1225,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
list_for_each_entry(glist, &qg->groups, next_group) {
ulist_add(tmp, glist->group->qgroupid,
- (unsigned long)glist->group,
+ (uintptr_t)glist->group,
GFP_ATOMIC);
}
}
@@ -1469,13 +1469,17 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
* be exceeded
*/
ulist = ulist_alloc(GFP_ATOMIC);
- ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+ if (!ulist) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(ulist, &uiter))) {
struct btrfs_qgroup *qg;
struct btrfs_qgroup_list *glist;
- qg = (struct btrfs_qgroup *)unode->aux;
+ qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
qg->reserved + qg->rfer + num_bytes >
@@ -1489,7 +1493,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
list_for_each_entry(glist, &qg->groups, next_group) {
ulist_add(ulist, glist->group->qgroupid,
- (unsigned long)glist->group, GFP_ATOMIC);
+ (uintptr_t)glist->group, GFP_ATOMIC);
}
}
if (ret)
@@ -1502,7 +1506,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
while ((unode = ulist_next(ulist, &uiter))) {
struct btrfs_qgroup *qg;
- qg = (struct btrfs_qgroup *)unode->aux;
+ qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
qg->reserved += num_bytes;
}
@@ -1541,19 +1545,23 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
goto out;
ulist = ulist_alloc(GFP_ATOMIC);
- ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+ if (!ulist) {
+ btrfs_std_error(fs_info, -ENOMEM);
+ goto out;
+ }
+ ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(ulist, &uiter))) {
struct btrfs_qgroup *qg;
struct btrfs_qgroup_list *glist;
- qg = (struct btrfs_qgroup *)unode->aux;
+ qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
qg->reserved -= num_bytes;
list_for_each_entry(glist, &qg->groups, next_group) {
ulist_add(ulist, glist->group->qgroupid,
- (unsigned long)glist->group, GFP_ATOMIC);
+ (uintptr_t)glist->group, GFP_ATOMIC);
}
}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4da08652004..776f0aa128f 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3270,8 +3270,8 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
key.offset = 0;
inode = btrfs_iget(fs_info->sb, &key, root, NULL);
- if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) {
- if (inode && !IS_ERR(inode))
+ if (IS_ERR(inode) || is_bad_inode(inode)) {
+ if (!IS_ERR(inode))
iput(inode);
return -ENOENT;
}
@@ -3621,7 +3621,7 @@ next:
ret = find_first_extent_bit(&rc->processed_blocks,
key.objectid, &start, &end,
- EXTENT_DIRTY);
+ EXTENT_DIRTY, NULL);
if (ret == 0 && start <= key.objectid) {
btrfs_release_path(path);
@@ -3674,7 +3674,8 @@ int prepare_to_relocate(struct reloc_control *rc)
struct btrfs_trans_handle *trans;
int ret;
- rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
+ rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
+ BTRFS_BLOCK_RSV_TEMP);
if (!rc->block_rsv)
return -ENOMEM;
@@ -4057,7 +4058,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
(unsigned long long)rc->block_group->flags);
btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
- btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
+ btrfs_wait_ordered_extents(fs_info->tree_root, 0);
while (1) {
mutex_lock(&fs_info->cleaner_mutex);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 10d8e4d8807..eb923d087da 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -141,8 +141,10 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
return -ENOMEM;
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
- if (ret < 0)
- goto out_abort;
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
if (ret != 0) {
btrfs_print_leaf(root, path->nodes[0]);
@@ -166,16 +168,23 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_release_path(path);
ret = btrfs_search_slot(trans, root, key, path,
-1, 1);
- if (ret < 0)
- goto out_abort;
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+
ret = btrfs_del_item(trans, root, path);
- if (ret < 0)
- goto out_abort;
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
btrfs_release_path(path);
ret = btrfs_insert_empty_item(trans, root, path,
key, sizeof(*item));
- if (ret < 0)
- goto out_abort;
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
l = path->nodes[0];
slot = path->slots[0];
ptr = btrfs_item_ptr_offset(l, slot);
@@ -192,10 +201,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
out:
btrfs_free_path(path);
return ret;
-
-out_abort:
- btrfs_abort_transaction(trans, root, ret);
- goto out;
}
int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b223620cd5a..27892f67e69 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -352,13 +352,14 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
struct extent_buffer *eb;
struct btrfs_extent_item *ei;
struct scrub_warning swarn;
- u32 item_size;
- int ret;
+ unsigned long ptr = 0;
+ u64 extent_item_pos;
+ u64 flags = 0;
u64 ref_root;
+ u32 item_size;
u8 ref_level;
- unsigned long ptr = 0;
const int bufsize = 4096;
- u64 extent_item_pos;
+ int ret;
path = btrfs_alloc_path();
@@ -375,7 +376,8 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
if (!path || !swarn.scratch_buf || !swarn.msg_buf)
goto out;
- ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
+ ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
+ &flags);
if (ret < 0)
goto out;
@@ -387,7 +389,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
item_size = btrfs_item_size_nr(eb, path->slots[0]);
btrfs_release_path(path);
- if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
do {
ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
&ref_root, &ref_level);
@@ -1029,6 +1031,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
spin_lock(&sdev->stat_lock);
sdev->stat.malloc_errors++;
spin_unlock(&sdev->stat_lock);
+ kfree(bbio);
return -ENOMEM;
}
sblock->page_count++;
@@ -1666,21 +1669,6 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
scrub_block_put(sblock);
}
- if (sbio->err) {
- /* what is this good for??? */
- sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
- sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
- sbio->bio->bi_phys_segments = 0;
- sbio->bio->bi_idx = 0;
-
- for (i = 0; i < sbio->page_count; i++) {
- struct bio_vec *bi;
- bi = &sbio->bio->bi_io_vec[i];
- bi->bv_offset = 0;
- bi->bv_len = PAGE_SIZE;
- }
- }
-
bio_put(sbio->bio);
sbio->bio = NULL;
spin_lock(&sdev->list_lock);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index fb5ffe95f86..c7beb543a4a 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -107,7 +107,6 @@ struct send_ctx {
int cur_inode_new;
int cur_inode_new_gen;
int cur_inode_deleted;
- int cur_inode_first_ref_orphan;
u64 cur_inode_size;
u64 cur_inode_mode;
@@ -126,7 +125,15 @@ struct send_ctx {
struct name_cache_entry {
struct list_head list;
- struct list_head use_list;
+ /*
+ * radix_tree has only 32bit entries but we need to handle 64bit inums.
+ * We use the lower 32bit of the 64bit inum to store it in the tree. If
+ * more then one inum would fall into the same entry, we use radix_list
+ * to store the additional entries. radix_list is also used to store
+ * entries where two entries have the same inum but different
+ * generations.
+ */
+ struct list_head radix_list;
u64 ino;
u64 gen;
u64 parent_ino;
@@ -328,6 +335,7 @@ out:
return ret;
}
+#if 0
static void fs_path_remove(struct fs_path *p)
{
BUG_ON(p->reversed);
@@ -335,6 +343,7 @@ static void fs_path_remove(struct fs_path *p)
p->end--;
*p->end = 0;
}
+#endif
static int fs_path_copy(struct fs_path *p, struct fs_path *from)
{
@@ -377,7 +386,7 @@ static struct btrfs_path *alloc_path_for_send(void)
return path;
}
-static int write_buf(struct send_ctx *sctx, const void *buf, u32 len)
+int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
{
int ret;
mm_segment_t old_fs;
@@ -387,8 +396,7 @@ static int write_buf(struct send_ctx *sctx, const void *buf, u32 len)
set_fs(KERNEL_DS);
while (pos < len) {
- ret = vfs_write(sctx->send_filp, (char *)buf + pos, len - pos,
- &sctx->send_off);
+ ret = vfs_write(filp, (char *)buf + pos, len - pos, off);
/* TODO handle that correctly */
/*if (ret == -ERESTARTSYS) {
continue;
@@ -544,7 +552,8 @@ static int send_header(struct send_ctx *sctx)
strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
- return write_buf(sctx, &hdr, sizeof(hdr));
+ return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
+ &sctx->send_off);
}
/*
@@ -581,7 +590,8 @@ static int send_cmd(struct send_ctx *sctx)
crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
hdr->crc = cpu_to_le32(crc);
- ret = write_buf(sctx, sctx->send_buf, sctx->send_size);
+ ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
+ &sctx->send_off);
sctx->total_send_size += sctx->send_size;
sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
@@ -687,7 +697,8 @@ out:
*/
static int get_inode_info(struct btrfs_root *root,
u64 ino, u64 *size, u64 *gen,
- u64 *mode, u64 *uid, u64 *gid)
+ u64 *mode, u64 *uid, u64 *gid,
+ u64 *rdev)
{
int ret;
struct btrfs_inode_item *ii;
@@ -721,6 +732,8 @@ static int get_inode_info(struct btrfs_root *root,
*uid = btrfs_inode_uid(path->nodes[0], ii);
if (gid)
*gid = btrfs_inode_gid(path->nodes[0], ii);
+ if (rdev)
+ *rdev = btrfs_inode_rdev(path->nodes[0], ii);
out:
btrfs_free_path(path);
@@ -852,7 +865,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
struct extent_buffer *eb;
struct btrfs_item *item;
struct btrfs_dir_item *di;
- struct btrfs_path *tmp_path = NULL;
struct btrfs_key di_key;
char *buf = NULL;
char *buf2 = NULL;
@@ -874,12 +886,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
goto out;
}
- tmp_path = alloc_path_for_send();
- if (!tmp_path) {
- ret = -ENOMEM;
- goto out;
- }
-
eb = path->nodes[0];
slot = path->slots[0];
item = btrfs_item_nr(eb, slot);
@@ -941,7 +947,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
}
out:
- btrfs_free_path(tmp_path);
if (buf_virtual)
vfree(buf);
else
@@ -1026,12 +1031,12 @@ struct backref_ctx {
u64 extent_len;
/* Just to check for bugs in backref resolving */
- int found_in_send_root;
+ int found_itself;
};
static int __clone_root_cmp_bsearch(const void *key, const void *elt)
{
- u64 root = (u64)key;
+ u64 root = (u64)(uintptr_t)key;
struct clone_root *cr = (struct clone_root *)elt;
if (root < cr->root->objectid)
@@ -1055,6 +1060,7 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)
/*
* Called for every backref that is found for the current extent.
+ * Results are collected in sctx->clone_roots->ino/offset/found_refs
*/
static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
{
@@ -1064,7 +1070,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
u64 i_size;
/* First check if the root is in the list of accepted clone sources */
- found = bsearch((void *)root, bctx->sctx->clone_roots,
+ found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
bctx->sctx->clone_roots_cnt,
sizeof(struct clone_root),
__clone_root_cmp_bsearch);
@@ -1074,14 +1080,15 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
if (found->root == bctx->sctx->send_root &&
ino == bctx->cur_objectid &&
offset == bctx->cur_offset) {
- bctx->found_in_send_root = 1;
+ bctx->found_itself = 1;
}
/*
- * There are inodes that have extents that lie behind it's i_size. Don't
+ * There are inodes that have extents that lie behind its i_size. Don't
* accept clones from these extents.
*/
- ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL);
+ ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL,
+ NULL);
if (ret < 0)
return ret;
@@ -1101,16 +1108,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
*/
if (ino >= bctx->cur_objectid)
return 0;
- /*if (ino > ctx->cur_objectid)
+#if 0
+ if (ino > bctx->cur_objectid)
return 0;
- if (offset + ctx->extent_len > ctx->cur_offset)
- return 0;*/
-
- bctx->found++;
- found->found_refs++;
- found->ino = ino;
- found->offset = offset;
- return 0;
+ if (offset + bctx->extent_len > bctx->cur_offset)
+ return 0;
+#endif
}
bctx->found++;
@@ -1130,6 +1133,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
}
/*
+ * Given an inode, offset and extent item, it finds a good clone for a clone
+ * instruction. Returns -ENOENT when none could be found. The function makes
+ * sure that the returned clone is usable at the point where sending is at the
+ * moment. This means, that no clones are accepted which lie behind the current
+ * inode+offset.
+ *
* path must point to the extent item when called.
*/
static int find_extent_clone(struct send_ctx *sctx,
@@ -1141,20 +1150,29 @@ static int find_extent_clone(struct send_ctx *sctx,
int ret;
int extent_type;
u64 logical;
+ u64 disk_byte;
u64 num_bytes;
u64 extent_item_pos;
+ u64 flags = 0;
struct btrfs_file_extent_item *fi;
struct extent_buffer *eb = path->nodes[0];
- struct backref_ctx backref_ctx;
+ struct backref_ctx *backref_ctx = NULL;
struct clone_root *cur_clone_root;
struct btrfs_key found_key;
struct btrfs_path *tmp_path;
+ int compressed;
u32 i;
tmp_path = alloc_path_for_send();
if (!tmp_path)
return -ENOMEM;
+ backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS);
+ if (!backref_ctx) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
if (data_offset >= ino_size) {
/*
* There may be extents that lie behind the file's size.
@@ -1172,22 +1190,23 @@ static int find_extent_clone(struct send_ctx *sctx,
ret = -ENOENT;
goto out;
}
+ compressed = btrfs_file_extent_compression(eb, fi);
num_bytes = btrfs_file_extent_num_bytes(eb, fi);
- logical = btrfs_file_extent_disk_bytenr(eb, fi);
- if (logical == 0) {
+ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+ if (disk_byte == 0) {
ret = -ENOENT;
goto out;
}
- logical += btrfs_file_extent_offset(eb, fi);
+ logical = disk_byte + btrfs_file_extent_offset(eb, fi);
- ret = extent_from_logical(sctx->send_root->fs_info,
- logical, tmp_path, &found_key);
+ ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path,
+ &found_key, &flags);
btrfs_release_path(tmp_path);
if (ret < 0)
goto out;
- if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
ret = -EIO;
goto out;
}
@@ -1202,12 +1221,12 @@ static int find_extent_clone(struct send_ctx *sctx,
cur_clone_root->found_refs = 0;
}
- backref_ctx.sctx = sctx;
- backref_ctx.found = 0;
- backref_ctx.cur_objectid = ino;
- backref_ctx.cur_offset = data_offset;
- backref_ctx.found_in_send_root = 0;
- backref_ctx.extent_len = num_bytes;
+ backref_ctx->sctx = sctx;
+ backref_ctx->found = 0;
+ backref_ctx->cur_objectid = ino;
+ backref_ctx->cur_offset = data_offset;
+ backref_ctx->found_itself = 0;
+ backref_ctx->extent_len = num_bytes;
/*
* The last extent of a file may be too large due to page alignment.
@@ -1215,25 +1234,31 @@ static int find_extent_clone(struct send_ctx *sctx,
* __iterate_backrefs work.
*/
if (data_offset + num_bytes >= ino_size)
- backref_ctx.extent_len = ino_size - data_offset;
+ backref_ctx->extent_len = ino_size - data_offset;
/*
* Now collect all backrefs.
*/
+ if (compressed == BTRFS_COMPRESS_NONE)
+ extent_item_pos = logical - found_key.objectid;
+ else
+ extent_item_pos = 0;
+
extent_item_pos = logical - found_key.objectid;
ret = iterate_extent_inodes(sctx->send_root->fs_info,
found_key.objectid, extent_item_pos, 1,
- __iterate_backrefs, &backref_ctx);
+ __iterate_backrefs, backref_ctx);
+
if (ret < 0)
goto out;
- if (!backref_ctx.found_in_send_root) {
+ if (!backref_ctx->found_itself) {
/* found a bug in backref code? */
ret = -EIO;
printk(KERN_ERR "btrfs: ERROR did not find backref in "
"send_root. inode=%llu, offset=%llu, "
- "logical=%llu\n",
- ino, data_offset, logical);
+ "disk_byte=%llu found extent=%llu\n",
+ ino, data_offset, disk_byte, found_key.objectid);
goto out;
}
@@ -1242,7 +1267,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
"num_bytes=%llu, logical=%llu\n",
data_offset, ino, num_bytes, logical);
- if (!backref_ctx.found)
+ if (!backref_ctx->found)
verbose_printk("btrfs: no clones found\n");
cur_clone_root = NULL;
@@ -1253,7 +1278,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
else if (sctx->clone_roots[i].root == sctx->send_root)
/* prefer clones from send_root over others */
cur_clone_root = sctx->clone_roots + i;
- break;
}
}
@@ -1267,6 +1291,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
out:
btrfs_free_path(tmp_path);
+ kfree(backref_ctx);
return ret;
}
@@ -1307,8 +1332,6 @@ static int read_symlink(struct send_ctx *sctx,
len = btrfs_file_extent_inline_len(path->nodes[0], ei);
ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
- if (ret < 0)
- goto out;
out:
btrfs_free_path(path);
@@ -1404,7 +1427,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
u64 right_gen;
ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
- NULL);
+ NULL, NULL);
if (ret < 0 && ret != -ENOENT)
goto out;
left_ret = ret;
@@ -1413,16 +1436,16 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
right_ret = -ENOENT;
} else {
ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL);
if (ret < 0 && ret != -ENOENT)
goto out;
right_ret = ret;
}
if (!left_ret && !right_ret) {
- if (left_gen == gen && right_gen == gen)
+ if (left_gen == gen && right_gen == gen) {
ret = inode_state_no_change;
- else if (left_gen == gen) {
+ } else if (left_gen == gen) {
if (ino < sctx->send_progress)
ret = inode_state_did_create;
else
@@ -1516,6 +1539,10 @@ out:
return ret;
}
+/*
+ * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
+ * generation of the parent dir and the name of the dir entry.
+ */
static int get_first_ref(struct send_ctx *sctx,
struct btrfs_root *root, u64 ino,
u64 *dir, u64 *dir_gen, struct fs_path *name)
@@ -1557,7 +1584,7 @@ static int get_first_ref(struct send_ctx *sctx,
btrfs_release_path(path);
ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL,
- NULL);
+ NULL, NULL);
if (ret < 0)
goto out;
@@ -1586,22 +1613,28 @@ static int is_first_ref(struct send_ctx *sctx,
if (ret < 0)
goto out;
- if (name_len != fs_path_len(tmp_name)) {
+ if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {
ret = 0;
goto out;
}
- ret = memcmp(tmp_name->start, name, name_len);
- if (ret)
- ret = 0;
- else
- ret = 1;
+ ret = !memcmp(tmp_name->start, name, name_len);
out:
fs_path_free(sctx, tmp_name);
return ret;
}
+/*
+ * Used by process_recorded_refs to determine if a new ref would overwrite an
+ * already existing ref. In case it detects an overwrite, it returns the
+ * inode/gen in who_ino/who_gen.
+ * When an overwrite is detected, process_recorded_refs does proper orphanizing
+ * to make sure later references to the overwritten inode are possible.
+ * Orphanizing is however only required for the first ref of an inode.
+ * process_recorded_refs does an additional is_first_ref check to see if
+ * orphanizing is really required.
+ */
static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
const char *name, int name_len,
u64 *who_ino, u64 *who_gen)
@@ -1626,9 +1659,14 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
goto out;
}
+ /*
+ * Check if the overwritten ref was already processed. If yes, the ref
+ * was already unlinked/moved, so we can safely assume that we will not
+ * overwrite anything at this point in time.
+ */
if (other_inode > sctx->send_progress) {
ret = get_inode_info(sctx->parent_root, other_inode, NULL,
- who_gen, NULL, NULL, NULL);
+ who_gen, NULL, NULL, NULL, NULL);
if (ret < 0)
goto out;
@@ -1642,6 +1680,13 @@ out:
return ret;
}
+/*
+ * Checks if the ref was overwritten by an already processed inode. This is
+ * used by __get_cur_name_and_parent to find out if the ref was orphanized and
+ * thus the orphan name needs be used.
+ * process_recorded_refs also uses it to avoid unlinking of refs that were
+ * overwritten.
+ */
static int did_overwrite_ref(struct send_ctx *sctx,
u64 dir, u64 dir_gen,
u64 ino, u64 ino_gen,
@@ -1671,7 +1716,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
}
ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
- NULL);
+ NULL, NULL);
if (ret < 0)
goto out;
@@ -1690,6 +1735,11 @@ out:
return ret;
}
+/*
+ * Same as did_overwrite_ref, but also checks if it is the first ref of an inode
+ * that got overwritten. This is used by process_recorded_refs to determine
+ * if it has to use the path as returned by get_cur_path or the orphan name.
+ */
static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
{
int ret = 0;
@@ -1710,39 +1760,40 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
name->start, fs_path_len(name));
- if (ret < 0)
- goto out;
out:
fs_path_free(sctx, name);
return ret;
}
+/*
+ * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
+ * so we need to do some special handling in case we have clashes. This function
+ * takes care of this with the help of name_cache_entry::radix_list.
+ * In case of error, nce is kfreed.
+ */
static int name_cache_insert(struct send_ctx *sctx,
struct name_cache_entry *nce)
{
int ret = 0;
- struct name_cache_entry **ncea;
-
- ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
- if (ncea) {
- if (!ncea[0])
- ncea[0] = nce;
- else if (!ncea[1])
- ncea[1] = nce;
- else
- BUG();
- } else {
- ncea = kmalloc(sizeof(void *) * 2, GFP_NOFS);
- if (!ncea)
+ struct list_head *nce_head;
+
+ nce_head = radix_tree_lookup(&sctx->name_cache,
+ (unsigned long)nce->ino);
+ if (!nce_head) {
+ nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
+ if (!nce_head)
return -ENOMEM;
+ INIT_LIST_HEAD(nce_head);
- ncea[0] = nce;
- ncea[1] = NULL;
- ret = radix_tree_insert(&sctx->name_cache, nce->ino, ncea);
- if (ret < 0)
+ ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
+ if (ret < 0) {
+ kfree(nce_head);
+ kfree(nce);
return ret;
+ }
}
+ list_add_tail(&nce->radix_list, nce_head);
list_add_tail(&nce->list, &sctx->name_cache_list);
sctx->name_cache_size++;
@@ -1752,50 +1803,52 @@ static int name_cache_insert(struct send_ctx *sctx,
static void name_cache_delete(struct send_ctx *sctx,
struct name_cache_entry *nce)
{
- struct name_cache_entry **ncea;
-
- ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
- BUG_ON(!ncea);
-
- if (ncea[0] == nce)
- ncea[0] = NULL;
- else if (ncea[1] == nce)
- ncea[1] = NULL;
- else
- BUG();
+ struct list_head *nce_head;
- if (!ncea[0] && !ncea[1]) {
- radix_tree_delete(&sctx->name_cache, nce->ino);
- kfree(ncea);
- }
+ nce_head = radix_tree_lookup(&sctx->name_cache,
+ (unsigned long)nce->ino);
+ BUG_ON(!nce_head);
+ list_del(&nce->radix_list);
list_del(&nce->list);
-
sctx->name_cache_size--;
+
+ if (list_empty(nce_head)) {
+ radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
+ kfree(nce_head);
+ }
}
static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
u64 ino, u64 gen)
{
- struct name_cache_entry **ncea;
+ struct list_head *nce_head;
+ struct name_cache_entry *cur;
- ncea = radix_tree_lookup(&sctx->name_cache, ino);
- if (!ncea)
+ nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
+ if (!nce_head)
return NULL;
- if (ncea[0] && ncea[0]->gen == gen)
- return ncea[0];
- else if (ncea[1] && ncea[1]->gen == gen)
- return ncea[1];
+ list_for_each_entry(cur, nce_head, radix_list) {
+ if (cur->ino == ino && cur->gen == gen)
+ return cur;
+ }
return NULL;
}
+/*
+ * Removes the entry from the list and adds it back to the end. This marks the
+ * entry as recently used so that name_cache_clean_unused does not remove it.
+ */
static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)
{
list_del(&nce->list);
list_add_tail(&nce->list, &sctx->name_cache_list);
}
+/*
+ * Remove some entries from the beginning of name_cache_list.
+ */
static void name_cache_clean_unused(struct send_ctx *sctx)
{
struct name_cache_entry *nce;
@@ -1814,13 +1867,23 @@ static void name_cache_clean_unused(struct send_ctx *sctx)
static void name_cache_free(struct send_ctx *sctx)
{
struct name_cache_entry *nce;
- struct name_cache_entry *tmp;
- list_for_each_entry_safe(nce, tmp, &sctx->name_cache_list, list) {
+ while (!list_empty(&sctx->name_cache_list)) {
+ nce = list_entry(sctx->name_cache_list.next,
+ struct name_cache_entry, list);
name_cache_delete(sctx, nce);
+ kfree(nce);
}
}
+/*
+ * Used by get_cur_path for each ref up to the root.
+ * Returns 0 if it succeeded.
+ * Returns 1 if the inode is not existent or got overwritten. In that case, the
+ * name is an orphan name. This instructs get_cur_path to stop iterating. If 1
+ * is returned, parent_ino/parent_gen are not guaranteed to be valid.
+ * Returns <0 in case of error.
+ */
static int __get_cur_name_and_parent(struct send_ctx *sctx,
u64 ino, u64 gen,
u64 *parent_ino,
@@ -1832,6 +1895,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
struct btrfs_path *path = NULL;
struct name_cache_entry *nce = NULL;
+ /*
+ * First check if we already did a call to this function with the same
+ * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
+ * return the cached result.
+ */
nce = name_cache_search(sctx, ino, gen);
if (nce) {
if (ino < sctx->send_progress && nce->need_later_update) {
@@ -1854,6 +1922,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
if (!path)
return -ENOMEM;
+ /*
+ * If the inode is not existent yet, add the orphan name and return 1.
+ * This should only happen for the parent dir that we determine in
+ * __record_new_ref
+ */
ret = is_inode_existent(sctx, ino, gen);
if (ret < 0)
goto out;
@@ -1866,6 +1939,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
goto out_cache;
}
+ /*
+ * Depending on whether the inode was already processed or not, use
+ * send_root or parent_root for ref lookup.
+ */
if (ino < sctx->send_progress)
ret = get_first_ref(sctx, sctx->send_root, ino,
parent_ino, parent_gen, dest);
@@ -1875,6 +1952,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
if (ret < 0)
goto out;
+ /*
+ * Check if the ref was overwritten by an inode's ref that was processed
+ * earlier. If yes, treat as orphan and return 1.
+ */
ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
dest->start, dest->end - dest->start);
if (ret < 0)
@@ -1888,6 +1969,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
}
out_cache:
+ /*
+ * Store the result of the lookup in the name cache.
+ */
nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS);
if (!nce) {
ret = -ENOMEM;
@@ -1901,7 +1985,6 @@ out_cache:
nce->name_len = fs_path_len(dest);
nce->ret = ret;
strcpy(nce->name, dest->start);
- memset(&nce->use_list, 0, sizeof(nce->use_list));
if (ino < sctx->send_progress)
nce->need_later_update = 0;
@@ -2107,9 +2190,6 @@ static int send_subvol_begin(struct send_ctx *sctx)
read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
btrfs_release_path(path);
- if (ret < 0)
- goto out;
-
if (parent_root) {
ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
if (ret < 0)
@@ -2276,7 +2356,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
btrfs_inode_mtime(ii));
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,
btrfs_inode_ctime(ii));
- /* TODO otime? */
+ /* TODO Add otime support when the otime patches get into upstream */
ret = send_cmd(sctx);
@@ -2292,39 +2372,39 @@ out:
* a valid path yet because we did not process the refs yet. So, the inode
* is created as orphan.
*/
-static int send_create_inode(struct send_ctx *sctx, struct btrfs_path *path,
- struct btrfs_key *key)
+static int send_create_inode(struct send_ctx *sctx, u64 ino)
{
int ret = 0;
- struct extent_buffer *eb = path->nodes[0];
- struct btrfs_inode_item *ii;
struct fs_path *p;
- int slot = path->slots[0];
int cmd;
+ u64 gen;
u64 mode;
+ u64 rdev;
-verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino);
+verbose_printk("btrfs: send_create_inode %llu\n", ino);
p = fs_path_alloc(sctx);
if (!p)
return -ENOMEM;
- ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
- mode = btrfs_inode_mode(eb, ii);
+ ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL,
+ NULL, &rdev);
+ if (ret < 0)
+ goto out;
- if (S_ISREG(mode))
+ if (S_ISREG(mode)) {
cmd = BTRFS_SEND_C_MKFILE;
- else if (S_ISDIR(mode))
+ } else if (S_ISDIR(mode)) {
cmd = BTRFS_SEND_C_MKDIR;
- else if (S_ISLNK(mode))
+ } else if (S_ISLNK(mode)) {
cmd = BTRFS_SEND_C_SYMLINK;
- else if (S_ISCHR(mode) || S_ISBLK(mode))
+ } else if (S_ISCHR(mode) || S_ISBLK(mode)) {
cmd = BTRFS_SEND_C_MKNOD;
- else if (S_ISFIFO(mode))
+ } else if (S_ISFIFO(mode)) {
cmd = BTRFS_SEND_C_MKFIFO;
- else if (S_ISSOCK(mode))
+ } else if (S_ISSOCK(mode)) {
cmd = BTRFS_SEND_C_MKSOCK;
- else {
+ } else {
printk(KERN_WARNING "btrfs: unexpected inode type %o",
(int)(mode & S_IFMT));
ret = -ENOTSUPP;
@@ -2335,22 +2415,22 @@ verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino);
if (ret < 0)
goto out;
- ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ ret = gen_unique_name(sctx, ino, gen, p);
if (ret < 0)
goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
- TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, sctx->cur_ino);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
if (S_ISLNK(mode)) {
fs_path_reset(p);
- ret = read_symlink(sctx, sctx->send_root, sctx->cur_ino, p);
+ ret = read_symlink(sctx, sctx->send_root, ino, p);
if (ret < 0)
goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
} else if (S_ISCHR(mode) || S_ISBLK(mode) ||
S_ISFIFO(mode) || S_ISSOCK(mode)) {
- TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, btrfs_inode_rdev(eb, ii));
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, rdev);
}
ret = send_cmd(sctx);
@@ -2364,6 +2444,92 @@ out:
return ret;
}
+/*
+ * We need some special handling for inodes that get processed before the parent
+ * directory got created. See process_recorded_refs for details.
+ * This function does the check if we already created the dir out of order.
+ */
+static int did_create_dir(struct send_ctx *sctx, u64 dir)
+{
+ int ret = 0;
+ struct btrfs_path *path = NULL;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_key di_key;
+ struct extent_buffer *eb;
+ struct btrfs_dir_item *di;
+ int slot;
+
+ path = alloc_path_for_send();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = dir;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = 0;
+ while (1) {
+ ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
+ 1, 0);
+ if (ret < 0)
+ goto out;
+ if (!ret) {
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(eb, &found_key, slot);
+ }
+ if (ret || found_key.objectid != key.objectid ||
+ found_key.type != key.type) {
+ ret = 0;
+ goto out;
+ }
+
+ di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+ btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+
+ if (di_key.objectid < sctx->send_progress) {
+ ret = 1;
+ goto out;
+ }
+
+ key.offset = found_key.offset + 1;
+ btrfs_release_path(path);
+ }
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Only creates the inode if it is:
+ * 1. Not a directory
+ * 2. Or a directory which was not created already due to out of order
+ * directories. See did_create_dir and process_recorded_refs for details.
+ */
+static int send_create_inode_if_needed(struct send_ctx *sctx)
+{
+ int ret;
+
+ if (S_ISDIR(sctx->cur_inode_mode)) {
+ ret = did_create_dir(sctx, sctx->cur_ino);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+ }
+
+ ret = send_create_inode(sctx, sctx->cur_ino);
+ if (ret < 0)
+ goto out;
+
+out:
+ return ret;
+}
+
struct recorded_ref {
struct list_head list;
char *dir_path;
@@ -2416,13 +2582,13 @@ static int record_ref(struct list_head *head, u64 dir,
static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
{
struct recorded_ref *cur;
- struct recorded_ref *tmp;
- list_for_each_entry_safe(cur, tmp, head, list) {
+ while (!list_empty(head)) {
+ cur = list_entry(head->next, struct recorded_ref, list);
fs_path_free(sctx, cur->full_path);
+ list_del(&cur->list);
kfree(cur);
}
- INIT_LIST_HEAD(head);
}
static void free_recorded_refs(struct send_ctx *sctx)
@@ -2432,7 +2598,7 @@ static void free_recorded_refs(struct send_ctx *sctx)
}
/*
- * Renames/moves a file/dir to it's orphan name. Used when the first
+ * Renames/moves a file/dir to its orphan name. Used when the first
* ref of an unprocessed inode gets overwritten and for all non empty
* directories.
*/
@@ -2472,6 +2638,12 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
struct btrfs_key loc;
struct btrfs_dir_item *di;
+ /*
+ * Don't try to rmdir the top/root subvolume dir.
+ */
+ if (dir == BTRFS_FIRST_FREE_OBJECTID)
+ return 0;
+
path = alloc_path_for_send();
if (!path)
return -ENOMEM;
@@ -2513,160 +2685,6 @@ out:
return ret;
}
-struct finish_unordered_dir_ctx {
- struct send_ctx *sctx;
- struct fs_path *cur_path;
- struct fs_path *dir_path;
- u64 dir_ino;
- int need_delete;
- int delete_pass;
-};
-
-int __finish_unordered_dir(int num, struct btrfs_key *di_key,
- const char *name, int name_len,
- const char *data, int data_len,
- u8 type, void *ctx)
-{
- int ret = 0;
- struct finish_unordered_dir_ctx *fctx = ctx;
- struct send_ctx *sctx = fctx->sctx;
- u64 di_gen;
- u64 di_mode;
- int is_orphan = 0;
-
- if (di_key->objectid >= fctx->dir_ino)
- goto out;
-
- fs_path_reset(fctx->cur_path);
-
- ret = get_inode_info(sctx->send_root, di_key->objectid,
- NULL, &di_gen, &di_mode, NULL, NULL);
- if (ret < 0)
- goto out;
-
- ret = is_first_ref(sctx, sctx->send_root, di_key->objectid,
- fctx->dir_ino, name, name_len);
- if (ret < 0)
- goto out;
- if (ret) {
- is_orphan = 1;
- ret = gen_unique_name(sctx, di_key->objectid, di_gen,
- fctx->cur_path);
- } else {
- ret = get_cur_path(sctx, di_key->objectid, di_gen,
- fctx->cur_path);
- }
- if (ret < 0)
- goto out;
-
- ret = fs_path_add(fctx->dir_path, name, name_len);
- if (ret < 0)
- goto out;
-
- if (!fctx->delete_pass) {
- if (S_ISDIR(di_mode)) {
- ret = send_rename(sctx, fctx->cur_path,
- fctx->dir_path);
- } else {
- ret = send_link(sctx, fctx->dir_path,
- fctx->cur_path);
- if (is_orphan)
- fctx->need_delete = 1;
- }
- } else if (!S_ISDIR(di_mode)) {
- ret = send_unlink(sctx, fctx->cur_path);
- } else {
- ret = 0;
- }
-
- fs_path_remove(fctx->dir_path);
-
-out:
- return ret;
-}
-
-/*
- * Go through all dir items and see if we find refs which could not be created
- * in the past because the dir did not exist at that time.
- */
-static int finish_outoforder_dir(struct send_ctx *sctx, u64 dir, u64 dir_gen)
-{
- int ret = 0;
- struct btrfs_path *path = NULL;
- struct btrfs_key key;
- struct btrfs_key found_key;
- struct extent_buffer *eb;
- struct finish_unordered_dir_ctx fctx;
- int slot;
-
- path = alloc_path_for_send();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
-
- memset(&fctx, 0, sizeof(fctx));
- fctx.sctx = sctx;
- fctx.cur_path = fs_path_alloc(sctx);
- fctx.dir_path = fs_path_alloc(sctx);
- if (!fctx.cur_path || !fctx.dir_path) {
- ret = -ENOMEM;
- goto out;
- }
- fctx.dir_ino = dir;
-
- ret = get_cur_path(sctx, dir, dir_gen, fctx.dir_path);
- if (ret < 0)
- goto out;
-
- /*
- * We do two passes. The first links in the new refs and the second
- * deletes orphans if required. Deletion of orphans is not required for
- * directory inodes, as we always have only one ref and use rename
- * instead of link for those.
- */
-
-again:
- key.objectid = dir;
- key.type = BTRFS_DIR_ITEM_KEY;
- key.offset = 0;
- while (1) {
- ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
- 1, 0);
- if (ret < 0)
- goto out;
- eb = path->nodes[0];
- slot = path->slots[0];
- btrfs_item_key_to_cpu(eb, &found_key, slot);
-
- if (found_key.objectid != key.objectid ||
- found_key.type != key.type) {
- btrfs_release_path(path);
- break;
- }
-
- ret = iterate_dir_item(sctx, sctx->send_root, path,
- &found_key, __finish_unordered_dir,
- &fctx);
- if (ret < 0)
- goto out;
-
- key.offset = found_key.offset + 1;
- btrfs_release_path(path);
- }
-
- if (!fctx.delete_pass && fctx.need_delete) {
- fctx.delete_pass = 1;
- goto again;
- }
-
-out:
- btrfs_free_path(path);
- fs_path_free(sctx, fctx.cur_path);
- fs_path_free(sctx, fctx.dir_path);
- return ret;
-}
-
/*
* This does all the move/link/unlink/rmdir magic.
*/
@@ -2674,6 +2692,7 @@ static int process_recorded_refs(struct send_ctx *sctx)
{
int ret = 0;
struct recorded_ref *cur;
+ struct recorded_ref *cur2;
struct ulist *check_dirs = NULL;
struct ulist_iterator uit;
struct ulist_node *un;
@@ -2685,6 +2704,12 @@ static int process_recorded_refs(struct send_ctx *sctx)
verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
+ /*
+ * This should never happen as the root dir always has the same ref
+ * which is always '..'
+ */
+ BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
+
valid_path = fs_path_alloc(sctx);
if (!valid_path) {
ret = -ENOMEM;
@@ -2731,6 +2756,46 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
list_for_each_entry(cur, &sctx->new_refs, list) {
/*
+ * We may have refs where the parent directory does not exist
+ * yet. This happens if the parent directories inum is higher
+ * the the current inum. To handle this case, we create the
+ * parent directory out of order. But we need to check if this
+ * did already happen before due to other refs in the same dir.
+ */
+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+ if (ret < 0)
+ goto out;
+ if (ret == inode_state_will_create) {
+ ret = 0;
+ /*
+ * First check if any of the current inodes refs did
+ * already create the dir.
+ */
+ list_for_each_entry(cur2, &sctx->new_refs, list) {
+ if (cur == cur2)
+ break;
+ if (cur2->dir == cur->dir) {
+ ret = 1;
+ break;
+ }
+ }
+
+ /*
+ * If that did not happen, check if a previous inode
+ * did already create the dir.
+ */
+ if (!ret)
+ ret = did_create_dir(sctx, cur->dir);
+ if (ret < 0)
+ goto out;
+ if (!ret) {
+ ret = send_create_inode(sctx, cur->dir);
+ if (ret < 0)
+ goto out;
+ }
+ }
+
+ /*
* Check if this new ref would overwrite the first ref of
* another unprocessed inode. If yes, orphanize the
* overwritten inode. If we find an overwritten ref that is
@@ -2764,7 +2829,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
* inode, move it and update valid_path. If not, link or move
* it depending on the inode mode.
*/
- if (is_orphan && !sctx->cur_inode_first_ref_orphan) {
+ if (is_orphan) {
ret = send_rename(sctx, valid_path, cur->full_path);
if (ret < 0)
goto out;
@@ -2827,6 +2892,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
if (ret < 0)
goto out;
}
+ } else if (S_ISDIR(sctx->cur_inode_mode) &&
+ !list_empty(&sctx->deleted_refs)) {
+ /*
+ * We have a moved dir. Add the old parent to check_dirs
+ */
+ cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
+ list);
+ ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
+ GFP_NOFS);
+ if (ret < 0)
+ goto out;
} else if (!S_ISDIR(sctx->cur_inode_mode)) {
/*
* We have a non dir inode. Go through all deleted refs and
@@ -2840,35 +2916,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
if (ret < 0)
goto out;
if (!ret) {
- /*
- * In case the inode was moved to a directory
- * that was not created yet (see
- * __record_new_ref), we can not unlink the ref
- * as it will be needed later when the parent
- * directory is created, so that we can move in
- * the inode to the new dir.
- */
- if (!is_orphan &&
- sctx->cur_inode_first_ref_orphan) {
- ret = orphanize_inode(sctx,
- sctx->cur_ino,
- sctx->cur_inode_gen,
- cur->full_path);
- if (ret < 0)
- goto out;
- ret = gen_unique_name(sctx,
- sctx->cur_ino,
- sctx->cur_inode_gen,
- valid_path);
- if (ret < 0)
- goto out;
- is_orphan = 1;
-
- } else {
- ret = send_unlink(sctx, cur->full_path);
- if (ret < 0)
- goto out;
- }
+ ret = send_unlink(sctx, cur->full_path);
+ if (ret < 0)
+ goto out;
}
ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
GFP_NOFS);
@@ -2880,12 +2930,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
* If the inode is still orphan, unlink the orphan. This may
* happen when a previous inode did overwrite the first ref
* of this inode and no new refs were added for the current
- * inode.
- * We can however not delete the orphan in case the inode relies
- * in a directory that was not created yet (see
- * __record_new_ref)
+ * inode. Unlinking does not mean that the inode is deleted in
+ * all cases. There may still be links to this inode in other
+ * places.
*/
- if (is_orphan && !sctx->cur_inode_first_ref_orphan) {
+ if (is_orphan) {
ret = send_unlink(sctx, valid_path);
if (ret < 0)
goto out;
@@ -2900,6 +2949,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
*/
ULIST_ITER_INIT(&uit);
while ((un = ulist_next(check_dirs, &uit))) {
+ /*
+ * In case we had refs into dirs that were not processed yet,
+ * we don't need to do the utime and rmdir logic for these dirs.
+ * The dir will be processed later.
+ */
if (un->val > sctx->cur_ino)
continue;
@@ -2929,25 +2983,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
}
}
- /*
- * Current inode is now at it's new position, so we must increase
- * send_progress
- */
- sctx->send_progress = sctx->cur_ino + 1;
-
- /*
- * We may have a directory here that has pending refs which could not
- * be created before (because the dir did not exist before, see
- * __record_new_ref). finish_outoforder_dir will link/move the pending
- * refs.
- */
- if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_new) {
- ret = finish_outoforder_dir(sctx, sctx->cur_ino,
- sctx->cur_inode_gen);
- if (ret < 0)
- goto out;
- }
-
ret = 0;
out:
@@ -2971,34 +3006,9 @@ static int __record_new_ref(int num, u64 dir, int index,
return -ENOMEM;
ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL,
- NULL);
- if (ret < 0)
- goto out;
-
- /*
- * The parent may be non-existent at this point in time. This happens
- * if the ino of the parent dir is higher then the current ino. In this
- * case, we can not process this ref until the parent dir is finally
- * created. If we reach the parent dir later, process_recorded_refs
- * will go through all dir items and process the refs that could not be
- * processed before. In case this is the first ref, we set
- * cur_inode_first_ref_orphan to 1 to inform process_recorded_refs to
- * keep an orphan of the inode so that it later can be used for
- * link/move
- */
- ret = is_inode_existent(sctx, dir, gen);
+ NULL, NULL);
if (ret < 0)
goto out;
- if (!ret) {
- ret = is_first_ref(sctx, sctx->send_root, sctx->cur_ino, dir,
- name->start, fs_path_len(name));
- if (ret < 0)
- goto out;
- if (ret)
- sctx->cur_inode_first_ref_orphan = 1;
- ret = 0;
- goto out;
- }
ret = get_cur_path(sctx, dir, gen, p);
if (ret < 0)
@@ -3029,7 +3039,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
return -ENOMEM;
ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
- NULL);
+ NULL, NULL);
if (ret < 0)
goto out;
@@ -3206,33 +3216,28 @@ static int process_all_refs(struct send_ctx *sctx,
key.offset = 0;
while (1) {
ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
- if (ret < 0) {
- btrfs_release_path(path);
+ if (ret < 0)
goto out;
- }
- if (ret) {
- btrfs_release_path(path);
+ if (ret)
break;
- }
eb = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(eb, &found_key, slot);
if (found_key.objectid != key.objectid ||
- found_key.type != key.type) {
- btrfs_release_path(path);
+ found_key.type != key.type)
break;
- }
- ret = iterate_inode_ref(sctx, sctx->parent_root, path,
- &found_key, 0, cb, sctx);
+ ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb,
+ sctx);
btrfs_release_path(path);
if (ret < 0)
goto out;
key.offset = found_key.offset + 1;
}
+ btrfs_release_path(path);
ret = process_recorded_refs(sctx);
@@ -3555,7 +3560,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
int ret = 0;
struct fs_path *p;
loff_t pos = offset;
- int readed = 0;
+ int num_read = 0;
mm_segment_t old_fs;
p = fs_path_alloc(sctx);
@@ -3580,8 +3585,8 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos);
if (ret < 0)
goto out;
- readed = ret;
- if (!readed)
+ num_read = ret;
+ if (!num_read)
goto out;
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
@@ -3594,7 +3599,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
- TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, readed);
+ TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);
ret = send_cmd(sctx);
@@ -3604,7 +3609,7 @@ out:
set_fs(old_fs);
if (ret < 0)
return ret;
- return readed;
+ return num_read;
}
/*
@@ -3615,7 +3620,6 @@ static int send_clone(struct send_ctx *sctx,
struct clone_root *clone_root)
{
int ret = 0;
- struct btrfs_root *clone_root2 = clone_root->root;
struct fs_path *p;
u64 gen;
@@ -3640,22 +3644,23 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
- if (clone_root2 == sctx->send_root) {
+ if (clone_root->root == sctx->send_root) {
ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
- &gen, NULL, NULL, NULL);
+ &gen, NULL, NULL, NULL, NULL);
if (ret < 0)
goto out;
ret = get_cur_path(sctx, clone_root->ino, gen, p);
} else {
- ret = get_inode_path(sctx, clone_root2, clone_root->ino, p);
+ ret = get_inode_path(sctx, clone_root->root,
+ clone_root->ino, p);
}
if (ret < 0)
goto out;
TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
- clone_root2->root_item.uuid);
+ clone_root->root->root_item.uuid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
- clone_root2->root_item.ctransid);
+ clone_root->root->root_item.ctransid);
TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
clone_root->offset);
@@ -3684,10 +3689,17 @@ static int send_write_or_clone(struct send_ctx *sctx,
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_file_extent_item);
type = btrfs_file_extent_type(path->nodes[0], ei);
- if (type == BTRFS_FILE_EXTENT_INLINE)
+ if (type == BTRFS_FILE_EXTENT_INLINE) {
len = btrfs_file_extent_inline_len(path->nodes[0], ei);
- else
+ /*
+ * it is possible the inline item won't cover the whole page,
+ * but there may be items after this page. Make
+ * sure to send the whole thing
+ */
+ len = PAGE_CACHE_ALIGN(len);
+ } else {
len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
+ }
if (offset + len > sctx->cur_inode_size)
len = sctx->cur_inode_size - offset;
@@ -3735,6 +3747,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,
u64 left_offset_fixed;
u64 left_len;
u64 right_len;
+ u64 left_gen;
+ u64 right_gen;
u8 left_type;
u8 right_type;
@@ -3744,17 +3758,17 @@ static int is_extent_unchanged(struct send_ctx *sctx,
eb = left_path->nodes[0];
slot = left_path->slots[0];
-
ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
left_type = btrfs_file_extent_type(eb, ei);
- left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
- left_len = btrfs_file_extent_num_bytes(eb, ei);
- left_offset = btrfs_file_extent_offset(eb, ei);
if (left_type != BTRFS_FILE_EXTENT_REG) {
ret = 0;
goto out;
}
+ left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
+ left_len = btrfs_file_extent_num_bytes(eb, ei);
+ left_offset = btrfs_file_extent_offset(eb, ei);
+ left_gen = btrfs_file_extent_generation(eb, ei);
/*
* Following comments will refer to these graphics. L is the left
@@ -3810,6 +3824,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,
right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
right_len = btrfs_file_extent_num_bytes(eb, ei);
right_offset = btrfs_file_extent_offset(eb, ei);
+ right_gen = btrfs_file_extent_generation(eb, ei);
if (right_type != BTRFS_FILE_EXTENT_REG) {
ret = 0;
@@ -3820,7 +3835,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,
* Are we at extent 8? If yes, we know the extent is changed.
* This may only happen on the first iteration.
*/
- if (found_key.offset + right_len < ekey->offset) {
+ if (found_key.offset + right_len <= ekey->offset) {
ret = 0;
goto out;
}
@@ -3837,8 +3852,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
/*
* Check if we have the same extent.
*/
- if (left_disknr + left_offset_fixed !=
- right_disknr + right_offset) {
+ if (left_disknr != right_disknr ||
+ left_offset_fixed != right_offset ||
+ left_gen != right_gen) {
ret = 0;
goto out;
}
@@ -3977,6 +3993,15 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
goto out;
ret = process_recorded_refs(sctx);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * We have processed the refs and thus need to advance send_progress.
+ * Now, calls to get_cur_xxx will take the updated refs of the current
+ * inode into account.
+ */
+ sctx->send_progress = sctx->cur_ino + 1;
out:
return ret;
@@ -4004,7 +4029,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
goto out;
ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
- &left_mode, &left_uid, &left_gid);
+ &left_mode, &left_uid, &left_gid, NULL);
if (ret < 0)
goto out;
@@ -4015,7 +4040,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
} else {
ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
NULL, NULL, &right_mode, &right_uid,
- &right_gid);
+ &right_gid, NULL);
if (ret < 0)
goto out;
@@ -4074,7 +4099,12 @@ static int changed_inode(struct send_ctx *sctx,
sctx->cur_ino = key->objectid;
sctx->cur_inode_new_gen = 0;
- sctx->cur_inode_first_ref_orphan = 0;
+
+ /*
+ * Set send_progress to current inode. This will tell all get_cur_xxx
+ * functions that the current inode's refs are not updated yet. Later,
+ * when process_recorded_refs is finished, it is set to cur_ino + 1.
+ */
sctx->send_progress = sctx->cur_ino;
if (result == BTRFS_COMPARE_TREE_NEW ||
@@ -4098,7 +4128,14 @@ static int changed_inode(struct send_ctx *sctx,
right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
right_ii);
- if (left_gen != right_gen)
+
+ /*
+ * The cur_ino = root dir case is special here. We can't treat
+ * the inode as deleted+reused because it would generate a
+ * stream that tries to delete/mkdir the root dir.
+ */
+ if (left_gen != right_gen &&
+ sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
sctx->cur_inode_new_gen = 1;
}
@@ -4111,8 +4148,7 @@ static int changed_inode(struct send_ctx *sctx,
sctx->cur_inode_mode = btrfs_inode_mode(
sctx->left_path->nodes[0], left_ii);
if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
- ret = send_create_inode(sctx, sctx->left_path,
- sctx->cmp_key);
+ ret = send_create_inode_if_needed(sctx);
} else if (result == BTRFS_COMPARE_TREE_DELETED) {
sctx->cur_inode_gen = right_gen;
sctx->cur_inode_new = 0;
@@ -4122,7 +4158,17 @@ static int changed_inode(struct send_ctx *sctx,
sctx->cur_inode_mode = btrfs_inode_mode(
sctx->right_path->nodes[0], right_ii);
} else if (result == BTRFS_COMPARE_TREE_CHANGED) {
+ /*
+ * We need to do some special handling in case the inode was
+ * reported as changed with a changed generation number. This
+ * means that the original inode was deleted and new inode
+ * reused the same inum. So we have to treat the old inode as
+ * deleted and the new one as new.
+ */
if (sctx->cur_inode_new_gen) {
+ /*
+ * First, process the inode as if it was deleted.
+ */
sctx->cur_inode_gen = right_gen;
sctx->cur_inode_new = 0;
sctx->cur_inode_deleted = 1;
@@ -4135,6 +4181,9 @@ static int changed_inode(struct send_ctx *sctx,
if (ret < 0)
goto out;
+ /*
+ * Now process the inode as if it was new.
+ */
sctx->cur_inode_gen = left_gen;
sctx->cur_inode_new = 1;
sctx->cur_inode_deleted = 0;
@@ -4142,14 +4191,23 @@ static int changed_inode(struct send_ctx *sctx,
sctx->left_path->nodes[0], left_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
sctx->left_path->nodes[0], left_ii);
- ret = send_create_inode(sctx, sctx->left_path,
- sctx->cmp_key);
+ ret = send_create_inode_if_needed(sctx);
if (ret < 0)
goto out;
ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
if (ret < 0)
goto out;
+ /*
+ * Advance send_progress now as we did not get into
+ * process_recorded_refs_if_needed in the new_gen case.
+ */
+ sctx->send_progress = sctx->cur_ino + 1;
+
+ /*
+ * Now process all extents and xattrs of the inode as if
+ * they were all new.
+ */
ret = process_all_extents(sctx);
if (ret < 0)
goto out;
@@ -4172,6 +4230,16 @@ out:
return ret;
}
+/*
+ * We have to process new refs before deleted refs, but compare_trees gives us
+ * the new and deleted refs mixed. To fix this, we record the new/deleted refs
+ * first and later process them in process_recorded_refs.
+ * For the cur_inode_new_gen case, we skip recording completely because
+ * changed_inode did already initiate processing of refs. The reason for this is
+ * that in this case, compare_tree actually compares the refs of 2 different
+ * inodes. To fix this, process_all_refs is used in changed_inode to handle all
+ * refs of the right tree as deleted and all refs of the left tree as new.
+ */
static int changed_ref(struct send_ctx *sctx,
enum btrfs_compare_tree_result result)
{
@@ -4192,6 +4260,11 @@ static int changed_ref(struct send_ctx *sctx,
return ret;
}
+/*
+ * Process new/deleted/changed xattrs. We skip processing in the
+ * cur_inode_new_gen case because changed_inode did already initiate processing
+ * of xattrs. The reason is the same as in changed_ref
+ */
static int changed_xattr(struct send_ctx *sctx,
enum btrfs_compare_tree_result result)
{
@@ -4211,6 +4284,11 @@ static int changed_xattr(struct send_ctx *sctx,
return ret;
}
+/*
+ * Process new/deleted/changed extents. We skip processing in the
+ * cur_inode_new_gen case because changed_inode did already initiate processing
+ * of extents. The reason is the same as in changed_ref
+ */
static int changed_extent(struct send_ctx *sctx,
enum btrfs_compare_tree_result result)
{
@@ -4227,7 +4305,10 @@ static int changed_extent(struct send_ctx *sctx,
return ret;
}
-
+/*
+ * Updates compare related fields in sctx and simply forwards to the actual
+ * changed_xxx functions.
+ */
static int changed_cb(struct btrfs_root *left_root,
struct btrfs_root *right_root,
struct btrfs_path *left_path,
@@ -4247,6 +4328,11 @@ static int changed_cb(struct btrfs_root *left_root,
if (ret < 0)
goto out;
+ /* Ignore non-FS objects */
+ if (key->objectid == BTRFS_FREE_INO_OBJECTID ||
+ key->objectid == BTRFS_FREE_SPACE_OBJECTID)
+ goto out;
+
if (key->type == BTRFS_INODE_ITEM_KEY)
ret = changed_inode(sctx, result);
else if (key->type == BTRFS_INODE_REF_KEY)
@@ -4299,7 +4385,8 @@ join_trans:
}
/*
- * Make sure the tree has not changed
+ * Make sure the tree has not changed after re-joining. We detect this
+ * by comparing start_ctransid and ctransid. They should always match.
*/
spin_lock(&send_root->root_times_lock);
ctransid = btrfs_root_ctransid(&send_root->root_item);
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 9934e948e57..1bf4f32fd4e 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -130,4 +130,5 @@ enum {
#ifdef __KERNEL__
long btrfs_ioctl_send(struct file *mnt_file, void __user *arg);
+int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off);
#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 83d6f9f9c22..915ac14c206 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -243,12 +243,18 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root, const char *function,
unsigned int line, int errno)
{
- WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted");
+ WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted\n");
trans->aborted = errno;
/* Nothing used. The other threads that have joined this
* transaction may be able to continue. */
if (!trans->blocks_used) {
- btrfs_printk(root->fs_info, "Aborting unused transaction.\n");
+ char nbuf[16];
+ const char *errstr;
+
+ errstr = btrfs_decode_error(root->fs_info, errno, nbuf);
+ btrfs_printk(root->fs_info,
+ "%s:%d: Aborting unused transaction(%s).\n",
+ function, line, errstr);
return;
}
trans->transaction->aborted = errno;
@@ -407,7 +413,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
btrfs_set_opt(info->mount_opt, NODATASUM);
break;
case Opt_nodatacow:
- printk(KERN_INFO "btrfs: setting nodatacow\n");
+ if (!btrfs_test_opt(root, COMPRESS) ||
+ !btrfs_test_opt(root, FORCE_COMPRESS)) {
+ printk(KERN_INFO "btrfs: setting nodatacow, compression disabled\n");
+ } else {
+ printk(KERN_INFO "btrfs: setting nodatacow\n");
+ }
+ info->compress_type = BTRFS_COMPRESS_NONE;
+ btrfs_clear_opt(info->mount_opt, COMPRESS);
+ btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
btrfs_set_opt(info->mount_opt, NODATACOW);
btrfs_set_opt(info->mount_opt, NODATASUM);
break;
@@ -422,10 +436,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
compress_type = "zlib";
info->compress_type = BTRFS_COMPRESS_ZLIB;
btrfs_set_opt(info->mount_opt, COMPRESS);
+ btrfs_clear_opt(info->mount_opt, NODATACOW);
+ btrfs_clear_opt(info->mount_opt, NODATASUM);
} else if (strcmp(args[0].from, "lzo") == 0) {
compress_type = "lzo";
info->compress_type = BTRFS_COMPRESS_LZO;
btrfs_set_opt(info->mount_opt, COMPRESS);
+ btrfs_clear_opt(info->mount_opt, NODATACOW);
+ btrfs_clear_opt(info->mount_opt, NODATASUM);
btrfs_set_fs_incompat(info, COMPRESS_LZO);
} else if (strncmp(args[0].from, "no", 2) == 0) {
compress_type = "no";
@@ -543,11 +561,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
break;
case Opt_defrag:
- printk(KERN_INFO "btrfs: enabling auto defrag");
+ printk(KERN_INFO "btrfs: enabling auto defrag\n");
btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
break;
case Opt_recovery:
- printk(KERN_INFO "btrfs: enabling auto recovery");
+ printk(KERN_INFO "btrfs: enabling auto recovery\n");
btrfs_set_opt(info->mount_opt, RECOVERY);
break;
case Opt_skip_balance:
@@ -846,18 +864,15 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
return 0;
}
- btrfs_wait_ordered_extents(root, 0, 0);
-
- spin_lock(&fs_info->trans_lock);
- if (!fs_info->running_transaction) {
- spin_unlock(&fs_info->trans_lock);
- return 0;
- }
- spin_unlock(&fs_info->trans_lock);
+ btrfs_wait_ordered_extents(root, 0);
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
+ trans = btrfs_attach_transaction(root);
+ if (IS_ERR(trans)) {
+ /* no transaction, don't bother */
+ if (PTR_ERR(trans) == -ENOENT)
+ return 0;
return PTR_ERR(trans);
+ }
return btrfs_commit_transaction(trans, root);
}
@@ -1508,17 +1523,21 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
static int btrfs_freeze(struct super_block *sb)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- mutex_lock(&fs_info->transaction_kthread_mutex);
- mutex_lock(&fs_info->cleaner_mutex);
- return 0;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = btrfs_sb(sb)->tree_root;
+
+ trans = btrfs_attach_transaction(root);
+ if (IS_ERR(trans)) {
+ /* no transaction, don't bother */
+ if (PTR_ERR(trans) == -ENOENT)
+ return 0;
+ return PTR_ERR(trans);
+ }
+ return btrfs_commit_transaction(trans, root);
}
static int btrfs_unfreeze(struct super_block *sb)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- mutex_unlock(&fs_info->cleaner_mutex);
- mutex_unlock(&fs_info->transaction_kthread_mutex);
return 0;
}
@@ -1595,7 +1614,7 @@ static int btrfs_interface_init(void)
static void btrfs_interface_exit(void)
{
if (misc_deregister(&btrfs_misc) < 0)
- printk(KERN_INFO "misc_deregister failed for control device");
+ printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
}
static int __init init_btrfs_fs(void)
@@ -1620,10 +1639,14 @@ static int __init init_btrfs_fs(void)
if (err)
goto free_extent_io;
- err = btrfs_delayed_inode_init();
+ err = ordered_data_init();
if (err)
goto free_extent_map;
+ err = btrfs_delayed_inode_init();
+ if (err)
+ goto free_ordered_data;
+
err = btrfs_interface_init();
if (err)
goto free_delayed_inode;
@@ -1641,6 +1664,8 @@ unregister_ioctl:
btrfs_interface_exit();
free_delayed_inode:
btrfs_delayed_inode_exit();
+free_ordered_data:
+ ordered_data_exit();
free_extent_map:
extent_map_exit();
free_extent_io:
@@ -1657,6 +1682,7 @@ static void __exit exit_btrfs_fs(void)
{
btrfs_destroy_cachep();
btrfs_delayed_inode_exit();
+ ordered_data_exit();
extent_map_exit();
extent_io_exit();
btrfs_interface_exit();
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 27c26004e05..77db875b511 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -53,7 +53,7 @@ static noinline void switch_commit_root(struct btrfs_root *root)
/*
* either allocate a new transaction or hop into the existing one
*/
-static noinline int join_transaction(struct btrfs_root *root, int nofail)
+static noinline int join_transaction(struct btrfs_root *root, int type)
{
struct btrfs_transaction *cur_trans;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -67,7 +67,13 @@ loop:
}
if (fs_info->trans_no_join) {
- if (!nofail) {
+ /*
+ * If we are JOIN_NOLOCK we're already committing a current
+ * transaction, we just need a handle to deal with something
+ * when committing the transaction, such as inode cache and
+ * space cache. It is a special case.
+ */
+ if (type != TRANS_JOIN_NOLOCK) {
spin_unlock(&fs_info->trans_lock);
return -EBUSY;
}
@@ -87,6 +93,13 @@ loop:
}
spin_unlock(&fs_info->trans_lock);
+ /*
+ * If we are ATTACH, we just want to catch the current transaction,
+ * and commit it. If there is no transaction, just return ENOENT.
+ */
+ if (type == TRANS_ATTACH)
+ return -ENOENT;
+
cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
if (!cur_trans)
return -ENOMEM;
@@ -267,13 +280,6 @@ static void wait_current_trans(struct btrfs_root *root)
}
}
-enum btrfs_trans_type {
- TRANS_START,
- TRANS_JOIN,
- TRANS_USERSPACE,
- TRANS_JOIN_NOLOCK,
-};
-
static int may_wait_transaction(struct btrfs_root *root, int type)
{
if (root->fs_info->log_root_recovering)
@@ -290,7 +296,8 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
}
static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
- u64 num_items, int type)
+ u64 num_items, int type,
+ int noflush)
{
struct btrfs_trans_handle *h;
struct btrfs_transaction *cur_trans;
@@ -324,9 +331,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
}
num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
- ret = btrfs_block_rsv_add(root,
- &root->fs_info->trans_block_rsv,
- num_bytes);
+ if (noflush)
+ ret = btrfs_block_rsv_add_noflush(root,
+ &root->fs_info->trans_block_rsv,
+ num_bytes);
+ else
+ ret = btrfs_block_rsv_add(root,
+ &root->fs_info->trans_block_rsv,
+ num_bytes);
if (ret)
return ERR_PTR(ret);
}
@@ -335,19 +347,34 @@ again:
if (!h)
return ERR_PTR(-ENOMEM);
- sb_start_intwrite(root->fs_info->sb);
+ /*
+ * If we are JOIN_NOLOCK we're already committing a transaction and
+ * waiting on this guy, so we don't need to do the sb_start_intwrite
+ * because we're already holding a ref. We need this because we could
+ * have raced in and did an fsync() on a file which can kick a commit
+ * and then we deadlock with somebody doing a freeze.
+ *
+ * If we are ATTACH, it means we just want to catch the current
+ * transaction and commit it, so we needn't do sb_start_intwrite().
+ */
+ if (type < TRANS_JOIN_NOLOCK)
+ sb_start_intwrite(root->fs_info->sb);
if (may_wait_transaction(root, type))
wait_current_trans(root);
do {
- ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
+ ret = join_transaction(root, type);
if (ret == -EBUSY)
wait_current_trans(root);
} while (ret == -EBUSY);
if (ret < 0) {
- sb_end_intwrite(root->fs_info->sb);
+ /* We must get the transaction if we are JOIN_NOLOCK. */
+ BUG_ON(type == TRANS_JOIN_NOLOCK);
+
+ if (type < TRANS_JOIN_NOLOCK)
+ sb_end_intwrite(root->fs_info->sb);
kmem_cache_free(btrfs_trans_handle_cachep, h);
return ERR_PTR(ret);
}
@@ -367,7 +394,9 @@ again:
h->aborted = 0;
h->qgroup_reserved = qgroup_reserved;
h->delayed_ref_elem.seq = 0;
+ h->type = type;
INIT_LIST_HEAD(&h->qgroup_ref_list);
+ INIT_LIST_HEAD(&h->new_bgs);
smp_mb();
if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -393,21 +422,33 @@ got_it:
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
int num_items)
{
- return start_transaction(root, num_items, TRANS_START);
+ return start_transaction(root, num_items, TRANS_START, 0);
+}
+
+struct btrfs_trans_handle *btrfs_start_transaction_noflush(
+ struct btrfs_root *root, int num_items)
+{
+ return start_transaction(root, num_items, TRANS_START, 1);
}
+
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
{
- return start_transaction(root, 0, TRANS_JOIN);
+ return start_transaction(root, 0, TRANS_JOIN, 0);
}
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
{
- return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
+ return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0);
}
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
{
- return start_transaction(root, 0, TRANS_USERSPACE);
+ return start_transaction(root, 0, TRANS_USERSPACE, 0);
+}
+
+struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
+{
+ return start_transaction(root, 0, TRANS_ATTACH, 0);
}
/* wait for a transaction commit to be fully complete */
@@ -506,11 +547,12 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
}
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, int throttle, int lock)
+ struct btrfs_root *root, int throttle)
{
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *info = root->fs_info;
int count = 0;
+ int lock = (trans->type != TRANS_JOIN_NOLOCK);
int err = 0;
if (--trans->use_count) {
@@ -536,6 +578,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
trans->qgroup_reserved = 0;
}
+ if (!list_empty(&trans->new_bgs))
+ btrfs_create_pending_block_groups(trans, root);
+
while (count < 2) {
unsigned long cur = trans->delayed_ref_updates;
trans->delayed_ref_updates = 0;
@@ -551,7 +596,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
- sb_end_intwrite(root->fs_info->sb);
+ if (!list_empty(&trans->new_bgs))
+ btrfs_create_pending_block_groups(trans, root);
if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
should_end_transaction(trans, root)) {
@@ -573,6 +619,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
}
}
+ if (trans->type < TRANS_JOIN_NOLOCK)
+ sb_end_intwrite(root->fs_info->sb);
+
WARN_ON(cur_trans != info->running_transaction);
WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
atomic_dec(&cur_trans->num_writers);
@@ -604,7 +653,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
{
int ret;
- ret = __btrfs_end_transaction(trans, root, 0, 1);
+ ret = __btrfs_end_transaction(trans, root, 0);
if (ret)
return ret;
return 0;
@@ -615,18 +664,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
{
int ret;
- ret = __btrfs_end_transaction(trans, root, 1, 1);
- if (ret)
- return ret;
- return 0;
-}
-
-int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
-{
- int ret;
-
- ret = __btrfs_end_transaction(trans, root, 0, 0);
+ ret = __btrfs_end_transaction(trans, root, 1);
if (ret)
return ret;
return 0;
@@ -635,7 +673,7 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- return __btrfs_end_transaction(trans, root, 1, 1);
+ return __btrfs_end_transaction(trans, root, 1);
}
/*
@@ -649,13 +687,15 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
int err = 0;
int werr = 0;
struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+ struct extent_state *cached_state = NULL;
u64 start = 0;
u64 end;
while (!find_first_extent_bit(dirty_pages, start, &start, &end,
- mark)) {
- convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
- GFP_NOFS);
+ mark, &cached_state)) {
+ convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
+ mark, &cached_state, GFP_NOFS);
+ cached_state = NULL;
err = filemap_fdatawrite_range(mapping, start, end);
if (err)
werr = err;
@@ -679,12 +719,14 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
int err = 0;
int werr = 0;
struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+ struct extent_state *cached_state = NULL;
u64 start = 0;
u64 end;
while (!find_first_extent_bit(dirty_pages, start, &start, &end,
- EXTENT_NEED_WAIT)) {
- clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
+ EXTENT_NEED_WAIT, &cached_state)) {
+ clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
+ 0, 0, &cached_state, GFP_NOFS);
err = filemap_fdatawait_range(mapping, start, end);
if (err)
werr = err;
@@ -955,6 +997,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_root *parent_root;
struct btrfs_block_rsv *rsv;
struct inode *parent_inode;
+ struct btrfs_path *path;
+ struct btrfs_dir_item *dir_item;
struct dentry *parent;
struct dentry *dentry;
struct extent_buffer *tmp;
@@ -967,18 +1011,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
u64 root_flags;
uuid_le new_uuid;
- rsv = trans->block_rsv;
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = pending->error = -ENOMEM;
+ goto path_alloc_fail;
+ }
new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
if (!new_root_item) {
ret = pending->error = -ENOMEM;
- goto fail;
+ goto root_item_alloc_fail;
}
ret = btrfs_find_free_objectid(tree_root, &objectid);
if (ret) {
pending->error = ret;
- goto fail;
+ goto no_free_objectid;
}
btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
@@ -988,22 +1036,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
to_reserve);
if (ret) {
pending->error = ret;
- goto fail;
+ goto no_free_objectid;
}
}
ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid,
objectid, pending->inherit);
- kfree(pending->inherit);
if (ret) {
pending->error = ret;
- goto fail;
+ goto no_free_objectid;
}
key.objectid = objectid;
key.offset = (u64)-1;
key.type = BTRFS_ROOT_ITEM_KEY;
+ rsv = trans->block_rsv;
trans->block_rsv = &pending->block_rsv;
dentry = pending->dentry;
@@ -1017,24 +1065,21 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
*/
ret = btrfs_set_inode_index(parent_inode, &index);
BUG_ON(ret); /* -ENOMEM */
- ret = btrfs_insert_dir_item(trans, parent_root,
- dentry->d_name.name, dentry->d_name.len,
- parent_inode, &key,
- BTRFS_FT_DIR, index);
- if (ret == -EEXIST) {
+
+ /* check if there is a file/dir which has the same name. */
+ dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
+ btrfs_ino(parent_inode),
+ dentry->d_name.name,
+ dentry->d_name.len, 0);
+ if (dir_item != NULL && !IS_ERR(dir_item)) {
pending->error = -EEXIST;
- dput(parent);
goto fail;
- } else if (ret) {
- goto abort_trans_dput;
+ } else if (IS_ERR(dir_item)) {
+ ret = PTR_ERR(dir_item);
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
}
-
- btrfs_i_size_write(parent_inode, parent_inode->i_size +
- dentry->d_name.len * 2);
- parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
- ret = btrfs_update_inode(trans, parent_root, parent_inode);
- if (ret)
- goto abort_trans_dput;
+ btrfs_release_path(path);
/*
* pull in the delayed directory update
@@ -1043,8 +1088,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* snapshot
*/
ret = btrfs_run_delayed_items(trans, root);
- if (ret) { /* Transaction aborted */
- dput(parent);
+ if (ret) { /* Transaction aborted */
+ btrfs_abort_transaction(trans, root, ret);
goto fail;
}
@@ -1079,7 +1124,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
if (ret) {
btrfs_tree_unlock(old);
free_extent_buffer(old);
- goto abort_trans_dput;
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
}
btrfs_set_lock_blocking(old);
@@ -1088,8 +1134,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
/* clean up in any case */
btrfs_tree_unlock(old);
free_extent_buffer(old);
- if (ret)
- goto abort_trans_dput;
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
/* see comments in should_cow_block() */
root->force_cow = 1;
@@ -1101,8 +1149,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
btrfs_tree_unlock(tmp);
free_extent_buffer(tmp);
- if (ret)
- goto abort_trans_dput;
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
/*
* insert root back/forward references
@@ -1111,32 +1161,58 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
parent_root->root_key.objectid,
btrfs_ino(parent_inode), index,
dentry->d_name.name, dentry->d_name.len);
- dput(parent);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
goto fail;
+ }
key.offset = (u64)-1;
pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
if (IS_ERR(pending->snap)) {
ret = PTR_ERR(pending->snap);
- goto abort_trans;
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
}
ret = btrfs_reloc_post_snapshot(trans, pending);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ ret = btrfs_insert_dir_item(trans, parent_root,
+ dentry->d_name.name, dentry->d_name.len,
+ parent_inode, &key,
+ BTRFS_FT_DIR, index);
+ /* We have check then name at the beginning, so it is impossible. */
+ BUG_ON(ret == -EEXIST);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ btrfs_i_size_write(parent_inode, parent_inode->i_size +
+ dentry->d_name.len * 2);
+ parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+ ret = btrfs_update_inode(trans, parent_root, parent_inode);
if (ret)
- goto abort_trans;
- ret = 0;
+ btrfs_abort_transaction(trans, root, ret);
fail:
- kfree(new_root_item);
+ dput(parent);
trans->block_rsv = rsv;
+no_free_objectid:
+ kfree(new_root_item);
+root_item_alloc_fail:
+ btrfs_free_path(path);
+path_alloc_fail:
btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
return ret;
-
-abort_trans_dput:
- dput(parent);
-abort_trans:
- btrfs_abort_transaction(trans, root, ret);
- goto fail;
}
/*
@@ -1229,6 +1305,16 @@ static void do_async_commit(struct work_struct *work)
struct btrfs_async_commit *ac =
container_of(work, struct btrfs_async_commit, work.work);
+ /*
+ * We've got freeze protection passed with the transaction.
+ * Tell lockdep about it.
+ */
+ rwsem_acquire_read(
+ &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+ 0, 1, _THIS_IP_);
+
+ current->journal_info = ac->newtrans;
+
btrfs_commit_transaction(ac->newtrans, ac->root);
kfree(ac);
}
@@ -1258,6 +1344,14 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
atomic_inc(&cur_trans->use_count);
btrfs_end_transaction(trans, root);
+
+ /*
+ * Tell lockdep we've released the freeze rwsem, since the
+ * async commit thread will be the one to unlock it.
+ */
+ rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+ 1, _THIS_IP_);
+
schedule_delayed_work(&ac->work, 0);
/* wait for transaction to start and unblock */
@@ -1348,6 +1442,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
*/
cur_trans->delayed_refs.flushing = 1;
+ if (!list_empty(&trans->new_bgs))
+ btrfs_create_pending_block_groups(trans, root);
+
ret = btrfs_run_delayed_refs(trans, root, 0);
if (ret)
goto cleanup_transaction;
@@ -1403,7 +1500,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
if (flush_on_commit || snap_pending) {
btrfs_start_delalloc_inodes(root, 1);
- btrfs_wait_ordered_extents(root, 0, 1);
+ btrfs_wait_ordered_extents(root, 1);
}
ret = btrfs_run_delayed_items(trans, root);
@@ -1456,13 +1553,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
*/
mutex_lock(&root->fs_info->reloc_mutex);
- ret = btrfs_run_delayed_items(trans, root);
+ /*
+ * We needn't worry about the delayed items because we will
+ * deal with them in create_pending_snapshot(), which is the
+ * core function of the snapshot creation.
+ */
+ ret = create_pending_snapshots(trans, root->fs_info);
if (ret) {
mutex_unlock(&root->fs_info->reloc_mutex);
goto cleanup_transaction;
}
- ret = create_pending_snapshots(trans, root->fs_info);
+ /*
+ * We insert the dir indexes of the snapshots and update the inode
+ * of the snapshots' parents after the snapshot creation, so there
+ * are some delayed items which are not dealt with. Now deal with
+ * them.
+ *
+ * We needn't worry that this operation will corrupt the snapshots,
+ * because all the tree which are snapshoted will be forced to COW
+ * the nodes and leaves.
+ */
+ ret = btrfs_run_delayed_items(trans, root);
if (ret) {
mutex_unlock(&root->fs_info->reloc_mutex);
goto cleanup_transaction;
@@ -1584,7 +1696,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
put_transaction(cur_trans);
put_transaction(cur_trans);
- sb_end_intwrite(root->fs_info->sb);
+ if (trans->type < TRANS_JOIN_NOLOCK)
+ sb_end_intwrite(root->fs_info->sb);
trace_btrfs_transaction_commit(root);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index e8b8416c688..80961947a6b 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -47,6 +47,14 @@ struct btrfs_transaction {
int aborted;
};
+enum btrfs_trans_type {
+ TRANS_START,
+ TRANS_JOIN,
+ TRANS_USERSPACE,
+ TRANS_JOIN_NOLOCK,
+ TRANS_ATTACH,
+};
+
struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
@@ -58,8 +66,9 @@ struct btrfs_trans_handle {
struct btrfs_transaction *transaction;
struct btrfs_block_rsv *block_rsv;
struct btrfs_block_rsv *orig_rsv;
- int aborted;
- int adding_csums;
+ short aborted;
+ short adding_csums;
+ enum btrfs_trans_type type;
/*
* this root is only needed to validate that the root passed to
* start_transaction is the same as the one passed to end_transaction.
@@ -68,6 +77,7 @@ struct btrfs_trans_handle {
struct btrfs_root *root;
struct seq_list delayed_ref_elem;
struct list_head qgroup_ref_list;
+ struct list_head new_bgs;
};
struct btrfs_pending_snapshot {
@@ -88,16 +98,18 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
{
BTRFS_I(inode)->last_trans = trans->transaction->transid;
BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+ BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
}
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
-int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
int num_items);
+struct btrfs_trans_handle *btrfs_start_transaction_noflush(
+ struct btrfs_root *root, int num_items);
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c86670f4f28..e9ebb472b28 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -18,13 +18,16 @@
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/list_sort.h>
#include "ctree.h"
#include "transaction.h"
#include "disk-io.h"
#include "locking.h"
#include "print-tree.h"
+#include "backref.h"
#include "compat.h"
#include "tree-log.h"
+#include "hash.h"
/* magic values for the inode_only field in btrfs_log_inode:
*
@@ -146,7 +149,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
root->log_multiple_pids = true;
}
- root->log_batch++;
+ atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
mutex_unlock(&root->log_mutex);
return 0;
@@ -165,7 +168,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
err = ret;
}
mutex_unlock(&root->fs_info->tree_log_mutex);
- root->log_batch++;
+ atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
mutex_unlock(&root->log_mutex);
return err;
@@ -484,7 +487,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
int found_type;
u64 mask = root->sectorsize - 1;
u64 extent_end;
- u64 alloc_hint;
u64 start = key->offset;
u64 saved_nbytes;
struct btrfs_file_extent_item *item;
@@ -550,8 +552,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
saved_nbytes = inode_get_bytes(inode);
/* drop any overlapping extents */
- ret = btrfs_drop_extents(trans, inode, start, extent_end,
- &alloc_hint, 1);
+ ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
BUG_ON(ret);
if (found_type == BTRFS_FILE_EXTENT_REG ||
@@ -744,6 +745,7 @@ out:
*/
static noinline int backref_in_log(struct btrfs_root *log,
struct btrfs_key *key,
+ u64 ref_objectid,
char *name, int namelen)
{
struct btrfs_path *path;
@@ -764,8 +766,17 @@ static noinline int backref_in_log(struct btrfs_root *log,
if (ret != 0)
goto out;
- item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+
+ if (key->type == BTRFS_INODE_EXTREF_KEY) {
+ if (btrfs_find_name_in_ext_backref(path, ref_objectid,
+ name, namelen, NULL))
+ match = 1;
+
+ goto out;
+ }
+
+ item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
ptr_end = ptr + item_size;
while (ptr < ptr_end) {
ref = (struct btrfs_inode_ref *)ptr;
@@ -786,91 +797,42 @@ out:
return match;
}
-
-/*
- * replay one inode back reference item found in the log tree.
- * eb, slot and key refer to the buffer and key found in the log tree.
- * root is the destination we are replaying into, and path is for temp
- * use by this function. (it should be released on return).
- */
-static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- struct btrfs_root *log,
struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+ struct btrfs_root *log_root,
+ struct inode *dir, struct inode *inode,
+ struct extent_buffer *eb,
+ u64 inode_objectid, u64 parent_objectid,
+ u64 ref_index, char *name, int namelen,
+ int *search_done)
{
- struct btrfs_inode_ref *ref;
- struct btrfs_dir_item *di;
- struct inode *dir;
- struct inode *inode;
- unsigned long ref_ptr;
- unsigned long ref_end;
- char *name;
- int namelen;
int ret;
- int search_done = 0;
-
- /*
- * it is possible that we didn't log all the parent directories
- * for a given inode. If we don't find the dir, just don't
- * copy the back ref in. The link count fixup code will take
- * care of the rest
- */
- dir = read_one_inode(root, key->offset);
- if (!dir)
- return -ENOENT;
-
- inode = read_one_inode(root, key->objectid);
- if (!inode) {
- iput(dir);
- return -EIO;
- }
-
- ref_ptr = btrfs_item_ptr_offset(eb, slot);
- ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+ char *victim_name;
+ int victim_name_len;
+ struct extent_buffer *leaf;
+ struct btrfs_dir_item *di;
+ struct btrfs_key search_key;
+ struct btrfs_inode_extref *extref;
again:
- ref = (struct btrfs_inode_ref *)ref_ptr;
-
- namelen = btrfs_inode_ref_name_len(eb, ref);
- name = kmalloc(namelen, GFP_NOFS);
- BUG_ON(!name);
-
- read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
-
- /* if we already have a perfect match, we're done */
- if (inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
- btrfs_inode_ref_index(eb, ref),
- name, namelen)) {
- goto out;
- }
-
- /*
- * look for a conflicting back reference in the metadata.
- * if we find one we have to unlink that name of the file
- * before we add our new link. Later on, we overwrite any
- * existing back reference, and we don't want to create
- * dangling pointers in the directory.
- */
-
- if (search_done)
- goto insert;
-
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ /* Search old style refs */
+ search_key.objectid = inode_objectid;
+ search_key.type = BTRFS_INODE_REF_KEY;
+ search_key.offset = parent_objectid;
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret == 0) {
- char *victim_name;
- int victim_name_len;
struct btrfs_inode_ref *victim_ref;
unsigned long ptr;
unsigned long ptr_end;
- struct extent_buffer *leaf = path->nodes[0];
+
+ leaf = path->nodes[0];
/* are we trying to overwrite a back ref for the root directory
* if so, just jump out, we're done
*/
- if (key->objectid == key->offset)
- goto out_nowrite;
+ if (search_key.objectid == search_key.offset)
+ return 1;
/* check all the names in this back reference to see
* if they are in the log. if so, we allow them to stay
@@ -889,7 +851,9 @@ again:
(unsigned long)(victim_ref + 1),
victim_name_len);
- if (!backref_in_log(log, key, victim_name,
+ if (!backref_in_log(log_root, &search_key,
+ parent_objectid,
+ victim_name,
victim_name_len)) {
btrfs_inc_nlink(inode);
btrfs_release_path(path);
@@ -897,9 +861,14 @@ again:
ret = btrfs_unlink_inode(trans, root, dir,
inode, victim_name,
victim_name_len);
+ BUG_ON(ret);
btrfs_run_delayed_items(trans, root);
+ kfree(victim_name);
+ *search_done = 1;
+ goto again;
}
kfree(victim_name);
+
ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
}
BUG_ON(ret);
@@ -908,14 +877,78 @@ again:
* NOTE: we have searched root tree and checked the
* coresponding ref, it does not need to check again.
*/
- search_done = 1;
+ *search_done = 1;
+ }
+ btrfs_release_path(path);
+
+ /* Same search but for extended refs */
+ extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
+ inode_objectid, parent_objectid, 0,
+ 0);
+ if (!IS_ERR_OR_NULL(extref)) {
+ u32 item_size;
+ u32 cur_offset = 0;
+ unsigned long base;
+ struct inode *victim_parent;
+
+ leaf = path->nodes[0];
+
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ base = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+ while (cur_offset < item_size) {
+ extref = (struct btrfs_inode_extref *)base + cur_offset;
+
+ victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+ if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
+ goto next;
+
+ victim_name = kmalloc(victim_name_len, GFP_NOFS);
+ read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
+ victim_name_len);
+
+ search_key.objectid = inode_objectid;
+ search_key.type = BTRFS_INODE_EXTREF_KEY;
+ search_key.offset = btrfs_extref_hash(parent_objectid,
+ victim_name,
+ victim_name_len);
+ ret = 0;
+ if (!backref_in_log(log_root, &search_key,
+ parent_objectid, victim_name,
+ victim_name_len)) {
+ ret = -ENOENT;
+ victim_parent = read_one_inode(root,
+ parent_objectid);
+ if (victim_parent) {
+ btrfs_inc_nlink(inode);
+ btrfs_release_path(path);
+
+ ret = btrfs_unlink_inode(trans, root,
+ victim_parent,
+ inode,
+ victim_name,
+ victim_name_len);
+ btrfs_run_delayed_items(trans, root);
+ }
+ BUG_ON(ret);
+ iput(victim_parent);
+ kfree(victim_name);
+ *search_done = 1;
+ goto again;
+ }
+ kfree(victim_name);
+ BUG_ON(ret);
+next:
+ cur_offset += victim_name_len + sizeof(*extref);
+ }
+ *search_done = 1;
}
btrfs_release_path(path);
/* look for a conflicting sequence number */
di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
- btrfs_inode_ref_index(eb, ref),
- name, namelen, 0);
+ ref_index, name, namelen, 0);
if (di && !IS_ERR(di)) {
ret = drop_one_dir_item(trans, root, path, dir, di);
BUG_ON(ret);
@@ -931,25 +964,173 @@ again:
}
btrfs_release_path(path);
-insert:
- /* insert our name */
- ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
- btrfs_inode_ref_index(eb, ref));
- BUG_ON(ret);
+ return 0;
+}
- btrfs_update_inode(trans, root, inode);
+static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+ u32 *namelen, char **name, u64 *index,
+ u64 *parent_objectid)
+{
+ struct btrfs_inode_extref *extref;
-out:
- ref_ptr = (unsigned long)(ref + 1) + namelen;
- kfree(name);
- if (ref_ptr < ref_end)
- goto again;
+ extref = (struct btrfs_inode_extref *)ref_ptr;
+
+ *namelen = btrfs_inode_extref_name_len(eb, extref);
+ *name = kmalloc(*namelen, GFP_NOFS);
+ if (*name == NULL)
+ return -ENOMEM;
+
+ read_extent_buffer(eb, *name, (unsigned long)&extref->name,
+ *namelen);
+
+ *index = btrfs_inode_extref_index(eb, extref);
+ if (parent_objectid)
+ *parent_objectid = btrfs_inode_extref_parent(eb, extref);
+
+ return 0;
+}
+
+static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+ u32 *namelen, char **name, u64 *index)
+{
+ struct btrfs_inode_ref *ref;
+
+ ref = (struct btrfs_inode_ref *)ref_ptr;
+
+ *namelen = btrfs_inode_ref_name_len(eb, ref);
+ *name = kmalloc(*namelen, GFP_NOFS);
+ if (*name == NULL)
+ return -ENOMEM;
+
+ read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
+
+ *index = btrfs_inode_ref_index(eb, ref);
+
+ return 0;
+}
+
+/*
+ * replay one inode back reference item found in the log tree.
+ * eb, slot and key refer to the buffer and key found in the log tree.
+ * root is the destination we are replaying into, and path is for temp
+ * use by this function. (it should be released on return).
+ */
+static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+{
+ struct inode *dir;
+ struct inode *inode;
+ unsigned long ref_ptr;
+ unsigned long ref_end;
+ char *name;
+ int namelen;
+ int ret;
+ int search_done = 0;
+ int log_ref_ver = 0;
+ u64 parent_objectid;
+ u64 inode_objectid;
+ u64 ref_index = 0;
+ int ref_struct_size;
+
+ ref_ptr = btrfs_item_ptr_offset(eb, slot);
+ ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+
+ if (key->type == BTRFS_INODE_EXTREF_KEY) {
+ struct btrfs_inode_extref *r;
+
+ ref_struct_size = sizeof(struct btrfs_inode_extref);
+ log_ref_ver = 1;
+ r = (struct btrfs_inode_extref *)ref_ptr;
+ parent_objectid = btrfs_inode_extref_parent(eb, r);
+ } else {
+ ref_struct_size = sizeof(struct btrfs_inode_ref);
+ parent_objectid = key->offset;
+ }
+ inode_objectid = key->objectid;
+
+ /*
+ * it is possible that we didn't log all the parent directories
+ * for a given inode. If we don't find the dir, just don't
+ * copy the back ref in. The link count fixup code will take
+ * care of the rest
+ */
+ dir = read_one_inode(root, parent_objectid);
+ if (!dir)
+ return -ENOENT;
+
+ inode = read_one_inode(root, inode_objectid);
+ if (!inode) {
+ iput(dir);
+ return -EIO;
+ }
+
+ while (ref_ptr < ref_end) {
+ if (log_ref_ver) {
+ ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
+ &ref_index, &parent_objectid);
+ /*
+ * parent object can change from one array
+ * item to another.
+ */
+ if (!dir)
+ dir = read_one_inode(root, parent_objectid);
+ if (!dir)
+ return -ENOENT;
+ } else {
+ ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
+ &ref_index);
+ }
+ if (ret)
+ return ret;
+
+ /* if we already have a perfect match, we're done */
+ if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
+ ref_index, name, namelen)) {
+ /*
+ * look for a conflicting back reference in the
+ * metadata. if we find one we have to unlink that name
+ * of the file before we add our new link. Later on, we
+ * overwrite any existing back reference, and we don't
+ * want to create dangling pointers in the directory.
+ */
+
+ if (!search_done) {
+ ret = __add_inode_ref(trans, root, path, log,
+ dir, inode, eb,
+ inode_objectid,
+ parent_objectid,
+ ref_index, name, namelen,
+ &search_done);
+ if (ret == 1)
+ goto out;
+ BUG_ON(ret);
+ }
+
+ /* insert our name */
+ ret = btrfs_add_link(trans, dir, inode, name, namelen,
+ 0, ref_index);
+ BUG_ON(ret);
+
+ btrfs_update_inode(trans, root, inode);
+ }
+
+ ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
+ kfree(name);
+ if (log_ref_ver) {
+ iput(dir);
+ dir = NULL;
+ }
+ }
/* finally write the back reference in the inode */
ret = overwrite_item(trans, root, path, eb, slot, key);
BUG_ON(ret);
-out_nowrite:
+out:
btrfs_release_path(path);
iput(dir);
iput(inode);
@@ -966,25 +1147,55 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans,
return ret;
}
+static int count_inode_extrefs(struct btrfs_root *root,
+ struct inode *inode, struct btrfs_path *path)
+{
+ int ret = 0;
+ int name_len;
+ unsigned int nlink = 0;
+ u32 item_size;
+ u32 cur_offset = 0;
+ u64 inode_objectid = btrfs_ino(inode);
+ u64 offset = 0;
+ unsigned long ptr;
+ struct btrfs_inode_extref *extref;
+ struct extent_buffer *leaf;
-/*
- * There are a few corners where the link count of the file can't
- * be properly maintained during replay. So, instead of adding
- * lots of complexity to the log code, we just scan the backrefs
- * for any file that has been through replay.
- *
- * The scan will update the link count on the inode to reflect the
- * number of back refs found. If it goes down to zero, the iput
- * will free the inode.
- */
-static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *inode)
+ while (1) {
+ ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
+ &extref, &offset);
+ if (ret)
+ break;
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+ while (cur_offset < item_size) {
+ extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
+ name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+ nlink++;
+
+ cur_offset += name_len + sizeof(*extref);
+ }
+
+ offset++;
+ btrfs_release_path(path);
+ }
+ btrfs_release_path(path);
+
+ if (ret < 0)
+ return ret;
+ return nlink;
+}
+
+static int count_inode_refs(struct btrfs_root *root,
+ struct inode *inode, struct btrfs_path *path)
{
- struct btrfs_path *path;
int ret;
struct btrfs_key key;
- u64 nlink = 0;
+ unsigned int nlink = 0;
unsigned long ptr;
unsigned long ptr_end;
int name_len;
@@ -994,10 +1205,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
while (1) {
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
@@ -1031,6 +1238,50 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
}
btrfs_release_path(path);
+
+ return nlink;
+}
+
+/*
+ * There are a few corners where the link count of the file can't
+ * be properly maintained during replay. So, instead of adding
+ * lots of complexity to the log code, we just scan the backrefs
+ * for any file that has been through replay.
+ *
+ * The scan will update the link count on the inode to reflect the
+ * number of back refs found. If it goes down to zero, the iput
+ * will free the inode.
+ */
+static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode)
+{
+ struct btrfs_path *path;
+ int ret;
+ u64 nlink = 0;
+ u64 ino = btrfs_ino(inode);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = count_inode_refs(root, inode, path);
+ if (ret < 0)
+ goto out;
+
+ nlink = ret;
+
+ ret = count_inode_extrefs(root, inode, path);
+ if (ret == -ENOENT)
+ ret = 0;
+
+ if (ret < 0)
+ goto out;
+
+ nlink += ret;
+
+ ret = 0;
+
if (nlink != inode->i_nlink) {
set_nlink(inode, nlink);
btrfs_update_inode(trans, root, inode);
@@ -1046,9 +1297,10 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
ret = insert_orphan_item(trans, root, ino);
BUG_ON(ret);
}
- btrfs_free_path(path);
- return 0;
+out:
+ btrfs_free_path(path);
+ return ret;
}
static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
@@ -1695,6 +1947,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
ret = add_inode_ref(wc->trans, root, log, path,
eb, i, &key);
BUG_ON(ret && ret != -ENOENT);
+ } else if (key.type == BTRFS_INODE_EXTREF_KEY) {
+ ret = add_inode_ref(wc->trans, root, log, path,
+ eb, i, &key);
+ BUG_ON(ret && ret != -ENOENT);
} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
ret = replay_one_extent(wc->trans, root, path,
eb, i, &key);
@@ -2037,7 +2293,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
wait_log_commit(trans, root, root->log_transid - 1);
while (1) {
- unsigned long batch = root->log_batch;
+ int batch = atomic_read(&root->log_batch);
/* when we're on an ssd, just kick the log commit out */
if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
mutex_unlock(&root->log_mutex);
@@ -2045,7 +2301,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_lock(&root->log_mutex);
}
wait_for_writer(trans, root);
- if (batch == root->log_batch)
+ if (batch == atomic_read(&root->log_batch))
break;
}
@@ -2074,7 +2330,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
btrfs_set_root_node(&log->root_item, log->node);
- root->log_batch = 0;
root->log_transid++;
log->log_transid = root->log_transid;
root->log_start_pid = 0;
@@ -2087,7 +2342,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&root->log_mutex);
mutex_lock(&log_root_tree->log_mutex);
- log_root_tree->log_batch++;
+ atomic_inc(&log_root_tree->log_batch);
atomic_inc(&log_root_tree->log_writers);
mutex_unlock(&log_root_tree->log_mutex);
@@ -2157,7 +2412,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
btrfs_header_level(log_root_tree->node));
- log_root_tree->log_batch = 0;
log_root_tree->log_transid++;
smp_mb();
@@ -2171,9 +2425,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* in and cause problems either.
*/
btrfs_scrub_pause_super(root);
- write_ctree_super(trans, root->fs_info->tree_root, 1);
+ ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
btrfs_scrub_continue_super(root);
- ret = 0;
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_wake_log_root;
+ }
mutex_lock(&root->log_mutex);
if (root->last_log_commit < log_transid)
@@ -2209,7 +2466,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
while (1) {
ret = find_first_extent_bit(&log->dirty_log_pages,
- 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
+ 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
+ NULL);
if (ret)
break;
@@ -2646,6 +2904,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
int ret;
struct btrfs_key key;
struct btrfs_key found_key;
+ int start_slot;
key.objectid = objectid;
key.type = max_key_type;
@@ -2667,8 +2926,18 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
if (found_key.objectid != objectid)
break;
- ret = btrfs_del_item(trans, log, path);
- if (ret)
+ found_key.offset = 0;
+ found_key.type = 0;
+ ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
+ &start_slot);
+
+ ret = btrfs_del_items(trans, log, path, start_slot,
+ path->slots[0] - start_slot + 1);
+ /*
+ * If start slot isn't 0 then we don't need to re-search, we've
+ * found the last guy with the objectid in this tree.
+ */
+ if (ret || start_slot != 0)
break;
btrfs_release_path(path);
}
@@ -2678,14 +2947,64 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
return ret;
}
+static void fill_inode_item(struct btrfs_trans_handle *trans,
+ struct extent_buffer *leaf,
+ struct btrfs_inode_item *item,
+ struct inode *inode, int log_inode_only)
+{
+ btrfs_set_inode_uid(leaf, item, inode->i_uid);
+ btrfs_set_inode_gid(leaf, item, inode->i_gid);
+ btrfs_set_inode_mode(leaf, item, inode->i_mode);
+ btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+
+ btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
+ inode->i_atime.tv_sec);
+ btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
+ inode->i_atime.tv_nsec);
+
+ btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
+ inode->i_mtime.tv_sec);
+ btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
+ inode->i_mtime.tv_nsec);
+
+ btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
+ inode->i_ctime.tv_sec);
+ btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
+ inode->i_ctime.tv_nsec);
+
+ btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
+
+ btrfs_set_inode_sequence(leaf, item, inode->i_version);
+ btrfs_set_inode_transid(leaf, item, trans->transid);
+ btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
+ btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
+ btrfs_set_inode_block_group(leaf, item, 0);
+
+ if (log_inode_only) {
+ /* set the generation to zero so the recover code
+ * can tell the difference between an logging
+ * just to say 'this inode exists' and a logging
+ * to say 'update this inode with these values'
+ */
+ btrfs_set_inode_generation(leaf, item, 0);
+ btrfs_set_inode_size(leaf, item, 0);
+ } else {
+ btrfs_set_inode_generation(leaf, item,
+ BTRFS_I(inode)->generation);
+ btrfs_set_inode_size(leaf, item, inode->i_size);
+ }
+
+}
+
static noinline int copy_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *log,
+ struct inode *inode,
struct btrfs_path *dst_path,
struct extent_buffer *src,
int start_slot, int nr, int inode_only)
{
unsigned long src_offset;
unsigned long dst_offset;
+ struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
struct btrfs_file_extent_item *extent;
struct btrfs_inode_item *inode_item;
int ret;
@@ -2694,6 +3013,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
char *ins_data;
int i;
struct list_head ordered_sums;
+ int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
INIT_LIST_HEAD(&ordered_sums);
@@ -2722,29 +3042,23 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
src_offset = btrfs_item_ptr_offset(src, start_slot + i);
- copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
- src_offset, ins_sizes[i]);
-
- if (inode_only == LOG_INODE_EXISTS &&
- ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
+ if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
inode_item = btrfs_item_ptr(dst_path->nodes[0],
dst_path->slots[0],
struct btrfs_inode_item);
- btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
-
- /* set the generation to zero so the recover code
- * can tell the difference between an logging
- * just to say 'this inode exists' and a logging
- * to say 'update this inode with these values'
- */
- btrfs_set_inode_generation(dst_path->nodes[0],
- inode_item, 0);
+ fill_inode_item(trans, dst_path->nodes[0], inode_item,
+ inode, inode_only == LOG_INODE_EXISTS);
+ } else {
+ copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
+ src_offset, ins_sizes[i]);
}
+
/* take a reference on file data extents so that truncates
* or deletes of this inode don't have to relog the inode
* again
*/
- if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
+ if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
+ !skip_csum) {
int found_type;
extent = btrfs_item_ptr(src, start_slot + i,
struct btrfs_file_extent_item);
@@ -2753,8 +3067,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
continue;
found_type = btrfs_file_extent_type(src, extent);
- if (found_type == BTRFS_FILE_EXTENT_REG ||
- found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ if (found_type == BTRFS_FILE_EXTENT_REG) {
u64 ds, dl, cs, cl;
ds = btrfs_file_extent_disk_bytenr(src,
extent);
@@ -2803,6 +3116,239 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
return ret;
}
+static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct extent_map *em1, *em2;
+
+ em1 = list_entry(a, struct extent_map, list);
+ em2 = list_entry(b, struct extent_map, list);
+
+ if (em1->start < em2->start)
+ return -1;
+ else if (em1->start > em2->start)
+ return 1;
+ return 0;
+}
+
+struct log_args {
+ struct extent_buffer *src;
+ u64 next_offset;
+ int start_slot;
+ int nr;
+};
+
+static int log_one_extent(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct btrfs_root *root,
+ struct extent_map *em, struct btrfs_path *path,
+ struct btrfs_path *dst_path, struct log_args *args)
+{
+ struct btrfs_root *log = root->log_root;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ u64 start = em->mod_start;
+ u64 search_start = start;
+ u64 len = em->mod_len;
+ u64 num_bytes;
+ int nritems;
+ int ret;
+
+ if (BTRFS_I(inode)->logged_trans == trans->transid) {
+ ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
+ start + len, NULL, 0);
+ if (ret)
+ return ret;
+ }
+
+ while (len) {
+ if (args->nr)
+ goto next_slot;
+again:
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = search_start;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ if (ret) {
+ /*
+ * A rare case were we can have an em for a section of a
+ * larger extent so we need to make sure that this em
+ * falls within the extent we've found. If not we just
+ * bail and go back to ye-olde way of doing things but
+ * it happens often enough in testing that we need to do
+ * this dance to make sure.
+ */
+ do {
+ if (path->slots[0] == 0) {
+ btrfs_release_path(path);
+ if (search_start == 0)
+ return -ENOENT;
+ search_start--;
+ goto again;
+ }
+
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid != btrfs_ino(inode) ||
+ key.type != BTRFS_EXTENT_DATA_KEY) {
+ btrfs_release_path(path);
+ return -ENOENT;
+ }
+ } while (key.offset > start);
+
+ fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ num_bytes = btrfs_file_extent_num_bytes(path->nodes[0],
+ fi);
+ if (key.offset + num_bytes <= start) {
+ btrfs_release_path(path);
+ return -ENOENT;
+ }
+ }
+ args->src = path->nodes[0];
+next_slot:
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ fi = btrfs_item_ptr(args->src, path->slots[0],
+ struct btrfs_file_extent_item);
+ if (args->nr &&
+ args->start_slot + args->nr == path->slots[0]) {
+ args->nr++;
+ } else if (args->nr) {
+ ret = copy_items(trans, inode, dst_path, args->src,
+ args->start_slot, args->nr,
+ LOG_INODE_ALL);
+ if (ret)
+ return ret;
+ args->nr = 1;
+ args->start_slot = path->slots[0];
+ } else if (!args->nr) {
+ args->nr = 1;
+ args->start_slot = path->slots[0];
+ }
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ path->slots[0]++;
+ num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
+ if (len < num_bytes) {
+ /* I _think_ this is ok, envision we write to a
+ * preallocated space that is adjacent to a previously
+ * written preallocated space that gets merged when we
+ * mark this preallocated space written. If we do not
+ * have the adjacent extent in cache then when we copy
+ * this extent it could end up being larger than our EM
+ * thinks it is, which is a-ok, so just set len to 0.
+ */
+ len = 0;
+ } else {
+ len -= num_bytes;
+ }
+ start = key.offset + num_bytes;
+ args->next_offset = start;
+ search_start = start;
+
+ if (path->slots[0] < nritems) {
+ if (len)
+ goto next_slot;
+ break;
+ }
+
+ if (args->nr) {
+ ret = copy_items(trans, inode, dst_path, args->src,
+ args->start_slot, args->nr,
+ LOG_INODE_ALL);
+ if (ret)
+ return ret;
+ args->nr = 0;
+ btrfs_release_path(path);
+ }
+ }
+
+ return 0;
+}
+
+static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path)
+{
+ struct log_args args;
+ struct extent_map *em, *n;
+ struct list_head extents;
+ struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
+ u64 test_gen;
+ int ret = 0;
+
+ INIT_LIST_HEAD(&extents);
+
+ memset(&args, 0, sizeof(args));
+
+ write_lock(&tree->lock);
+ test_gen = root->fs_info->last_trans_committed;
+
+ list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
+ list_del_init(&em->list);
+ if (em->generation <= test_gen)
+ continue;
+ /* Need a ref to keep it from getting evicted from cache */
+ atomic_inc(&em->refs);
+ set_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ list_add_tail(&em->list, &extents);
+ }
+
+ list_sort(NULL, &extents, extent_cmp);
+
+ while (!list_empty(&extents)) {
+ em = list_entry(extents.next, struct extent_map, list);
+
+ list_del_init(&em->list);
+ clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+
+ /*
+ * If we had an error we just need to delete everybody from our
+ * private list.
+ */
+ if (ret) {
+ free_extent_map(em);
+ continue;
+ }
+
+ write_unlock(&tree->lock);
+
+ /*
+ * If the previous EM and the last extent we left off on aren't
+ * sequential then we need to copy the items we have and redo
+ * our search
+ */
+ if (args.nr && em->mod_start != args.next_offset) {
+ ret = copy_items(trans, inode, dst_path, args.src,
+ args.start_slot, args.nr,
+ LOG_INODE_ALL);
+ if (ret) {
+ free_extent_map(em);
+ write_lock(&tree->lock);
+ continue;
+ }
+ btrfs_release_path(path);
+ args.nr = 0;
+ }
+
+ ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
+ free_extent_map(em);
+ write_lock(&tree->lock);
+ }
+ WARN_ON(!list_empty(&extents));
+ write_unlock(&tree->lock);
+
+ if (!ret && args.nr)
+ ret = copy_items(trans, inode, dst_path, args.src,
+ args.start_slot, args.nr, LOG_INODE_ALL);
+ btrfs_release_path(path);
+ return ret;
+}
+
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@@ -2832,6 +3378,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
int nritems;
int ins_start_slot = 0;
int ins_nr;
+ bool fast_search = false;
u64 ino = btrfs_ino(inode);
log = root->log_root;
@@ -2851,21 +3398,23 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
max_key.objectid = ino;
- /* today the code can only do partial logging of directories */
- if (!S_ISDIR(inode->i_mode))
- inode_only = LOG_INODE_ALL;
+ /* today the code can only do partial logging of directories */
if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
max_key.type = BTRFS_XATTR_ITEM_KEY;
else
max_key.type = (u8)-1;
max_key.offset = (u64)-1;
- ret = btrfs_commit_inode_delayed_items(trans, inode);
- if (ret) {
- btrfs_free_path(path);
- btrfs_free_path(dst_path);
- return ret;
+ /* Only run delayed items if we are a dir or a new file */
+ if (S_ISDIR(inode->i_mode) ||
+ BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
+ ret = btrfs_commit_inode_delayed_items(trans, inode);
+ if (ret) {
+ btrfs_free_path(path);
+ btrfs_free_path(dst_path);
+ return ret;
+ }
}
mutex_lock(&BTRFS_I(inode)->log_mutex);
@@ -2881,7 +3430,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
max_key_type = BTRFS_XATTR_ITEM_KEY;
ret = drop_objectid_items(trans, log, path, ino, max_key_type);
} else {
- ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
+ if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags)) {
+ ret = btrfs_truncate_inode_items(trans, log,
+ inode, 0, 0);
+ } else {
+ fast_search = true;
+ max_key.type = BTRFS_XATTR_ITEM_KEY;
+ ret = drop_objectid_items(trans, log, path, ino,
+ BTRFS_XATTR_ITEM_KEY);
+ }
}
if (ret) {
err = ret;
@@ -2912,7 +3470,7 @@ again:
goto next_slot;
}
- ret = copy_items(trans, log, dst_path, src, ins_start_slot,
+ ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
ins_nr, inode_only);
if (ret) {
err = ret;
@@ -2930,7 +3488,7 @@ next_slot:
goto again;
}
if (ins_nr) {
- ret = copy_items(trans, log, dst_path, src,
+ ret = copy_items(trans, inode, dst_path, src,
ins_start_slot,
ins_nr, inode_only);
if (ret) {
@@ -2951,8 +3509,7 @@ next_slot:
break;
}
if (ins_nr) {
- ret = copy_items(trans, log, dst_path, src,
- ins_start_slot,
+ ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
ins_nr, inode_only);
if (ret) {
err = ret;
@@ -2960,7 +3517,24 @@ next_slot:
}
ins_nr = 0;
}
- WARN_ON(ins_nr);
+
+ if (fast_search) {
+ btrfs_release_path(path);
+ btrfs_release_path(dst_path);
+ ret = btrfs_log_changed_extents(trans, root, inode, path,
+ dst_path);
+ if (ret) {
+ err = ret;
+ goto out_unlock;
+ }
+ } else {
+ struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em, *n;
+
+ list_for_each_entry_safe(em, n, &tree->modified_extents, list)
+ list_del_init(&em->list);
+ }
+
if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
btrfs_release_path(path);
btrfs_release_path(dst_path);
@@ -2971,6 +3545,7 @@ next_slot:
}
}
BTRFS_I(inode)->logged_trans = trans->transid;
+ BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
out_unlock:
mutex_unlock(&BTRFS_I(inode)->log_mutex);
@@ -3138,7 +3713,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
end_trans:
dput(old_parent);
if (ret < 0) {
- BUG_ON(ret != -ENOSPC);
+ WARN_ON(ret != -ENOSPC);
root->fs_info->last_trans_log_full_commit = trans->transid;
ret = 1;
}
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index ab942f46b3d..99be4c138db 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -143,14 +143,13 @@ EXPORT_SYMBOL(ulist_free);
* In case of allocation failure -ENOMEM is returned and the ulist stays
* unaltered.
*/
-int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
- gfp_t gfp_mask)
+int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask)
{
return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);
}
-int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
- unsigned long *old_aux, gfp_t gfp_mask)
+int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
+ u64 *old_aux, gfp_t gfp_mask)
{
int i;
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 21bdc8ec813..21a1963439c 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -33,7 +33,7 @@ struct ulist_iterator {
*/
struct ulist_node {
u64 val; /* value to store */
- unsigned long aux; /* auxiliary value saved along with the val */
+ u64 aux; /* auxiliary value saved along with the val */
};
struct ulist {
@@ -65,10 +65,9 @@ void ulist_fini(struct ulist *ulist);
void ulist_reinit(struct ulist *ulist);
struct ulist *ulist_alloc(gfp_t gfp_mask);
void ulist_free(struct ulist *ulist);
-int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
- gfp_t gfp_mask);
-int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
- unsigned long *old_aux, gfp_t gfp_mask);
+int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
+int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
+ u64 *old_aux, gfp_t gfp_mask);
struct ulist_node *ulist_next(struct ulist *ulist,
struct ulist_iterator *uiter);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 88b969aeeb7..029b903a4ae 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -639,7 +639,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
bdev = blkdev_get_by_path(device->name->str, flags, holder);
if (IS_ERR(bdev)) {
- printk(KERN_INFO "open %s failed\n", device->name->str);
+ printk(KERN_INFO "btrfs: open %s failed\n", device->name->str);
goto error;
}
filemap_write_and_wait(bdev->bd_inode->i_mapping);
@@ -1475,6 +1475,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
free_fs_devices(cur_devices);
}
+ root->fs_info->num_tolerated_disk_barrier_failures =
+ btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
+
/*
* at this point, the device is zero sized. We want to
* remove it from the devices list and zero out the old super
@@ -1775,15 +1778,21 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
if (seeding_dev) {
ret = init_first_rw_device(trans, root, device);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
goto error_trans;
+ }
ret = btrfs_finish_sprout(trans, root);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
goto error_trans;
+ }
} else {
ret = btrfs_add_device(trans, root, device);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
goto error_trans;
+ }
}
/*
@@ -1793,6 +1802,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
btrfs_clear_space_info_full(root->fs_info);
unlock_chunks(root);
+ root->fs_info->num_tolerated_disk_barrier_failures =
+ btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
ret = btrfs_commit_transaction(trans, root);
if (seeding_dev) {
@@ -1814,7 +1825,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
error_trans:
unlock_chunks(root);
- btrfs_abort_transaction(trans, root, ret);
btrfs_end_transaction(trans, root);
rcu_string_free(device->name);
kfree(device);
@@ -2804,6 +2814,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
}
}
+ if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ int num_tolerated_disk_barrier_failures;
+ u64 target = bctl->sys.target;
+
+ num_tolerated_disk_barrier_failures =
+ btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+ if (num_tolerated_disk_barrier_failures > 0 &&
+ (target &
+ (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
+ num_tolerated_disk_barrier_failures = 0;
+ else if (num_tolerated_disk_barrier_failures > 1 &&
+ (target &
+ (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
+ num_tolerated_disk_barrier_failures = 1;
+
+ fs_info->num_tolerated_disk_barrier_failures =
+ num_tolerated_disk_barrier_failures;
+ }
+
ret = insert_balance_item(fs_info->tree_root, bctl);
if (ret && ret != -EEXIST)
goto out;
@@ -2836,6 +2866,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
__cancel_balance(fs_info);
}
+ if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ fs_info->num_tolerated_disk_barrier_failures =
+ btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+ }
+
wake_up(&fs_info->balance_wait_q);
return ret;
@@ -3608,12 +3643,16 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
&sys_chunk_size, &sys_stripe_size,
sys_chunk_offset, alloc_profile);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
ret = btrfs_add_device(trans, fs_info->chunk_root, device);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
/*
* Modifying chunk tree needs allocating new blocks from both
@@ -3623,19 +3662,19 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
*/
ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
chunk_size, stripe_size);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
ret = __finish_chunk_alloc(trans, extent_root, sys_map,
sys_chunk_offset, sys_chunk_size,
sys_stripe_size);
if (ret)
- goto abort;
+ btrfs_abort_transaction(trans, root, ret);
- return 0;
+out:
-abort:
- btrfs_abort_transaction(trans, root, ret);
return ret;
}
@@ -3760,7 +3799,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
read_unlock(&em_tree->lock);
if (!em) {
- printk(KERN_CRIT "unable to find logical %llu len %llu\n",
+ printk(KERN_CRIT "btrfs: unable to find logical %llu len %llu\n",
(unsigned long long)logical,
(unsigned long long)*length);
BUG();
@@ -4217,7 +4256,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
total_devs = bbio->num_stripes;
if (map_length < length) {
- printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
+ printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
"len %llu\n", (unsigned long long)logical,
(unsigned long long)length,
(unsigned long long)map_length);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 92c20654cc5..9acb846c3e7 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -97,7 +97,7 @@ static int zlib_compress_pages(struct list_head *ws,
*total_in = 0;
if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
- printk(KERN_WARNING "deflateInit failed\n");
+ printk(KERN_WARNING "btrfs: deflateInit failed\n");
ret = -1;
goto out;
}
@@ -125,7 +125,7 @@ static int zlib_compress_pages(struct list_head *ws,
while (workspace->def_strm.total_in < len) {
ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
if (ret != Z_OK) {
- printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+ printk(KERN_DEBUG "btrfs: deflate in loop returned %d\n",
ret);
zlib_deflateEnd(&workspace->def_strm);
ret = -1;
@@ -252,7 +252,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
}
if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
- printk(KERN_WARNING "inflateInit failed\n");
+ printk(KERN_WARNING "btrfs: inflateInit failed\n");
return -1;
}
while (workspace->inf_strm.total_in < srclen) {
@@ -336,7 +336,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
}
if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
- printk(KERN_WARNING "inflateInit failed\n");
+ printk(KERN_WARNING "btrfs: inflateInit failed\n");
return -1;
}