summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_dir.c2
-rw-r--r--fs/btrfs/ctree.c129
-rw-r--r--fs/btrfs/ioctl.c20
-rw-r--r--fs/ceph/Kconfig2
-rw-r--r--fs/ceph/auth_x.c3
-rw-r--r--fs/ceph/caps.c15
-rw-r--r--fs/ceph/dir.c13
-rw-r--r--fs/ceph/file.c2
-rw-r--r--fs/ceph/inode.c6
-rw-r--r--fs/ceph/mds_client.c45
-rw-r--r--fs/ceph/mds_client.h1
-rw-r--r--fs/ceph/messenger.c71
-rw-r--r--fs/ceph/mon_client.c6
-rw-r--r--fs/ceph/osd_client.c6
-rw-r--r--fs/ceph/osdmap.c27
-rw-r--r--fs/cifs/cifsfs.c6
-rw-r--r--fs/cifs/dns_resolve.c69
-rw-r--r--fs/cifs/dns_resolve.h4
-rw-r--r--fs/dcache.c2
-rw-r--r--fs/ecryptfs/messaging.c17
-rw-r--r--fs/gfs2/bmap.c1
-rw-r--r--fs/gfs2/dir.c33
-rw-r--r--fs/gfs2/glock.c12
-rw-r--r--fs/gfs2/inode.c12
-rw-r--r--fs/gfs2/quota.c10
-rw-r--r--fs/gfs2/quota.h2
-rw-r--r--fs/inode.c2
-rw-r--r--fs/jbd2/journal.c15
-rw-r--r--fs/jbd2/transaction.c9
-rw-r--r--fs/jffs2/xattr.c2
-rw-r--r--fs/mbcache.c5
-rw-r--r--fs/nfs/dir.c2
-rw-r--r--fs/nfs/internal.h3
-rw-r--r--fs/ocfs2/aops.c94
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c3
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c22
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--fs/ocfs2/file.c309
-rw-r--r--fs/ocfs2/file.h6
-rw-r--r--fs/ocfs2/journal.c30
-rw-r--r--fs/ocfs2/localalloc.c7
-rw-r--r--fs/ocfs2/quota_global.c2
-rw-r--r--fs/ocfs2/quota_local.c4
-rw-r--r--fs/ocfs2/refcounttree.c12
-rw-r--r--fs/ocfs2/suballoc.c2
-rw-r--r--fs/ocfs2/xattr.c200
-rw-r--r--fs/partitions/ibm.c14
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/sysfs/symlink.c26
-rw-r--r--fs/ubifs/shrinker.c2
-rw-r--r--fs/ubifs/ubifs.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c5
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c130
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h3
-rw-r--r--fs/xfs/quota/xfs_qm.c7
-rw-r--r--fs/xfs/xfs_mount.h2
58 files changed, 1014 insertions, 430 deletions
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index d61e3b28ce3..36d961f342a 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -146,7 +146,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
while (rdir->head < rdir->tail) {
p9stat_init(&st);
err = p9stat_read(rdir->buf + rdir->head,
- buflen - rdir->head, &st,
+ rdir->tail - rdir->head, &st,
fid->clnt->proto_version);
if (err) {
P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0d1d966b0fe..c3df14ce2cc 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2304,12 +2304,17 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
return ret;
}
+/*
+ * min slot controls the lowest index we're willing to push to the
+ * right. We'll push up to and including min_slot, but no lower
+ */
static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
int data_size, int empty,
struct extent_buffer *right,
- int free_space, u32 left_nritems)
+ int free_space, u32 left_nritems,
+ u32 min_slot)
{
struct extent_buffer *left = path->nodes[0];
struct extent_buffer *upper = path->nodes[1];
@@ -2327,7 +2332,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
if (empty)
nr = 0;
else
- nr = 1;
+ nr = max_t(u32, 1, min_slot);
if (path->slots[0] >= left_nritems)
push_space += data_size;
@@ -2469,10 +2474,14 @@ out_unlock:
*
* returns 1 if the push failed because the other node didn't have enough
* room, 0 if everything worked out and < 0 if there were major errors.
+ *
+ * this will push starting from min_slot to the end of the leaf. It won't
+ * push any slot lower than min_slot
*/
static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_path *path, int data_size,
- int empty)
+ *root, struct btrfs_path *path,
+ int min_data_size, int data_size,
+ int empty, u32 min_slot)
{
struct extent_buffer *left = path->nodes[0];
struct extent_buffer *right;
@@ -2514,8 +2523,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (left_nritems == 0)
goto out_unlock;
- return __push_leaf_right(trans, root, path, data_size, empty,
- right, free_space, left_nritems);
+ return __push_leaf_right(trans, root, path, min_data_size, empty,
+ right, free_space, left_nritems, min_slot);
out_unlock:
btrfs_tree_unlock(right);
free_extent_buffer(right);
@@ -2525,12 +2534,17 @@ out_unlock:
/*
* push some data in the path leaf to the left, trying to free up at
* least data_size bytes. returns zero if the push worked, nonzero otherwise
+ *
+ * max_slot can put a limit on how far into the leaf we'll push items. The
+ * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
+ * items
*/
static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int data_size,
int empty, struct extent_buffer *left,
- int free_space, int right_nritems)
+ int free_space, u32 right_nritems,
+ u32 max_slot)
{
struct btrfs_disk_key disk_key;
struct extent_buffer *right = path->nodes[0];
@@ -2549,9 +2563,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
slot = path->slots[1];
if (empty)
- nr = right_nritems;
+ nr = min(right_nritems, max_slot);
else
- nr = right_nritems - 1;
+ nr = min(right_nritems - 1, max_slot);
for (i = 0; i < nr; i++) {
item = btrfs_item_nr(right, i);
@@ -2712,10 +2726,14 @@ out:
/*
* push some data in the path leaf to the left, trying to free up at
* least data_size bytes. returns zero if the push worked, nonzero otherwise
+ *
+ * max_slot can put a limit on how far into the leaf we'll push items. The
+ * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
+ * items
*/
static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_path *path, int data_size,
- int empty)
+ *root, struct btrfs_path *path, int min_data_size,
+ int data_size, int empty, u32 max_slot)
{
struct extent_buffer *right = path->nodes[0];
struct extent_buffer *left;
@@ -2761,8 +2779,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
- return __push_leaf_left(trans, root, path, data_size,
- empty, left, free_space, right_nritems);
+ return __push_leaf_left(trans, root, path, min_data_size,
+ empty, left, free_space, right_nritems,
+ max_slot);
out:
btrfs_tree_unlock(left);
free_extent_buffer(left);
@@ -2855,6 +2874,64 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
}
/*
+ * double splits happen when we need to insert a big item in the middle
+ * of a leaf. A double split can leave us with 3 mostly empty leaves:
+ * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
+ * A B C
+ *
+ * We avoid this by trying to push the items on either side of our target
+ * into the adjacent leaves. If all goes well we can avoid the double split
+ * completely.
+ */
+static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ int data_size)
+{
+ int ret;
+ int progress = 0;
+ int slot;
+ u32 nritems;
+
+ slot = path->slots[0];
+
+ /*
+ * try to push all the items after our slot into the
+ * right leaf
+ */
+ ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
+ if (ret < 0)
+ return ret;
+
+ if (ret == 0)
+ progress++;
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ /*
+ * our goal is to get our slot at the start or end of a leaf. If
+ * we've done so we're done
+ */
+ if (path->slots[0] == 0 || path->slots[0] == nritems)
+ return 0;
+
+ if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
+ return 0;
+
+ /* try to push all the items before our slot into the next leaf */
+ slot = path->slots[0];
+ ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
+ if (ret < 0)
+ return ret;
+
+ if (ret == 0)
+ progress++;
+
+ if (progress)
+ return 0;
+ return 1;
+}
+
+/*
* split the path's leaf in two, making sure there is at least data_size
* available for the resulting leaf level of the path.
*
@@ -2876,6 +2953,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
int wret;
int split;
int num_doubles = 0;
+ int tried_avoid_double = 0;
l = path->nodes[0];
slot = path->slots[0];
@@ -2884,12 +2962,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
return -EOVERFLOW;
/* first try to make some room by pushing left and right */
- if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
- wret = push_leaf_right(trans, root, path, data_size, 0);
+ if (data_size) {
+ wret = push_leaf_right(trans, root, path, data_size,
+ data_size, 0, 0);
if (wret < 0)
return wret;
if (wret) {
- wret = push_leaf_left(trans, root, path, data_size, 0);
+ wret = push_leaf_left(trans, root, path, data_size,
+ data_size, 0, (u32)-1);
if (wret < 0)
return wret;
}
@@ -2923,6 +3003,8 @@ again:
if (mid != nritems &&
leaf_space_used(l, mid, nritems - mid) +
data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+ if (data_size && !tried_avoid_double)
+ goto push_for_double;
split = 2;
}
}
@@ -2939,6 +3021,8 @@ again:
if (mid != nritems &&
leaf_space_used(l, mid, nritems - mid) +
data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+ if (data_size && !tried_avoid_double)
+ goto push_for_double;
split = 2 ;
}
}
@@ -3019,6 +3103,13 @@ again:
}
return ret;
+
+push_for_double:
+ push_for_double_split(trans, root, path, data_size);
+ tried_avoid_double = 1;
+ if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
+ return 0;
+ goto again;
}
static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
@@ -3915,13 +4006,15 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
extent_buffer_get(leaf);
btrfs_set_path_blocking(path);
- wret = push_leaf_left(trans, root, path, 1, 1);
+ wret = push_leaf_left(trans, root, path, 1, 1,
+ 1, (u32)-1);
if (wret < 0 && wret != -ENOSPC)
ret = wret;
if (path->nodes[0] == leaf &&
btrfs_header_nritems(leaf)) {
- wret = push_leaf_right(trans, root, path, 1, 1);
+ wret = push_leaf_right(trans, root, path, 1,
+ 1, 1, 0);
if (wret < 0 && wret != -ENOSPC)
ret = wret;
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4dbaf89b133..9254b3d58db 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1458,7 +1458,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
*/
/* the destination must be opened for writing */
- if (!(file->f_mode & FMODE_WRITE))
+ if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
return -EINVAL;
ret = mnt_want_write(file->f_path.mnt);
@@ -1511,7 +1511,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
/* determine range to clone */
ret = -EINVAL;
- if (off >= src->i_size || off + len > src->i_size)
+ if (off + len > src->i_size || off + len < off)
goto out_unlock;
if (len == 0)
olen = len = src->i_size - off;
@@ -1578,6 +1578,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
u64 disko = 0, diskl = 0;
u64 datao = 0, datal = 0;
u8 comp;
+ u64 endoff;
size = btrfs_item_size_nr(leaf, slot);
read_extent_buffer(leaf, buf,
@@ -1712,9 +1713,18 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
btrfs_release_path(root, path);
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- if (new_key.offset + datal > inode->i_size)
- btrfs_i_size_write(inode,
- new_key.offset + datal);
+
+ /*
+ * we round up to the block size at eof when
+ * determining which extents to clone above,
+ * but shouldn't round up the file size
+ */
+ endoff = new_key.offset + datal;
+ if (endoff > off+olen)
+ endoff = off+olen;
+ if (endoff > inode->i_size)
+ btrfs_i_size_write(inode, endoff);
+
BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 04b8280582a..bc87b9c1d27 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -2,7 +2,7 @@ config CEPH_FS
tristate "Ceph distributed file system (EXPERIMENTAL)"
depends on INET && EXPERIMENTAL
select LIBCRC32C
- select CONFIG_CRYPTO_AES
+ select CRYPTO_AES
help
Choose Y or M here to include support for mounting the
experimental Ceph distributed file system. Ceph is an extremely
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index 3fe49042d8a..6d44053ecff 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -613,6 +613,9 @@ static void ceph_x_destroy(struct ceph_auth_client *ac)
remove_ticket_handler(ac, th);
}
+ if (xi->auth_authorizer.buf)
+ ceph_buffer_put(xi->auth_authorizer.buf);
+
kfree(ac->private);
ac->private = NULL;
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 74144d6389f..b81be9a5648 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -627,7 +627,7 @@ retry:
if (fmode >= 0)
__ceph_get_fmode(ci, fmode);
spin_unlock(&inode->i_lock);
- wake_up(&ci->i_cap_wq);
+ wake_up_all(&ci->i_cap_wq);
return 0;
}
@@ -1181,7 +1181,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
}
if (wake)
- wake_up(&ci->i_cap_wq);
+ wake_up_all(&ci->i_cap_wq);
return delayed;
}
@@ -2153,7 +2153,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
else if (flushsnaps)
ceph_flush_snaps(ci);
if (wake)
- wake_up(&ci->i_cap_wq);
+ wake_up_all(&ci->i_cap_wq);
if (put)
iput(inode);
}
@@ -2229,7 +2229,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
iput(inode);
} else if (complete_capsnap) {
ceph_flush_snaps(ci);
- wake_up(&ci->i_cap_wq);
+ wake_up_all(&ci->i_cap_wq);
}
if (drop_capsnap)
iput(inode);
@@ -2405,7 +2405,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
if (queue_invalidate)
ceph_queue_invalidate(inode);
if (wake)
- wake_up(&ci->i_cap_wq);
+ wake_up_all(&ci->i_cap_wq);
if (check_caps == 1)
ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
@@ -2460,7 +2460,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
struct ceph_inode_info,
i_flushing_item)->vfs_inode);
mdsc->num_cap_flushing--;
- wake_up(&mdsc->cap_flushing_wq);
+ wake_up_all(&mdsc->cap_flushing_wq);
dout(" inode %p now !flushing\n", inode);
if (ci->i_dirty_caps == 0) {
@@ -2472,7 +2472,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
}
}
spin_unlock(&mdsc->cap_dirty_lock);
- wake_up(&ci->i_cap_wq);
+ wake_up_all(&ci->i_cap_wq);
out:
spin_unlock(&inode->i_lock);
@@ -2984,6 +2984,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
memcpy(*p, dentry->d_name.name, dentry->d_name.len);
*p += dentry->d_name.len;
rel->dname_seq = cpu_to_le32(di->lease_seq);
+ __ceph_mdsc_drop_dentry_lease(dentry);
}
spin_unlock(&dentry->d_lock);
return ret;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f85719310db..f94ed3c7f6a 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -266,6 +266,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
spin_lock(&inode->i_lock);
if ((filp->f_pos == 2 || fi->dentry) &&
!ceph_test_opt(client, NOASYNCREADDIR) &&
+ ceph_snap(inode) != CEPH_SNAPDIR &&
(ci->i_ceph_flags & CEPH_I_COMPLETE) &&
__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
err = __dcache_readdir(filp, dirent, filldir);
@@ -1013,18 +1014,22 @@ out_touch:
/*
* When a dentry is released, clear the dir I_COMPLETE if it was part
- * of the current dir gen.
+ * of the current dir gen or if this is in the snapshot namespace.
*/
static void ceph_dentry_release(struct dentry *dentry)
{
struct ceph_dentry_info *di = ceph_dentry(dentry);
struct inode *parent_inode = dentry->d_parent->d_inode;
+ u64 snapid = ceph_snap(parent_inode);
- if (parent_inode) {
+ dout("dentry_release %p parent %p\n", dentry, parent_inode);
+
+ if (parent_inode && snapid != CEPH_SNAPDIR) {
struct ceph_inode_info *ci = ceph_inode(parent_inode);
spin_lock(&parent_inode->i_lock);
- if (ci->i_shared_gen == di->lease_shared_gen) {
+ if (ci->i_shared_gen == di->lease_shared_gen ||
+ snapid <= CEPH_MAXSNAP) {
dout(" clearing %p complete (d_release)\n",
parent_inode);
ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
@@ -1241,7 +1246,9 @@ struct dentry_operations ceph_dentry_ops = {
struct dentry_operations ceph_snapdir_dentry_ops = {
.d_revalidate = ceph_snapdir_d_revalidate,
+ .d_release = ceph_dentry_release,
};
struct dentry_operations ceph_snap_dentry_ops = {
+ .d_release = ceph_dentry_release,
};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 6251a1574b9..7c08698fad3 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -265,7 +265,7 @@ int ceph_release(struct inode *inode, struct file *file)
kmem_cache_free(ceph_file_cachep, cf);
/* wake up anyone waiting for caps on this inode */
- wake_up(&ci->i_cap_wq);
+ wake_up_all(&ci->i_cap_wq);
return 0;
}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 8f9b9fe8ef9..389f9dbd994 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1199,8 +1199,10 @@ retry_lookup:
goto out;
}
err = ceph_init_dentry(dn);
- if (err < 0)
+ if (err < 0) {
+ dput(dn);
goto out;
+ }
} else if (dn->d_inode &&
(ceph_ino(dn->d_inode) != vino.ino ||
ceph_snap(dn->d_inode) != vino.snap)) {
@@ -1499,7 +1501,7 @@ retry:
if (wrbuffer_refs == 0)
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
if (wake)
- wake_up(&ci->i_cap_wq);
+ wake_up_all(&ci->i_cap_wq);
}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 3ab79f6c4ce..dd440bd438a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -868,7 +868,7 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
{
struct ceph_inode_info *ci = ceph_inode(inode);
- wake_up(&ci->i_cap_wq);
+ wake_up_all(&ci->i_cap_wq);
if (arg) {
spin_lock(&inode->i_lock);
ci->i_wanted_max_size = 0;
@@ -1514,6 +1514,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
ceph_encode_filepath(&p, end, ino1, path1);
ceph_encode_filepath(&p, end, ino2, path2);
+ /* make note of release offset, in case we need to replay */
+ req->r_request_release_offset = p - msg->front.iov_base;
+
/* cap releases */
releases = 0;
if (req->r_inode_drop)
@@ -1561,7 +1564,7 @@ static void complete_request(struct ceph_mds_client *mdsc,
if (req->r_callback)
req->r_callback(mdsc, req);
else
- complete(&req->r_completion);
+ complete_all(&req->r_completion);
}
/*
@@ -1580,6 +1583,32 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
+ if (req->r_got_unsafe) {
+ /*
+ * Replay. Do not regenerate message (and rebuild
+ * paths, etc.); just use the original message.
+ * Rebuilding paths will break for renames because
+ * d_move mangles the src name.
+ */
+ msg = req->r_request;
+ rhead = msg->front.iov_base;
+
+ flags = le32_to_cpu(rhead->flags);
+ flags |= CEPH_MDS_FLAG_REPLAY;
+ rhead->flags = cpu_to_le32(flags);
+
+ if (req->r_target_inode)
+ rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
+
+ rhead->num_retry = req->r_attempts - 1;
+
+ /* remove cap/dentry releases from message */
+ rhead->num_releases = 0;
+ msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset);
+ msg->front.iov_len = req->r_request_release_offset;
+ return 0;
+ }
+
if (req->r_request) {
ceph_msg_put(req->r_request);
req->r_request = NULL;
@@ -1601,13 +1630,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
rhead->flags = cpu_to_le32(flags);
rhead->num_fwd = req->r_num_fwd;
rhead->num_retry = req->r_attempts - 1;
+ rhead->ino = 0;
dout(" r_locked_dir = %p\n", req->r_locked_dir);
-
- if (req->r_target_inode && req->r_got_unsafe)
- rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
- else
- rhead->ino = 0;
return 0;
}
@@ -1907,7 +1932,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
if (head->safe) {
req->r_got_safe = true;
__unregister_request(mdsc, req);
- complete(&req->r_safe_completion);
+ complete_all(&req->r_safe_completion);
if (req->r_got_unsafe) {
/*
@@ -1922,7 +1947,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
/* last unsafe request during umount? */
if (mdsc->stopping && !__get_oldest_req(mdsc))
- complete(&mdsc->safe_umount_waiters);
+ complete_all(&mdsc->safe_umount_waiters);
mutex_unlock(&mdsc->mutex);
goto out;
}
@@ -2101,7 +2126,7 @@ static void handle_session(struct ceph_mds_session *session,
pr_info("mds%d reconnect denied\n", session->s_mds);
remove_session_caps(session);
wake = 1; /* for good measure */
- complete(&mdsc->session_close_waiters);
+ complete_all(&mdsc->session_close_waiters);
kick_requests(mdsc, mds);
break;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index b292fa42a66..952410c60d0 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -188,6 +188,7 @@ struct ceph_mds_request {
int r_old_inode_drop, r_old_inode_unless;
struct ceph_msg *r_request; /* original request */
+ int r_request_release_offset;
struct ceph_msg *r_reply;
struct ceph_mds_reply_info_parsed r_reply_info;
int r_err;
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 9ad43a310a4..15167b2daa5 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -43,7 +43,8 @@ static void ceph_fault(struct ceph_connection *con);
* nicely render a sockaddr as a string.
*/
#define MAX_ADDR_STR 20
-static char addr_str[MAX_ADDR_STR][40];
+#define MAX_ADDR_STR_LEN 60
+static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
static DEFINE_SPINLOCK(addr_str_lock);
static int last_addr_str;
@@ -52,7 +53,6 @@ const char *pr_addr(const struct sockaddr_storage *ss)
int i;
char *s;
struct sockaddr_in *in4 = (void *)ss;
- unsigned char *quad = (void *)&in4->sin_addr.s_addr;
struct sockaddr_in6 *in6 = (void *)ss;
spin_lock(&addr_str_lock);
@@ -64,25 +64,13 @@ const char *pr_addr(const struct sockaddr_storage *ss)
switch (ss->ss_family) {
case AF_INET:
- sprintf(s, "%u.%u.%u.%u:%u",
- (unsigned int)quad[0],
- (unsigned int)quad[1],
- (unsigned int)quad[2],
- (unsigned int)quad[3],
- (unsigned int)ntohs(in4->sin_port));
+ snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
+ (unsigned int)ntohs(in4->sin_port));
break;
case AF_INET6:
- sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
- in6->sin6_addr.s6_addr16[0],
- in6->sin6_addr.s6_addr16[1],
- in6->sin6_addr.s6_addr16[2],
- in6->sin6_addr.s6_addr16[3],
- in6->sin6_addr.s6_addr16[4],
- in6->sin6_addr.s6_addr16[5],
- in6->sin6_addr.s6_addr16[6],
- in6->sin6_addr.s6_addr16[7],
- (unsigned int)ntohs(in6->sin6_port));
+ snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
+ (unsigned int)ntohs(in6->sin6_port));
break;
default:
@@ -215,12 +203,13 @@ static void set_sock_callbacks(struct socket *sock,
*/
static struct socket *ceph_tcp_connect(struct ceph_connection *con)
{
- struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
+ struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
struct socket *sock;
int ret;
BUG_ON(con->sock);
- ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+ ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
+ IPPROTO_TCP, &sock);
if (ret)
return ERR_PTR(ret);
con->sock = sock;
@@ -234,7 +223,8 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
- ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
+ ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
+ O_NONBLOCK);
if (ret == -EINPROGRESS) {
dout("connect %s EINPROGRESS sk_state = %u\n",
pr_addr(&con->peer_addr.in_addr),
@@ -1009,19 +999,32 @@ int ceph_parse_ips(const char *c, const char *end,
struct sockaddr_in *in4 = (void *)ss;
struct sockaddr_in6 *in6 = (void *)ss;
int port;
+ char delim = ',';
+
+ if (*p == '[') {
+ delim = ']';
+ p++;
+ }
memset(ss, 0, sizeof(*ss));
if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
- ',', &ipend)) {
+ delim, &ipend))
ss->ss_family = AF_INET;
- } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
- ',', &ipend)) {
+ else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
+ delim, &ipend))
ss->ss_family = AF_INET6;
- } else {
+ else
goto bad;
- }
p = ipend;
+ if (delim == ']') {
+ if (*p != ']') {
+ dout("missing matching ']'\n");
+ goto bad;
+ }
+ p++;
+ }
+
/* port? */
if (p < end && *p == ':') {
port = 0;
@@ -1055,7 +1058,7 @@ int ceph_parse_ips(const char *c, const char *end,
return 0;
bad:
- pr_err("parse_ips bad ip '%s'\n", c);
+ pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
return -EINVAL;
}
@@ -2015,20 +2018,20 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
{
mutex_lock(&con->mutex);
if (!list_empty(&msg->list_head)) {
- dout("con_revoke %p msg %p\n", con, msg);
+ dout("con_revoke %p msg %p - was on queue\n", con, msg);
list_del_init(&msg->list_head);
ceph_msg_put(msg);
msg->hdr.seq = 0;
- if (con->out_msg == msg) {
- ceph_msg_put(con->out_msg);
- con->out_msg = NULL;
- }
+ }
+ if (con->out_msg == msg) {
+ dout("con_revoke %p msg %p - was sending\n", con, msg);
+ con->out_msg = NULL;
if (con->out_kvec_is_msg) {
con->out_skip = con->out_kvec_bytes;
con->out_kvec_is_msg = false;
}
- } else {
- dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
+ ceph_msg_put(msg);
+ msg->hdr.seq = 0;
}
mutex_unlock(&con->mutex);
}
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index cc115eafae1..54fe01c5070 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -345,7 +345,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
out:
mutex_unlock(&monc->mutex);
- wake_up(&client->auth_wq);
+ wake_up_all(&client->auth_wq);
}
/*
@@ -462,7 +462,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
}
mutex_unlock(&monc->mutex);
if (req) {
- complete(&req->completion);
+ complete_all(&req->completion);
put_generic_request(req);
}
return;
@@ -718,7 +718,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
monc->m_auth->front_max);
if (ret < 0) {
monc->client->auth_err = ret;
- wake_up(&monc->client->auth_wq);
+ wake_up_all(&monc->client->auth_wq);
} else if (ret > 0) {
__send_prepared_auth_request(monc, ret);
} else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 92b7251a53f..e3852234789 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -862,12 +862,12 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
if (req->r_callback)
req->r_callback(req, msg);
else
- complete(&req->r_completion);
+ complete_all(&req->r_completion);
if (flags & CEPH_OSD_FLAG_ONDISK) {
if (req->r_safe_callback)
req->r_safe_callback(req, msg);
- complete(&req->r_safe_completion); /* fsync waiter */
+ complete_all(&req->r_safe_completion); /* fsync waiter */
}
done:
@@ -1083,7 +1083,7 @@ done:
if (newmap)
kick_requests(osdc, NULL);
up_read(&osdc->map_sem);
- wake_up(&osdc->client->auth_wq);
+ wake_up_all(&osdc->client->auth_wq);
return;
bad:
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index 50ce64ebd33..416d46adbf8 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -568,6 +568,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
if (ev > CEPH_PG_POOL_VERSION) {
pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
ev, CEPH_PG_POOL_VERSION);
+ kfree(pi);
goto bad;
}
__decode_pool(p, pi);
@@ -830,12 +831,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
/* remove any? */
while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
node)->pgid, pgid) <= 0) {
- struct rb_node *cur = rbp;
+ struct ceph_pg_mapping *cur =
+ rb_entry(rbp, struct ceph_pg_mapping, node);
+
rbp = rb_next(rbp);
- dout(" removed pg_temp %llx\n",
- *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
- node)->pgid);
- rb_erase(cur, &map->pg_temp);
+ dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
+ rb_erase(&cur->node, &map->pg_temp);
+ kfree(cur);
}
if (pglen) {
@@ -851,19 +853,22 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
for (j = 0; j < pglen; j++)
pg->osds[j] = ceph_decode_32(p);
err = __insert_pg_mapping(pg, &map->pg_temp);
- if (err)
+ if (err) {
+ kfree(pg);
goto bad;
+ }
dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
pglen);
}
}
while (rbp) {
- struct rb_node *cur = rbp;
+ struct ceph_pg_mapping *cur =
+ rb_entry(rbp, struct ceph_pg_mapping, node);
+
rbp = rb_next(rbp);
- dout(" removed pg_temp %llx\n",
- *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
- node)->pgid);
- rb_erase(cur, &map->pg_temp);
+ dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
+ rb_erase(&cur->node, &map->pg_temp);
+ kfree(cur);
}
/* ignore the rest */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 484e52bb40b..2cb1a70214d 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -923,7 +923,7 @@ init_cifs(void)
goto out_unregister_filesystem;
#endif
#ifdef CONFIG_CIFS_DFS_UPCALL
- rc = register_key_type(&key_type_dns_resolver);
+ rc = cifs_init_dns_resolver();
if (rc)
goto out_unregister_key_type;
#endif
@@ -935,7 +935,7 @@ init_cifs(void)
out_unregister_resolver_key:
#ifdef CONFIG_CIFS_DFS_UPCALL
- unregister_key_type(&key_type_dns_resolver);
+ cifs_exit_dns_resolver();
out_unregister_key_type:
#endif
#ifdef CONFIG_CIFS_UPCALL
@@ -961,7 +961,7 @@ exit_cifs(void)
cifs_proc_clean();
#ifdef CONFIG_CIFS_DFS_UPCALL
cifs_dfs_release_automount_timer();
- unregister_key_type(&key_type_dns_resolver);
+ cifs_exit_dns_resolver();
#endif
#ifdef CONFIG_CIFS_UPCALL
unregister_key_type(&cifs_spnego_key_type);
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 4db2c5e7283..49315cbf742 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -24,12 +24,16 @@
*/
#include <linux/slab.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
#include <keys/user-type.h>
#include "dns_resolve.h"
#include "cifsglob.h"
#include "cifsproto.h"
#include "cifs_debug.h"
+static const struct cred *dns_resolver_cache;
+
/* Checks if supplied name is IP address
* returns:
* 1 - name is IP
@@ -94,6 +98,7 @@ struct key_type key_type_dns_resolver = {
int
dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
{
+ const struct cred *saved_cred;
int rc = -EAGAIN;
struct key *rkey = ERR_PTR(-EAGAIN);
char *name;
@@ -133,8 +138,15 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
goto skip_upcall;
}
+ saved_cred = override_creds(dns_resolver_cache);
rkey = request_key(&key_type_dns_resolver, name, "");
+ revert_creds(saved_cred);
if (!IS_ERR(rkey)) {
+ if (!(rkey->perm & KEY_USR_VIEW)) {
+ down_read(&rkey->sem);
+ rkey->perm |= KEY_USR_VIEW;
+ up_read(&rkey->sem);
+ }
len = rkey->type_data.x[0];
data = rkey->payload.data;
} else {
@@ -165,4 +177,61 @@ out:
return rc;
}
+int __init cifs_init_dns_resolver(void)
+{
+ struct cred *cred;
+ struct key *keyring;
+ int ret;
+
+ printk(KERN_NOTICE "Registering the %s key type\n",
+ key_type_dns_resolver.name);
+
+ /* create an override credential set with a special thread keyring in
+ * which DNS requests are cached
+ *
+ * this is used to prevent malicious redirections from being installed
+ * with add_key().
+ */
+ cred = prepare_kernel_cred(NULL);
+ if (!cred)
+ return -ENOMEM;
+
+ keyring = key_alloc(&key_type_keyring, ".dns_resolver", 0, 0, cred,
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW | KEY_USR_READ,
+ KEY_ALLOC_NOT_IN_QUOTA);
+ if (IS_ERR(keyring)) {
+ ret = PTR_ERR(keyring);
+ goto failed_put_cred;
+ }
+
+ ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+ if (ret < 0)
+ goto failed_put_key;
+
+ ret = register_key_type(&key_type_dns_resolver);
+ if (ret < 0)
+ goto failed_put_key;
+
+ /* instruct request_key() to use this special keyring as a cache for
+ * the results it looks up */
+ cred->thread_keyring = keyring;
+ cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+ dns_resolver_cache = cred;
+ return 0;
+
+failed_put_key:
+ key_put(keyring);
+failed_put_cred:
+ put_cred(cred);
+ return ret;
+}
+void __exit cifs_exit_dns_resolver(void)
+{
+ key_revoke(dns_resolver_cache->thread_keyring);
+ unregister_key_type(&key_type_dns_resolver);
+ put_cred(dns_resolver_cache);
+ printk(KERN_NOTICE "Unregistered %s key type\n",
+ key_type_dns_resolver.name);
+}
diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h
index 966e9288930..26b9eaa9f5e 100644
--- a/fs/cifs/dns_resolve.h
+++ b/fs/cifs/dns_resolve.h
@@ -24,8 +24,8 @@
#define _DNS_RESOLVE_H
#ifdef __KERNEL__
-#include <linux/key-type.h>
-extern struct key_type key_type_dns_resolver;
+extern int __init cifs_init_dns_resolver(void);
+extern void __exit cifs_exit_dns_resolver(void);
extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr);
#endif /* KERNEL */
diff --git a/fs/dcache.c b/fs/dcache.c
index c8c78ba0782..86d4db15473 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -896,7 +896,7 @@ EXPORT_SYMBOL(shrink_dcache_parent);
*
* In this case we return -1 to tell the caller that we baled.
*/
-static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
+static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
{
if (nr) {
if (!(gfp_mask & __GFP_FS))
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 2d8dbce9d48..46c4dd8dfcc 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -31,9 +31,9 @@ static struct mutex ecryptfs_msg_ctx_lists_mux;
static struct hlist_head *ecryptfs_daemon_hash;
struct mutex ecryptfs_daemon_hash_mux;
-static int ecryptfs_hash_buckets;
+static int ecryptfs_hash_bits;
#define ecryptfs_uid_hash(uid) \
- hash_long((unsigned long)uid, ecryptfs_hash_buckets)
+ hash_long((unsigned long)uid, ecryptfs_hash_bits)
static u32 ecryptfs_msg_counter;
static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr;
@@ -486,18 +486,19 @@ int ecryptfs_init_messaging(void)
}
mutex_init(&ecryptfs_daemon_hash_mux);
mutex_lock(&ecryptfs_daemon_hash_mux);
- ecryptfs_hash_buckets = 1;
- while (ecryptfs_number_of_users >> ecryptfs_hash_buckets)
- ecryptfs_hash_buckets++;
+ ecryptfs_hash_bits = 1;
+ while (ecryptfs_number_of_users >> ecryptfs_hash_bits)
+ ecryptfs_hash_bits++;
ecryptfs_daemon_hash = kmalloc((sizeof(struct hlist_head)
- * ecryptfs_hash_buckets), GFP_KERNEL);
+ * (1 << ecryptfs_hash_bits)),
+ GFP_KERNEL);
if (!ecryptfs_daemon_hash) {
rc = -ENOMEM;
printk(KERN_ERR "%s: Failed to allocate memory\n", __func__);
mutex_unlock(&ecryptfs_daemon_hash_mux);
goto out;
}
- for (i = 0; i < ecryptfs_hash_buckets; i++)
+ for (i = 0; i < (1 << ecryptfs_hash_bits); i++)
INIT_HLIST_HEAD(&ecryptfs_daemon_hash[i]);
mutex_unlock(&ecryptfs_daemon_hash_mux);
ecryptfs_msg_ctx_arr = kmalloc((sizeof(struct ecryptfs_msg_ctx)
@@ -554,7 +555,7 @@ void ecryptfs_release_messaging(void)
int i;
mutex_lock(&ecryptfs_daemon_hash_mux);
- for (i = 0; i < ecryptfs_hash_buckets; i++) {
+ for (i = 0; i < (1 << ecryptfs_hash_bits); i++) {
int rc;
hlist_for_each_entry(daemon, elem,
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 4a48c0f4b40..84da64b551b 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1041,6 +1041,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
if (gfs2_is_stuffed(ip)) {
u64 dsize = size + sizeof(struct gfs2_inode);
+ ip->i_disksize = size;
ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
gfs2_dinode_out(ip, dibh->b_data);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 8295c5b5d4a..6b48d7c268b 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -392,7 +392,7 @@ static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
unsigned totlen = be16_to_cpu(dent->de_rec_len);
if (gfs2_dirent_sentinel(dent))
- actual = GFS2_DIRENT_SIZE(0);
+ actual = 0;
if (totlen - actual >= required)
return 1;
return 0;
@@ -1231,6 +1231,25 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
return 0;
}
+static void *gfs2_alloc_sort_buffer(unsigned size)
+{
+ void *ptr = NULL;
+
+ if (size < KMALLOC_MAX_SIZE)
+ ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN);
+ if (!ptr)
+ ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL);
+ return ptr;
+}
+
+static void gfs2_free_sort_buffer(void *ptr)
+{
+ if (is_vmalloc_addr(ptr))
+ vfree(ptr);
+ else
+ kfree(ptr);
+}
+
static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
filldir_t filldir, int *copied, unsigned *depth,
u64 leaf_no)
@@ -1271,7 +1290,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
* 99 is the maximum number of entries that can fit in a single
* leaf block.
*/
- larr = vmalloc((leaves + entries + 99) * sizeof(void *));
+ larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *));
if (!larr)
goto out;
darr = (const struct gfs2_dirent **)(larr + leaves);
@@ -1282,7 +1301,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
do {
error = get_leaf(ip, lfn, &bh);
if (error)
- goto out_kfree;
+ goto out_free;
lf = (struct gfs2_leaf *)bh->b_data;
lfn = be64_to_cpu(lf->lf_next);
if (lf->lf_entries) {
@@ -1291,7 +1310,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
gfs2_dirent_gather, NULL, &g);
error = PTR_ERR(dent);
if (IS_ERR(dent))
- goto out_kfree;
+ goto out_free;
if (entries2 != g.offset) {
fs_warn(sdp, "Number of entries corrupt in dir "
"leaf %llu, entries2 (%u) != "
@@ -1300,7 +1319,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
entries2, g.offset);
error = -EIO;
- goto out_kfree;
+ goto out_free;
}
error = 0;
larr[leaf++] = bh;
@@ -1312,10 +1331,10 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
BUG_ON(entries2 != entries);
error = do_filldir_main(ip, offset, opaque, filldir, darr,
entries, copied);
-out_kfree:
+out_free:
for(i = 0; i < leaf; i++)
brelse(larr[i]);
- vfree(larr);
+ gfs2_free_sort_buffer(larr);
out:
return error;
}
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ddcdbf49353..0898f3ec821 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -706,8 +706,18 @@ static void glock_work_func(struct work_struct *work)
{
unsigned long delay = 0;
struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
+ struct gfs2_holder *gh;
int drop_ref = 0;
+ if (unlikely(test_bit(GLF_FROZEN, &gl->gl_flags))) {
+ spin_lock(&gl->gl_spin);
+ gh = find_first_waiter(gl);
+ if (gh && (gh->gh_flags & LM_FLAG_NOEXP) &&
+ test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
+ set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+ spin_unlock(&gl->gl_spin);
+ }
+
if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) {
finish_xmote(gl, gl->gl_reply);
drop_ref = 1;
@@ -1348,7 +1358,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
}
-static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
+static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
{
struct gfs2_glock *gl;
int may_demote;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index b5612cbb62a..f03afd9c44b 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -169,7 +169,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
{
struct inode *inode;
struct gfs2_inode *ip;
- struct gfs2_glock *io_gl;
+ struct gfs2_glock *io_gl = NULL;
int error;
inode = gfs2_iget(sb, no_addr);
@@ -198,6 +198,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
ip->i_iopen_gh.gh_gl->gl_object = ip;
gfs2_glock_put(io_gl);
+ io_gl = NULL;
if ((type == DT_UNKNOWN) && (no_formal_ino == 0))
goto gfs2_nfsbypass;
@@ -228,7 +229,8 @@ gfs2_nfsbypass:
fail_glock:
gfs2_glock_dq(&ip->i_iopen_gh);
fail_iopen:
- gfs2_glock_put(io_gl);
+ if (io_gl)
+ gfs2_glock_put(io_gl);
fail_put:
if (inode->i_state & I_NEW)
ip->i_gl->gl_object = NULL;
@@ -256,7 +258,7 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
{
struct gfs2_sbd *sdp;
struct gfs2_inode *ip;
- struct gfs2_glock *io_gl;
+ struct gfs2_glock *io_gl = NULL;
int error;
struct gfs2_holder gh;
struct inode *inode;
@@ -293,6 +295,7 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
ip->i_iopen_gh.gh_gl->gl_object = ip;
gfs2_glock_put(io_gl);
+ io_gl = NULL;
inode->i_mode = DT2IF(DT_UNKNOWN);
@@ -319,7 +322,8 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
fail_glock:
gfs2_glock_dq(&ip->i_iopen_gh);
fail_iopen:
- gfs2_glock_put(io_gl);
+ if (io_gl)
+ gfs2_glock_put(io_gl);
fail_put:
ip->i_gl->gl_object = NULL;
gfs2_glock_put(ip->i_gl);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 49667d68769..8f02d3db8f4 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -77,7 +77,7 @@ static LIST_HEAD(qd_lru_list);
static atomic_t qd_lru_count = ATOMIC_INIT(0);
static DEFINE_SPINLOCK(qd_lru_lock);
-int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask)
+int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
{
struct gfs2_quota_data *qd;
struct gfs2_sbd *sdp;
@@ -694,10 +694,8 @@ get_a_page:
if (!buffer_mapped(bh))
goto unlock_out;
/* If it's a newly allocated disk block for quota, zero it */
- if (buffer_new(bh)) {
- memset(bh->b_data, 0, bh->b_size);
- set_buffer_uptodate(bh);
- }
+ if (buffer_new(bh))
+ zero_user(page, pos - blocksize, bh->b_size);
}
if (PageUptodate(page))
@@ -723,7 +721,7 @@ get_a_page:
/* If quota straddles page boundary, we need to update the rest of the
* quota at the beginning of the next page */
- if (offset != 0) { /* first page, offset is closer to PAGE_CACHE_SIZE */
+ if ((offset + sizeof(struct gfs2_quota)) > PAGE_CACHE_SIZE) {
ptr = ptr + nbytes;
nbytes = sizeof(struct gfs2_quota) - nbytes;
offset = 0;
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 195f60c8bd1..e7d236ca48b 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -51,7 +51,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
return ret;
}
-extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask);
+extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask);
extern const struct quotactl_ops gfs2_quotactl_ops;
#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/inode.c b/fs/inode.c
index 2bee20ae3d6..722860b323a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -512,7 +512,7 @@ static void prune_icache(int nr_to_scan)
* This function is passed the number of inodes to scan, and it returns the
* total number of remaining possibly-reclaimable inodes.
*/
-static int shrink_icache_memory(int nr, gfp_t gfp_mask)
+static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
{
if (nr) {
/*
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index bc2ff593276..036880895bf 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -297,7 +297,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
struct page *new_page;
unsigned int new_offset;
struct buffer_head *bh_in = jh2bh(jh_in);
- struct jbd2_buffer_trigger_type *triggers;
journal_t *journal = transaction->t_journal;
/*
@@ -328,21 +327,21 @@ repeat:
done_copy_out = 1;
new_page = virt_to_page(jh_in->b_frozen_data);
new_offset = offset_in_page(jh_in->b_frozen_data);
- triggers = jh_in->b_frozen_triggers;
} else {
new_page = jh2bh(jh_in)->b_page;
new_offset = offset_in_page(jh2bh(jh_in)->b_data);
- triggers = jh_in->b_triggers;
}
mapped_data = kmap_atomic(new_page, KM_USER0);
/*
- * Fire any commit trigger. Do this before checking for escaping,
- * as the trigger may modify the magic offset. If a copy-out
- * happens afterwards, it will have the correct data in the buffer.
+ * Fire data frozen trigger if data already wasn't frozen. Do this
+ * before checking for escaping, as the trigger may modify the magic
+ * offset. If a copy-out happens afterwards, it will have the correct
+ * data in the buffer.
*/
- jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset,
- triggers);
+ if (!done_copy_out)
+ jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset,
+ jh_in->b_triggers);
/*
* Check for escaping
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e214d68620a..b8e0806681b 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -725,6 +725,9 @@ done:
page = jh2bh(jh)->b_page;
offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
source = kmap_atomic(page, KM_USER0);
+ /* Fire data frozen trigger just before we copy the data */
+ jbd2_buffer_frozen_trigger(jh, source + offset,
+ jh->b_triggers);
memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
kunmap_atomic(source, KM_USER0);
@@ -963,15 +966,15 @@ void jbd2_journal_set_triggers(struct buffer_head *bh,
jh->b_triggers = type;
}
-void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data,
+void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
struct jbd2_buffer_trigger_type *triggers)
{
struct buffer_head *bh = jh2bh(jh);
- if (!triggers || !triggers->t_commit)
+ if (!triggers || !triggers->t_frozen)
return;
- triggers->t_commit(triggers, bh, mapped_data, bh->b_size);
+ triggers->t_frozen(triggers, bh, mapped_data, bh->b_size);
}
void jbd2_buffer_abort_trigger(struct journal_head *jh,
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index a2d58c96f1b..d258e261bdc 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -626,7 +626,7 @@ void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *i
static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
{
- /* success of check_xattr_ref_inode() means taht inode (ic) dose not have
+ /* success of check_xattr_ref_inode() means that inode (ic) dose not have
* duplicate name/value pairs. If duplicate name/value pair would be found,
* one will be removed.
*/
diff --git a/fs/mbcache.c b/fs/mbcache.c
index ec88ff3d04a..e28f21b9534 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -115,7 +115,7 @@ mb_cache_indexes(struct mb_cache *cache)
* What the mbcache registers as to get shrunk dynamically.
*/
-static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask);
+static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
static struct shrinker mb_cache_shrinker = {
.shrink = mb_cache_shrink_fn,
@@ -191,13 +191,14 @@ forget:
* This function is called by the kernel memory management when memory
* gets low.
*
+ * @shrink: (ignored)
* @nr_to_scan: Number of objects to scan
* @gfp_mask: (ignored)
*
* Returns the number of objects which are present in the cache.
*/
static int
-mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask)
+mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
{
LIST_HEAD(free_list);
struct list_head *l, *ltmp;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 782b431ef91..e60416d3f81 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1710,7 +1710,7 @@ static void nfs_access_free_list(struct list_head *head)
}
}
-int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask)
+int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
{
LIST_HEAD(head);
struct nfs_inode *nfsi;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index d8bd619e386..e70f44b9b3f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -205,7 +205,8 @@ extern struct rpc_procinfo nfs4_procedures[];
void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
/* dir.c */
-extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
+extern int nfs_access_cache_shrinker(struct shrinker *shrink,
+ int nr_to_scan, gfp_t gfp_mask);
/* inode.c */
extern struct workqueue_struct *nfsiod_workqueue;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 3623ca20cc1..356e976772b 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -196,15 +196,14 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
dump_stack();
goto bail;
}
-
- past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
- mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
- (unsigned long long)past_eof);
-
- if (create && (iblock >= past_eof))
- set_buffer_new(bh_result);
}
+ past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+ mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
+ (unsigned long long)past_eof);
+ if (create && (iblock >= past_eof))
+ set_buffer_new(bh_result);
+
bail:
if (err < 0)
err = -EIO;
@@ -459,36 +458,6 @@ int walk_page_buffers( handle_t *handle,
return ret;
}
-handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
- struct page *page,
- unsigned from,
- unsigned to)
-{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- handle_t *handle;
- int ret = 0;
-
- handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- ret = -ENOMEM;
- mlog_errno(ret);
- goto out;
- }
-
- if (ocfs2_should_order_data(inode)) {
- ret = ocfs2_jbd2_file_inode(handle, inode);
- if (ret < 0)
- mlog_errno(ret);
- }
-out:
- if (ret) {
- if (!IS_ERR(handle))
- ocfs2_commit_trans(osb, handle);
- handle = ERR_PTR(ret);
- }
- return handle;
-}
-
static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
{
sector_t status;
@@ -1131,23 +1100,37 @@ out:
*/
static int ocfs2_grab_pages_for_write(struct address_space *mapping,
struct ocfs2_write_ctxt *wc,
- u32 cpos, loff_t user_pos, int new,
+ u32 cpos, loff_t user_pos,
+ unsigned user_len, int new,
struct page *mmap_page)
{
int ret = 0, i;
- unsigned long start, target_index, index;
+ unsigned long start, target_index, end_index, index;
struct inode *inode = mapping->host;
+ loff_t last_byte;
target_index = user_pos >> PAGE_CACHE_SHIFT;
/*
* Figure out how many pages we'll be manipulating here. For
* non allocating write, we just change the one
- * page. Otherwise, we'll need a whole clusters worth.
+ * page. Otherwise, we'll need a whole clusters worth. If we're
+ * writing past i_size, we only need enough pages to cover the
+ * last page of the write.
*/
if (new) {
wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
+ /*
+ * We need the index *past* the last page we could possibly
+ * touch. This is the page past the end of the write or
+ * i_size, whichever is greater.
+ */
+ last_byte = max(user_pos + user_len, i_size_read(inode));
+ BUG_ON(last_byte < 1);
+ end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1;
+ if ((start + wc->w_num_pages) > end_index)
+ wc->w_num_pages = end_index - start;
} else {
wc->w_num_pages = 1;
start = target_index;
@@ -1620,21 +1603,20 @@ out:
* write path can treat it as an non-allocating write, which has no
* special case code for sparse/nonsparse files.
*/
-static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
- unsigned len,
+static int ocfs2_expand_nonsparse_inode(struct inode *inode,
+ struct buffer_head *di_bh,
+ loff_t pos, unsigned len,
struct ocfs2_write_ctxt *wc)
{
int ret;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
loff_t newsize = pos + len;
- if (ocfs2_sparse_alloc(osb))
- return 0;
+ BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
if (newsize <= i_size_read(inode))
return 0;
- ret = ocfs2_extend_no_holes(inode, newsize, pos);
+ ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos);
if (ret)
mlog_errno(ret);
@@ -1644,6 +1626,18 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
return ret;
}
+static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
+ loff_t pos)
+{
+ int ret = 0;
+
+ BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
+ if (pos > i_size_read(inode))
+ ret = ocfs2_zero_extend(inode, di_bh, pos);
+
+ return ret;
+}
+
int ocfs2_write_begin_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata,
@@ -1679,7 +1673,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
}
}
- ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
+ if (ocfs2_sparse_alloc(osb))
+ ret = ocfs2_zero_tail(inode, di_bh, pos);
+ else
+ ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
+ wc);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1789,7 +1787,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
* that we can zero and flush if we error after adding the
* extent.
*/
- ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
+ ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
cluster_of_pages, mmap_page);
if (ret) {
mlog_errno(ret);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 6b5a492e174..153abb5abef 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1671,7 +1671,7 @@ struct dlm_ctxt * dlm_register_domain(const char *domain,
struct dlm_ctxt *dlm = NULL;
struct dlm_ctxt *new_ctxt = NULL;
- if (strlen(domain) > O2NM_MAX_NAME_LEN) {
+ if (strlen(domain) >= O2NM_MAX_NAME_LEN) {
ret = -ENAMETOOLONG;
mlog(ML_ERROR, "domain name length too long\n");
goto leave;
@@ -1709,6 +1709,7 @@ retry:
}
if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) {
+ spin_unlock(&dlm_domain_lock);
mlog(ML_ERROR,
"Requested locking protocol version is not "
"compatible with already registered domain "
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 4a7506a4e31..94b97fc6a88 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2808,14 +2808,8 @@ again:
mlog(0, "trying again...\n");
goto again;
}
- /* now that we are sure the MIGRATING state is there, drop
- * the unneded state which blocked threads trying to DIRTY */
- spin_lock(&res->spinlock);
- BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
- BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
- res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
- spin_unlock(&res->spinlock);
+ ret = 0;
/* did the target go down or die? */
spin_lock(&dlm->spinlock);
if (!test_bit(target, dlm->domain_map)) {
@@ -2826,9 +2820,21 @@ again:
spin_unlock(&dlm->spinlock);
/*
+ * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
+ * another try; otherwise, we are sure the MIGRATING state is there,
+ * drop the unneded state which blocked threads trying to DIRTY
+ */
+ spin_lock(&res->spinlock);
+ BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
+ res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
+ if (!ret)
+ BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
+ spin_unlock(&res->spinlock);
+
+ /*
* at this point:
*
- * o the DLM_LOCK_RES_MIGRATING flag is set
+ * o the DLM_LOCK_RES_MIGRATING flag is set if target not down
* o there are no pending asts on this lockres
* o all processes trying to reserve an ast on this
* lockres must wait for the MIGRATING flag to clear
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index f8b75ce4be7..9dfaac73b36 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -463,7 +463,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
int bit;
- bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
+ bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0);
if (bit >= O2NM_MAX_NODES || bit < 0)
dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
else
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6a13ea64c44..2b10b36d157 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -724,28 +724,55 @@ leave:
return status;
}
+/*
+ * While a write will already be ordering the data, a truncate will not.
+ * Thus, we need to explicitly order the zeroed pages.
+ */
+static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ handle_t *handle = NULL;
+ int ret = 0;
+
+ if (!ocfs2_should_order_data(inode))
+ goto out;
+
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_jbd2_file_inode(handle, inode);
+ if (ret < 0)
+ mlog_errno(ret);
+
+out:
+ if (ret) {
+ if (!IS_ERR(handle))
+ ocfs2_commit_trans(osb, handle);
+ handle = ERR_PTR(ret);
+ }
+ return handle;
+}
+
/* Some parts of this taken from generic_cont_expand, which turned out
* to be too fragile to do exactly what we need without us having to
* worry about recursive locking in ->write_begin() and ->write_end(). */
-static int ocfs2_write_zero_page(struct inode *inode,
- u64 size)
+static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
+ u64 abs_to)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
- unsigned long index;
- unsigned int offset;
+ unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
handle_t *handle = NULL;
- int ret;
+ int ret = 0;
+ unsigned zero_from, zero_to, block_start, block_end;
- offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
- /* ugh. in prepare/commit_write, if from==to==start of block, we
- ** skip the prepare. make sure we never send an offset for the start
- ** of a block
- */
- if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
- offset++;
- }
- index = size >> PAGE_CACHE_SHIFT;
+ BUG_ON(abs_from >= abs_to);
+ BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
+ BUG_ON(abs_from & (inode->i_blkbits - 1));
page = grab_cache_page(mapping, index);
if (!page) {
@@ -754,31 +781,56 @@ static int ocfs2_write_zero_page(struct inode *inode,
goto out;
}
- ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_unlock;
- }
+ /* Get the offsets within the page that we want to zero */
+ zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
+ zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
+ if (!zero_to)
+ zero_to = PAGE_CACHE_SIZE;
- if (ocfs2_should_order_data(inode)) {
- handle = ocfs2_start_walk_page_trans(inode, page, offset,
- offset);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- handle = NULL;
+ mlog(0,
+ "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n",
+ (unsigned long long)abs_from, (unsigned long long)abs_to,
+ index, zero_from, zero_to);
+
+ /* We know that zero_from is block aligned */
+ for (block_start = zero_from; block_start < zero_to;
+ block_start = block_end) {
+ block_end = block_start + (1 << inode->i_blkbits);
+
+ /*
+ * block_start is block-aligned. Bump it by one to
+ * force ocfs2_{prepare,commit}_write() to zero the
+ * whole block.
+ */
+ ret = ocfs2_prepare_write_nolock(inode, page,
+ block_start + 1,
+ block_start + 1);
+ if (ret < 0) {
+ mlog_errno(ret);
goto out_unlock;
}
- }
- /* must not update i_size! */
- ret = block_commit_write(page, offset, offset);
- if (ret < 0)
- mlog_errno(ret);
- else
- ret = 0;
+ if (!handle) {
+ handle = ocfs2_zero_start_ordered_transaction(inode);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ break;
+ }
+ }
+
+ /* must not update i_size! */
+ ret = block_commit_write(page, block_start + 1,
+ block_start + 1);
+ if (ret < 0)
+ mlog_errno(ret);
+ else
+ ret = 0;
+ }
if (handle)
ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+
out_unlock:
unlock_page(page);
page_cache_release(page);
@@ -786,22 +838,114 @@ out:
return ret;
}
-static int ocfs2_zero_extend(struct inode *inode,
- u64 zero_to_size)
+/*
+ * Find the next range to zero. We do this in terms of bytes because
+ * that's what ocfs2_zero_extend() wants, and it is dealing with the
+ * pagecache. We may return multiple extents.
+ *
+ * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
+ * needs to be zeroed. range_start and range_end return the next zeroing
+ * range. A subsequent call should pass the previous range_end as its
+ * zero_start. If range_end is 0, there's nothing to do.
+ *
+ * Unwritten extents are skipped over. Refcounted extents are CoWd.
+ */
+static int ocfs2_zero_extend_get_range(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 zero_start, u64 zero_end,
+ u64 *range_start, u64 *range_end)
{
- int ret = 0;
- u64 start_off;
- struct super_block *sb = inode->i_sb;
+ int rc = 0, needs_cow = 0;
+ u32 p_cpos, zero_clusters = 0;
+ u32 zero_cpos =
+ zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
+ unsigned int num_clusters = 0;
+ unsigned int ext_flags = 0;
- start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
- while (start_off < zero_to_size) {
- ret = ocfs2_write_zero_page(inode, start_off);
- if (ret < 0) {
- mlog_errno(ret);
+ while (zero_cpos < last_cpos) {
+ rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
+ &num_clusters, &ext_flags);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
+
+ if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+ zero_clusters = num_clusters;
+ if (ext_flags & OCFS2_EXT_REFCOUNTED)
+ needs_cow = 1;
+ break;
+ }
+
+ zero_cpos += num_clusters;
+ }
+ if (!zero_clusters) {
+ *range_end = 0;
+ goto out;
+ }
+
+ while ((zero_cpos + zero_clusters) < last_cpos) {
+ rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
+ &p_cpos, &num_clusters,
+ &ext_flags);
+ if (rc) {
+ mlog_errno(rc);
goto out;
}
- start_off += sb->s_blocksize;
+ if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
+ break;
+ if (ext_flags & OCFS2_EXT_REFCOUNTED)
+ needs_cow = 1;
+ zero_clusters += num_clusters;
+ }
+ if ((zero_cpos + zero_clusters) > last_cpos)
+ zero_clusters = last_cpos - zero_cpos;
+
+ if (needs_cow) {
+ rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
+ UINT_MAX);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
+ }
+
+ *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
+ *range_end = ocfs2_clusters_to_bytes(inode->i_sb,
+ zero_cpos + zero_clusters);
+
+out:
+ return rc;
+}
+
+/*
+ * Zero one range returned from ocfs2_zero_extend_get_range(). The caller
+ * has made sure that the entire range needs zeroing.
+ */
+static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
+ u64 range_end)
+{
+ int rc = 0;
+ u64 next_pos;
+ u64 zero_pos = range_start;
+
+ mlog(0, "range_start = %llu, range_end = %llu\n",
+ (unsigned long long)range_start,
+ (unsigned long long)range_end);
+ BUG_ON(range_start >= range_end);
+
+ while (zero_pos < range_end) {
+ next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
+ if (next_pos > range_end)
+ next_pos = range_end;
+ rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
+ if (rc < 0) {
+ mlog_errno(rc);
+ break;
+ }
+ zero_pos = next_pos;
/*
* Very large extends have the potential to lock up
@@ -810,16 +954,63 @@ static int ocfs2_zero_extend(struct inode *inode,
cond_resched();
}
-out:
+ return rc;
+}
+
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+ loff_t zero_to_size)
+{
+ int ret = 0;
+ u64 zero_start, range_start = 0, range_end = 0;
+ struct super_block *sb = inode->i_sb;
+
+ zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+ mlog(0, "zero_start %llu for i_size %llu\n",
+ (unsigned long long)zero_start,
+ (unsigned long long)i_size_read(inode));
+ while (zero_start < zero_to_size) {
+ ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
+ zero_to_size,
+ &range_start,
+ &range_end);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ if (!range_end)
+ break;
+ /* Trim the ends */
+ if (range_start < zero_start)
+ range_start = zero_start;
+ if (range_end > zero_to_size)
+ range_end = zero_to_size;
+
+ ret = ocfs2_zero_extend_range(inode, range_start,
+ range_end);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ zero_start = range_end;
+ }
+
return ret;
}
-int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+ u64 new_i_size, u64 zero_to)
{
int ret;
u32 clusters_to_add;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ /*
+ * Only quota files call this without a bh, and they can't be
+ * refcounted.
+ */
+ BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+ BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
+
clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
if (clusters_to_add < oi->ip_clusters)
clusters_to_add = 0;
@@ -840,7 +1031,7 @@ int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
* still need to zero the area between the old i_size and the
* new i_size.
*/
- ret = ocfs2_zero_extend(inode, zero_to);
+ ret = ocfs2_zero_extend(inode, di_bh, zero_to);
if (ret < 0)
mlog_errno(ret);
@@ -862,27 +1053,15 @@ static int ocfs2_extend_file(struct inode *inode,
goto out;
if (i_size_read(inode) == new_i_size)
- goto out;
+ goto out;
BUG_ON(new_i_size < i_size_read(inode));
/*
- * Fall through for converting inline data, even if the fs
- * supports sparse files.
- *
- * The check for inline data here is legal - nobody can add
- * the feature since we have i_mutex. We must check it again
- * after acquiring ip_alloc_sem though, as paths like mmap
- * might have raced us to converting the inode to extents.
- */
- if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
- goto out_update_size;
-
- /*
* The alloc sem blocks people in read/write from reading our
* allocation until we're done changing it. We depend on
* i_mutex to block other extend/truncate calls while we're
- * here.
+ * here. We even have to hold it for sparse files because there
+ * might be some tail zeroing.
*/
down_write(&oi->ip_alloc_sem);
@@ -899,14 +1078,16 @@ static int ocfs2_extend_file(struct inode *inode,
ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
if (ret) {
up_write(&oi->ip_alloc_sem);
-
mlog_errno(ret);
goto out;
}
}
- if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
- ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
+ else
+ ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
+ new_i_size);
up_write(&oi->ip_alloc_sem);
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index d66cf4f7c70..97bf761c9e7 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -54,8 +54,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
int ocfs2_simple_size_update(struct inode *inode,
struct buffer_head *di_bh,
u64 new_i_size);
-int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
- u64 zero_to);
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+ u64 new_i_size, u64 zero_to);
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+ loff_t zero_to);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 47878cf1641..625de9d7088 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -472,7 +472,7 @@ static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger
return container_of(triggers, struct ocfs2_triggers, ot_triggers);
}
-static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
struct buffer_head *bh,
void *data, size_t size)
{
@@ -491,7 +491,7 @@ static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
* Quota blocks have their own trigger because the struct ocfs2_block_check
* offset depends on the blocksize.
*/
-static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_dq_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
struct buffer_head *bh,
void *data, size_t size)
{
@@ -511,7 +511,7 @@ static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
* Directory blocks also have their own trigger because the
* struct ocfs2_block_check offset depends on the blocksize.
*/
-static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
struct buffer_head *bh,
void *data, size_t size)
{
@@ -544,7 +544,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
static struct ocfs2_triggers di_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_dinode, i_check),
@@ -552,7 +552,7 @@ static struct ocfs2_triggers di_triggers = {
static struct ocfs2_triggers eb_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_extent_block, h_check),
@@ -560,7 +560,7 @@ static struct ocfs2_triggers eb_triggers = {
static struct ocfs2_triggers rb_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_refcount_block, rf_check),
@@ -568,7 +568,7 @@ static struct ocfs2_triggers rb_triggers = {
static struct ocfs2_triggers gd_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_group_desc, bg_check),
@@ -576,14 +576,14 @@ static struct ocfs2_triggers gd_triggers = {
static struct ocfs2_triggers db_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_db_commit_trigger,
+ .t_frozen = ocfs2_db_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
};
static struct ocfs2_triggers xb_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_xattr_block, xb_check),
@@ -591,14 +591,14 @@ static struct ocfs2_triggers xb_triggers = {
static struct ocfs2_triggers dq_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_dq_commit_trigger,
+ .t_frozen = ocfs2_dq_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
};
static struct ocfs2_triggers dr_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check),
@@ -606,7 +606,7 @@ static struct ocfs2_triggers dr_triggers = {
static struct ocfs2_triggers dl_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check),
@@ -1936,7 +1936,7 @@ void ocfs2_orphan_scan_work(struct work_struct *work)
mutex_lock(&os->os_lock);
ocfs2_queue_orphan_scan(osb);
if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
- schedule_delayed_work(&os->os_orphan_scan_work,
+ queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
ocfs2_orphan_scan_timeout());
mutex_unlock(&os->os_lock);
}
@@ -1976,8 +1976,8 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
else {
atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
- schedule_delayed_work(&os->os_orphan_scan_work,
- ocfs2_orphan_scan_timeout());
+ queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+ ocfs2_orphan_scan_timeout());
}
}
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 3d7419682dc..ec6adbf8f55 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -118,6 +118,7 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
{
unsigned int la_mb;
unsigned int gd_mb;
+ unsigned int la_max_mb;
unsigned int megs_per_slot;
struct super_block *sb = osb->sb;
@@ -182,6 +183,12 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
if (megs_per_slot < la_mb)
la_mb = megs_per_slot;
+ /* We can't store more bits than we can in a block. */
+ la_max_mb = ocfs2_clusters_to_megabytes(osb->sb,
+ ocfs2_local_alloc_size(sb) * 8);
+ if (la_mb > la_max_mb)
+ la_mb = la_max_mb;
+
return la_mb;
}
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 2bb35fe0051..4607923eb24 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -775,7 +775,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
* locking allocators ranks above a transaction start
*/
WARN_ON(journal_current_handle());
- status = ocfs2_extend_no_holes(gqinode,
+ status = ocfs2_extend_no_holes(gqinode, NULL,
gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
gqinode->i_size);
if (status < 0)
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 8bd70d4d184..dc78764ccc4 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -971,7 +971,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
u64 p_blkno;
/* We are protected by dqio_sem so no locking needed */
- status = ocfs2_extend_no_holes(lqinode,
+ status = ocfs2_extend_no_holes(lqinode, NULL,
lqinode->i_size + 2 * sb->s_blocksize,
lqinode->i_size);
if (status < 0) {
@@ -1114,7 +1114,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
return ocfs2_local_quota_add_chunk(sb, type, offset);
/* We are protected by dqio_sem so no locking needed */
- status = ocfs2_extend_no_holes(lqinode,
+ status = ocfs2_extend_no_holes(lqinode, NULL,
lqinode->i_size + sb->s_blocksize,
lqinode->i_size);
if (status < 0) {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 4793f36f651..3ac5aa733e9 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2931,6 +2931,12 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
+ /*
+ * We only duplicate pages until we reach the page contains i_size - 1.
+ * So trim 'end' to i_size.
+ */
+ if (end > i_size_read(context->inode))
+ end = i_size_read(context->inode);
while (offset < end) {
page_index = offset >> PAGE_CACHE_SHIFT;
@@ -4166,6 +4172,12 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
struct inode *inode = old_dentry->d_inode;
struct buffer_head *new_bh = NULL;
+ if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = filemap_fdatawrite(inode->i_mapping);
if (ret) {
mlog_errno(ret);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index f4c2a9eb8c4..a8e6a95a353 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -741,7 +741,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
le16_to_cpu(bg->bg_free_bits_count));
le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
le16_to_cpu(bg->bg_bits));
- cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg->bg_blkno);
+ cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
le16_add_cpu(&cl->cl_next_free_rec, 1);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e97b34842cf..d03469f6180 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -709,7 +709,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
struct ocfs2_xattr_value_buf *vb,
struct ocfs2_xattr_set_ctxt *ctxt)
{
- int status = 0;
+ int status = 0, credits;
handle_t *handle = ctxt->handle;
enum ocfs2_alloc_restarted why;
u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
@@ -719,38 +719,54 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
- status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ while (clusters_to_add) {
+ status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ break;
+ }
- prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
- status = ocfs2_add_clusters_in_btree(handle,
- &et,
- &logical_start,
- clusters_to_add,
- 0,
- ctxt->data_ac,
- ctxt->meta_ac,
- &why);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
+ status = ocfs2_add_clusters_in_btree(handle,
+ &et,
+ &logical_start,
+ clusters_to_add,
+ 0,
+ ctxt->data_ac,
+ ctxt->meta_ac,
+ &why);
+ if ((status < 0) && (status != -EAGAIN)) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ break;
+ }
- ocfs2_journal_dirty(handle, vb->vb_bh);
+ ocfs2_journal_dirty(handle, vb->vb_bh);
- clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
+ clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) -
+ prev_clusters;
- /*
- * We should have already allocated enough space before the transaction,
- * so no need to restart.
- */
- BUG_ON(why != RESTART_NONE || clusters_to_add);
-
-leave:
+ if (why != RESTART_NONE && clusters_to_add) {
+ /*
+ * We can only fail in case the alloc file doesn't give
+ * up enough clusters.
+ */
+ BUG_ON(why == RESTART_META);
+
+ mlog(0, "restarting xattr value extension for %u"
+ " clusters,.\n", clusters_to_add);
+ credits = ocfs2_calc_extend_credits(inode->i_sb,
+ &vb->vb_xv->xr_list,
+ clusters_to_add);
+ status = ocfs2_extend_trans(handle, credits);
+ if (status < 0) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ break;
+ }
+ }
+ }
return status;
}
@@ -6788,16 +6804,15 @@ out:
return ret;
}
-static int ocfs2_reflink_xattr_buckets(handle_t *handle,
+static int ocfs2_reflink_xattr_bucket(handle_t *handle,
u64 blkno, u64 new_blkno, u32 clusters,
+ u32 *cpos, int num_buckets,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_reflink_xattr_tree_args *args)
{
int i, j, ret = 0;
struct super_block *sb = args->reflink->old_inode->i_sb;
- u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
- u32 num_buckets = clusters * bpc;
int bpb = args->old_bucket->bu_blocks;
struct ocfs2_xattr_value_buf vb = {
.vb_access = ocfs2_journal_access,
@@ -6816,14 +6831,6 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
break;
}
- /*
- * The real bucket num in this series of blocks is stored
- * in the 1st bucket.
- */
- if (i == 0)
- num_buckets = le16_to_cpu(
- bucket_xh(args->old_bucket)->xh_num_buckets);
-
ret = ocfs2_xattr_bucket_journal_access(handle,
args->new_bucket,
OCFS2_JOURNAL_ACCESS_CREATE);
@@ -6837,6 +6844,18 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
bucket_block(args->old_bucket, j),
sb->s_blocksize);
+ /*
+ * Record the start cpos so that we can use it to initialize
+ * our xattr tree we also set the xh_num_bucket for the new
+ * bucket.
+ */
+ if (i == 0) {
+ *cpos = le32_to_cpu(bucket_xh(args->new_bucket)->
+ xh_entries[0].xe_name_hash);
+ bucket_xh(args->new_bucket)->xh_num_buckets =
+ cpu_to_le16(num_buckets);
+ }
+
ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
ret = ocfs2_reflink_xattr_header(handle, args->reflink,
@@ -6866,6 +6885,7 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
}
ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
+
ocfs2_xattr_bucket_relse(args->old_bucket);
ocfs2_xattr_bucket_relse(args->new_bucket);
}
@@ -6874,6 +6894,75 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
ocfs2_xattr_bucket_relse(args->new_bucket);
return ret;
}
+
+static int ocfs2_reflink_xattr_buckets(handle_t *handle,
+ struct inode *inode,
+ struct ocfs2_reflink_xattr_tree_args *args,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac,
+ u64 blkno, u32 cpos, u32 len)
+{
+ int ret, first_inserted = 0;
+ u32 p_cluster, num_clusters, reflink_cpos = 0;
+ u64 new_blkno;
+ unsigned int num_buckets, reflink_buckets;
+ unsigned int bpc =
+ ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
+
+ ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ num_buckets = le16_to_cpu(bucket_xh(args->old_bucket)->xh_num_buckets);
+ ocfs2_xattr_bucket_relse(args->old_bucket);
+
+ while (len && num_buckets) {
+ ret = ocfs2_claim_clusters(handle, data_ac,
+ 1, &p_cluster, &num_clusters);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ new_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+ reflink_buckets = min(num_buckets, bpc * num_clusters);
+
+ ret = ocfs2_reflink_xattr_bucket(handle, blkno,
+ new_blkno, num_clusters,
+ &reflink_cpos, reflink_buckets,
+ meta_ac, data_ac, args);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * For the 1st allocated cluster, we make it use the same cpos
+ * so that the xattr tree looks the same as the original one
+ * in the most case.
+ */
+ if (!first_inserted) {
+ reflink_cpos = cpos;
+ first_inserted = 1;
+ }
+ ret = ocfs2_insert_extent(handle, et, reflink_cpos, new_blkno,
+ num_clusters, 0, meta_ac);
+ if (ret)
+ mlog_errno(ret);
+
+ mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
+ (unsigned long long)new_blkno, num_clusters, reflink_cpos);
+
+ len -= num_clusters;
+ blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
+ num_buckets -= reflink_buckets;
+ }
+out:
+ return ret;
+}
+
/*
* Create the same xattr extent record in the new inode's xattr tree.
*/
@@ -6885,8 +6974,6 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
void *para)
{
int ret, credits = 0;
- u32 p_cluster, num_clusters;
- u64 new_blkno;
handle_t *handle;
struct ocfs2_reflink_xattr_tree_args *args =
(struct ocfs2_reflink_xattr_tree_args *)para;
@@ -6895,6 +6982,9 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
struct ocfs2_alloc_context *data_ac = NULL;
struct ocfs2_extent_tree et;
+ mlog(0, "reflink xattr buckets %llu len %u\n",
+ (unsigned long long)blkno, len);
+
ocfs2_init_xattr_tree_extent_tree(&et,
INODE_CACHE(args->reflink->new_inode),
args->new_blk_bh);
@@ -6914,32 +7004,12 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
goto out;
}
- ret = ocfs2_claim_clusters(handle, data_ac,
- len, &p_cluster, &num_clusters);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
-
- new_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cluster);
-
- mlog(0, "reflink xattr buckets %llu to %llu, len %u\n",
- (unsigned long long)blkno, (unsigned long long)new_blkno, len);
- ret = ocfs2_reflink_xattr_buckets(handle, blkno, new_blkno, len,
- meta_ac, data_ac, args);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
-
- mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
- (unsigned long long)new_blkno, len, cpos);
- ret = ocfs2_insert_extent(handle, &et, cpos, new_blkno,
- len, 0, meta_ac);
+ ret = ocfs2_reflink_xattr_buckets(handle, inode, args, &et,
+ meta_ac, data_ac,
+ blkno, cpos, len);
if (ret)
mlog_errno(ret);
-out_commit:
ocfs2_commit_trans(osb, handle);
out:
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 3e73de5967f..fc8497643fd 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -74,6 +74,7 @@ int ibm_partition(struct parsed_partitions *state)
} *label;
unsigned char *data;
Sector sect;
+ sector_t labelsect;
res = 0;
blocksize = bdev_logical_block_size(bdev);
@@ -98,10 +99,19 @@ int ibm_partition(struct parsed_partitions *state)
goto out_freeall;
/*
+ * Special case for FBA disks: label sector does not depend on
+ * blocksize.
+ */
+ if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) ||
+ (info->cu_type == 0x3880 && info->dev_type == 0x3370))
+ labelsect = info->label_block;
+ else
+ labelsect = info->label_block * (blocksize >> 9);
+
+ /*
* Get volume label, extract name and type.
*/
- data = read_part_sector(state, info->label_block*(blocksize/512),
- &sect);
+ data = read_part_sector(state, labelsect, &sect);
if (data == NULL)
goto out_readerr;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 12c233da1b6..437d2ca2de9 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -676,7 +676,7 @@ static void prune_dqcache(int count)
* This is called from kswapd when we think we need some
* more memory
*/
-static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
+static int shrink_dqcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
{
if (nr) {
spin_lock(&dq_list_lock);
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index f71246bebfe..a7ac78f8e67 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -28,6 +28,7 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
struct sysfs_dirent *target_sd = NULL;
struct sysfs_dirent *sd = NULL;
struct sysfs_addrm_cxt acxt;
+ enum kobj_ns_type ns_type;
int error;
BUG_ON(!name);
@@ -58,16 +59,29 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
if (!sd)
goto out_put;
- if (sysfs_ns_type(parent_sd))
+ ns_type = sysfs_ns_type(parent_sd);
+ if (ns_type)
sd->s_ns = target->ktype->namespace(target);
sd->s_symlink.target_sd = target_sd;
target_sd = NULL; /* reference is now owned by the symlink */
sysfs_addrm_start(&acxt, parent_sd);
- if (warn)
- error = sysfs_add_one(&acxt, sd);
- else
- error = __sysfs_add_one(&acxt, sd);
+ /* Symlinks must be between directories with the same ns_type */
+ if (!ns_type ||
+ (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) {
+ if (warn)
+ error = sysfs_add_one(&acxt, sd);
+ else
+ error = __sysfs_add_one(&acxt, sd);
+ } else {
+ error = -EINVAL;
+ WARN(1, KERN_WARNING
+ "sysfs: symlink across ns_types %s/%s -> %s/%s\n",
+ parent_sd->s_name,
+ sd->s_name,
+ sd->s_symlink.target_sd->s_parent->s_name,
+ sd->s_symlink.target_sd->s_name);
+ }
sysfs_addrm_finish(&acxt);
if (error)
@@ -122,7 +136,7 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
{
const void *ns = NULL;
spin_lock(&sysfs_assoc_lock);
- if (targ->sd)
+ if (targ->sd && sysfs_ns_type(kobj->sd))
ns = targ->sd->s_ns;
spin_unlock(&sysfs_assoc_lock);
sysfs_hash_and_remove(kobj->sd, ns, name);
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 02feb59cefc..0b201114a5a 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -277,7 +277,7 @@ static int kick_a_thread(void)
return 0;
}
-int ubifs_shrinker(int nr, gfp_t gfp_mask)
+int ubifs_shrinker(struct shrinker *shrink, int nr, gfp_t gfp_mask)
{
int freed, contention = 0;
long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 2eef553d50c..04310878f44 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1575,7 +1575,7 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
int ubifs_tnc_end_commit(struct ubifs_info *c);
/* shrinker.c */
-int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask);
+int ubifs_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
/* commit.c */
int ubifs_bg_thread(void *info);
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 649ade8ef59..2ee3f7a6016 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -45,7 +45,7 @@
static kmem_zone_t *xfs_buf_zone;
STATIC int xfsbufd(void *);
-STATIC int xfsbufd_wakeup(int, gfp_t);
+STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
static struct shrinker xfs_buf_shake = {
.shrink = xfsbufd_wakeup,
@@ -340,7 +340,7 @@ _xfs_buf_lookup_pages(
__func__, gfp_mask);
XFS_STATS_INC(xb_page_retries);
- xfsbufd_wakeup(0, gfp_mask);
+ xfsbufd_wakeup(NULL, 0, gfp_mask);
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry;
}
@@ -1762,6 +1762,7 @@ xfs_buf_runall_queues(
STATIC int
xfsbufd_wakeup(
+ struct shrinker *shrink,
int priority,
gfp_t mask)
{
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index f2d1718c916..80938c736c2 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1883,7 +1883,6 @@ init_xfs_fs(void)
goto out_cleanup_procfs;
vfs_initquota();
- xfs_inode_shrinker_init();
error = register_filesystem(&xfs_fs_type);
if (error)
@@ -1911,7 +1910,6 @@ exit_xfs_fs(void)
{
vfs_exitquota();
unregister_filesystem(&xfs_fs_type);
- xfs_inode_shrinker_destroy();
xfs_sysctl_unregister();
xfs_cleanup_procfs();
xfs_buf_terminate();
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index ef7f0218bcc..a51a07c3a70 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -144,6 +144,41 @@ restart:
return last_error;
}
+/*
+ * Select the next per-ag structure to iterate during the walk. The reclaim
+ * walk is optimised only to walk AGs with reclaimable inodes in them.
+ */
+static struct xfs_perag *
+xfs_inode_ag_iter_next_pag(
+ struct xfs_mount *mp,
+ xfs_agnumber_t *first,
+ int tag)
+{
+ struct xfs_perag *pag = NULL;
+
+ if (tag == XFS_ICI_RECLAIM_TAG) {
+ int found;
+ int ref;
+
+ spin_lock(&mp->m_perag_lock);
+ found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+ (void **)&pag, *first, 1, tag);
+ if (found <= 0) {
+ spin_unlock(&mp->m_perag_lock);
+ return NULL;
+ }
+ *first = pag->pag_agno + 1;
+ /* open coded pag reference increment */
+ ref = atomic_inc_return(&pag->pag_ref);
+ spin_unlock(&mp->m_perag_lock);
+ trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
+ } else {
+ pag = xfs_perag_get(mp, *first);
+ (*first)++;
+ }
+ return pag;
+}
+
int
xfs_inode_ag_iterator(
struct xfs_mount *mp,
@@ -154,16 +189,15 @@ xfs_inode_ag_iterator(
int exclusive,
int *nr_to_scan)
{
+ struct xfs_perag *pag;
int error = 0;
int last_error = 0;
xfs_agnumber_t ag;
int nr;
nr = nr_to_scan ? *nr_to_scan : INT_MAX;
- for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
- struct xfs_perag *pag;
-
- pag = xfs_perag_get(mp, ag);
+ ag = 0;
+ while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) {
error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
exclusive, &nr);
xfs_perag_put(pag);
@@ -640,6 +674,17 @@ __xfs_inode_set_reclaim_tag(
radix_tree_tag_set(&pag->pag_ici_root,
XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
XFS_ICI_RECLAIM_TAG);
+
+ if (!pag->pag_ici_reclaimable) {
+ /* propagate the reclaim tag up into the perag radix tree */
+ spin_lock(&ip->i_mount->m_perag_lock);
+ radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+ XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+ spin_unlock(&ip->i_mount->m_perag_lock);
+ trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
+ -1, _RET_IP_);
+ }
pag->pag_ici_reclaimable++;
}
@@ -674,6 +719,16 @@ __xfs_inode_clear_reclaim_tag(
radix_tree_tag_clear(&pag->pag_ici_root,
XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
pag->pag_ici_reclaimable--;
+ if (!pag->pag_ici_reclaimable) {
+ /* clear the reclaim tag from the perag radix tree */
+ spin_lock(&ip->i_mount->m_perag_lock);
+ radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+ XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+ spin_unlock(&ip->i_mount->m_perag_lock);
+ trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
+ -1, _RET_IP_);
+ }
}
/*
@@ -828,83 +883,52 @@ xfs_reclaim_inodes(
/*
* Shrinker infrastructure.
- *
- * This is all far more complex than it needs to be. It adds a global list of
- * mounts because the shrinkers can only call a global context. We need to make
- * the shrinkers pass a context to avoid the need for global state.
*/
-static LIST_HEAD(xfs_mount_list);
-static struct rw_semaphore xfs_mount_list_lock;
-
static int
xfs_reclaim_inode_shrink(
+ struct shrinker *shrink,
int nr_to_scan,
gfp_t gfp_mask)
{
struct xfs_mount *mp;
struct xfs_perag *pag;
xfs_agnumber_t ag;
- int reclaimable = 0;
+ int reclaimable;
+ mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
if (nr_to_scan) {
if (!(gfp_mask & __GFP_FS))
return -1;
- down_read(&xfs_mount_list_lock);
- list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
- xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
+ xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
- if (nr_to_scan <= 0)
- break;
- }
- up_read(&xfs_mount_list_lock);
- }
+ /* if we don't exhaust the scan, don't bother coming back */
+ if (nr_to_scan > 0)
+ return -1;
+ }
- down_read(&xfs_mount_list_lock);
- list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
- for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
- pag = xfs_perag_get(mp, ag);
- reclaimable += pag->pag_ici_reclaimable;
- xfs_perag_put(pag);
- }
+ reclaimable = 0;
+ ag = 0;
+ while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag,
+ XFS_ICI_RECLAIM_TAG))) {
+ reclaimable += pag->pag_ici_reclaimable;
+ xfs_perag_put(pag);
}
- up_read(&xfs_mount_list_lock);
return reclaimable;
}
-static struct shrinker xfs_inode_shrinker = {
- .shrink = xfs_reclaim_inode_shrink,
- .seeks = DEFAULT_SEEKS,
-};
-
-void __init
-xfs_inode_shrinker_init(void)
-{
- init_rwsem(&xfs_mount_list_lock);
- register_shrinker(&xfs_inode_shrinker);
-}
-
-void
-xfs_inode_shrinker_destroy(void)
-{
- ASSERT(list_empty(&xfs_mount_list));
- unregister_shrinker(&xfs_inode_shrinker);
-}
-
void
xfs_inode_shrinker_register(
struct xfs_mount *mp)
{
- down_write(&xfs_mount_list_lock);
- list_add_tail(&mp->m_mplist, &xfs_mount_list);
- up_write(&xfs_mount_list_lock);
+ mp->m_inode_shrink.shrink = xfs_reclaim_inode_shrink;
+ mp->m_inode_shrink.seeks = DEFAULT_SEEKS;
+ register_shrinker(&mp->m_inode_shrink);
}
void
xfs_inode_shrinker_unregister(
struct xfs_mount *mp)
{
- down_write(&xfs_mount_list_lock);
- list_del(&mp->m_mplist);
- up_write(&xfs_mount_list_lock);
+ unregister_shrinker(&mp->m_inode_shrink);
}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index cdcbaaca988..e28139aaa4a 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -55,8 +55,6 @@ int xfs_inode_ag_iterator(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
int flags, int tag, int write_lock, int *nr_to_scan);
-void xfs_inode_shrinker_init(void);
-void xfs_inode_shrinker_destroy(void);
void xfs_inode_shrinker_register(struct xfs_mount *mp);
void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 73d5aa11738..30282069090 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -124,7 +124,10 @@ DEFINE_EVENT(xfs_perag_class, name, \
unsigned long caller_ip), \
TP_ARGS(mp, agno, refcount, caller_ip))
DEFINE_PERAG_REF_EVENT(xfs_perag_get);
+DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim);
DEFINE_PERAG_REF_EVENT(xfs_perag_put);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
TRACE_EVENT(xfs_attr_list_node_descend,
TP_PROTO(struct xfs_attr_list_context *ctx,
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 8c117ff2e3a..67c018392d6 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -69,7 +69,7 @@ STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
-STATIC int xfs_qm_shake(int, gfp_t);
+STATIC int xfs_qm_shake(struct shrinker *, int, gfp_t);
static struct shrinker xfs_qm_shaker = {
.shrink = xfs_qm_shake,
@@ -2117,7 +2117,10 @@ xfs_qm_shake_freelist(
*/
/* ARGSUSED */
STATIC int
-xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
+xfs_qm_shake(
+ struct shrinker *shrink,
+ int nr_to_scan,
+ gfp_t gfp_mask)
{
int ndqused, nfree, n;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1d2c7eed4ed..5761087ee8e 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -259,7 +259,7 @@ typedef struct xfs_mount {
wait_queue_head_t m_wait_single_sync_task;
__int64_t m_update_flags; /* sb flags we need to update
on the next remount,rw */
- struct list_head m_mplist; /* inode shrinker mount list */
+ struct shrinker m_inode_shrink; /* inode reclaim shrinker */
} xfs_mount_t;
/*