diff options
Diffstat (limited to 'fs')
58 files changed, 1014 insertions, 430 deletions
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index d61e3b28ce3..36d961f342a 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -146,7 +146,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) while (rdir->head < rdir->tail) { p9stat_init(&st); err = p9stat_read(rdir->buf + rdir->head, - buflen - rdir->head, &st, + rdir->tail - rdir->head, &st, fid->clnt->proto_version); if (err) { P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0d1d966b0fe..c3df14ce2cc 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2304,12 +2304,17 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root, return ret; } +/* + * min slot controls the lowest index we're willing to push to the + * right. We'll push up to and including min_slot, but no lower + */ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int data_size, int empty, struct extent_buffer *right, - int free_space, u32 left_nritems) + int free_space, u32 left_nritems, + u32 min_slot) { struct extent_buffer *left = path->nodes[0]; struct extent_buffer *upper = path->nodes[1]; @@ -2327,7 +2332,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (empty) nr = 0; else - nr = 1; + nr = max_t(u32, 1, min_slot); if (path->slots[0] >= left_nritems) push_space += data_size; @@ -2469,10 +2474,14 @@ out_unlock: * * returns 1 if the push failed because the other node didn't have enough * room, 0 if everything worked out and < 0 if there were major errors. + * + * this will push starting from min_slot to the end of the leaf. It won't + * push any slot lower than min_slot */ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int data_size, - int empty) + *root, struct btrfs_path *path, + int min_data_size, int data_size, + int empty, u32 min_slot) { struct extent_buffer *left = path->nodes[0]; struct extent_buffer *right; @@ -2514,8 +2523,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (left_nritems == 0) goto out_unlock; - return __push_leaf_right(trans, root, path, data_size, empty, - right, free_space, left_nritems); + return __push_leaf_right(trans, root, path, min_data_size, empty, + right, free_space, left_nritems, min_slot); out_unlock: btrfs_tree_unlock(right); free_extent_buffer(right); @@ -2525,12 +2534,17 @@ out_unlock: /* * push some data in the path leaf to the left, trying to free up at * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * max_slot can put a limit on how far into the leaf we'll push items. The + * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the + * items */ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int data_size, int empty, struct extent_buffer *left, - int free_space, int right_nritems) + int free_space, u32 right_nritems, + u32 max_slot) { struct btrfs_disk_key disk_key; struct extent_buffer *right = path->nodes[0]; @@ -2549,9 +2563,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, slot = path->slots[1]; if (empty) - nr = right_nritems; + nr = min(right_nritems, max_slot); else - nr = right_nritems - 1; + nr = min(right_nritems - 1, max_slot); for (i = 0; i < nr; i++) { item = btrfs_item_nr(right, i); @@ -2712,10 +2726,14 @@ out: /* * push some data in the path leaf to the left, trying to free up at * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * max_slot can put a limit on how far into the leaf we'll push items. The + * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the + * items */ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int data_size, - int empty) + *root, struct btrfs_path *path, int min_data_size, + int data_size, int empty, u32 max_slot) { struct extent_buffer *right = path->nodes[0]; struct extent_buffer *left; @@ -2761,8 +2779,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - return __push_leaf_left(trans, root, path, data_size, - empty, left, free_space, right_nritems); + return __push_leaf_left(trans, root, path, min_data_size, + empty, left, free_space, right_nritems, + max_slot); out: btrfs_tree_unlock(left); free_extent_buffer(left); @@ -2855,6 +2874,64 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, } /* + * double splits happen when we need to insert a big item in the middle + * of a leaf. A double split can leave us with 3 mostly empty leaves: + * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] + * A B C + * + * We avoid this by trying to push the items on either side of our target + * into the adjacent leaves. If all goes well we can avoid the double split + * completely. + */ +static noinline int push_for_double_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int data_size) +{ + int ret; + int progress = 0; + int slot; + u32 nritems; + + slot = path->slots[0]; + + /* + * try to push all the items after our slot into the + * right leaf + */ + ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot); + if (ret < 0) + return ret; + + if (ret == 0) + progress++; + + nritems = btrfs_header_nritems(path->nodes[0]); + /* + * our goal is to get our slot at the start or end of a leaf. If + * we've done so we're done + */ + if (path->slots[0] == 0 || path->slots[0] == nritems) + return 0; + + if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) + return 0; + + /* try to push all the items before our slot into the next leaf */ + slot = path->slots[0]; + ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot); + if (ret < 0) + return ret; + + if (ret == 0) + progress++; + + if (progress) + return 0; + return 1; +} + +/* * split the path's leaf in two, making sure there is at least data_size * available for the resulting leaf level of the path. * @@ -2876,6 +2953,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, int wret; int split; int num_doubles = 0; + int tried_avoid_double = 0; l = path->nodes[0]; slot = path->slots[0]; @@ -2884,12 +2962,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, return -EOVERFLOW; /* first try to make some room by pushing left and right */ - if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { - wret = push_leaf_right(trans, root, path, data_size, 0); + if (data_size) { + wret = push_leaf_right(trans, root, path, data_size, + data_size, 0, 0); if (wret < 0) return wret; if (wret) { - wret = push_leaf_left(trans, root, path, data_size, 0); + wret = push_leaf_left(trans, root, path, data_size, + data_size, 0, (u32)-1); if (wret < 0) return wret; } @@ -2923,6 +3003,8 @@ again: if (mid != nritems && leaf_space_used(l, mid, nritems - mid) + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + if (data_size && !tried_avoid_double) + goto push_for_double; split = 2; } } @@ -2939,6 +3021,8 @@ again: if (mid != nritems && leaf_space_used(l, mid, nritems - mid) + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + if (data_size && !tried_avoid_double) + goto push_for_double; split = 2 ; } } @@ -3019,6 +3103,13 @@ again: } return ret; + +push_for_double: + push_for_double_split(trans, root, path, data_size); + tried_avoid_double = 1; + if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) + return 0; + goto again; } static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, @@ -3915,13 +4006,15 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, extent_buffer_get(leaf); btrfs_set_path_blocking(path); - wret = push_leaf_left(trans, root, path, 1, 1); + wret = push_leaf_left(trans, root, path, 1, 1, + 1, (u32)-1); if (wret < 0 && wret != -ENOSPC) ret = wret; if (path->nodes[0] == leaf && btrfs_header_nritems(leaf)) { - wret = push_leaf_right(trans, root, path, 1, 1); + wret = push_leaf_right(trans, root, path, 1, + 1, 1, 0); if (wret < 0 && wret != -ENOSPC) ret = wret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4dbaf89b133..9254b3d58db 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1458,7 +1458,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, */ /* the destination must be opened for writing */ - if (!(file->f_mode & FMODE_WRITE)) + if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) return -EINVAL; ret = mnt_want_write(file->f_path.mnt); @@ -1511,7 +1511,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, /* determine range to clone */ ret = -EINVAL; - if (off >= src->i_size || off + len > src->i_size) + if (off + len > src->i_size || off + len < off) goto out_unlock; if (len == 0) olen = len = src->i_size - off; @@ -1578,6 +1578,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, u64 disko = 0, diskl = 0; u64 datao = 0, datal = 0; u8 comp; + u64 endoff; size = btrfs_item_size_nr(leaf, slot); read_extent_buffer(leaf, buf, @@ -1712,9 +1713,18 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_release_path(root, path); inode->i_mtime = inode->i_ctime = CURRENT_TIME; - if (new_key.offset + datal > inode->i_size) - btrfs_i_size_write(inode, - new_key.offset + datal); + + /* + * we round up to the block size at eof when + * determining which extents to clone above, + * but shouldn't round up the file size + */ + endoff = new_key.offset + datal; + if (endoff > off+olen) + endoff = off+olen; + if (endoff > inode->i_size) + btrfs_i_size_write(inode, endoff); + BTRFS_I(inode)->flags = BTRFS_I(src)->flags; ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 04b8280582a..bc87b9c1d27 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -2,7 +2,7 @@ config CEPH_FS tristate "Ceph distributed file system (EXPERIMENTAL)" depends on INET && EXPERIMENTAL select LIBCRC32C - select CONFIG_CRYPTO_AES + select CRYPTO_AES help Choose Y or M here to include support for mounting the experimental Ceph distributed file system. Ceph is an extremely diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c index 3fe49042d8a..6d44053ecff 100644 --- a/fs/ceph/auth_x.c +++ b/fs/ceph/auth_x.c @@ -613,6 +613,9 @@ static void ceph_x_destroy(struct ceph_auth_client *ac) remove_ticket_handler(ac, th); } + if (xi->auth_authorizer.buf) + ceph_buffer_put(xi->auth_authorizer.buf); + kfree(ac->private); ac->private = NULL; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 74144d6389f..b81be9a5648 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -627,7 +627,7 @@ retry: if (fmode >= 0) __ceph_get_fmode(ci, fmode); spin_unlock(&inode->i_lock); - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); return 0; } @@ -1181,7 +1181,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, } if (wake) - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); return delayed; } @@ -2153,7 +2153,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) else if (flushsnaps) ceph_flush_snaps(ci); if (wake) - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); if (put) iput(inode); } @@ -2229,7 +2229,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, iput(inode); } else if (complete_capsnap) { ceph_flush_snaps(ci); - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); } if (drop_capsnap) iput(inode); @@ -2405,7 +2405,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, if (queue_invalidate) ceph_queue_invalidate(inode); if (wake) - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); if (check_caps == 1) ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, @@ -2460,7 +2460,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, struct ceph_inode_info, i_flushing_item)->vfs_inode); mdsc->num_cap_flushing--; - wake_up(&mdsc->cap_flushing_wq); + wake_up_all(&mdsc->cap_flushing_wq); dout(" inode %p now !flushing\n", inode); if (ci->i_dirty_caps == 0) { @@ -2472,7 +2472,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, } } spin_unlock(&mdsc->cap_dirty_lock); - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); out: spin_unlock(&inode->i_lock); @@ -2984,6 +2984,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, memcpy(*p, dentry->d_name.name, dentry->d_name.len); *p += dentry->d_name.len; rel->dname_seq = cpu_to_le32(di->lease_seq); + __ceph_mdsc_drop_dentry_lease(dentry); } spin_unlock(&dentry->d_lock); return ret; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index f85719310db..f94ed3c7f6a 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -266,6 +266,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) spin_lock(&inode->i_lock); if ((filp->f_pos == 2 || fi->dentry) && !ceph_test_opt(client, NOASYNCREADDIR) && + ceph_snap(inode) != CEPH_SNAPDIR && (ci->i_ceph_flags & CEPH_I_COMPLETE) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { err = __dcache_readdir(filp, dirent, filldir); @@ -1013,18 +1014,22 @@ out_touch: /* * When a dentry is released, clear the dir I_COMPLETE if it was part - * of the current dir gen. + * of the current dir gen or if this is in the snapshot namespace. */ static void ceph_dentry_release(struct dentry *dentry) { struct ceph_dentry_info *di = ceph_dentry(dentry); struct inode *parent_inode = dentry->d_parent->d_inode; + u64 snapid = ceph_snap(parent_inode); - if (parent_inode) { + dout("dentry_release %p parent %p\n", dentry, parent_inode); + + if (parent_inode && snapid != CEPH_SNAPDIR) { struct ceph_inode_info *ci = ceph_inode(parent_inode); spin_lock(&parent_inode->i_lock); - if (ci->i_shared_gen == di->lease_shared_gen) { + if (ci->i_shared_gen == di->lease_shared_gen || + snapid <= CEPH_MAXSNAP) { dout(" clearing %p complete (d_release)\n", parent_inode); ci->i_ceph_flags &= ~CEPH_I_COMPLETE; @@ -1241,7 +1246,9 @@ struct dentry_operations ceph_dentry_ops = { struct dentry_operations ceph_snapdir_dentry_ops = { .d_revalidate = ceph_snapdir_d_revalidate, + .d_release = ceph_dentry_release, }; struct dentry_operations ceph_snap_dentry_ops = { + .d_release = ceph_dentry_release, }; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 6251a1574b9..7c08698fad3 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -265,7 +265,7 @@ int ceph_release(struct inode *inode, struct file *file) kmem_cache_free(ceph_file_cachep, cf); /* wake up anyone waiting for caps on this inode */ - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); return 0; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 8f9b9fe8ef9..389f9dbd994 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1199,8 +1199,10 @@ retry_lookup: goto out; } err = ceph_init_dentry(dn); - if (err < 0) + if (err < 0) { + dput(dn); goto out; + } } else if (dn->d_inode && (ceph_ino(dn->d_inode) != vino.ino || ceph_snap(dn->d_inode) != vino.snap)) { @@ -1499,7 +1501,7 @@ retry: if (wrbuffer_refs == 0) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); if (wake) - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 3ab79f6c4ce..dd440bd438a 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -868,7 +868,7 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, { struct ceph_inode_info *ci = ceph_inode(inode); - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); if (arg) { spin_lock(&inode->i_lock); ci->i_wanted_max_size = 0; @@ -1514,6 +1514,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, ceph_encode_filepath(&p, end, ino1, path1); ceph_encode_filepath(&p, end, ino2, path2); + /* make note of release offset, in case we need to replay */ + req->r_request_release_offset = p - msg->front.iov_base; + /* cap releases */ releases = 0; if (req->r_inode_drop) @@ -1561,7 +1564,7 @@ static void complete_request(struct ceph_mds_client *mdsc, if (req->r_callback) req->r_callback(mdsc, req); else - complete(&req->r_completion); + complete_all(&req->r_completion); } /* @@ -1580,6 +1583,32 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); + if (req->r_got_unsafe) { + /* + * Replay. Do not regenerate message (and rebuild + * paths, etc.); just use the original message. + * Rebuilding paths will break for renames because + * d_move mangles the src name. + */ + msg = req->r_request; + rhead = msg->front.iov_base; + + flags = le32_to_cpu(rhead->flags); + flags |= CEPH_MDS_FLAG_REPLAY; + rhead->flags = cpu_to_le32(flags); + + if (req->r_target_inode) + rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); + + rhead->num_retry = req->r_attempts - 1; + + /* remove cap/dentry releases from message */ + rhead->num_releases = 0; + msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset); + msg->front.iov_len = req->r_request_release_offset; + return 0; + } + if (req->r_request) { ceph_msg_put(req->r_request); req->r_request = NULL; @@ -1601,13 +1630,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, rhead->flags = cpu_to_le32(flags); rhead->num_fwd = req->r_num_fwd; rhead->num_retry = req->r_attempts - 1; + rhead->ino = 0; dout(" r_locked_dir = %p\n", req->r_locked_dir); - - if (req->r_target_inode && req->r_got_unsafe) - rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); - else - rhead->ino = 0; return 0; } @@ -1907,7 +1932,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (head->safe) { req->r_got_safe = true; __unregister_request(mdsc, req); - complete(&req->r_safe_completion); + complete_all(&req->r_safe_completion); if (req->r_got_unsafe) { /* @@ -1922,7 +1947,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* last unsafe request during umount? */ if (mdsc->stopping && !__get_oldest_req(mdsc)) - complete(&mdsc->safe_umount_waiters); + complete_all(&mdsc->safe_umount_waiters); mutex_unlock(&mdsc->mutex); goto out; } @@ -2101,7 +2126,7 @@ static void handle_session(struct ceph_mds_session *session, pr_info("mds%d reconnect denied\n", session->s_mds); remove_session_caps(session); wake = 1; /* for good measure */ - complete(&mdsc->session_close_waiters); + complete_all(&mdsc->session_close_waiters); kick_requests(mdsc, mds); break; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index b292fa42a66..952410c60d0 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -188,6 +188,7 @@ struct ceph_mds_request { int r_old_inode_drop, r_old_inode_unless; struct ceph_msg *r_request; /* original request */ + int r_request_release_offset; struct ceph_msg *r_reply; struct ceph_mds_reply_info_parsed r_reply_info; int r_err; diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 9ad43a310a4..15167b2daa5 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -43,7 +43,8 @@ static void ceph_fault(struct ceph_connection *con); * nicely render a sockaddr as a string. */ #define MAX_ADDR_STR 20 -static char addr_str[MAX_ADDR_STR][40]; +#define MAX_ADDR_STR_LEN 60 +static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN]; static DEFINE_SPINLOCK(addr_str_lock); static int last_addr_str; @@ -52,7 +53,6 @@ const char *pr_addr(const struct sockaddr_storage *ss) int i; char *s; struct sockaddr_in *in4 = (void *)ss; - unsigned char *quad = (void *)&in4->sin_addr.s_addr; struct sockaddr_in6 *in6 = (void *)ss; spin_lock(&addr_str_lock); @@ -64,25 +64,13 @@ const char *pr_addr(const struct sockaddr_storage *ss) switch (ss->ss_family) { case AF_INET: - sprintf(s, "%u.%u.%u.%u:%u", - (unsigned int)quad[0], - (unsigned int)quad[1], - (unsigned int)quad[2], - (unsigned int)quad[3], - (unsigned int)ntohs(in4->sin_port)); + snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr, + (unsigned int)ntohs(in4->sin_port)); break; case AF_INET6: - sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u", - in6->sin6_addr.s6_addr16[0], - in6->sin6_addr.s6_addr16[1], - in6->sin6_addr.s6_addr16[2], - in6->sin6_addr.s6_addr16[3], - in6->sin6_addr.s6_addr16[4], - in6->sin6_addr.s6_addr16[5], - in6->sin6_addr.s6_addr16[6], - in6->sin6_addr.s6_addr16[7], - (unsigned int)ntohs(in6->sin6_port)); + snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr, + (unsigned int)ntohs(in6->sin6_port)); break; default: @@ -215,12 +203,13 @@ static void set_sock_callbacks(struct socket *sock, */ static struct socket *ceph_tcp_connect(struct ceph_connection *con) { - struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr; + struct sockaddr_storage *paddr = &con->peer_addr.in_addr; struct socket *sock; int ret; BUG_ON(con->sock); - ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM, + IPPROTO_TCP, &sock); if (ret) return ERR_PTR(ret); con->sock = sock; @@ -234,7 +223,8 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con) dout("connect %s\n", pr_addr(&con->peer_addr.in_addr)); - ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK); + ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), + O_NONBLOCK); if (ret == -EINPROGRESS) { dout("connect %s EINPROGRESS sk_state = %u\n", pr_addr(&con->peer_addr.in_addr), @@ -1009,19 +999,32 @@ int ceph_parse_ips(const char *c, const char *end, struct sockaddr_in *in4 = (void *)ss; struct sockaddr_in6 *in6 = (void *)ss; int port; + char delim = ','; + + if (*p == '[') { + delim = ']'; + p++; + } memset(ss, 0, sizeof(*ss)); if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr, - ',', &ipend)) { + delim, &ipend)) ss->ss_family = AF_INET; - } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr, - ',', &ipend)) { + else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr, + delim, &ipend)) ss->ss_family = AF_INET6; - } else { + else goto bad; - } p = ipend; + if (delim == ']') { + if (*p != ']') { + dout("missing matching ']'\n"); + goto bad; + } + p++; + } + /* port? */ if (p < end && *p == ':') { port = 0; @@ -1055,7 +1058,7 @@ int ceph_parse_ips(const char *c, const char *end, return 0; bad: - pr_err("parse_ips bad ip '%s'\n", c); + pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c); return -EINVAL; } @@ -2015,20 +2018,20 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) { mutex_lock(&con->mutex); if (!list_empty(&msg->list_head)) { - dout("con_revoke %p msg %p\n", con, msg); + dout("con_revoke %p msg %p - was on queue\n", con, msg); list_del_init(&msg->list_head); ceph_msg_put(msg); msg->hdr.seq = 0; - if (con->out_msg == msg) { - ceph_msg_put(con->out_msg); - con->out_msg = NULL; - } + } + if (con->out_msg == msg) { + dout("con_revoke %p msg %p - was sending\n", con, msg); + con->out_msg = NULL; if (con->out_kvec_is_msg) { con->out_skip = con->out_kvec_bytes; con->out_kvec_is_msg = false; } - } else { - dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg); + ceph_msg_put(msg); + msg->hdr.seq = 0; } mutex_unlock(&con->mutex); } diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c index cc115eafae1..54fe01c5070 100644 --- a/fs/ceph/mon_client.c +++ b/fs/ceph/mon_client.c @@ -345,7 +345,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, out: mutex_unlock(&monc->mutex); - wake_up(&client->auth_wq); + wake_up_all(&client->auth_wq); } /* @@ -462,7 +462,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc, } mutex_unlock(&monc->mutex); if (req) { - complete(&req->completion); + complete_all(&req->completion); put_generic_request(req); } return; @@ -718,7 +718,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc, monc->m_auth->front_max); if (ret < 0) { monc->client->auth_err = ret; - wake_up(&monc->client->auth_wq); + wake_up_all(&monc->client->auth_wq); } else if (ret > 0) { __send_prepared_auth_request(monc, ret); } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index 92b7251a53f..e3852234789 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -862,12 +862,12 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, if (req->r_callback) req->r_callback(req, msg); else - complete(&req->r_completion); + complete_all(&req->r_completion); if (flags & CEPH_OSD_FLAG_ONDISK) { if (req->r_safe_callback) req->r_safe_callback(req, msg); - complete(&req->r_safe_completion); /* fsync waiter */ + complete_all(&req->r_safe_completion); /* fsync waiter */ } done: @@ -1083,7 +1083,7 @@ done: if (newmap) kick_requests(osdc, NULL); up_read(&osdc->map_sem); - wake_up(&osdc->client->auth_wq); + wake_up_all(&osdc->client->auth_wq); return; bad: diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 50ce64ebd33..416d46adbf8 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -568,6 +568,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) if (ev > CEPH_PG_POOL_VERSION) { pr_warning("got unknown v %d > %d of ceph_pg_pool\n", ev, CEPH_PG_POOL_VERSION); + kfree(pi); goto bad; } __decode_pool(p, pi); @@ -830,12 +831,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, /* remove any? */ while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping, node)->pgid, pgid) <= 0) { - struct rb_node *cur = rbp; + struct ceph_pg_mapping *cur = + rb_entry(rbp, struct ceph_pg_mapping, node); + rbp = rb_next(rbp); - dout(" removed pg_temp %llx\n", - *(u64 *)&rb_entry(cur, struct ceph_pg_mapping, - node)->pgid); - rb_erase(cur, &map->pg_temp); + dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); + rb_erase(&cur->node, &map->pg_temp); + kfree(cur); } if (pglen) { @@ -851,19 +853,22 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, for (j = 0; j < pglen; j++) pg->osds[j] = ceph_decode_32(p); err = __insert_pg_mapping(pg, &map->pg_temp); - if (err) + if (err) { + kfree(pg); goto bad; + } dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, pglen); } } while (rbp) { - struct rb_node *cur = rbp; + struct ceph_pg_mapping *cur = + rb_entry(rbp, struct ceph_pg_mapping, node); + rbp = rb_next(rbp); - dout(" removed pg_temp %llx\n", - *(u64 *)&rb_entry(cur, struct ceph_pg_mapping, - node)->pgid); - rb_erase(cur, &map->pg_temp); + dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); + rb_erase(&cur->node, &map->pg_temp); + kfree(cur); } /* ignore the rest */ diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 484e52bb40b..2cb1a70214d 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -923,7 +923,7 @@ init_cifs(void) goto out_unregister_filesystem; #endif #ifdef CONFIG_CIFS_DFS_UPCALL - rc = register_key_type(&key_type_dns_resolver); + rc = cifs_init_dns_resolver(); if (rc) goto out_unregister_key_type; #endif @@ -935,7 +935,7 @@ init_cifs(void) out_unregister_resolver_key: #ifdef CONFIG_CIFS_DFS_UPCALL - unregister_key_type(&key_type_dns_resolver); + cifs_exit_dns_resolver(); out_unregister_key_type: #endif #ifdef CONFIG_CIFS_UPCALL @@ -961,7 +961,7 @@ exit_cifs(void) cifs_proc_clean(); #ifdef CONFIG_CIFS_DFS_UPCALL cifs_dfs_release_automount_timer(); - unregister_key_type(&key_type_dns_resolver); + cifs_exit_dns_resolver(); #endif #ifdef CONFIG_CIFS_UPCALL unregister_key_type(&cifs_spnego_key_type); diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index 4db2c5e7283..49315cbf742 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -24,12 +24,16 @@ */ #include <linux/slab.h> +#include <linux/keyctl.h> +#include <linux/key-type.h> #include <keys/user-type.h> #include "dns_resolve.h" #include "cifsglob.h" #include "cifsproto.h" #include "cifs_debug.h" +static const struct cred *dns_resolver_cache; + /* Checks if supplied name is IP address * returns: * 1 - name is IP @@ -94,6 +98,7 @@ struct key_type key_type_dns_resolver = { int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) { + const struct cred *saved_cred; int rc = -EAGAIN; struct key *rkey = ERR_PTR(-EAGAIN); char *name; @@ -133,8 +138,15 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) goto skip_upcall; } + saved_cred = override_creds(dns_resolver_cache); rkey = request_key(&key_type_dns_resolver, name, ""); + revert_creds(saved_cred); if (!IS_ERR(rkey)) { + if (!(rkey->perm & KEY_USR_VIEW)) { + down_read(&rkey->sem); + rkey->perm |= KEY_USR_VIEW; + up_read(&rkey->sem); + } len = rkey->type_data.x[0]; data = rkey->payload.data; } else { @@ -165,4 +177,61 @@ out: return rc; } +int __init cifs_init_dns_resolver(void) +{ + struct cred *cred; + struct key *keyring; + int ret; + + printk(KERN_NOTICE "Registering the %s key type\n", + key_type_dns_resolver.name); + + /* create an override credential set with a special thread keyring in + * which DNS requests are cached + * + * this is used to prevent malicious redirections from being installed + * with add_key(). + */ + cred = prepare_kernel_cred(NULL); + if (!cred) + return -ENOMEM; + + keyring = key_alloc(&key_type_keyring, ".dns_resolver", 0, 0, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA); + if (IS_ERR(keyring)) { + ret = PTR_ERR(keyring); + goto failed_put_cred; + } + + ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL); + if (ret < 0) + goto failed_put_key; + + ret = register_key_type(&key_type_dns_resolver); + if (ret < 0) + goto failed_put_key; + + /* instruct request_key() to use this special keyring as a cache for + * the results it looks up */ + cred->thread_keyring = keyring; + cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; + dns_resolver_cache = cred; + return 0; + +failed_put_key: + key_put(keyring); +failed_put_cred: + put_cred(cred); + return ret; +} +void __exit cifs_exit_dns_resolver(void) +{ + key_revoke(dns_resolver_cache->thread_keyring); + unregister_key_type(&key_type_dns_resolver); + put_cred(dns_resolver_cache); + printk(KERN_NOTICE "Unregistered %s key type\n", + key_type_dns_resolver.name); +} diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h index 966e9288930..26b9eaa9f5e 100644 --- a/fs/cifs/dns_resolve.h +++ b/fs/cifs/dns_resolve.h @@ -24,8 +24,8 @@ #define _DNS_RESOLVE_H #ifdef __KERNEL__ -#include <linux/key-type.h> -extern struct key_type key_type_dns_resolver; +extern int __init cifs_init_dns_resolver(void); +extern void __exit cifs_exit_dns_resolver(void); extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr); #endif /* KERNEL */ diff --git a/fs/dcache.c b/fs/dcache.c index c8c78ba0782..86d4db15473 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -896,7 +896,7 @@ EXPORT_SYMBOL(shrink_dcache_parent); * * In this case we return -1 to tell the caller that we baled. */ -static int shrink_dcache_memory(int nr, gfp_t gfp_mask) +static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { if (nr) { if (!(gfp_mask & __GFP_FS)) diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 2d8dbce9d48..46c4dd8dfcc 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -31,9 +31,9 @@ static struct mutex ecryptfs_msg_ctx_lists_mux; static struct hlist_head *ecryptfs_daemon_hash; struct mutex ecryptfs_daemon_hash_mux; -static int ecryptfs_hash_buckets; +static int ecryptfs_hash_bits; #define ecryptfs_uid_hash(uid) \ - hash_long((unsigned long)uid, ecryptfs_hash_buckets) + hash_long((unsigned long)uid, ecryptfs_hash_bits) static u32 ecryptfs_msg_counter; static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr; @@ -486,18 +486,19 @@ int ecryptfs_init_messaging(void) } mutex_init(&ecryptfs_daemon_hash_mux); mutex_lock(&ecryptfs_daemon_hash_mux); - ecryptfs_hash_buckets = 1; - while (ecryptfs_number_of_users >> ecryptfs_hash_buckets) - ecryptfs_hash_buckets++; + ecryptfs_hash_bits = 1; + while (ecryptfs_number_of_users >> ecryptfs_hash_bits) + ecryptfs_hash_bits++; ecryptfs_daemon_hash = kmalloc((sizeof(struct hlist_head) - * ecryptfs_hash_buckets), GFP_KERNEL); + * (1 << ecryptfs_hash_bits)), + GFP_KERNEL); if (!ecryptfs_daemon_hash) { rc = -ENOMEM; printk(KERN_ERR "%s: Failed to allocate memory\n", __func__); mutex_unlock(&ecryptfs_daemon_hash_mux); goto out; } - for (i = 0; i < ecryptfs_hash_buckets; i++) + for (i = 0; i < (1 << ecryptfs_hash_bits); i++) INIT_HLIST_HEAD(&ecryptfs_daemon_hash[i]); mutex_unlock(&ecryptfs_daemon_hash_mux); ecryptfs_msg_ctx_arr = kmalloc((sizeof(struct ecryptfs_msg_ctx) @@ -554,7 +555,7 @@ void ecryptfs_release_messaging(void) int i; mutex_lock(&ecryptfs_daemon_hash_mux); - for (i = 0; i < ecryptfs_hash_buckets; i++) { + for (i = 0; i < (1 << ecryptfs_hash_bits); i++) { int rc; hlist_for_each_entry(daemon, elem, diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 4a48c0f4b40..84da64b551b 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1041,6 +1041,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size) if (gfs2_is_stuffed(ip)) { u64 dsize = size + sizeof(struct gfs2_inode); + ip->i_disksize = size; ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 8295c5b5d4a..6b48d7c268b 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -392,7 +392,7 @@ static int gfs2_dirent_find_space(const struct gfs2_dirent *dent, unsigned totlen = be16_to_cpu(dent->de_rec_len); if (gfs2_dirent_sentinel(dent)) - actual = GFS2_DIRENT_SIZE(0); + actual = 0; if (totlen - actual >= required) return 1; return 0; @@ -1231,6 +1231,25 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, return 0; } +static void *gfs2_alloc_sort_buffer(unsigned size) +{ + void *ptr = NULL; + + if (size < KMALLOC_MAX_SIZE) + ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN); + if (!ptr) + ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL); + return ptr; +} + +static void gfs2_free_sort_buffer(void *ptr) +{ + if (is_vmalloc_addr(ptr)) + vfree(ptr); + else + kfree(ptr); +} + static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, filldir_t filldir, int *copied, unsigned *depth, u64 leaf_no) @@ -1271,7 +1290,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, * 99 is the maximum number of entries that can fit in a single * leaf block. */ - larr = vmalloc((leaves + entries + 99) * sizeof(void *)); + larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *)); if (!larr) goto out; darr = (const struct gfs2_dirent **)(larr + leaves); @@ -1282,7 +1301,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, do { error = get_leaf(ip, lfn, &bh); if (error) - goto out_kfree; + goto out_free; lf = (struct gfs2_leaf *)bh->b_data; lfn = be64_to_cpu(lf->lf_next); if (lf->lf_entries) { @@ -1291,7 +1310,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, gfs2_dirent_gather, NULL, &g); error = PTR_ERR(dent); if (IS_ERR(dent)) - goto out_kfree; + goto out_free; if (entries2 != g.offset) { fs_warn(sdp, "Number of entries corrupt in dir " "leaf %llu, entries2 (%u) != " @@ -1300,7 +1319,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, entries2, g.offset); error = -EIO; - goto out_kfree; + goto out_free; } error = 0; larr[leaf++] = bh; @@ -1312,10 +1331,10 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, BUG_ON(entries2 != entries); error = do_filldir_main(ip, offset, opaque, filldir, darr, entries, copied); -out_kfree: +out_free: for(i = 0; i < leaf; i++) brelse(larr[i]); - vfree(larr); + gfs2_free_sort_buffer(larr); out: return error; } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index ddcdbf49353..0898f3ec821 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -706,8 +706,18 @@ static void glock_work_func(struct work_struct *work) { unsigned long delay = 0; struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work); + struct gfs2_holder *gh; int drop_ref = 0; + if (unlikely(test_bit(GLF_FROZEN, &gl->gl_flags))) { + spin_lock(&gl->gl_spin); + gh = find_first_waiter(gl); + if (gh && (gh->gh_flags & LM_FLAG_NOEXP) && + test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) + set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + spin_unlock(&gl->gl_spin); + } + if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) { finish_xmote(gl, gl->gl_reply); drop_ref = 1; @@ -1348,7 +1358,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) } -static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask) +static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { struct gfs2_glock *gl; int may_demote; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index b5612cbb62a..f03afd9c44b 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -169,7 +169,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, { struct inode *inode; struct gfs2_inode *ip; - struct gfs2_glock *io_gl; + struct gfs2_glock *io_gl = NULL; int error; inode = gfs2_iget(sb, no_addr); @@ -198,6 +198,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, ip->i_iopen_gh.gh_gl->gl_object = ip; gfs2_glock_put(io_gl); + io_gl = NULL; if ((type == DT_UNKNOWN) && (no_formal_ino == 0)) goto gfs2_nfsbypass; @@ -228,7 +229,8 @@ gfs2_nfsbypass: fail_glock: gfs2_glock_dq(&ip->i_iopen_gh); fail_iopen: - gfs2_glock_put(io_gl); + if (io_gl) + gfs2_glock_put(io_gl); fail_put: if (inode->i_state & I_NEW) ip->i_gl->gl_object = NULL; @@ -256,7 +258,7 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr) { struct gfs2_sbd *sdp; struct gfs2_inode *ip; - struct gfs2_glock *io_gl; + struct gfs2_glock *io_gl = NULL; int error; struct gfs2_holder gh; struct inode *inode; @@ -293,6 +295,7 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr) ip->i_iopen_gh.gh_gl->gl_object = ip; gfs2_glock_put(io_gl); + io_gl = NULL; inode->i_mode = DT2IF(DT_UNKNOWN); @@ -319,7 +322,8 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr) fail_glock: gfs2_glock_dq(&ip->i_iopen_gh); fail_iopen: - gfs2_glock_put(io_gl); + if (io_gl) + gfs2_glock_put(io_gl); fail_put: ip->i_gl->gl_object = NULL; gfs2_glock_put(ip->i_gl); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 49667d68769..8f02d3db8f4 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -77,7 +77,7 @@ static LIST_HEAD(qd_lru_list); static atomic_t qd_lru_count = ATOMIC_INIT(0); static DEFINE_SPINLOCK(qd_lru_lock); -int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask) +int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { struct gfs2_quota_data *qd; struct gfs2_sbd *sdp; @@ -694,10 +694,8 @@ get_a_page: if (!buffer_mapped(bh)) goto unlock_out; /* If it's a newly allocated disk block for quota, zero it */ - if (buffer_new(bh)) { - memset(bh->b_data, 0, bh->b_size); - set_buffer_uptodate(bh); - } + if (buffer_new(bh)) + zero_user(page, pos - blocksize, bh->b_size); } if (PageUptodate(page)) @@ -723,7 +721,7 @@ get_a_page: /* If quota straddles page boundary, we need to update the rest of the * quota at the beginning of the next page */ - if (offset != 0) { /* first page, offset is closer to PAGE_CACHE_SIZE */ + if ((offset + sizeof(struct gfs2_quota)) > PAGE_CACHE_SIZE) { ptr = ptr + nbytes; nbytes = sizeof(struct gfs2_quota) - nbytes; offset = 0; diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 195f60c8bd1..e7d236ca48b 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -51,7 +51,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) return ret; } -extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask); +extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask); extern const struct quotactl_ops gfs2_quotactl_ops; #endif /* __QUOTA_DOT_H__ */ diff --git a/fs/inode.c b/fs/inode.c index 2bee20ae3d6..722860b323a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -512,7 +512,7 @@ static void prune_icache(int nr_to_scan) * This function is passed the number of inodes to scan, and it returns the * total number of remaining possibly-reclaimable inodes. */ -static int shrink_icache_memory(int nr, gfp_t gfp_mask) +static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { if (nr) { /* diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index bc2ff593276..036880895bf 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -297,7 +297,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, struct page *new_page; unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); - struct jbd2_buffer_trigger_type *triggers; journal_t *journal = transaction->t_journal; /* @@ -328,21 +327,21 @@ repeat: done_copy_out = 1; new_page = virt_to_page(jh_in->b_frozen_data); new_offset = offset_in_page(jh_in->b_frozen_data); - triggers = jh_in->b_frozen_triggers; } else { new_page = jh2bh(jh_in)->b_page; new_offset = offset_in_page(jh2bh(jh_in)->b_data); - triggers = jh_in->b_triggers; } mapped_data = kmap_atomic(new_page, KM_USER0); /* - * Fire any commit trigger. Do this before checking for escaping, - * as the trigger may modify the magic offset. If a copy-out - * happens afterwards, it will have the correct data in the buffer. + * Fire data frozen trigger if data already wasn't frozen. Do this + * before checking for escaping, as the trigger may modify the magic + * offset. If a copy-out happens afterwards, it will have the correct + * data in the buffer. */ - jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset, - triggers); + if (!done_copy_out) + jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset, + jh_in->b_triggers); /* * Check for escaping diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e214d68620a..b8e0806681b 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -725,6 +725,9 @@ done: page = jh2bh(jh)->b_page; offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; source = kmap_atomic(page, KM_USER0); + /* Fire data frozen trigger just before we copy the data */ + jbd2_buffer_frozen_trigger(jh, source + offset, + jh->b_triggers); memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); kunmap_atomic(source, KM_USER0); @@ -963,15 +966,15 @@ void jbd2_journal_set_triggers(struct buffer_head *bh, jh->b_triggers = type; } -void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data, +void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, struct jbd2_buffer_trigger_type *triggers) { struct buffer_head *bh = jh2bh(jh); - if (!triggers || !triggers->t_commit) + if (!triggers || !triggers->t_frozen) return; - triggers->t_commit(triggers, bh, mapped_data, bh->b_size); + triggers->t_frozen(triggers, bh, mapped_data, bh->b_size); } void jbd2_buffer_abort_trigger(struct journal_head *jh, diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index a2d58c96f1b..d258e261bdc 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -626,7 +626,7 @@ void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *i static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic) { - /* success of check_xattr_ref_inode() means taht inode (ic) dose not have + /* success of check_xattr_ref_inode() means that inode (ic) dose not have * duplicate name/value pairs. If duplicate name/value pair would be found, * one will be removed. */ diff --git a/fs/mbcache.c b/fs/mbcache.c index ec88ff3d04a..e28f21b9534 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -115,7 +115,7 @@ mb_cache_indexes(struct mb_cache *cache) * What the mbcache registers as to get shrunk dynamically. */ -static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask); +static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask); static struct shrinker mb_cache_shrinker = { .shrink = mb_cache_shrink_fn, @@ -191,13 +191,14 @@ forget: * This function is called by the kernel memory management when memory * gets low. * + * @shrink: (ignored) * @nr_to_scan: Number of objects to scan * @gfp_mask: (ignored) * * Returns the number of objects which are present in the cache. */ static int -mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask) +mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { LIST_HEAD(free_list); struct list_head *l, *ltmp; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 782b431ef91..e60416d3f81 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1710,7 +1710,7 @@ static void nfs_access_free_list(struct list_head *head) } } -int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask) +int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { LIST_HEAD(head); struct nfs_inode *nfsi; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index d8bd619e386..e70f44b9b3f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -205,7 +205,8 @@ extern struct rpc_procinfo nfs4_procedures[]; void nfs_close_context(struct nfs_open_context *ctx, int is_sync); /* dir.c */ -extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); +extern int nfs_access_cache_shrinker(struct shrinker *shrink, + int nr_to_scan, gfp_t gfp_mask); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 3623ca20cc1..356e976772b 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -196,15 +196,14 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock, dump_stack(); goto bail; } - - past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); - mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, - (unsigned long long)past_eof); - - if (create && (iblock >= past_eof)) - set_buffer_new(bh_result); } + past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); + mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, + (unsigned long long)past_eof); + if (create && (iblock >= past_eof)) + set_buffer_new(bh_result); + bail: if (err < 0) err = -EIO; @@ -459,36 +458,6 @@ int walk_page_buffers( handle_t *handle, return ret; } -handle_t *ocfs2_start_walk_page_trans(struct inode *inode, - struct page *page, - unsigned from, - unsigned to) -{ - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - handle_t *handle; - int ret = 0; - - handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (IS_ERR(handle)) { - ret = -ENOMEM; - mlog_errno(ret); - goto out; - } - - if (ocfs2_should_order_data(inode)) { - ret = ocfs2_jbd2_file_inode(handle, inode); - if (ret < 0) - mlog_errno(ret); - } -out: - if (ret) { - if (!IS_ERR(handle)) - ocfs2_commit_trans(osb, handle); - handle = ERR_PTR(ret); - } - return handle; -} - static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) { sector_t status; @@ -1131,23 +1100,37 @@ out: */ static int ocfs2_grab_pages_for_write(struct address_space *mapping, struct ocfs2_write_ctxt *wc, - u32 cpos, loff_t user_pos, int new, + u32 cpos, loff_t user_pos, + unsigned user_len, int new, struct page *mmap_page) { int ret = 0, i; - unsigned long start, target_index, index; + unsigned long start, target_index, end_index, index; struct inode *inode = mapping->host; + loff_t last_byte; target_index = user_pos >> PAGE_CACHE_SHIFT; /* * Figure out how many pages we'll be manipulating here. For * non allocating write, we just change the one - * page. Otherwise, we'll need a whole clusters worth. + * page. Otherwise, we'll need a whole clusters worth. If we're + * writing past i_size, we only need enough pages to cover the + * last page of the write. */ if (new) { wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb); start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos); + /* + * We need the index *past* the last page we could possibly + * touch. This is the page past the end of the write or + * i_size, whichever is greater. + */ + last_byte = max(user_pos + user_len, i_size_read(inode)); + BUG_ON(last_byte < 1); + end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1; + if ((start + wc->w_num_pages) > end_index) + wc->w_num_pages = end_index - start; } else { wc->w_num_pages = 1; start = target_index; @@ -1620,21 +1603,20 @@ out: * write path can treat it as an non-allocating write, which has no * special case code for sparse/nonsparse files. */ -static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos, - unsigned len, +static int ocfs2_expand_nonsparse_inode(struct inode *inode, + struct buffer_head *di_bh, + loff_t pos, unsigned len, struct ocfs2_write_ctxt *wc) { int ret; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); loff_t newsize = pos + len; - if (ocfs2_sparse_alloc(osb)) - return 0; + BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))); if (newsize <= i_size_read(inode)) return 0; - ret = ocfs2_extend_no_holes(inode, newsize, pos); + ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos); if (ret) mlog_errno(ret); @@ -1644,6 +1626,18 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos, return ret; } +static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh, + loff_t pos) +{ + int ret = 0; + + BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))); + if (pos > i_size_read(inode)) + ret = ocfs2_zero_extend(inode, di_bh, pos); + + return ret; +} + int ocfs2_write_begin_nolock(struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, @@ -1679,7 +1673,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, } } - ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc); + if (ocfs2_sparse_alloc(osb)) + ret = ocfs2_zero_tail(inode, di_bh, pos); + else + ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len, + wc); if (ret) { mlog_errno(ret); goto out; @@ -1789,7 +1787,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, * that we can zero and flush if we error after adding the * extent. */ - ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, + ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, cluster_of_pages, mmap_page); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 6b5a492e174..153abb5abef 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1671,7 +1671,7 @@ struct dlm_ctxt * dlm_register_domain(const char *domain, struct dlm_ctxt *dlm = NULL; struct dlm_ctxt *new_ctxt = NULL; - if (strlen(domain) > O2NM_MAX_NAME_LEN) { + if (strlen(domain) >= O2NM_MAX_NAME_LEN) { ret = -ENAMETOOLONG; mlog(ML_ERROR, "domain name length too long\n"); goto leave; @@ -1709,6 +1709,7 @@ retry: } if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) { + spin_unlock(&dlm_domain_lock); mlog(ML_ERROR, "Requested locking protocol version is not " "compatible with already registered domain " diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 4a7506a4e31..94b97fc6a88 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2808,14 +2808,8 @@ again: mlog(0, "trying again...\n"); goto again; } - /* now that we are sure the MIGRATING state is there, drop - * the unneded state which blocked threads trying to DIRTY */ - spin_lock(&res->spinlock); - BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY)); - BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); - res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; - spin_unlock(&res->spinlock); + ret = 0; /* did the target go down or die? */ spin_lock(&dlm->spinlock); if (!test_bit(target, dlm->domain_map)) { @@ -2826,9 +2820,21 @@ again: spin_unlock(&dlm->spinlock); /* + * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for + * another try; otherwise, we are sure the MIGRATING state is there, + * drop the unneded state which blocked threads trying to DIRTY + */ + spin_lock(&res->spinlock); + BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY)); + res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; + if (!ret) + BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); + spin_unlock(&res->spinlock); + + /* * at this point: * - * o the DLM_LOCK_RES_MIGRATING flag is set + * o the DLM_LOCK_RES_MIGRATING flag is set if target not down * o there are no pending asts on this lockres * o all processes trying to reserve an ast on this * lockres must wait for the MIGRATING flag to clear diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index f8b75ce4be7..9dfaac73b36 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -463,7 +463,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { int bit; - bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0); + bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0); if (bit >= O2NM_MAX_NODES || bit < 0) dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); else diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6a13ea64c44..2b10b36d157 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -724,28 +724,55 @@ leave: return status; } +/* + * While a write will already be ordering the data, a truncate will not. + * Thus, we need to explicitly order the zeroed pages. + */ +static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + handle_t *handle = NULL; + int ret = 0; + + if (!ocfs2_should_order_data(inode)) + goto out; + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_jbd2_file_inode(handle, inode); + if (ret < 0) + mlog_errno(ret); + +out: + if (ret) { + if (!IS_ERR(handle)) + ocfs2_commit_trans(osb, handle); + handle = ERR_PTR(ret); + } + return handle; +} + /* Some parts of this taken from generic_cont_expand, which turned out * to be too fragile to do exactly what we need without us having to * worry about recursive locking in ->write_begin() and ->write_end(). */ -static int ocfs2_write_zero_page(struct inode *inode, - u64 size) +static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, + u64 abs_to) { struct address_space *mapping = inode->i_mapping; struct page *page; - unsigned long index; - unsigned int offset; + unsigned long index = abs_from >> PAGE_CACHE_SHIFT; handle_t *handle = NULL; - int ret; + int ret = 0; + unsigned zero_from, zero_to, block_start, block_end; - offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ - /* ugh. in prepare/commit_write, if from==to==start of block, we - ** skip the prepare. make sure we never send an offset for the start - ** of a block - */ - if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { - offset++; - } - index = size >> PAGE_CACHE_SHIFT; + BUG_ON(abs_from >= abs_to); + BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); + BUG_ON(abs_from & (inode->i_blkbits - 1)); page = grab_cache_page(mapping, index); if (!page) { @@ -754,31 +781,56 @@ static int ocfs2_write_zero_page(struct inode *inode, goto out; } - ret = ocfs2_prepare_write_nolock(inode, page, offset, offset); - if (ret < 0) { - mlog_errno(ret); - goto out_unlock; - } + /* Get the offsets within the page that we want to zero */ + zero_from = abs_from & (PAGE_CACHE_SIZE - 1); + zero_to = abs_to & (PAGE_CACHE_SIZE - 1); + if (!zero_to) + zero_to = PAGE_CACHE_SIZE; - if (ocfs2_should_order_data(inode)) { - handle = ocfs2_start_walk_page_trans(inode, page, offset, - offset); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - handle = NULL; + mlog(0, + "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n", + (unsigned long long)abs_from, (unsigned long long)abs_to, + index, zero_from, zero_to); + + /* We know that zero_from is block aligned */ + for (block_start = zero_from; block_start < zero_to; + block_start = block_end) { + block_end = block_start + (1 << inode->i_blkbits); + + /* + * block_start is block-aligned. Bump it by one to + * force ocfs2_{prepare,commit}_write() to zero the + * whole block. + */ + ret = ocfs2_prepare_write_nolock(inode, page, + block_start + 1, + block_start + 1); + if (ret < 0) { + mlog_errno(ret); goto out_unlock; } - } - /* must not update i_size! */ - ret = block_commit_write(page, offset, offset); - if (ret < 0) - mlog_errno(ret); - else - ret = 0; + if (!handle) { + handle = ocfs2_zero_start_ordered_transaction(inode); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + break; + } + } + + /* must not update i_size! */ + ret = block_commit_write(page, block_start + 1, + block_start + 1); + if (ret < 0) + mlog_errno(ret); + else + ret = 0; + } if (handle) ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); + out_unlock: unlock_page(page); page_cache_release(page); @@ -786,22 +838,114 @@ out: return ret; } -static int ocfs2_zero_extend(struct inode *inode, - u64 zero_to_size) +/* + * Find the next range to zero. We do this in terms of bytes because + * that's what ocfs2_zero_extend() wants, and it is dealing with the + * pagecache. We may return multiple extents. + * + * zero_start and zero_end are ocfs2_zero_extend()s current idea of what + * needs to be zeroed. range_start and range_end return the next zeroing + * range. A subsequent call should pass the previous range_end as its + * zero_start. If range_end is 0, there's nothing to do. + * + * Unwritten extents are skipped over. Refcounted extents are CoWd. + */ +static int ocfs2_zero_extend_get_range(struct inode *inode, + struct buffer_head *di_bh, + u64 zero_start, u64 zero_end, + u64 *range_start, u64 *range_end) { - int ret = 0; - u64 start_off; - struct super_block *sb = inode->i_sb; + int rc = 0, needs_cow = 0; + u32 p_cpos, zero_clusters = 0; + u32 zero_cpos = + zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end); + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; - start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); - while (start_off < zero_to_size) { - ret = ocfs2_write_zero_page(inode, start_off); - if (ret < 0) { - mlog_errno(ret); + while (zero_cpos < last_cpos) { + rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos, + &num_clusters, &ext_flags); + if (rc) { + mlog_errno(rc); + goto out; + } + + if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { + zero_clusters = num_clusters; + if (ext_flags & OCFS2_EXT_REFCOUNTED) + needs_cow = 1; + break; + } + + zero_cpos += num_clusters; + } + if (!zero_clusters) { + *range_end = 0; + goto out; + } + + while ((zero_cpos + zero_clusters) < last_cpos) { + rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters, + &p_cpos, &num_clusters, + &ext_flags); + if (rc) { + mlog_errno(rc); goto out; } - start_off += sb->s_blocksize; + if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)) + break; + if (ext_flags & OCFS2_EXT_REFCOUNTED) + needs_cow = 1; + zero_clusters += num_clusters; + } + if ((zero_cpos + zero_clusters) > last_cpos) + zero_clusters = last_cpos - zero_cpos; + + if (needs_cow) { + rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters, + UINT_MAX); + if (rc) { + mlog_errno(rc); + goto out; + } + } + + *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos); + *range_end = ocfs2_clusters_to_bytes(inode->i_sb, + zero_cpos + zero_clusters); + +out: + return rc; +} + +/* + * Zero one range returned from ocfs2_zero_extend_get_range(). The caller + * has made sure that the entire range needs zeroing. + */ +static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, + u64 range_end) +{ + int rc = 0; + u64 next_pos; + u64 zero_pos = range_start; + + mlog(0, "range_start = %llu, range_end = %llu\n", + (unsigned long long)range_start, + (unsigned long long)range_end); + BUG_ON(range_start >= range_end); + + while (zero_pos < range_end) { + next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE; + if (next_pos > range_end) + next_pos = range_end; + rc = ocfs2_write_zero_page(inode, zero_pos, next_pos); + if (rc < 0) { + mlog_errno(rc); + break; + } + zero_pos = next_pos; /* * Very large extends have the potential to lock up @@ -810,16 +954,63 @@ static int ocfs2_zero_extend(struct inode *inode, cond_resched(); } -out: + return rc; +} + +int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, + loff_t zero_to_size) +{ + int ret = 0; + u64 zero_start, range_start = 0, range_end = 0; + struct super_block *sb = inode->i_sb; + + zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); + mlog(0, "zero_start %llu for i_size %llu\n", + (unsigned long long)zero_start, + (unsigned long long)i_size_read(inode)); + while (zero_start < zero_to_size) { + ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start, + zero_to_size, + &range_start, + &range_end); + if (ret) { + mlog_errno(ret); + break; + } + if (!range_end) + break; + /* Trim the ends */ + if (range_start < zero_start) + range_start = zero_start; + if (range_end > zero_to_size) + range_end = zero_to_size; + + ret = ocfs2_zero_extend_range(inode, range_start, + range_end); + if (ret) { + mlog_errno(ret); + break; + } + zero_start = range_end; + } + return ret; } -int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to) +int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, + u64 new_i_size, u64 zero_to) { int ret; u32 clusters_to_add; struct ocfs2_inode_info *oi = OCFS2_I(inode); + /* + * Only quota files call this without a bh, and they can't be + * refcounted. + */ + BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE)); + clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); if (clusters_to_add < oi->ip_clusters) clusters_to_add = 0; @@ -840,7 +1031,7 @@ int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to) * still need to zero the area between the old i_size and the * new i_size. */ - ret = ocfs2_zero_extend(inode, zero_to); + ret = ocfs2_zero_extend(inode, di_bh, zero_to); if (ret < 0) mlog_errno(ret); @@ -862,27 +1053,15 @@ static int ocfs2_extend_file(struct inode *inode, goto out; if (i_size_read(inode) == new_i_size) - goto out; + goto out; BUG_ON(new_i_size < i_size_read(inode)); /* - * Fall through for converting inline data, even if the fs - * supports sparse files. - * - * The check for inline data here is legal - nobody can add - * the feature since we have i_mutex. We must check it again - * after acquiring ip_alloc_sem though, as paths like mmap - * might have raced us to converting the inode to extents. - */ - if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) - && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) - goto out_update_size; - - /* * The alloc sem blocks people in read/write from reading our * allocation until we're done changing it. We depend on * i_mutex to block other extend/truncate calls while we're - * here. + * here. We even have to hold it for sparse files because there + * might be some tail zeroing. */ down_write(&oi->ip_alloc_sem); @@ -899,14 +1078,16 @@ static int ocfs2_extend_file(struct inode *inode, ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); if (ret) { up_write(&oi->ip_alloc_sem); - mlog_errno(ret); goto out; } } - if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) - ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size); + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + ret = ocfs2_zero_extend(inode, di_bh, new_i_size); + else + ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size, + new_i_size); up_write(&oi->ip_alloc_sem); diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index d66cf4f7c70..97bf761c9e7 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -54,8 +54,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, int ocfs2_simple_size_update(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size); -int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, - u64 zero_to); +int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, + u64 new_i_size, u64 zero_to); +int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, + loff_t zero_to); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 47878cf1641..625de9d7088 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -472,7 +472,7 @@ static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger return container_of(triggers, struct ocfs2_triggers, ot_triggers); } -static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers, +static void ocfs2_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size) { @@ -491,7 +491,7 @@ static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers, * Quota blocks have their own trigger because the struct ocfs2_block_check * offset depends on the blocksize. */ -static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers, +static void ocfs2_dq_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size) { @@ -511,7 +511,7 @@ static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers, * Directory blocks also have their own trigger because the * struct ocfs2_block_check offset depends on the blocksize. */ -static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers, +static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size) { @@ -544,7 +544,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers, static struct ocfs2_triggers di_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_dinode, i_check), @@ -552,7 +552,7 @@ static struct ocfs2_triggers di_triggers = { static struct ocfs2_triggers eb_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_extent_block, h_check), @@ -560,7 +560,7 @@ static struct ocfs2_triggers eb_triggers = { static struct ocfs2_triggers rb_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check), @@ -568,7 +568,7 @@ static struct ocfs2_triggers rb_triggers = { static struct ocfs2_triggers gd_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_group_desc, bg_check), @@ -576,14 +576,14 @@ static struct ocfs2_triggers gd_triggers = { static struct ocfs2_triggers db_triggers = { .ot_triggers = { - .t_commit = ocfs2_db_commit_trigger, + .t_frozen = ocfs2_db_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, }; static struct ocfs2_triggers xb_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check), @@ -591,14 +591,14 @@ static struct ocfs2_triggers xb_triggers = { static struct ocfs2_triggers dq_triggers = { .ot_triggers = { - .t_commit = ocfs2_dq_commit_trigger, + .t_frozen = ocfs2_dq_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, }; static struct ocfs2_triggers dr_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check), @@ -606,7 +606,7 @@ static struct ocfs2_triggers dr_triggers = { static struct ocfs2_triggers dl_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check), @@ -1936,7 +1936,7 @@ void ocfs2_orphan_scan_work(struct work_struct *work) mutex_lock(&os->os_lock); ocfs2_queue_orphan_scan(osb); if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) - schedule_delayed_work(&os->os_orphan_scan_work, + queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, ocfs2_orphan_scan_timeout()); mutex_unlock(&os->os_lock); } @@ -1976,8 +1976,8 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb) atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); else { atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE); - schedule_delayed_work(&os->os_orphan_scan_work, - ocfs2_orphan_scan_timeout()); + queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, + ocfs2_orphan_scan_timeout()); } } diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 3d7419682dc..ec6adbf8f55 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -118,6 +118,7 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb) { unsigned int la_mb; unsigned int gd_mb; + unsigned int la_max_mb; unsigned int megs_per_slot; struct super_block *sb = osb->sb; @@ -182,6 +183,12 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb) if (megs_per_slot < la_mb) la_mb = megs_per_slot; + /* We can't store more bits than we can in a block. */ + la_max_mb = ocfs2_clusters_to_megabytes(osb->sb, + ocfs2_local_alloc_size(sb) * 8); + if (la_mb > la_max_mb) + la_mb = la_max_mb; + return la_mb; } diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 2bb35fe0051..4607923eb24 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -775,7 +775,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot) * locking allocators ranks above a transaction start */ WARN_ON(journal_current_handle()); - status = ocfs2_extend_no_holes(gqinode, + status = ocfs2_extend_no_holes(gqinode, NULL, gqinode->i_size + (need_alloc << sb->s_blocksize_bits), gqinode->i_size); if (status < 0) diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 8bd70d4d184..dc78764ccc4 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -971,7 +971,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( u64 p_blkno; /* We are protected by dqio_sem so no locking needed */ - status = ocfs2_extend_no_holes(lqinode, + status = ocfs2_extend_no_holes(lqinode, NULL, lqinode->i_size + 2 * sb->s_blocksize, lqinode->i_size); if (status < 0) { @@ -1114,7 +1114,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( return ocfs2_local_quota_add_chunk(sb, type, offset); /* We are protected by dqio_sem so no locking needed */ - status = ocfs2_extend_no_holes(lqinode, + status = ocfs2_extend_no_holes(lqinode, NULL, lqinode->i_size + sb->s_blocksize, lqinode->i_size); if (status < 0) { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 4793f36f651..3ac5aa733e9 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2931,6 +2931,12 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); + /* + * We only duplicate pages until we reach the page contains i_size - 1. + * So trim 'end' to i_size. + */ + if (end > i_size_read(context->inode)) + end = i_size_read(context->inode); while (offset < end) { page_index = offset >> PAGE_CACHE_SHIFT; @@ -4166,6 +4172,12 @@ static int __ocfs2_reflink(struct dentry *old_dentry, struct inode *inode = old_dentry->d_inode; struct buffer_head *new_bh = NULL; + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) { + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + ret = filemap_fdatawrite(inode->i_mapping); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index f4c2a9eb8c4..a8e6a95a353 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -741,7 +741,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, le16_to_cpu(bg->bg_free_bits_count)); le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits)); - cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg->bg_blkno); + cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno; if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) le16_add_cpu(&cl->cl_next_free_rec, 1); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index e97b34842cf..d03469f6180 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -709,7 +709,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode, struct ocfs2_xattr_value_buf *vb, struct ocfs2_xattr_set_ctxt *ctxt) { - int status = 0; + int status = 0, credits; handle_t *handle = ctxt->handle; enum ocfs2_alloc_restarted why; u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters); @@ -719,38 +719,54 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode, ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb); - status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto leave; - } + while (clusters_to_add) { + status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + break; + } - prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters); - status = ocfs2_add_clusters_in_btree(handle, - &et, - &logical_start, - clusters_to_add, - 0, - ctxt->data_ac, - ctxt->meta_ac, - &why); - if (status < 0) { - mlog_errno(status); - goto leave; - } + prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters); + status = ocfs2_add_clusters_in_btree(handle, + &et, + &logical_start, + clusters_to_add, + 0, + ctxt->data_ac, + ctxt->meta_ac, + &why); + if ((status < 0) && (status != -EAGAIN)) { + if (status != -ENOSPC) + mlog_errno(status); + break; + } - ocfs2_journal_dirty(handle, vb->vb_bh); + ocfs2_journal_dirty(handle, vb->vb_bh); - clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters; + clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - + prev_clusters; - /* - * We should have already allocated enough space before the transaction, - * so no need to restart. - */ - BUG_ON(why != RESTART_NONE || clusters_to_add); - -leave: + if (why != RESTART_NONE && clusters_to_add) { + /* + * We can only fail in case the alloc file doesn't give + * up enough clusters. + */ + BUG_ON(why == RESTART_META); + + mlog(0, "restarting xattr value extension for %u" + " clusters,.\n", clusters_to_add); + credits = ocfs2_calc_extend_credits(inode->i_sb, + &vb->vb_xv->xr_list, + clusters_to_add); + status = ocfs2_extend_trans(handle, credits); + if (status < 0) { + status = -ENOMEM; + mlog_errno(status); + break; + } + } + } return status; } @@ -6788,16 +6804,15 @@ out: return ret; } -static int ocfs2_reflink_xattr_buckets(handle_t *handle, +static int ocfs2_reflink_xattr_bucket(handle_t *handle, u64 blkno, u64 new_blkno, u32 clusters, + u32 *cpos, int num_buckets, struct ocfs2_alloc_context *meta_ac, struct ocfs2_alloc_context *data_ac, struct ocfs2_reflink_xattr_tree_args *args) { int i, j, ret = 0; struct super_block *sb = args->reflink->old_inode->i_sb; - u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb)); - u32 num_buckets = clusters * bpc; int bpb = args->old_bucket->bu_blocks; struct ocfs2_xattr_value_buf vb = { .vb_access = ocfs2_journal_access, @@ -6816,14 +6831,6 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle, break; } - /* - * The real bucket num in this series of blocks is stored - * in the 1st bucket. - */ - if (i == 0) - num_buckets = le16_to_cpu( - bucket_xh(args->old_bucket)->xh_num_buckets); - ret = ocfs2_xattr_bucket_journal_access(handle, args->new_bucket, OCFS2_JOURNAL_ACCESS_CREATE); @@ -6837,6 +6844,18 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle, bucket_block(args->old_bucket, j), sb->s_blocksize); + /* + * Record the start cpos so that we can use it to initialize + * our xattr tree we also set the xh_num_bucket for the new + * bucket. + */ + if (i == 0) { + *cpos = le32_to_cpu(bucket_xh(args->new_bucket)-> + xh_entries[0].xe_name_hash); + bucket_xh(args->new_bucket)->xh_num_buckets = + cpu_to_le16(num_buckets); + } + ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket); ret = ocfs2_reflink_xattr_header(handle, args->reflink, @@ -6866,6 +6885,7 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle, } ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket); + ocfs2_xattr_bucket_relse(args->old_bucket); ocfs2_xattr_bucket_relse(args->new_bucket); } @@ -6874,6 +6894,75 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle, ocfs2_xattr_bucket_relse(args->new_bucket); return ret; } + +static int ocfs2_reflink_xattr_buckets(handle_t *handle, + struct inode *inode, + struct ocfs2_reflink_xattr_tree_args *args, + struct ocfs2_extent_tree *et, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac, + u64 blkno, u32 cpos, u32 len) +{ + int ret, first_inserted = 0; + u32 p_cluster, num_clusters, reflink_cpos = 0; + u64 new_blkno; + unsigned int num_buckets, reflink_buckets; + unsigned int bpc = + ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)); + + ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + num_buckets = le16_to_cpu(bucket_xh(args->old_bucket)->xh_num_buckets); + ocfs2_xattr_bucket_relse(args->old_bucket); + + while (len && num_buckets) { + ret = ocfs2_claim_clusters(handle, data_ac, + 1, &p_cluster, &num_clusters); + if (ret) { + mlog_errno(ret); + goto out; + } + + new_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + reflink_buckets = min(num_buckets, bpc * num_clusters); + + ret = ocfs2_reflink_xattr_bucket(handle, blkno, + new_blkno, num_clusters, + &reflink_cpos, reflink_buckets, + meta_ac, data_ac, args); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * For the 1st allocated cluster, we make it use the same cpos + * so that the xattr tree looks the same as the original one + * in the most case. + */ + if (!first_inserted) { + reflink_cpos = cpos; + first_inserted = 1; + } + ret = ocfs2_insert_extent(handle, et, reflink_cpos, new_blkno, + num_clusters, 0, meta_ac); + if (ret) + mlog_errno(ret); + + mlog(0, "insert new xattr extent rec start %llu len %u to %u\n", + (unsigned long long)new_blkno, num_clusters, reflink_cpos); + + len -= num_clusters; + blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters); + num_buckets -= reflink_buckets; + } +out: + return ret; +} + /* * Create the same xattr extent record in the new inode's xattr tree. */ @@ -6885,8 +6974,6 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode, void *para) { int ret, credits = 0; - u32 p_cluster, num_clusters; - u64 new_blkno; handle_t *handle; struct ocfs2_reflink_xattr_tree_args *args = (struct ocfs2_reflink_xattr_tree_args *)para; @@ -6895,6 +6982,9 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode, struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_extent_tree et; + mlog(0, "reflink xattr buckets %llu len %u\n", + (unsigned long long)blkno, len); + ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(args->reflink->new_inode), args->new_blk_bh); @@ -6914,32 +7004,12 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode, goto out; } - ret = ocfs2_claim_clusters(handle, data_ac, - len, &p_cluster, &num_clusters); - if (ret) { - mlog_errno(ret); - goto out_commit; - } - - new_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cluster); - - mlog(0, "reflink xattr buckets %llu to %llu, len %u\n", - (unsigned long long)blkno, (unsigned long long)new_blkno, len); - ret = ocfs2_reflink_xattr_buckets(handle, blkno, new_blkno, len, - meta_ac, data_ac, args); - if (ret) { - mlog_errno(ret); - goto out_commit; - } - - mlog(0, "insert new xattr extent rec start %llu len %u to %u\n", - (unsigned long long)new_blkno, len, cpos); - ret = ocfs2_insert_extent(handle, &et, cpos, new_blkno, - len, 0, meta_ac); + ret = ocfs2_reflink_xattr_buckets(handle, inode, args, &et, + meta_ac, data_ac, + blkno, cpos, len); if (ret) mlog_errno(ret); -out_commit: ocfs2_commit_trans(osb, handle); out: diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c index 3e73de5967f..fc8497643fd 100644 --- a/fs/partitions/ibm.c +++ b/fs/partitions/ibm.c @@ -74,6 +74,7 @@ int ibm_partition(struct parsed_partitions *state) } *label; unsigned char *data; Sector sect; + sector_t labelsect; res = 0; blocksize = bdev_logical_block_size(bdev); @@ -98,10 +99,19 @@ int ibm_partition(struct parsed_partitions *state) goto out_freeall; /* + * Special case for FBA disks: label sector does not depend on + * blocksize. + */ + if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) || + (info->cu_type == 0x3880 && info->dev_type == 0x3370)) + labelsect = info->label_block; + else + labelsect = info->label_block * (blocksize >> 9); + + /* * Get volume label, extract name and type. */ - data = read_part_sector(state, info->label_block*(blocksize/512), - §); + data = read_part_sector(state, labelsect, §); if (data == NULL) goto out_readerr; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 12c233da1b6..437d2ca2de9 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -676,7 +676,7 @@ static void prune_dqcache(int count) * This is called from kswapd when we think we need some * more memory */ -static int shrink_dqcache_memory(int nr, gfp_t gfp_mask) +static int shrink_dqcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { if (nr) { spin_lock(&dq_list_lock); diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index f71246bebfe..a7ac78f8e67 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -28,6 +28,7 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, struct sysfs_dirent *target_sd = NULL; struct sysfs_dirent *sd = NULL; struct sysfs_addrm_cxt acxt; + enum kobj_ns_type ns_type; int error; BUG_ON(!name); @@ -58,16 +59,29 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, if (!sd) goto out_put; - if (sysfs_ns_type(parent_sd)) + ns_type = sysfs_ns_type(parent_sd); + if (ns_type) sd->s_ns = target->ktype->namespace(target); sd->s_symlink.target_sd = target_sd; target_sd = NULL; /* reference is now owned by the symlink */ sysfs_addrm_start(&acxt, parent_sd); - if (warn) - error = sysfs_add_one(&acxt, sd); - else - error = __sysfs_add_one(&acxt, sd); + /* Symlinks must be between directories with the same ns_type */ + if (!ns_type || + (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) { + if (warn) + error = sysfs_add_one(&acxt, sd); + else + error = __sysfs_add_one(&acxt, sd); + } else { + error = -EINVAL; + WARN(1, KERN_WARNING + "sysfs: symlink across ns_types %s/%s -> %s/%s\n", + parent_sd->s_name, + sd->s_name, + sd->s_symlink.target_sd->s_parent->s_name, + sd->s_symlink.target_sd->s_name); + } sysfs_addrm_finish(&acxt); if (error) @@ -122,7 +136,7 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, { const void *ns = NULL; spin_lock(&sysfs_assoc_lock); - if (targ->sd) + if (targ->sd && sysfs_ns_type(kobj->sd)) ns = targ->sd->s_ns; spin_unlock(&sysfs_assoc_lock); sysfs_hash_and_remove(kobj->sd, ns, name); diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c index 02feb59cefc..0b201114a5a 100644 --- a/fs/ubifs/shrinker.c +++ b/fs/ubifs/shrinker.c @@ -277,7 +277,7 @@ static int kick_a_thread(void) return 0; } -int ubifs_shrinker(int nr, gfp_t gfp_mask) +int ubifs_shrinker(struct shrinker *shrink, int nr, gfp_t gfp_mask) { int freed, contention = 0; long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 2eef553d50c..04310878f44 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1575,7 +1575,7 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot); int ubifs_tnc_end_commit(struct ubifs_info *c); /* shrinker.c */ -int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask); +int ubifs_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask); /* commit.c */ int ubifs_bg_thread(void *info); diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 649ade8ef59..2ee3f7a6016 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -45,7 +45,7 @@ static kmem_zone_t *xfs_buf_zone; STATIC int xfsbufd(void *); -STATIC int xfsbufd_wakeup(int, gfp_t); +STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t); STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); static struct shrinker xfs_buf_shake = { .shrink = xfsbufd_wakeup, @@ -340,7 +340,7 @@ _xfs_buf_lookup_pages( __func__, gfp_mask); XFS_STATS_INC(xb_page_retries); - xfsbufd_wakeup(0, gfp_mask); + xfsbufd_wakeup(NULL, 0, gfp_mask); congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } @@ -1762,6 +1762,7 @@ xfs_buf_runall_queues( STATIC int xfsbufd_wakeup( + struct shrinker *shrink, int priority, gfp_t mask) { diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index f2d1718c916..80938c736c2 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1883,7 +1883,6 @@ init_xfs_fs(void) goto out_cleanup_procfs; vfs_initquota(); - xfs_inode_shrinker_init(); error = register_filesystem(&xfs_fs_type); if (error) @@ -1911,7 +1910,6 @@ exit_xfs_fs(void) { vfs_exitquota(); unregister_filesystem(&xfs_fs_type); - xfs_inode_shrinker_destroy(); xfs_sysctl_unregister(); xfs_cleanup_procfs(); xfs_buf_terminate(); diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index ef7f0218bcc..a51a07c3a70 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -144,6 +144,41 @@ restart: return last_error; } +/* + * Select the next per-ag structure to iterate during the walk. The reclaim + * walk is optimised only to walk AGs with reclaimable inodes in them. + */ +static struct xfs_perag * +xfs_inode_ag_iter_next_pag( + struct xfs_mount *mp, + xfs_agnumber_t *first, + int tag) +{ + struct xfs_perag *pag = NULL; + + if (tag == XFS_ICI_RECLAIM_TAG) { + int found; + int ref; + + spin_lock(&mp->m_perag_lock); + found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, + (void **)&pag, *first, 1, tag); + if (found <= 0) { + spin_unlock(&mp->m_perag_lock); + return NULL; + } + *first = pag->pag_agno + 1; + /* open coded pag reference increment */ + ref = atomic_inc_return(&pag->pag_ref); + spin_unlock(&mp->m_perag_lock); + trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_); + } else { + pag = xfs_perag_get(mp, *first); + (*first)++; + } + return pag; +} + int xfs_inode_ag_iterator( struct xfs_mount *mp, @@ -154,16 +189,15 @@ xfs_inode_ag_iterator( int exclusive, int *nr_to_scan) { + struct xfs_perag *pag; int error = 0; int last_error = 0; xfs_agnumber_t ag; int nr; nr = nr_to_scan ? *nr_to_scan : INT_MAX; - for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { - struct xfs_perag *pag; - - pag = xfs_perag_get(mp, ag); + ag = 0; + while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) { error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, exclusive, &nr); xfs_perag_put(pag); @@ -640,6 +674,17 @@ __xfs_inode_set_reclaim_tag( radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), XFS_ICI_RECLAIM_TAG); + + if (!pag->pag_ici_reclaimable) { + /* propagate the reclaim tag up into the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_set(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } pag->pag_ici_reclaimable++; } @@ -674,6 +719,16 @@ __xfs_inode_clear_reclaim_tag( radix_tree_tag_clear(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); pag->pag_ici_reclaimable--; + if (!pag->pag_ici_reclaimable) { + /* clear the reclaim tag from the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_clear(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } } /* @@ -828,83 +883,52 @@ xfs_reclaim_inodes( /* * Shrinker infrastructure. - * - * This is all far more complex than it needs to be. It adds a global list of - * mounts because the shrinkers can only call a global context. We need to make - * the shrinkers pass a context to avoid the need for global state. */ -static LIST_HEAD(xfs_mount_list); -static struct rw_semaphore xfs_mount_list_lock; - static int xfs_reclaim_inode_shrink( + struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { struct xfs_mount *mp; struct xfs_perag *pag; xfs_agnumber_t ag; - int reclaimable = 0; + int reclaimable; + mp = container_of(shrink, struct xfs_mount, m_inode_shrink); if (nr_to_scan) { if (!(gfp_mask & __GFP_FS)) return -1; - down_read(&xfs_mount_list_lock); - list_for_each_entry(mp, &xfs_mount_list, m_mplist) { - xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0, + xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0, XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan); - if (nr_to_scan <= 0) - break; - } - up_read(&xfs_mount_list_lock); - } + /* if we don't exhaust the scan, don't bother coming back */ + if (nr_to_scan > 0) + return -1; + } - down_read(&xfs_mount_list_lock); - list_for_each_entry(mp, &xfs_mount_list, m_mplist) { - for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { - pag = xfs_perag_get(mp, ag); - reclaimable += pag->pag_ici_reclaimable; - xfs_perag_put(pag); - } + reclaimable = 0; + ag = 0; + while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, + XFS_ICI_RECLAIM_TAG))) { + reclaimable += pag->pag_ici_reclaimable; + xfs_perag_put(pag); } - up_read(&xfs_mount_list_lock); return reclaimable; } -static struct shrinker xfs_inode_shrinker = { - .shrink = xfs_reclaim_inode_shrink, - .seeks = DEFAULT_SEEKS, -}; - -void __init -xfs_inode_shrinker_init(void) -{ - init_rwsem(&xfs_mount_list_lock); - register_shrinker(&xfs_inode_shrinker); -} - -void -xfs_inode_shrinker_destroy(void) -{ - ASSERT(list_empty(&xfs_mount_list)); - unregister_shrinker(&xfs_inode_shrinker); -} - void xfs_inode_shrinker_register( struct xfs_mount *mp) { - down_write(&xfs_mount_list_lock); - list_add_tail(&mp->m_mplist, &xfs_mount_list); - up_write(&xfs_mount_list_lock); + mp->m_inode_shrink.shrink = xfs_reclaim_inode_shrink; + mp->m_inode_shrink.seeks = DEFAULT_SEEKS; + register_shrinker(&mp->m_inode_shrink); } void xfs_inode_shrinker_unregister( struct xfs_mount *mp) { - down_write(&xfs_mount_list_lock); - list_del(&mp->m_mplist); - up_write(&xfs_mount_list_lock); + unregister_shrinker(&mp->m_inode_shrink); } diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index cdcbaaca988..e28139aaa4a 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -55,8 +55,6 @@ int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), int flags, int tag, int write_lock, int *nr_to_scan); -void xfs_inode_shrinker_init(void); -void xfs_inode_shrinker_destroy(void); void xfs_inode_shrinker_register(struct xfs_mount *mp); void xfs_inode_shrinker_unregister(struct xfs_mount *mp); diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index 73d5aa11738..30282069090 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -124,7 +124,10 @@ DEFINE_EVENT(xfs_perag_class, name, \ unsigned long caller_ip), \ TP_ARGS(mp, agno, refcount, caller_ip)) DEFINE_PERAG_REF_EVENT(xfs_perag_get); +DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim); DEFINE_PERAG_REF_EVENT(xfs_perag_put); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); TRACE_EVENT(xfs_attr_list_node_descend, TP_PROTO(struct xfs_attr_list_context *ctx, diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 8c117ff2e3a..67c018392d6 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -69,7 +69,7 @@ STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); -STATIC int xfs_qm_shake(int, gfp_t); +STATIC int xfs_qm_shake(struct shrinker *, int, gfp_t); static struct shrinker xfs_qm_shaker = { .shrink = xfs_qm_shake, @@ -2117,7 +2117,10 @@ xfs_qm_shake_freelist( */ /* ARGSUSED */ STATIC int -xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask) +xfs_qm_shake( + struct shrinker *shrink, + int nr_to_scan, + gfp_t gfp_mask) { int ndqused, nfree, n; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 1d2c7eed4ed..5761087ee8e 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -259,7 +259,7 @@ typedef struct xfs_mount { wait_queue_head_t m_wait_single_sync_task; __int64_t m_update_flags; /* sb flags we need to update on the next remount,rw */ - struct list_head m_mplist; /* inode shrinker mount list */ + struct shrinker m_inode_shrink; /* inode reclaim shrinker */ } xfs_mount_t; /* |