diff options
Diffstat (limited to 'fs')
91 files changed, 1322 insertions, 941 deletions
diff --git a/fs/affs/super.c b/fs/affs/super.c index 6d589f28bf9..895ac7dc9db 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -340,8 +340,6 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) &blocksize,&sbi->s_prefix, sbi->s_volume, &mount_flags)) { printk(KERN_ERR "AFFS: Error parsing options\n"); - kfree(sbi->s_prefix); - kfree(sbi); return -EINVAL; } /* N.B. after this point s_prefix must be released */ diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 1c8c6cc6de3..4b0eff6da67 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -130,6 +130,15 @@ static void afs_cm_destructor(struct afs_call *call) { _enter(""); + /* Break the callbacks here so that we do it after the final ACK is + * received. The step number here must match the final number in + * afs_deliver_cb_callback(). + */ + if (call->unmarshall == 6) { + ASSERT(call->server && call->count && call->request); + afs_break_callbacks(call->server, call->count, call->request); + } + afs_put_server(call->server); call->server = NULL; kfree(call->buffer); @@ -272,6 +281,16 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, _debug("trailer"); if (skb->len != 0) return -EBADMSG; + + /* Record that the message was unmarshalled successfully so + * that the call destructor can know do the callback breaking + * work, even if the final ACK isn't received. + * + * If the step number changes, then afs_cm_destructor() must be + * updated also. + */ + call->unmarshall++; + case 6: break; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index be75b500005..590b55f46d6 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -75,7 +75,7 @@ struct afs_call { const struct afs_call_type *type; /* type of call */ const struct afs_wait_mode *wait_mode; /* completion wait mode */ wait_queue_head_t waitq; /* processes awaiting completion */ - work_func_t async_workfn; + void (*async_workfn)(struct afs_call *call); /* asynchronous work function */ struct work_struct async_work; /* asynchronous work processor */ struct work_struct work; /* actual work processor */ struct sk_buff_head rx_queue; /* received packets */ diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index ef943df73b8..03a3beb1700 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -25,7 +25,7 @@ static void afs_wake_up_call_waiter(struct afs_call *); static int afs_wait_for_call_to_complete(struct afs_call *); static void afs_wake_up_async_call(struct afs_call *); static int afs_dont_wait_for_call_to_complete(struct afs_call *); -static void afs_process_async_call(struct work_struct *); +static void afs_process_async_call(struct afs_call *); static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *); static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool); @@ -58,6 +58,13 @@ static void afs_collect_incoming_call(struct work_struct *); static struct sk_buff_head afs_incoming_calls; static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call); +static void afs_async_workfn(struct work_struct *work) +{ + struct afs_call *call = container_of(work, struct afs_call, async_work); + + call->async_workfn(call); +} + /* * open an RxRPC socket and bind it to be a server for callback notifications * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT @@ -184,6 +191,28 @@ static void afs_free_call(struct afs_call *call) } /* + * End a call but do not free it + */ +static void afs_end_call_nofree(struct afs_call *call) +{ + if (call->rxcall) { + rxrpc_kernel_end_call(call->rxcall); + call->rxcall = NULL; + } + if (call->type->destructor) + call->type->destructor(call); +} + +/* + * End a call and free it + */ +static void afs_end_call(struct afs_call *call) +{ + afs_end_call_nofree(call); + afs_free_call(call); +} + +/* * allocate a call with flat request and reply buffers */ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, @@ -326,7 +355,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, atomic_read(&afs_outstanding_calls)); call->wait_mode = wait_mode; - INIT_WORK(&call->async_work, afs_process_async_call); + call->async_workfn = afs_process_async_call; + INIT_WORK(&call->async_work, afs_async_workfn); memset(&srx, 0, sizeof(srx)); srx.srx_family = AF_RXRPC; @@ -383,11 +413,8 @@ error_do_abort: rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT); while ((skb = skb_dequeue(&call->rx_queue))) afs_free_skb(skb); - rxrpc_kernel_end_call(rxcall); - call->rxcall = NULL; error_kill_call: - call->type->destructor(call); - afs_free_call(call); + afs_end_call(call); _leave(" = %d", ret); return ret; } @@ -509,12 +536,8 @@ static void afs_deliver_to_call(struct afs_call *call) if (call->state >= AFS_CALL_COMPLETE) { while ((skb = skb_dequeue(&call->rx_queue))) afs_free_skb(skb); - if (call->incoming) { - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); - } + if (call->incoming) + afs_end_call(call); } _leave(""); @@ -564,10 +587,7 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) } _debug("call complete"); - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); + afs_end_call(call); _leave(" = %d", ret); return ret; } @@ -603,11 +623,8 @@ static int afs_dont_wait_for_call_to_complete(struct afs_call *call) /* * delete an asynchronous call */ -static void afs_delete_async_call(struct work_struct *work) +static void afs_delete_async_call(struct afs_call *call) { - struct afs_call *call = - container_of(work, struct afs_call, async_work); - _enter(""); afs_free_call(call); @@ -620,11 +637,8 @@ static void afs_delete_async_call(struct work_struct *work) * - on a multiple-thread workqueue this work item may try to run on several * CPUs at the same time */ -static void afs_process_async_call(struct work_struct *work) +static void afs_process_async_call(struct afs_call *call) { - struct afs_call *call = - container_of(work, struct afs_call, async_work); - _enter(""); if (!skb_queue_empty(&call->rx_queue)) @@ -637,10 +651,7 @@ static void afs_process_async_call(struct work_struct *work) call->reply = NULL; /* kill the call */ - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - if (call->type->destructor) - call->type->destructor(call); + afs_end_call_nofree(call); /* we can't just delete the call because the work item may be * queued */ @@ -663,13 +674,6 @@ void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb) call->reply_size += len; } -static void afs_async_workfn(struct work_struct *work) -{ - struct afs_call *call = container_of(work, struct afs_call, async_work); - - call->async_workfn(work); -} - /* * accept the backlog of incoming calls */ @@ -790,10 +794,7 @@ void afs_send_empty_reply(struct afs_call *call) _debug("oom"); rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); default: - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); + afs_end_call(call); _leave(" [error]"); return; } @@ -823,17 +824,16 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) call->state = AFS_CALL_AWAIT_ACK; n = rxrpc_kernel_send_data(call->rxcall, &msg, len); if (n >= 0) { + /* Success */ _leave(" [replied]"); return; } + if (n == -ENOMEM) { _debug("oom"); rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); } - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); + afs_end_call(call); _leave(" [error]"); } @@ -112,6 +112,11 @@ struct kioctx { struct work_struct free_work; + /* + * signals when all in-flight requests are done + */ + struct completion *requests_done; + struct { /* * This counts the number of available slots in the ringbuffer, @@ -508,6 +513,10 @@ static void free_ioctx_reqs(struct percpu_ref *ref) { struct kioctx *ctx = container_of(ref, struct kioctx, reqs); + /* At this point we know that there are no any in-flight requests */ + if (ctx->requests_done) + complete(ctx->requests_done); + INIT_WORK(&ctx->free_work, free_ioctx); schedule_work(&ctx->free_work); } @@ -718,7 +727,8 @@ err: * when the processes owning a context have all exited to encourage * the rapid destruction of the kioctx. */ -static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) +static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, + struct completion *requests_done) { if (!atomic_xchg(&ctx->dead, 1)) { struct kioctx_table *table; @@ -747,7 +757,11 @@ static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) if (ctx->mmap_size) vm_munmap(ctx->mmap_base, ctx->mmap_size); + ctx->requests_done = requests_done; percpu_ref_kill(&ctx->users); + } else { + if (requests_done) + complete(requests_done); } } @@ -809,7 +823,7 @@ void exit_aio(struct mm_struct *mm) */ ctx->mmap_size = 0; - kill_ioctx(mm, ctx); + kill_ioctx(mm, ctx, NULL); } } @@ -1185,7 +1199,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); if (ret) - kill_ioctx(current->mm, ioctx); + kill_ioctx(current->mm, ioctx, NULL); percpu_ref_put(&ioctx->users); } @@ -1203,8 +1217,22 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { - kill_ioctx(current->mm, ioctx); + struct completion requests_done = + COMPLETION_INITIALIZER_ONSTACK(requests_done); + + /* Pass requests_done to kill_ioctx() where it can be set + * in a thread-safe way. If we try to set it here then we have + * a race condition if two io_destroy() called simultaneously. + */ + kill_ioctx(current->mm, ioctx, &requests_done); percpu_ref_put(&ioctx->users); + + /* Wait until all IO for the context are done. Otherwise kernel + * keep using user-space buffers even if user thinks the context + * is destroyed. + */ + wait_for_completion(&requests_done); + return 0; } pr_debug("EINVAL: io_destroy: invalid context id\n"); @@ -1299,10 +1327,8 @@ rw_common: &iovec, compat) : aio_setup_single_vector(req, rw, buf, &nr_segs, iovec); - if (ret) - return ret; - - ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); + if (!ret) + ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); if (ret < 0) { if (iovec != &inline_vec) kfree(iovec); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 2caf36ac3e9..cc87c1abac9 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) spin_lock(&active->d_lock); /* Already gone? */ - if (!d_count(active)) + if ((int) d_count(active) <= 0) goto next; qstr = &active->d_name; @@ -230,7 +230,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) spin_lock(&expiring->d_lock); - /* Bad luck, we've already been dentry_iput */ + /* We've already been dentry_iput or unlinked */ if (!expiring->d_inode) goto next; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4c48df572bd..ba6b88528dc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2058,6 +2058,20 @@ struct btrfs_ioctl_defrag_range_args { #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) #define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ BTRFS_MOUNT_##opt) +#define btrfs_set_and_info(root, opt, fmt, args...) \ +{ \ + if (!btrfs_test_opt(root, opt)) \ + btrfs_info(root->fs_info, fmt, ##args); \ + btrfs_set_opt(root->fs_info->mount_opt, opt); \ +} + +#define btrfs_clear_and_info(root, opt, fmt, args...) \ +{ \ + if (btrfs_test_opt(root, opt)) \ + btrfs_info(root->fs_info, fmt, ##args); \ + btrfs_clear_opt(root->fs_info->mount_opt, opt); \ +} + /* * Inode flags */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 029d46c2e17..983314932af 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2861,7 +2861,7 @@ retry_root_backup: printk(KERN_ERR "BTRFS: failed to read log tree\n"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); - goto fail_trans_kthread; + goto fail_qgroup; } /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); @@ -2870,24 +2870,24 @@ retry_root_backup: "Failed to recover log tree"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); - goto fail_trans_kthread; + goto fail_qgroup; } if (sb->s_flags & MS_RDONLY) { ret = btrfs_commit_super(tree_root); if (ret) - goto fail_trans_kthread; + goto fail_qgroup; } } ret = btrfs_find_orphan_roots(tree_root); if (ret) - goto fail_trans_kthread; + goto fail_qgroup; if (!(sb->s_flags & MS_RDONLY)) { ret = btrfs_cleanup_fs_roots(fs_info); if (ret) - goto fail_trans_kthread; + goto fail_qgroup; ret = btrfs_recover_relocation(tree_root); if (ret < 0) { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1306487c82c..5590af92094 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1542,6 +1542,7 @@ again: ret = 0; } if (ret) { + key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; btrfs_release_path(path); @@ -3542,11 +3543,13 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) return extended_to_chunk(flags | tmp); } -static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) +static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) { unsigned seq; + u64 flags; do { + flags = orig_flags; seq = read_seqbegin(&root->fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA) @@ -5719,6 +5722,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (ret > 0 && skinny_metadata) { skinny_metadata = false; + key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; btrfs_release_path(path); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index eb742c07e7a..ae6af072b63 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -800,7 +800,7 @@ next_slot: if (start > key.offset && end < extent_end) { BUG_ON(del_nr > 0); if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - ret = -EINVAL; + ret = -EOPNOTSUPP; break; } @@ -846,7 +846,7 @@ next_slot: */ if (start <= key.offset && end < extent_end) { if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - ret = -EINVAL; + ret = -EOPNOTSUPP; break; } @@ -872,7 +872,7 @@ next_slot: if (start > key.offset && end >= extent_end) { BUG_ON(del_nr > 0); if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - ret = -EINVAL; + ret = -EOPNOTSUPP; break; } @@ -1777,7 +1777,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, start_pos = round_down(pos, root->sectorsize); if (start_pos > i_size_read(inode)) { /* Expand hole size to cover write data, preventing empty gap */ - end_pos = round_up(pos + iov->iov_len, root->sectorsize); + end_pos = round_up(pos + count, root->sectorsize); err = btrfs_cont_expand(inode, i_size_read(inode), end_pos); if (err) { mutex_unlock(&inode->i_mutex); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index cc8ca193d83..86935f5ae29 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -176,7 +176,11 @@ static void start_caching(struct btrfs_root *root) tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n", root->root_key.objectid); - BUG_ON(IS_ERR(tsk)); /* -ENOMEM */ + if (IS_ERR(tsk)) { + btrfs_warn(root->fs_info, "failed to start inode caching task"); + btrfs_clear_and_info(root, CHANGE_INODE_CACHE, + "disabling inode map caching"); + } } int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) @@ -205,24 +209,14 @@ again: void btrfs_return_ino(struct btrfs_root *root, u64 objectid) { - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; if (!btrfs_test_opt(root, INODE_MAP_CACHE)) return; - again: if (root->cached == BTRFS_CACHE_FINISHED) { - __btrfs_add_free_space(ctl, objectid, 1); + __btrfs_add_free_space(pinned, objectid, 1); } else { - /* - * If we are in the process of caching free ino chunks, - * to avoid adding the same inode number to the free_ino - * tree twice due to cross transaction, we'll leave it - * in the pinned tree until a transaction is committed - * or the caching work is done. - */ - down_write(&root->fs_info->commit_root_sem); spin_lock(&root->cache_lock); if (root->cached == BTRFS_CACHE_FINISHED) { @@ -234,11 +228,7 @@ again: start_caching(root); - if (objectid <= root->cache_progress || - objectid >= root->highest_objectid) - __btrfs_add_free_space(ctl, objectid, 1); - else - __btrfs_add_free_space(pinned, objectid, 1); + __btrfs_add_free_space(pinned, objectid, 1); up_write(&root->fs_info->commit_root_sem); } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e79ff6b90cb..2f6d7b13b5b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3066,7 +3066,7 @@ process_slot: new_key.offset + datal, 1); if (ret) { - if (ret != -EINVAL) + if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, root, ret); btrfs_end_transaction(trans, root); @@ -3120,6 +3120,8 @@ process_slot: } else if (type == BTRFS_FILE_EXTENT_INLINE) { u64 skip = 0; u64 trim = 0; + u64 aligned_end = 0; + if (off > key.offset) { skip = off - key.offset; new_key.offset += skip; @@ -3136,12 +3138,14 @@ process_slot: size -= skip + trim; datal -= skip + trim; + aligned_end = ALIGN(new_key.offset + datal, + root->sectorsize); ret = btrfs_drop_extents(trans, root, inode, new_key.offset, - new_key.offset + datal, + aligned_end, 1); if (ret) { - if (ret != -EINVAL) + if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, root, ret); btrfs_end_transaction(trans, root); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 1ac3ca98c42..fd38b505347 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -349,6 +349,11 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) if (p->buf_len >= len) return 0; + if (len > PATH_MAX) { + WARN_ON(1); + return -ENOMEM; + } + path_len = p->end - p->start; old_buf_len = p->buf_len; @@ -1663,7 +1668,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, goto out; } - if (key.type == BTRFS_INODE_REF_KEY) { + if (found_key.type == BTRFS_INODE_REF_KEY) { struct btrfs_inode_ref *iref; iref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 5011aadacab..9601d25a460 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -385,20 +385,6 @@ static match_table_t tokens = { {Opt_err, NULL}, }; -#define btrfs_set_and_info(root, opt, fmt, args...) \ -{ \ - if (!btrfs_test_opt(root, opt)) \ - btrfs_info(root->fs_info, fmt, ##args); \ - btrfs_set_opt(root->fs_info->mount_opt, opt); \ -} - -#define btrfs_clear_and_info(root, opt, fmt, args...) \ -{ \ - if (btrfs_test_opt(root, opt)) \ - btrfs_info(root->fs_info, fmt, ##args); \ - btrfs_clear_opt(root->fs_info->mount_opt, opt); \ -} - /* * Regular mount options parser. Everything that is needed only when * reading in a new superblock is parsed here. @@ -1186,7 +1172,6 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, return ERR_PTR(-ENOMEM); mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs); - kfree(newargs); if (PTR_RET(mnt) == -EBUSY) { if (flags & MS_RDONLY) { @@ -1196,17 +1181,22 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, int r; mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name, newargs); - if (IS_ERR(mnt)) + if (IS_ERR(mnt)) { + kfree(newargs); return ERR_CAST(mnt); + } r = btrfs_remount(mnt->mnt_sb, &flags, NULL); if (r < 0) { /* FIXME: release vfsmount mnt ??*/ + kfree(newargs); return ERR_PTR(r); } } } + kfree(newargs); + if (IS_ERR(mnt)) return ERR_CAST(mnt); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2e5e648eb5c..c561b628ebc 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3261,7 +3261,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, rel->seq = cpu_to_le32(cap->seq); rel->issue_seq = cpu_to_le32(cap->issue_seq), rel->mseq = cpu_to_le32(cap->mseq); - rel->caps = cpu_to_le32(cap->issued); + rel->caps = cpu_to_le32(cap->implemented); rel->wanted = cpu_to_le32(cap->mds_wanted); rel->dname_len = 0; rel->dname_seq = 0; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 766410a12c2..c29d6ae6887 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -141,7 +141,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, /* start at beginning? */ if (ctx->pos == 2 || last == NULL || - ctx->pos < ceph_dentry(last)->offset) { + fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) { if (list_empty(&parent->d_subdirs)) goto out_unlock; p = parent->d_subdirs.prev; @@ -182,9 +182,16 @@ more: spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); + /* make sure a dentry wasn't dropped while we didn't have parent lock */ + if (!ceph_dir_is_complete(dir)) { + dout(" lost dir complete on %p; falling back to mds\n", dir); + dput(dentry); + err = -EAGAIN; + goto out; + } + dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos, dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); - ctx->pos = di->offset; if (!dir_emit(ctx, dentry->d_name.name, dentry->d_name.len, ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), @@ -198,19 +205,12 @@ more: return 0; } + ctx->pos = di->offset + 1; + if (last) dput(last); last = dentry; - ctx->pos++; - - /* make sure a dentry wasn't dropped while we didn't have parent lock */ - if (!ceph_dir_is_complete(dir)) { - dout(" lost dir complete on %p; falling back to mds\n", dir); - err = -EAGAIN; - goto out; - } - spin_lock(&parent->d_lock); p = p->prev; /* advance to next dentry */ goto more; @@ -296,6 +296,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) err = __dcache_readdir(file, ctx, shared_gen); if (err != -EAGAIN) return err; + frag = fpos_frag(ctx->pos); + off = fpos_off(ctx->pos); } else { spin_unlock(&ci->i_ceph_lock); } @@ -446,7 +448,6 @@ more: if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { dout(" marking %p complete\n", inode); __ceph_dir_set_complete(ci, fi->dir_release_count); - ci->i_max_offset = ctx->pos; } spin_unlock(&ci->i_ceph_lock); @@ -935,14 +936,16 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, * to do it here. */ - /* d_move screws up d_subdirs order */ - ceph_dir_clear_complete(new_dir); - d_move(old_dentry, new_dentry); /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(new_dentry); + + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_complete(old_dir); + ceph_dir_clear_complete(new_dir); + } ceph_mdsc_put_request(req); return err; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 39da1c2efa5..88a6df4cbe6 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1221,9 +1221,6 @@ static long ceph_fallocate(struct file *file, int mode, if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - mutex_lock(&inode->i_mutex); if (ceph_snap(inode) != CEPH_NOSNAP) { diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 0b0728e5be2..233c6f96910 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -744,7 +744,6 @@ static int fill_inode(struct inode *inode, !__ceph_dir_is_complete(ci)) { dout(" marking %p complete (empty)\n", inode); __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count)); - ci->i_max_offset = 2; } no_change: /* only update max_size on auth cap */ @@ -890,41 +889,6 @@ out_unlock: } /* - * Set dentry's directory position based on the current dir's max, and - * order it in d_subdirs, so that dcache_readdir behaves. - * - * Always called under directory's i_mutex. - */ -static void ceph_set_dentry_offset(struct dentry *dn) -{ - struct dentry *dir = dn->d_parent; - struct inode *inode = dir->d_inode; - struct ceph_inode_info *ci; - struct ceph_dentry_info *di; - - BUG_ON(!inode); - - ci = ceph_inode(inode); - di = ceph_dentry(dn); - - spin_lock(&ci->i_ceph_lock); - if (!__ceph_dir_is_complete(ci)) { - spin_unlock(&ci->i_ceph_lock); - return; - } - di->offset = ceph_inode(inode)->i_max_offset++; - spin_unlock(&ci->i_ceph_lock); - - spin_lock(&dir->d_lock); - spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); - list_move(&dn->d_u.d_child, &dir->d_subdirs); - dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, - dn->d_u.d_child.prev, dn->d_u.d_child.next); - spin_unlock(&dn->d_lock); - spin_unlock(&dir->d_lock); -} - -/* * splice a dentry to an inode. * caller must hold directory i_mutex for this to be safe. * @@ -933,7 +897,7 @@ static void ceph_set_dentry_offset(struct dentry *dn) * the caller) if we fail. */ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, - bool *prehash, bool set_offset) + bool *prehash) { struct dentry *realdn; @@ -965,8 +929,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, } if ((!prehash || *prehash) && d_unhashed(dn)) d_rehash(dn); - if (set_offset) - ceph_set_dentry_offset(dn); out: return dn; } @@ -987,7 +949,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, { struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct inode *in = NULL; - struct ceph_mds_reply_inode *ininfo; struct ceph_vino vino; struct ceph_fs_client *fsc = ceph_sb_to_client(sb); int err = 0; @@ -1161,6 +1122,9 @@ retry_lookup: /* rename? */ if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) { + struct inode *olddir = req->r_old_dentry_dir; + BUG_ON(!olddir); + dout(" src %p '%.*s' dst %p '%.*s'\n", req->r_old_dentry, req->r_old_dentry->d_name.len, @@ -1180,13 +1144,10 @@ retry_lookup: rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(dn); - /* - * d_move() puts the renamed dentry at the end of - * d_subdirs. We need to assign it an appropriate - * directory offset so we can behave when dir is - * complete. - */ - ceph_set_dentry_offset(req->r_old_dentry); + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_complete(dir); + ceph_dir_clear_complete(olddir); + dout("dn %p gets new offset %lld\n", req->r_old_dentry, ceph_dentry(req->r_old_dentry)->offset); @@ -1213,8 +1174,9 @@ retry_lookup: /* attach proper inode */ if (!dn->d_inode) { + ceph_dir_clear_complete(dir); ihold(in); - dn = splice_dentry(dn, in, &have_lease, true); + dn = splice_dentry(dn, in, &have_lease); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; @@ -1235,17 +1197,16 @@ retry_lookup: (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || req->r_op == CEPH_MDS_OP_MKSNAP)) { struct dentry *dn = req->r_dentry; + struct inode *dir = req->r_locked_dir; /* fill out a snapdir LOOKUPSNAP dentry */ BUG_ON(!dn); - BUG_ON(!req->r_locked_dir); - BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR); - ininfo = rinfo->targeti.in; - vino.ino = le64_to_cpu(ininfo->ino); - vino.snap = le64_to_cpu(ininfo->snapid); + BUG_ON(!dir); + BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR); dout(" linking snapped dir %p to dn %p\n", in, dn); + ceph_dir_clear_complete(dir); ihold(in); - dn = splice_dentry(dn, in, NULL, true); + dn = splice_dentry(dn, in, NULL); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; @@ -1407,7 +1368,7 @@ retry_lookup: } if (!dn->d_inode) { - dn = splice_dentry(dn, in, NULL, false); + dn = splice_dentry(dn, in, NULL); if (IS_ERR(dn)) { err = PTR_ERR(dn); dn = NULL; diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index fdf941b44ff..a822a6e5829 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -109,6 +109,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; + req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL; req->r_args.setlayout.layout.fl_stripe_unit = @@ -153,6 +155,7 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; req->r_args.setlayout.layout.fl_stripe_unit = cpu_to_le32(l.stripe_unit); diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index d94ba0df9f4..191398852a2 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -45,6 +45,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; /* mds requires start and length rather than start and end */ if (LLONG_MAX == fl->fl_end) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 7866cd05a6b..ead05cc1f44 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -266,7 +266,6 @@ struct ceph_inode_info { struct timespec i_rctime; u64 i_rbytes, i_rfiles, i_rsubdirs; u64 i_files, i_subdirs; - u64 i_max_offset; /* largest readdir offset, set with complete dir */ struct rb_root i_fragtree; struct mutex i_fragtree_mutex; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index df9c9141c09..5be1f997ecd 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -253,6 +253,11 @@ cifs_alloc_inode(struct super_block *sb) cifs_set_oplock_level(cifs_inode, 0); cifs_inode->delete_pending = false; cifs_inode->invalid_mapping = false; + clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cifs_inode->flags); + clear_bit(CIFS_INODE_PENDING_WRITERS, &cifs_inode->flags); + clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cifs_inode->flags); + spin_lock_init(&cifs_inode->writers_lock); + cifs_inode->writers = 0; cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ cifs_inode->server_eof = 0; cifs_inode->uniqueid = 0; @@ -732,19 +737,26 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct inode *inode = file_inode(iocb->ki_filp); + struct cifsInodeInfo *cinode = CIFS_I(inode); ssize_t written; int rc; + written = cifs_get_writer(cinode); + if (written) + return written; + written = generic_file_aio_write(iocb, iov, nr_segs, pos); if (CIFS_CACHE_WRITE(CIFS_I(inode))) - return written; + goto out; rc = filemap_fdatawrite(inode->i_mapping); if (rc) cifs_dbg(FYI, "cifs_file_aio_write: %d rc on %p inode\n", rc, inode); +out: + cifs_put_writer(cinode); return written; } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index c0f3718b77a..30f6e9251a4 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -228,6 +228,8 @@ struct smb_version_operations { /* verify the message */ int (*check_message)(char *, unsigned int); bool (*is_oplock_break)(char *, struct TCP_Server_Info *); + void (*downgrade_oplock)(struct TCP_Server_Info *, + struct cifsInodeInfo *, bool); /* process transaction2 response */ bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *, char *, int); @@ -1113,6 +1115,12 @@ struct cifsInodeInfo { unsigned int epoch; /* used to track lease state changes */ bool delete_pending; /* DELETE_ON_CLOSE is set */ bool invalid_mapping; /* pagecache is invalid */ + unsigned long flags; +#define CIFS_INODE_PENDING_OPLOCK_BREAK (0) /* oplock break in progress */ +#define CIFS_INODE_PENDING_WRITERS (1) /* Writes in progress */ +#define CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2 (2) /* Downgrade oplock to L2 */ + spinlock_t writers_lock; + unsigned int writers; /* Number of writers on this inode */ unsigned long time; /* jiffies of last update of inode */ u64 server_eof; /* current file size on server -- protected by i_lock */ u64 uniqueid; /* server inode number */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index acc4ee8ed07..ca7980a1e30 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -127,6 +127,9 @@ extern u64 cifs_UnixTimeToNT(struct timespec); extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset); extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); +extern int cifs_get_writer(struct cifsInodeInfo *cinode); +extern void cifs_put_writer(struct cifsInodeInfo *cinode); +extern void cifs_done_oplock_break(struct cifsInodeInfo *cinode); extern int cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, const unsigned int xid); extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index f3264bd7a83..6ce4e0954b9 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -6197,6 +6197,9 @@ QAllEAsRetry: cifs_dbg(FYI, "ea length %d\n", list_len); if (list_len <= 8) { cifs_dbg(FYI, "empty EA list returned from server\n"); + /* didn't find the named attribute */ + if (ea_name) + rc = -ENODATA; goto QAllEAsOut; } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8add25538a3..5ed03e0b8b4 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2599,7 +2599,7 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, ssize_t err; err = generic_write_sync(file, iocb->ki_pos - rc, rc); - if (rc < 0) + if (err < 0) rc = err; } } else { @@ -2621,12 +2621,20 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); ssize_t written; + written = cifs_get_writer(cinode); + if (written) + return written; + if (CIFS_CACHE_WRITE(cinode)) { if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) - && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) - return generic_file_aio_write(iocb, iov, nr_segs, pos); - return cifs_writev(iocb, iov, nr_segs, pos); + && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) { + written = generic_file_aio_write( + iocb, iov, nr_segs, pos); + goto out; + } + written = cifs_writev(iocb, iov, nr_segs, pos); + goto out; } /* * For non-oplocked files in strict cache mode we need to write the data @@ -2646,6 +2654,8 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, inode); cinode->oplock = 0; } +out: + cifs_put_writer(cinode); return written; } @@ -2872,7 +2882,7 @@ ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, cifs_uncached_readv_complete); if (!rdata) { rc = -ENOMEM; - goto error; + break; } rc = cifs_read_allocate_pages(rdata, npages); @@ -3621,6 +3631,13 @@ static int cifs_launder_page(struct page *page) return rc; } +static int +cifs_pending_writers_wait(void *unused) +{ + schedule(); + return 0; +} + void cifs_oplock_break(struct work_struct *work) { struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, @@ -3628,8 +3645,15 @@ void cifs_oplock_break(struct work_struct *work) struct inode *inode = cfile->dentry->d_inode; struct cifsInodeInfo *cinode = CIFS_I(inode); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; int rc = 0; + wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, + cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE); + + server->ops->downgrade_oplock(server, cinode, + test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags)); + if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) && cifs_has_mand_locks(cinode)) { cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n", @@ -3666,6 +3690,7 @@ void cifs_oplock_break(struct work_struct *work) cinode); cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } + cifs_done_oplock_break(cinode); } /* diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index aadc2b68678..a22d667f106 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1737,6 +1737,9 @@ cifs_inode_needs_reval(struct inode *inode) if (cifs_i->time == 0) return true; + if (!cifs_sb->actimeo) + return true; + if (!time_in_range(jiffies, cifs_i->time, cifs_i->time + cifs_sb->actimeo)) return true; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 2f9f3790679..3b0c62e622d 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -466,8 +466,22 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) cifs_dbg(FYI, "file id match, oplock break\n"); pCifsInode = CIFS_I(netfile->dentry->d_inode); - cifs_set_oplock_level(pCifsInode, - pSMB->OplockLevel ? OPLOCK_READ : 0); + set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, + &pCifsInode->flags); + + /* + * Set flag if the server downgrades the oplock + * to L2 else clear. + */ + if (pSMB->OplockLevel) + set_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &pCifsInode->flags); + else + clear_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &pCifsInode->flags); + queue_work(cifsiod_wq, &netfile->oplock_break); netfile->oplock_break_cancelled = false; @@ -551,6 +565,62 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) cinode->oplock = 0; } +static int +cifs_oplock_break_wait(void *unused) +{ + schedule(); + return signal_pending(current) ? -ERESTARTSYS : 0; +} + +/* + * We wait for oplock breaks to be processed before we attempt to perform + * writes. + */ +int cifs_get_writer(struct cifsInodeInfo *cinode) +{ + int rc; + +start: + rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK, + cifs_oplock_break_wait, TASK_KILLABLE); + if (rc) + return rc; + + spin_lock(&cinode->writers_lock); + if (!cinode->writers) + set_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + cinode->writers++; + /* Check to see if we have started servicing an oplock break */ + if (test_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags)) { + cinode->writers--; + if (cinode->writers == 0) { + clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS); + } + spin_unlock(&cinode->writers_lock); + goto start; + } + spin_unlock(&cinode->writers_lock); + return 0; +} + +void cifs_put_writer(struct cifsInodeInfo *cinode) +{ + spin_lock(&cinode->writers_lock); + cinode->writers--; + if (cinode->writers == 0) { + clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS); + } + spin_unlock(&cinode->writers_lock); +} + +void cifs_done_oplock_break(struct cifsInodeInfo *cinode) +{ + clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK); +} + bool backup_cred(struct cifs_sb_info *cifs_sb) { diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 526fb89f923..d1fdfa84870 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -372,6 +372,16 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) return 0; } +static void +cifs_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, bool set_level2) +{ + if (set_level2) + cifs_set_oplock_level(cinode, OPLOCK_READ); + else + cifs_set_oplock_level(cinode, 0); +} + static bool cifs_check_trans2(struct mid_q_entry *mid, struct TCP_Server_Info *server, char *buf, int malformed) @@ -1019,6 +1029,7 @@ struct smb_version_operations smb1_operations = { .clear_stats = cifs_clear_stats, .print_stats = cifs_print_stats, .is_oplock_break = is_valid_oplock_break, + .downgrade_oplock = cifs_downgrade_oplock, .check_trans2 = cifs_check_trans2, .need_neg = cifs_need_neg, .negotiate = cifs_negotiate, diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index fb3966265b6..b8021fde987 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -575,9 +575,21 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) else cfile->oplock_break_cancelled = false; - server->ops->set_oplock_level(cinode, - rsp->OplockLevel ? SMB2_OPLOCK_LEVEL_II : 0, - 0, NULL); + set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, + &cinode->flags); + + /* + * Set flag if the server downgrades the oplock + * to L2 else clear. + */ + if (rsp->OplockLevel) + set_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); + else + clear_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); queue_work(cifsiod_wq, &cfile->oplock_break); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 192f51a12cf..35ddc3ed119 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -905,6 +905,17 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, } static void +smb2_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, bool set_level2) +{ + if (set_level2) + server->ops->set_oplock_level(cinode, SMB2_OPLOCK_LEVEL_II, + 0, NULL); + else + server->ops->set_oplock_level(cinode, 0, 0, NULL); +} + +static void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, unsigned int epoch, bool *purge_cache) { @@ -1110,6 +1121,7 @@ struct smb_version_operations smb20_operations = { .clear_stats = smb2_clear_stats, .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -1184,6 +1196,7 @@ struct smb_version_operations smb21_operations = { .clear_stats = smb2_clear_stats, .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -1259,6 +1272,7 @@ struct smb_version_operations smb30_operations = { .print_stats = smb2_print_stats, .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 86034470106..3802f8c94ac 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1352,7 +1352,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid) { int rc; - char *res_key = NULL; struct compress_ioctl fsctl_input; char *ret_data = NULL; @@ -1365,7 +1364,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, 2 /* in data len */, &ret_data /* out data */, NULL); cifs_dbg(FYI, "set compression rc %d\n", rc); - kfree(res_key); return rc; } diff --git a/fs/compat.c b/fs/compat.c index ca926ad0430..66d3d3c6b4b 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -457,9 +457,9 @@ COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, case F_GETLK64: case F_SETLK64: case F_SETLKW64: - case F_GETLKP: - case F_SETLKP: - case F_SETLKPW: + case F_OFD_GETLK: + case F_OFD_SETLK: + case F_OFD_SETLKW: ret = get_compat_flock64(&f, compat_ptr(arg)); if (ret != 0) break; @@ -468,7 +468,7 @@ COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, conv_cmd = convert_fcntl_cmd(cmd); ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f); set_fs(old_fs); - if ((conv_cmd == F_GETLK || conv_cmd == F_GETLKP) && ret == 0) { + if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) { /* need to return lock information - see above for commentary */ if (f.l_start > COMPAT_LOFF_T_MAX) ret = -EOVERFLOW; @@ -493,9 +493,9 @@ COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, case F_GETLK64: case F_SETLK64: case F_SETLKW64: - case F_GETLKP: - case F_SETLKP: - case F_SETLKPW: + case F_OFD_GETLK: + case F_OFD_SETLK: + case F_OFD_SETLKW: return -EINVAL; } return compat_sys_fcntl64(fd, cmd, arg); diff --git a/fs/coredump.c b/fs/coredump.c index e3ad709a423..0b2528fb640 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -73,10 +73,15 @@ static int expand_corename(struct core_name *cn, int size) static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg) { int free, need; + va_list arg_copy; again: free = cn->size - cn->used; - need = vsnprintf(cn->corename + cn->used, free, fmt, arg); + + va_copy(arg_copy, arg); + need = vsnprintf(cn->corename + cn->used, free, fmt, arg_copy); + va_end(arg_copy); + if (need < free) { cn->used += need; return 0; diff --git a/fs/dcache.c b/fs/dcache.c index 40707d88a94..be2bea834bf 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -246,16 +246,8 @@ static void __d_free(struct rcu_head *head) kmem_cache_free(dentry_cache, dentry); } -/* - * no locks, please. - */ -static void d_free(struct dentry *dentry) +static void dentry_free(struct dentry *dentry) { - BUG_ON((int)dentry->d_lockref.count > 0); - this_cpu_dec(nr_dentry); - if (dentry->d_op && dentry->d_op->d_release) - dentry->d_op->d_release(dentry); - /* if dentry was never visible to RCU, immediate free is OK */ if (!(dentry->d_flags & DCACHE_RCUACCESS)) __d_free(&dentry->d_u.d_rcu); @@ -403,56 +395,6 @@ static void dentry_lru_add(struct dentry *dentry) d_lru_add(dentry); } -/* - * Remove a dentry with references from the LRU. - * - * If we are on the shrink list, then we can get to try_prune_one_dentry() and - * lose our last reference through the parent walk. In this case, we need to - * remove ourselves from the shrink list, not the LRU. - */ -static void dentry_lru_del(struct dentry *dentry) -{ - if (dentry->d_flags & DCACHE_LRU_LIST) { - if (dentry->d_flags & DCACHE_SHRINK_LIST) - return d_shrink_del(dentry); - d_lru_del(dentry); - } -} - -/** - * d_kill - kill dentry and return parent - * @dentry: dentry to kill - * @parent: parent dentry - * - * The dentry must already be unhashed and removed from the LRU. - * - * If this is the root of the dentry tree, return NULL. - * - * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by - * d_kill. - */ -static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) - __releases(dentry->d_lock) - __releases(parent->d_lock) - __releases(dentry->d_inode->i_lock) -{ - list_del(&dentry->d_u.d_child); - /* - * Inform d_walk() that we are no longer attached to the - * dentry tree - */ - dentry->d_flags |= DCACHE_DENTRY_KILLED; - if (parent) - spin_unlock(&parent->d_lock); - dentry_iput(dentry); - /* - * dentry_iput drops the locks, at which point nobody (except - * transient RCU lookups) can reach this dentry. - */ - d_free(dentry); - return parent; -} - /** * d_drop - drop a dentry * @dentry: dentry to drop @@ -499,37 +441,12 @@ void d_drop(struct dentry *dentry) } EXPORT_SYMBOL(d_drop); -/* - * Finish off a dentry we've decided to kill. - * dentry->d_lock must be held, returns with it unlocked. - * If ref is non-zero, then decrement the refcount too. - * Returns dentry requiring refcount drop, or NULL if we're done. - */ -static struct dentry * -dentry_kill(struct dentry *dentry, int unlock_on_failure) - __releases(dentry->d_lock) +static void __dentry_kill(struct dentry *dentry) { - struct inode *inode; - struct dentry *parent; - - inode = dentry->d_inode; - if (inode && !spin_trylock(&inode->i_lock)) { -relock: - if (unlock_on_failure) { - spin_unlock(&dentry->d_lock); - cpu_relax(); - } - return dentry; /* try again with same dentry */ - } - if (IS_ROOT(dentry)) - parent = NULL; - else + struct dentry *parent = NULL; + bool can_free = true; + if (!IS_ROOT(dentry)) parent = dentry->d_parent; - if (parent && !spin_trylock(&parent->d_lock)) { - if (inode) - spin_unlock(&inode->i_lock); - goto relock; - } /* * The dentry is now unrecoverably dead to the world. @@ -543,10 +460,103 @@ relock: if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry)) dentry->d_op->d_prune(dentry); - dentry_lru_del(dentry); + if (dentry->d_flags & DCACHE_LRU_LIST) { + if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) + d_lru_del(dentry); + } /* if it was on the hash then remove it */ __d_drop(dentry); - return d_kill(dentry, parent); + list_del(&dentry->d_u.d_child); + /* + * Inform d_walk() that we are no longer attached to the + * dentry tree + */ + dentry->d_flags |= DCACHE_DENTRY_KILLED; + if (parent) + spin_unlock(&parent->d_lock); + dentry_iput(dentry); + /* + * dentry_iput drops the locks, at which point nobody (except + * transient RCU lookups) can reach this dentry. + */ + BUG_ON((int)dentry->d_lockref.count > 0); + this_cpu_dec(nr_dentry); + if (dentry->d_op && dentry->d_op->d_release) + dentry->d_op->d_release(dentry); + + spin_lock(&dentry->d_lock); + if (dentry->d_flags & DCACHE_SHRINK_LIST) { + dentry->d_flags |= DCACHE_MAY_FREE; + can_free = false; + } + spin_unlock(&dentry->d_lock); + if (likely(can_free)) + dentry_free(dentry); +} + +/* + * Finish off a dentry we've decided to kill. + * dentry->d_lock must be held, returns with it unlocked. + * If ref is non-zero, then decrement the refcount too. + * Returns dentry requiring refcount drop, or NULL if we're done. + */ +static struct dentry *dentry_kill(struct dentry *dentry) + __releases(dentry->d_lock) +{ + struct inode *inode = dentry->d_inode; + struct dentry *parent = NULL; + + if (inode && unlikely(!spin_trylock(&inode->i_lock))) + goto failed; + + if (!IS_ROOT(dentry)) { + parent = dentry->d_parent; + if (unlikely(!spin_trylock(&parent->d_lock))) { + if (inode) + spin_unlock(&inode->i_lock); + goto failed; + } + } + + __dentry_kill(dentry); + return parent; + +failed: + spin_unlock(&dentry->d_lock); + cpu_relax(); + return dentry; /* try again with same dentry */ +} + +static inline struct dentry *lock_parent(struct dentry *dentry) +{ + struct dentry *parent = dentry->d_parent; + if (IS_ROOT(dentry)) + return NULL; + if (likely(spin_trylock(&parent->d_lock))) + return parent; + spin_unlock(&dentry->d_lock); + rcu_read_lock(); +again: + parent = ACCESS_ONCE(dentry->d_parent); + spin_lock(&parent->d_lock); + /* + * We can't blindly lock dentry until we are sure + * that we won't violate the locking order. + * Any changes of dentry->d_parent must have + * been done with parent->d_lock held, so + * spin_lock() above is enough of a barrier + * for checking if it's still our child. + */ + if (unlikely(parent != dentry->d_parent)) { + spin_unlock(&parent->d_lock); + goto again; + } + rcu_read_unlock(); + if (parent != dentry) + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + else + parent = NULL; + return parent; } /* @@ -602,7 +612,7 @@ repeat: return; kill_it: - dentry = dentry_kill(dentry, 1); + dentry = dentry_kill(dentry); if (dentry) goto repeat; } @@ -815,64 +825,15 @@ restart: } EXPORT_SYMBOL(d_prune_aliases); -/* - * Try to throw away a dentry - free the inode, dput the parent. - * Requires dentry->d_lock is held, and dentry->d_count == 0. - * Releases dentry->d_lock. - * - * This may fail if locks cannot be acquired no problem, just try again. - */ -static struct dentry * try_prune_one_dentry(struct dentry *dentry) - __releases(dentry->d_lock) -{ - struct dentry *parent; - - parent = dentry_kill(dentry, 0); - /* - * If dentry_kill returns NULL, we have nothing more to do. - * if it returns the same dentry, trylocks failed. In either - * case, just loop again. - * - * Otherwise, we need to prune ancestors too. This is necessary - * to prevent quadratic behavior of shrink_dcache_parent(), but - * is also expected to be beneficial in reducing dentry cache - * fragmentation. - */ - if (!parent) - return NULL; - if (parent == dentry) - return dentry; - - /* Prune ancestors. */ - dentry = parent; - while (dentry) { - if (lockref_put_or_lock(&dentry->d_lockref)) - return NULL; - dentry = dentry_kill(dentry, 1); - } - return NULL; -} - static void shrink_dentry_list(struct list_head *list) { - struct dentry *dentry; - - rcu_read_lock(); - for (;;) { - dentry = list_entry_rcu(list->prev, struct dentry, d_lru); - if (&dentry->d_lru == list) - break; /* empty */ + struct dentry *dentry, *parent; - /* - * Get the dentry lock, and re-verify that the dentry is - * this on the shrinking list. If it is, we know that - * DCACHE_SHRINK_LIST and DCACHE_LRU_LIST are set. - */ + while (!list_empty(list)) { + struct inode *inode; + dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); - if (dentry != list_entry(list->prev, struct dentry, d_lru)) { - spin_unlock(&dentry->d_lock); - continue; - } + parent = lock_parent(dentry); /* * The dispose list is isolated and dentries are not accounted @@ -885,30 +846,63 @@ static void shrink_dentry_list(struct list_head *list) * We found an inuse dentry which was not removed from * the LRU because of laziness during lookup. Do not free it. */ - if (dentry->d_lockref.count) { + if ((int)dentry->d_lockref.count > 0) { spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); continue; } - rcu_read_unlock(); - /* - * If 'try_to_prune()' returns a dentry, it will - * be the same one we passed in, and d_lock will - * have been held the whole time, so it will not - * have been added to any other lists. We failed - * to get the inode lock. - * - * We just add it back to the shrink list. - */ - dentry = try_prune_one_dentry(dentry); - rcu_read_lock(); - if (dentry) { + if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) { + bool can_free = dentry->d_flags & DCACHE_MAY_FREE; + spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + if (can_free) + dentry_free(dentry); + continue; + } + + inode = dentry->d_inode; + if (inode && unlikely(!spin_trylock(&inode->i_lock))) { d_shrink_add(dentry, list); spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + continue; + } + + __dentry_kill(dentry); + + /* + * We need to prune ancestors too. This is necessary to prevent + * quadratic behavior of shrink_dcache_parent(), but is also + * expected to be beneficial in reducing dentry cache + * fragmentation. + */ + dentry = parent; + while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) { + parent = lock_parent(dentry); + if (dentry->d_lockref.count != 1) { + dentry->d_lockref.count--; + spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + break; + } + inode = dentry->d_inode; /* can't be NULL */ + if (unlikely(!spin_trylock(&inode->i_lock))) { + spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + cpu_relax(); + continue; + } + __dentry_kill(dentry); + dentry = parent; } } - rcu_read_unlock(); } static enum lru_status @@ -1261,34 +1255,23 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) if (data->start == dentry) goto out; - /* - * move only zero ref count dentries to the dispose list. - * - * Those which are presently on the shrink list, being processed - * by shrink_dentry_list(), shouldn't be moved. Otherwise the - * loop in shrink_dcache_parent() might not make any progress - * and loop forever. - */ - if (dentry->d_lockref.count) { - dentry_lru_del(dentry); - } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { - /* - * We can't use d_lru_shrink_move() because we - * need to get the global LRU lock and do the - * LRU accounting. - */ - d_lru_del(dentry); - d_shrink_add(dentry, &data->dispose); + if (dentry->d_flags & DCACHE_SHRINK_LIST) { data->found++; - ret = D_WALK_NORETRY; + } else { + if (dentry->d_flags & DCACHE_LRU_LIST) + d_lru_del(dentry); + if (!dentry->d_lockref.count) { + d_shrink_add(dentry, &data->dispose); + data->found++; + } } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ - if (data->found && need_resched()) - ret = D_WALK_QUIT; + if (!list_empty(&data->dispose)) + ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: return ret; } @@ -1318,45 +1301,35 @@ void shrink_dcache_parent(struct dentry *parent) } EXPORT_SYMBOL(shrink_dcache_parent); -static enum d_walk_ret umount_collect(void *_data, struct dentry *dentry) +static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) { - struct select_data *data = _data; - enum d_walk_ret ret = D_WALK_CONTINUE; + /* it has busy descendents; complain about those instead */ + if (!list_empty(&dentry->d_subdirs)) + return D_WALK_CONTINUE; - if (dentry->d_lockref.count) { - dentry_lru_del(dentry); - if (likely(!list_empty(&dentry->d_subdirs))) - goto out; - if (dentry == data->start && dentry->d_lockref.count == 1) - goto out; - printk(KERN_ERR - "BUG: Dentry %p{i=%lx,n=%s}" - " still in use (%d)" - " [unmount of %s %s]\n", + /* root with refcount 1 is fine */ + if (dentry == _data && dentry->d_lockref.count == 1) + return D_WALK_CONTINUE; + + printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} " + " still in use (%d) [unmount of %s %s]\n", dentry, dentry->d_inode ? dentry->d_inode->i_ino : 0UL, - dentry->d_name.name, + dentry, dentry->d_lockref.count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); - BUG(); - } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { - /* - * We can't use d_lru_shrink_move() because we - * need to get the global LRU lock and do the - * LRU accounting. - */ - if (dentry->d_flags & DCACHE_LRU_LIST) - d_lru_del(dentry); - d_shrink_add(dentry, &data->dispose); - data->found++; - ret = D_WALK_NORETRY; - } -out: - if (data->found && need_resched()) - ret = D_WALK_QUIT; - return ret; + WARN_ON(1); + return D_WALK_CONTINUE; +} + +static void do_one_tree(struct dentry *dentry) +{ + shrink_dcache_parent(dentry); + d_walk(dentry, dentry, umount_check, NULL); + d_drop(dentry); + dput(dentry); } /* @@ -1366,40 +1339,15 @@ void shrink_dcache_for_umount(struct super_block *sb) { struct dentry *dentry; - if (down_read_trylock(&sb->s_umount)) - BUG(); + WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked"); dentry = sb->s_root; sb->s_root = NULL; - for (;;) { - struct select_data data; - - INIT_LIST_HEAD(&data.dispose); - data.start = dentry; - data.found = 0; - - d_walk(dentry, &data, umount_collect, NULL); - if (!data.found) - break; - - shrink_dentry_list(&data.dispose); - cond_resched(); - } - d_drop(dentry); - dput(dentry); + do_one_tree(dentry); while (!hlist_bl_empty(&sb->s_anon)) { - struct select_data data; - dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash); - - INIT_LIST_HEAD(&data.dispose); - data.start = NULL; - data.found = 0; - - d_walk(dentry, &data, umount_collect, NULL); - if (data.found) - shrink_dentry_list(&data.dispose); - cond_resched(); + dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash)); + do_one_tree(dentry); } } @@ -1647,8 +1595,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) unsigned add_flags = d_flags_for_inode(inode); spin_lock(&dentry->d_lock); - dentry->d_flags &= ~DCACHE_ENTRY_TYPE; - dentry->d_flags |= add_flags; + __d_set_type(dentry, add_flags); if (inode) hlist_add_head(&dentry->d_alias, &inode->i_dentry); dentry->d_inode = inode; diff --git a/fs/exec.c b/fs/exec.c index 476f3ebf437..238b7aa26f6 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -657,10 +657,10 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long rlim_stack; #ifdef CONFIG_STACK_GROWSUP - /* Limit stack size to 1GB */ + /* Limit stack size */ stack_base = rlimit_max(RLIMIT_STACK); - if (stack_base > (1 << 30)) - stack_base = 1 << 30; + if (stack_base > STACK_SIZE_MAX) + stack_base = STACK_SIZE_MAX; /* Make sure we didn't let the argument array grow too large. */ if (vma->vm_end - vma->vm_start > stack_base) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 6ea7b1436bb..5c56785007e 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -667,7 +667,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) continue; x = ext4_count_free(bitmap_bh->b_data, - EXT4_BLOCKS_PER_GROUP(sb) / 8); + EXT4_CLUSTERS_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", i, ext4_free_group_clusters(sb, gdp), x); bitmap_count += x; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f1c65dc7cc0..66946aa6212 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2466,23 +2466,6 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) up_write(&EXT4_I(inode)->i_data_sem); } -/* - * Update i_disksize after writeback has been started. Races with truncate - * are avoided by checking i_size under i_data_sem. - */ -static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize) -{ - loff_t i_size; - - down_write(&EXT4_I(inode)->i_data_sem); - i_size = i_size_read(inode); - if (newsize > i_size) - newsize = i_size; - if (newsize > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = newsize; - up_write(&EXT4_I(inode)->i_data_sem); -} - struct ext4_group_info { unsigned long bb_state; struct rb_root bb_free_root; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 82df3ce9874..01b0c208f62 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3313,6 +3313,11 @@ static int ext4_split_extent(handle_t *handle, return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + return -EIO; + } uninitialized = ext4_ext_is_uninitialized(ex); split_flag1 = 0; @@ -3694,6 +3699,12 @@ static int ext4_convert_initialized_extents(handle_t *handle, } depth = ext_depth(inode); ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + err = -EIO; + goto out; + } } err = ext4_ext_get_access(handle, inode, path + depth); @@ -4730,6 +4741,9 @@ static long ext4_zero_range(struct file *file, loff_t offset, trace_ext4_zero_range(inode, offset, len, mode); + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + /* * Write out all dirty pages to avoid race conditions * Then release them. @@ -4878,9 +4892,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (mode & FALLOC_FL_PUNCH_HOLE) return ext4_punch_hole(inode, offset, len); - if (mode & FALLOC_FL_COLLAPSE_RANGE) - return ext4_collapse_range(inode, offset, len); - ret = ext4_convert_inline_data(inode); if (ret) return ret; @@ -4892,6 +4903,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; + if (mode & FALLOC_FL_COLLAPSE_RANGE) + return ext4_collapse_range(inode, offset, len); + if (mode & FALLOC_FL_ZERO_RANGE) return ext4_zero_range(file, offset, len, mode); @@ -5229,18 +5243,19 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) update = 1; - *start = ex_last->ee_block + + *start = le32_to_cpu(ex_last->ee_block) + ext4_ext_get_actual_len(ex_last); while (ex_start <= ex_last) { - ex_start->ee_block -= shift; - if (ex_start > - EXT_FIRST_EXTENT(path[depth].p_hdr)) { - if (ext4_ext_try_to_merge_right(inode, - path, ex_start - 1)) - ex_last--; - } - ex_start++; + le32_add_cpu(&ex_start->ee_block, -shift); + /* Try to merge to the left. */ + if ((ex_start > + EXT_FIRST_EXTENT(path[depth].p_hdr)) && + ext4_ext_try_to_merge_right(inode, + path, ex_start - 1)) + ex_last--; + else + ex_start++; } err = ext4_ext_dirty(handle, inode, path + depth); if (err) @@ -5255,7 +5270,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, if (err) goto out; - path[depth].p_idx->ei_block -= shift; + le32_add_cpu(&path[depth].p_idx->ei_block, -shift); err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; @@ -5300,7 +5315,8 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, return ret; } - stop_block = extent->ee_block + ext4_ext_get_actual_len(extent); + stop_block = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); ext4_ext_drop_refs(path); kfree(path); @@ -5313,10 +5329,18 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, * enough to accomodate the shift. */ path = ext4_ext_find_extent(inode, start - 1, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; - ex_start = extent->ee_block; - ex_end = extent->ee_block + ext4_ext_get_actual_len(extent); + if (extent) { + ex_start = le32_to_cpu(extent->ee_block); + ex_end = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + } else { + ex_start = 0; + ex_end = 0; + } ext4_ext_drop_refs(path); kfree(path); @@ -5331,7 +5355,13 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; - current_block = extent->ee_block; + if (!extent) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) start); + return -EIO; + } + + current_block = le32_to_cpu(extent->ee_block); if (start > current_block) { /* Hole, move to the next extent */ ret = mext_next_extent(inode, path, &extent); @@ -5365,17 +5395,18 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ext4_lblk_t punch_start, punch_stop; handle_t *handle; unsigned int credits; - loff_t new_size; + loff_t new_size, ioffset; int ret; - BUG_ON(offset + len > i_size_read(inode)); - /* Collapse range works only on fs block size aligned offsets. */ if (offset & (EXT4_BLOCK_SIZE(sb) - 1) || len & (EXT4_BLOCK_SIZE(sb) - 1)) return -EINVAL; if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) return -EOPNOTSUPP; trace_ext4_collapse_range(inode, offset, len); @@ -5383,22 +5414,34 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); + /* Call ext4_force_commit to flush all data in case of data=journal. */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + if (ret) + return ret; + } + + /* + * Need to round down offset to be aligned with page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, offset, -1); + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + LLONG_MAX); if (ret) return ret; /* Take mutex lock */ mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - ret = -EPERM; - goto out_mutex; - } - - if (IS_SWAPFILE(inode)) { - ret = -ETXTBSY; + /* + * There is no need to overlap collapse range with EOF, in which case + * it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + ret = -EINVAL; goto out_mutex; } @@ -5408,7 +5451,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - truncate_pagecache_range(inode, offset, -1); + truncate_pagecache(inode, ioffset); /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); @@ -5425,7 +5468,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ext4_discard_preallocations(inode); ret = ext4_es_remove_extent(inode, punch_start, - EXT_MAX_BLOCKS - punch_start - 1); + EXT_MAX_BLOCKS - punch_start); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; @@ -5436,6 +5479,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } + ext4_discard_preallocations(inode); ret = ext4_ext_shift_extents(inode, handle, punch_stop, punch_stop - punch_start); @@ -5445,10 +5489,9 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) } new_size = i_size_read(inode) - len; - truncate_setsize(inode, new_size); + i_size_write(inode, new_size); EXT4_I(inode)->i_disksize = new_size; - ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 0a014a7194b..0ebc21204b5 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -810,7 +810,7 @@ retry: newes.es_lblk = end + 1; newes.es_len = len2; - block = 0x7FDEADBEEF; + block = 0x7FDEADBEEFULL; if (ext4_es_is_written(&orig_es) || ext4_es_is_unwritten(&orig_es)) block = ext4_es_pblock(&orig_es) + diff --git a/fs/ext4/file.c b/fs/ext4/file.c index ca7502d89fd..063fc153835 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -82,7 +82,7 @@ ext4_unaligned_aio(struct inode *inode, const struct iovec *iov, size_t count = iov_length(iov, nr_segs); loff_t final_size = pos + count; - if (pos >= inode->i_size) + if (pos >= i_size_read(inode)) return 0; if ((pos & blockmask) || (final_size & blockmask)) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5b0d2c7d540..d7b7462a0e1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -522,6 +522,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (unlikely(map->m_len > INT_MAX)) map->m_len = INT_MAX; + /* We can handle the block number less than EXT_MAX_BLOCKS */ + if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) + return -EIO; + /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { ext4_es_lru_add(inode); @@ -2243,13 +2247,23 @@ static int mpage_map_and_submit_extent(handle_t *handle, return err; } while (map->m_len); - /* Update on-disk size after IO is submitted */ + /* + * Update on-disk size after IO is submitted. Races with + * truncate are avoided by checking i_size under i_data_sem. + */ disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; if (disksize > EXT4_I(inode)->i_disksize) { int err2; - - ext4_wb_update_i_disksize(inode, disksize); + loff_t i_size; + + down_write(&EXT4_I(inode)->i_data_sem); + i_size = i_size_read(inode); + if (disksize > i_size) + disksize = i_size; + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; err2 = ext4_mark_inode_dirty(handle, inode); + up_write(&EXT4_I(inode)->i_data_sem); if (err2) ext4_error(inode->i_sb, "Failed to mark inode %lu dirty", @@ -3527,15 +3541,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) } mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - ret = -EPERM; - goto out_mutex; - } - if (IS_SWAPFILE(inode)) { - ret = -ETXTBSY; - goto out_mutex; - } /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) @@ -3616,7 +3621,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) ret = ext4_free_hole_blocks(handle, inode, first_block, stop_block); - ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); @@ -4423,21 +4427,20 @@ out_brelse: * * We are called from a few places: * - * - Within generic_file_write() for O_SYNC files. + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running * transaction to commit. * - * - Within sys_sync(), kupdate and such. - * We wait on commit, if tol to. + * - Within flush work (sys_sync(), kupdate and such). + * We wait on commit, if told to. * - * - Within prune_icache() (PF_MEMALLOC == true) - * Here we simply return. We can't afford to block kswapd on the - * journal commit. + * - Within iput_final() -> write_inode_now() + * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in - * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for - * knfsd. + * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL + * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in @@ -4449,15 +4452,15 @@ out_brelse: * stuff(); * inode->i_size = expr; * - * is in error because a kswapd-driven write_inode() could occur while - * `stuff()' is running, and the new i_size will be lost. Plus the inode - * will no longer be on the superblock's dirty inode list. + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost. Plus the inode will no longer be on the + * superblock's dirty inode list. */ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; - if (current->flags & PF_MEMALLOC) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (EXT4_SB(inode->i_sb)->s_journal) { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a888cac76e9..c8238a26818 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -989,7 +989,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, poff = block % blocks_per_page; page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (!page) - return -EIO; + return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); e4b->bd_bitmap_page = page; e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); @@ -1003,7 +1003,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, pnum = block / blocks_per_page; page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (!page) - return -EIO; + return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); e4b->bd_buddy_page = page; return 0; @@ -1168,7 +1168,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) { + if (page == NULL) { + ret = -ENOMEM; + goto err; + } + if (!PageUptodate(page)) { ret = -EIO; goto err; } @@ -1197,7 +1201,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) { + if (page == NULL) { + ret = -ENOMEM; + goto err; + } + if (!PageUptodate(page)) { ret = -EIO; goto err; } @@ -5008,6 +5016,8 @@ error_return: */ static int ext4_trim_extent(struct super_block *sb, int start, int count, ext4_group_t group, struct ext4_buddy *e4b) +__releases(bitlock) +__acquires(bitlock) { struct ext4_free_extent ex; int ret = 0; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index ab95508e3d4..c18d95b5054 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -308,13 +308,14 @@ static void ext4_end_bio(struct bio *bio, int error) if (error) { struct inode *inode = io_end->inode; - ext4_warning(inode->i_sb, "I/O error writing to inode %lu " + ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " "(offset %llu size %ld starting block %llu)", - inode->i_ino, + error, inode->i_ino, (unsigned long long) io_end->offset, (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); + mapping_set_error(inode->i_mapping, error); } if (io_end->flag & EXT4_IO_END_UNWRITTEN) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f3c66709161..6f9e6fadac0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3869,19 +3869,38 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount2; } } + + /* + * set up enough so that it can read an inode, + * and create new inode for buddy allocator + */ + sbi->s_gdb_count = db_count; + if (!test_opt(sb, NOLOAD) && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) + sb->s_op = &ext4_sops; + else + sb->s_op = &ext4_nojournal_sops; + + ext4_ext_init(sb); + err = ext4_mb_init(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", + err); + goto failed_mount2; + } + if (!ext4_check_descriptors(sb, &first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); - goto failed_mount2; + goto failed_mount2a; } if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) if (!ext4_fill_flex_info(sb)) { ext4_msg(sb, KERN_ERR, "unable to initialize " "flex_bg meta info!"); - goto failed_mount2; + goto failed_mount2a; } - sbi->s_gdb_count = db_count; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); @@ -3916,14 +3935,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_extent_max_zeroout_kb = 32; - /* - * set up enough so that it can read an inode - */ - if (!test_opt(sb, NOLOAD) && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) - sb->s_op = &ext4_sops; - else - sb->s_op = &ext4_nojournal_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_QUOTA @@ -4113,21 +4124,13 @@ no_journal: if (err) { ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " "reserved pool", ext4_calculate_resv_clusters(sb)); - goto failed_mount4a; + goto failed_mount5; } err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " "zone (%d)", err); - goto failed_mount4a; - } - - ext4_ext_init(sb); - err = ext4_mb_init(sb); - if (err) { - ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", - err); goto failed_mount5; } @@ -4204,11 +4207,8 @@ failed_mount8: failed_mount7: ext4_unregister_li_request(sb); failed_mount6: - ext4_mb_release(sb); -failed_mount5: - ext4_ext_release(sb); ext4_release_system_zone(sb); -failed_mount4a: +failed_mount5: dput(sb->s_root); sb->s_root = NULL; failed_mount4: @@ -4232,11 +4232,14 @@ failed_mount3: percpu_counter_destroy(&sbi->s_extent_cache_cnt); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); +failed_mount2a: + ext4_mb_release(sb); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); ext4_kvfree(sbi->s_group_desc); failed_mount: + ext4_ext_release(sb); if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); if (sbi->s_proc) { diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 1f5cf588071..4eec399ec80 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -520,8 +520,8 @@ static void ext4_xattr_update_super_block(handle_t *handle, } /* - * Release the xattr block BH: If the reference count is > 1, decrement - * it; otherwise free the block. + * Release the xattr block BH: If the reference count is > 1, decrement it; + * otherwise free the block. */ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, @@ -542,16 +542,31 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, if (ce) mb_cache_entry_free(ce); get_bh(bh); + unlock_buffer(bh); ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); - unlock_buffer(bh); } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); if (ce) mb_cache_entry_release(ce); + /* + * Beware of this ugliness: Releasing of xattr block references + * from different inodes can race and so we have to protect + * from a race where someone else frees the block (and releases + * its journal_head) before we are done dirtying the buffer. In + * nojournal mode this race is harmless and we actually cannot + * call ext4_handle_dirty_xattr_block() with locked buffer as + * that function can call sync_dirty_buffer() so for that case + * we handle the dirtying after unlocking the buffer. + */ + if (ext4_handle_valid(handle)) + error = ext4_handle_dirty_xattr_block(handle, inode, + bh); unlock_buffer(bh); - error = ext4_handle_dirty_xattr_block(handle, inode, bh); + if (!ext4_handle_valid(handle)) + error = ext4_handle_dirty_xattr_block(handle, inode, + bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1)); diff --git a/fs/fcntl.c b/fs/fcntl.c index 9ead1596399..72c82f69b01 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -274,15 +274,15 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ - case F_GETLKP: + case F_OFD_GETLK: #endif case F_GETLK: err = fcntl_getlk(filp, cmd, (struct flock __user *) arg); break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ - case F_SETLKP: - case F_SETLKPW: + case F_OFD_SETLK: + case F_OFD_SETLKW: #endif /* Fallthrough */ case F_SETLK: @@ -399,13 +399,13 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, switch (cmd) { case F_GETLK64: - case F_GETLKP: + case F_OFD_GETLK: err = fcntl_getlk64(f.file, cmd, (struct flock64 __user *) arg); break; case F_SETLK64: case F_SETLKW64: - case F_SETLKP: - case F_SETLKPW: + case F_OFD_SETLK: + case F_OFD_SETLKW: err = fcntl_setlk64(fd, f.file, cmd, (struct flock64 __user *) arg); break; diff --git a/fs/fuse/control.c b/fs/fuse/control.c index a0b0855d00a..205e0d5d530 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -348,7 +348,7 @@ int __init fuse_ctl_init(void) return register_filesystem(&fuse_ctl_fs_type); } -void fuse_ctl_cleanup(void) +void __exit fuse_ctl_cleanup(void) { unregister_filesystem(&fuse_ctl_fs_type); } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 5b4e035b364..42198359fa1 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -679,6 +679,14 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, return create_new_entry(fc, req, dir, entry, S_IFLNK); } +static inline void fuse_update_ctime(struct inode *inode) +{ + if (!IS_NOCMTIME(inode)) { + inode->i_ctime = current_fs_time(inode->i_sb); + mark_inode_dirty_sync(inode); + } +} + static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; @@ -713,6 +721,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) fuse_invalidate_attr(inode); fuse_invalidate_attr(dir); fuse_invalidate_entry_cache(entry); + fuse_update_ctime(inode); } else if (err == -EINTR) fuse_invalidate_entry(entry); return err; @@ -743,23 +752,26 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) return err; } -static int fuse_rename(struct inode *olddir, struct dentry *oldent, - struct inode *newdir, struct dentry *newent) +static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags, int opcode, size_t argsize) { int err; - struct fuse_rename_in inarg; + struct fuse_rename2_in inarg; struct fuse_conn *fc = get_fuse_conn(olddir); - struct fuse_req *req = fuse_get_req_nopages(fc); + struct fuse_req *req; + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); - memset(&inarg, 0, sizeof(inarg)); + memset(&inarg, 0, argsize); inarg.newdir = get_node_id(newdir); - req->in.h.opcode = FUSE_RENAME; + inarg.flags = flags; + req->in.h.opcode = opcode; req->in.h.nodeid = get_node_id(olddir); req->in.numargs = 3; - req->in.args[0].size = sizeof(inarg); + req->in.args[0].size = argsize; req->in.args[0].value = &inarg; req->in.args[1].size = oldent->d_name.len + 1; req->in.args[1].value = oldent->d_name.name; @@ -771,15 +783,22 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, if (!err) { /* ctime changes */ fuse_invalidate_attr(oldent->d_inode); + fuse_update_ctime(oldent->d_inode); + + if (flags & RENAME_EXCHANGE) { + fuse_invalidate_attr(newent->d_inode); + fuse_update_ctime(newent->d_inode); + } fuse_invalidate_attr(olddir); if (olddir != newdir) fuse_invalidate_attr(newdir); /* newent will end up negative */ - if (newent->d_inode) { + if (!(flags & RENAME_EXCHANGE) && newent->d_inode) { fuse_invalidate_attr(newent->d_inode); fuse_invalidate_entry_cache(newent); + fuse_update_ctime(newent->d_inode); } } else if (err == -EINTR) { /* If request was interrupted, DEITY only knows if the @@ -795,6 +814,36 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, return err; } +static int fuse_rename(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent) +{ + return fuse_rename_common(olddir, oldent, newdir, newent, 0, + FUSE_RENAME, sizeof(struct fuse_rename_in)); +} + +static int fuse_rename2(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags) +{ + struct fuse_conn *fc = get_fuse_conn(olddir); + int err; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if (fc->no_rename2 || fc->minor < 23) + return -EINVAL; + + err = fuse_rename_common(olddir, oldent, newdir, newent, flags, + FUSE_RENAME2, sizeof(struct fuse_rename2_in)); + if (err == -ENOSYS) { + fc->no_rename2 = 1; + err = -EINVAL; + } + return err; + +} + static int fuse_link(struct dentry *entry, struct inode *newdir, struct dentry *newent) { @@ -829,6 +878,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, inc_nlink(inode); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); + fuse_update_ctime(inode); } else if (err == -EINTR) { fuse_invalidate_attr(inode); } @@ -846,6 +896,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, attr->size = i_size_read(inode); attr->mtime = inode->i_mtime.tv_sec; attr->mtimensec = inode->i_mtime.tv_nsec; + attr->ctime = inode->i_ctime.tv_sec; + attr->ctimensec = inode->i_ctime.tv_nsec; } stat->dev = inode->i_sb->s_dev; @@ -1504,7 +1556,7 @@ static bool update_mtime(unsigned ivalid, bool trust_local_mtime) } static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg, - bool trust_local_mtime) + bool trust_local_cmtime) { unsigned ivalid = iattr->ia_valid; @@ -1523,13 +1575,18 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg, if (!(ivalid & ATTR_ATIME_SET)) arg->valid |= FATTR_ATIME_NOW; } - if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_mtime)) { + if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) { arg->valid |= FATTR_MTIME; arg->mtime = iattr->ia_mtime.tv_sec; arg->mtimensec = iattr->ia_mtime.tv_nsec; - if (!(ivalid & ATTR_MTIME_SET) && !trust_local_mtime) + if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime) arg->valid |= FATTR_MTIME_NOW; } + if ((ivalid & ATTR_CTIME) && trust_local_cmtime) { + arg->valid |= FATTR_CTIME; + arg->ctime = iattr->ia_ctime.tv_sec; + arg->ctimensec = iattr->ia_ctime.tv_nsec; + } } /* @@ -1597,39 +1654,38 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req, /* * Flush inode->i_mtime to the server */ -int fuse_flush_mtime(struct file *file, bool nofail) +int fuse_flush_times(struct inode *inode, struct fuse_file *ff) { - struct inode *inode = file->f_mapping->host; - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = NULL; + struct fuse_req *req; struct fuse_setattr_in inarg; struct fuse_attr_out outarg; int err; - if (nofail) { - req = fuse_get_req_nofail_nopages(fc, file); - } else { - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - } + req = fuse_get_req_nopages(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); - inarg.valid |= FATTR_MTIME; + inarg.valid = FATTR_MTIME; inarg.mtime = inode->i_mtime.tv_sec; inarg.mtimensec = inode->i_mtime.tv_nsec; - + if (fc->minor >= 23) { + inarg.valid |= FATTR_CTIME; + inarg.ctime = inode->i_ctime.tv_sec; + inarg.ctimensec = inode->i_ctime.tv_nsec; + } + if (ff) { + inarg.valid |= FATTR_FH; + inarg.fh = ff->fh; + } fuse_setattr_fill(fc, req, inode, &inarg, &outarg); fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); - if (!err) - clear_bit(FUSE_I_MTIME_DIRTY, &fi->state); - return err; } @@ -1653,7 +1709,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, bool is_wb = fc->writeback_cache; loff_t oldsize; int err; - bool trust_local_mtime = is_wb && S_ISREG(inode->i_mode); + bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode); if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) attr->ia_valid |= ATTR_FORCE; @@ -1678,11 +1734,13 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, if (is_truncate) { fuse_set_nowrite(inode); set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + if (trust_local_cmtime && attr->ia_size != inode->i_size) + attr->ia_valid |= ATTR_MTIME | ATTR_CTIME; } memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); - iattr_to_fattr(attr, &inarg, trust_local_mtime); + iattr_to_fattr(attr, &inarg, trust_local_cmtime); if (file) { struct fuse_file *ff = file->private_data; inarg.valid |= FATTR_FH; @@ -1711,9 +1769,12 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, spin_lock(&fc->lock); /* the kernel maintains i_mtime locally */ - if (trust_local_mtime && (attr->ia_valid & ATTR_MTIME)) { - inode->i_mtime = attr->ia_mtime; - clear_bit(FUSE_I_MTIME_DIRTY, &fi->state); + if (trust_local_cmtime) { + if (attr->ia_valid & ATTR_MTIME) + inode->i_mtime = attr->ia_mtime; + if (attr->ia_valid & ATTR_CTIME) + inode->i_ctime = attr->ia_ctime; + /* FIXME: clear I_DIRTY_SYNC? */ } fuse_change_attributes_common(inode, &outarg.attr, @@ -1810,8 +1871,10 @@ static int fuse_setxattr(struct dentry *entry, const char *name, fc->no_setxattr = 1; err = -EOPNOTSUPP; } - if (!err) + if (!err) { fuse_invalidate_attr(inode); + fuse_update_ctime(inode); + } return err; } @@ -1941,20 +2004,11 @@ static int fuse_removexattr(struct dentry *entry, const char *name) fc->no_removexattr = 1; err = -EOPNOTSUPP; } - if (!err) + if (!err) { fuse_invalidate_attr(inode); - return err; -} - -static int fuse_update_time(struct inode *inode, struct timespec *now, - int flags) -{ - if (flags & S_MTIME) { - inode->i_mtime = *now; - set_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state); - BUG_ON(!S_ISREG(inode->i_mode)); + fuse_update_ctime(inode); } - return 0; + return err; } static const struct inode_operations fuse_dir_inode_operations = { @@ -1964,6 +2018,7 @@ static const struct inode_operations fuse_dir_inode_operations = { .unlink = fuse_unlink, .rmdir = fuse_rmdir, .rename = fuse_rename, + .rename2 = fuse_rename2, .link = fuse_link, .setattr = fuse_setattr, .create = fuse_create, @@ -1996,7 +2051,6 @@ static const struct inode_operations fuse_common_inode_operations = { .getxattr = fuse_getxattr, .listxattr = fuse_listxattr, .removexattr = fuse_removexattr, - .update_time = fuse_update_time, }; static const struct inode_operations fuse_symlink_inode_operations = { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 13f8bdec511..96d513e01a5 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -223,6 +223,8 @@ void fuse_finish_open(struct inode *inode, struct file *file) i_size_write(inode, 0); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); + if (fc->writeback_cache) + file_update_time(file); } if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); @@ -232,18 +234,26 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) { struct fuse_conn *fc = get_fuse_conn(inode); int err; + bool lock_inode = (file->f_flags & O_TRUNC) && + fc->atomic_o_trunc && + fc->writeback_cache; err = generic_file_open(inode, file); if (err) return err; + if (lock_inode) + mutex_lock(&inode->i_mutex); + err = fuse_do_open(fc, get_node_id(inode), file, isdir); - if (err) - return err; - fuse_finish_open(inode, file); + if (!err) + fuse_finish_open(inode, file); - return 0; + if (lock_inode) + mutex_unlock(&inode->i_mutex); + + return err; } static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) @@ -314,10 +324,7 @@ static int fuse_release(struct inode *inode, struct file *file) /* see fuse_vma_close() for !writeback_cache case */ if (fc->writeback_cache) - filemap_write_and_wait(file->f_mapping); - - if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state)) - fuse_flush_mtime(file, true); + write_inode_now(inode, 1); fuse_release_common(file, FUSE_RELEASE); @@ -439,7 +446,7 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (fc->no_flush) return 0; - err = filemap_write_and_wait(file->f_mapping); + err = write_inode_now(inode, 1); if (err) return err; @@ -480,13 +487,6 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, if (is_bad_inode(inode)) return -EIO; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (err) - return err; - - if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) - return 0; - mutex_lock(&inode->i_mutex); /* @@ -494,17 +494,17 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, * wait for all outstanding writes, before sending the FSYNC * request. */ - err = write_inode_now(inode, 0); + err = filemap_write_and_wait_range(inode->i_mapping, start, end); if (err) goto out; fuse_sync_writes(inode); + err = sync_inode_metadata(inode, 1); + if (err) + goto out; - if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state)) { - int err = fuse_flush_mtime(file, false); - if (err) - goto out; - } + if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) + goto out; req = fuse_get_req_nopages(fc); if (IS_ERR(req)) { @@ -1659,13 +1659,13 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) fuse_writepage_free(fc, req); } -static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, - struct fuse_inode *fi) +static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, + struct fuse_inode *fi) { struct fuse_file *ff = NULL; spin_lock(&fc->lock); - if (!WARN_ON(list_empty(&fi->write_files))) { + if (!list_empty(&fi->write_files)) { ff = list_entry(fi->write_files.next, struct fuse_file, write_entry); fuse_file_get(ff); @@ -1675,6 +1675,29 @@ static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, return ff; } +static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, + struct fuse_inode *fi) +{ + struct fuse_file *ff = __fuse_write_file_get(fc, fi); + WARN_ON(!ff); + return ff; +} + +int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_file *ff; + int err; + + ff = __fuse_write_file_get(fc, fi); + err = fuse_flush_times(inode, ff); + if (ff) + fuse_file_put(ff, 0); + + return err; +} + static int fuse_writepage_locked(struct page *page) { struct address_space *mapping = page->mapping; @@ -2972,6 +2995,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || (mode & FALLOC_FL_PUNCH_HOLE); + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + if (fc->no_fallocate) return -EOPNOTSUPP; @@ -3017,12 +3043,8 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE)) { bool changed = fuse_write_update_size(inode, offset + length); - if (changed && fc->writeback_cache) { - struct fuse_inode *fi = get_fuse_inode(inode); - - inode->i_mtime = current_fs_time(inode->i_sb); - set_bit(FUSE_I_MTIME_DIRTY, &fi->state); - } + if (changed && fc->writeback_cache) + file_update_time(file); } if (mode & FALLOC_FL_PUNCH_HOLE) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a257ed8ebee..7aa5c75e0de 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -119,8 +119,6 @@ enum { FUSE_I_INIT_RDPLUS, /** An operation changing file size is in progress */ FUSE_I_SIZE_UNSTABLE, - /** i_mtime has been updated locally; a flush to userspace needed */ - FUSE_I_MTIME_DIRTY, }; struct fuse_conn; @@ -544,6 +542,9 @@ struct fuse_conn { /** Is fallocate not implemented by fs? */ unsigned no_fallocate:1; + /** Is rename with flags implemented by fs? */ + unsigned no_rename2:1; + /** Use enhanced/automatic page cache invalidation. */ unsigned auto_inval_data:1; @@ -725,7 +726,7 @@ int fuse_dev_init(void); void fuse_dev_cleanup(void); int fuse_ctl_init(void); -void fuse_ctl_cleanup(void); +void __exit fuse_ctl_cleanup(void); /** * Allocate a request @@ -891,7 +892,8 @@ int fuse_dev_release(struct inode *inode, struct file *file); bool fuse_write_update_size(struct inode *inode, loff_t pos); -int fuse_flush_mtime(struct file *file, bool nofail); +int fuse_flush_times(struct inode *inode, struct fuse_file *ff); +int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); int fuse_do_setattr(struct inode *inode, struct iattr *attr, struct file *file); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 8d611696fca..754dcf23de8 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -175,9 +175,9 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) { inode->i_mtime.tv_sec = attr->mtime; inode->i_mtime.tv_nsec = attr->mtimensec; + inode->i_ctime.tv_sec = attr->ctime; + inode->i_ctime.tv_nsec = attr->ctimensec; } - inode->i_ctime.tv_sec = attr->ctime; - inode->i_ctime.tv_nsec = attr->ctimensec; if (attr->blksize != 0) inode->i_blkbits = ilog2(attr->blksize); @@ -256,6 +256,8 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) inode->i_size = attr->size; inode->i_mtime.tv_sec = attr->mtime; inode->i_mtime.tv_nsec = attr->mtimensec; + inode->i_ctime.tv_sec = attr->ctime; + inode->i_ctime.tv_nsec = attr->ctimensec; if (S_ISREG(inode->i_mode)) { fuse_init_common(inode); fuse_init_file_inode(inode); @@ -303,7 +305,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, if ((inode->i_state & I_NEW)) { inode->i_flags |= S_NOATIME; - if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) + if (!fc->writeback_cache || !S_ISREG(attr->mode)) inode->i_flags |= S_NOCMTIME; inode->i_generation = generation; inode->i_data.backing_dev_info = &fc->bdi; @@ -788,6 +790,7 @@ static const struct super_operations fuse_super_operations = { .alloc_inode = fuse_alloc_inode, .destroy_inode = fuse_destroy_inode, .evict_inode = fuse_evict_inode, + .write_inode = fuse_write_inode, .drop_inode = generic_delete_inode, .remount_fs = fuse_remount_fs, .put_super = fuse_put_super, @@ -890,6 +893,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->async_dio = 1; if (arg->flags & FUSE_WRITEBACK_CACHE) fc->writeback_cache = 1; + if (arg->time_gran && arg->time_gran <= 1000000000) + fc->sb->s_time_gran = arg->time_gran; + else + fc->sb->s_time_gran = 1000000000; + } else { ra_pages = fc->max_read / PAGE_CACHE_SIZE; fc->no_lock = 1; @@ -996,7 +1004,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_flags & MS_MANDLOCK) goto err; - sb->s_flags &= ~MS_NOSEC; + sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION); if (!parse_fuse_opt((char *) data, &d, is_bdev)) goto err; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 20402752093..e19d4c0caca 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1030,6 +1030,11 @@ static int __init init_hugetlbfs_fs(void) int error; int i; + if (!hugepages_supported()) { + pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n"); + return -ENOTSUPP; + } + error = bdi_init(&hugetlbfs_backing_dev_info); if (error) return error; diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 78f3403300a..ac127cd008b 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -232,9 +232,6 @@ static int kernfs_link_sibling(struct kernfs_node *kn) struct rb_node **node = &kn->parent->dir.children.rb_node; struct rb_node *parent = NULL; - if (kernfs_type(kn) == KERNFS_DIR) - kn->parent->dir.subdirs++; - while (*node) { struct kernfs_node *pos; int result; @@ -249,9 +246,15 @@ static int kernfs_link_sibling(struct kernfs_node *kn) else return -EEXIST; } + /* add new node and rebalance the tree */ rb_link_node(&kn->rb, parent, node); rb_insert_color(&kn->rb, &kn->parent->dir.children); + + /* successfully added, account subdir number */ + if (kernfs_type(kn) == KERNFS_DIR) + kn->parent->dir.subdirs++; + return 0; } diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 8034706a7af..5e9a80cfc3d 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -484,6 +484,8 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) ops = kernfs_ops(of->kn); rc = ops->mmap(of, vma); + if (rc) + goto out_put; /* * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() @@ -608,6 +610,7 @@ static void kernfs_put_open_node(struct kernfs_node *kn, static int kernfs_fop_open(struct inode *inode, struct file *file) { struct kernfs_node *kn = file->f_path.dentry->d_fsdata; + struct kernfs_root *root = kernfs_root(kn); const struct kernfs_ops *ops; struct kernfs_open_file *of; bool has_read, has_write, has_mmap; @@ -622,14 +625,16 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) has_write = ops->write || ops->mmap; has_mmap = ops->mmap; - /* check perms and supported operations */ - if ((file->f_mode & FMODE_WRITE) && - (!(inode->i_mode & S_IWUGO) || !has_write)) - goto err_out; + /* see the flag definition for details */ + if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) { + if ((file->f_mode & FMODE_WRITE) && + (!(inode->i_mode & S_IWUGO) || !has_write)) + goto err_out; - if ((file->f_mode & FMODE_READ) && - (!(inode->i_mode & S_IRUGO) || !has_read)) - goto err_out; + if ((file->f_mode & FMODE_READ) && + (!(inode->i_mode & S_IRUGO) || !has_read)) + goto err_out; + } /* allocate a kernfs_open_file for the file */ error = -ENOMEM; diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index abb0f1f53d9..985217626e6 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -48,14 +48,18 @@ void __init kernfs_inode_init(void) static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) { + static DEFINE_MUTEX(iattr_mutex); + struct kernfs_iattrs *ret; struct iattr *iattrs; + mutex_lock(&iattr_mutex); + if (kn->iattr) - return kn->iattr; + goto out_unlock; kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL); if (!kn->iattr) - return NULL; + goto out_unlock; iattrs = &kn->iattr->ia_iattr; /* assign default attributes */ @@ -65,8 +69,10 @@ static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; simple_xattrs_init(&kn->iattr->xattrs); - - return kn->iattr; +out_unlock: + ret = kn->iattr; + mutex_unlock(&iattr_mutex); + return ret; } static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) diff --git a/fs/locks.c b/fs/locks.c index 13fc7a6d380..e390bd9ae06 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -135,7 +135,7 @@ #define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) #define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG)) -#define IS_FILE_PVT(fl) (fl->fl_flags & FL_FILE_PVT) +#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) static bool lease_breaking(struct file_lock *fl) { @@ -389,18 +389,6 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl, fl->fl_ops = NULL; fl->fl_lmops = NULL; - /* Ensure that fl->fl_filp has compatible f_mode */ - switch (l->l_type) { - case F_RDLCK: - if (!(filp->f_mode & FMODE_READ)) - return -EBADF; - break; - case F_WRLCK: - if (!(filp->f_mode & FMODE_WRITE)) - return -EBADF; - break; - } - return assign_type(fl, l->l_type); } @@ -564,7 +552,7 @@ static void __locks_insert_block(struct file_lock *blocker, BUG_ON(!list_empty(&waiter->fl_block)); waiter->fl_next = blocker; list_add_tail(&waiter->fl_block, &blocker->fl_block); - if (IS_POSIX(blocker) && !IS_FILE_PVT(blocker)) + if (IS_POSIX(blocker) && !IS_OFDLCK(blocker)) locks_insert_global_blocked(waiter); } @@ -759,12 +747,12 @@ EXPORT_SYMBOL(posix_test_lock); * of tasks (such as posix threads) sharing the same open file table. * To handle those cases, we just bail out after a few iterations. * - * For FL_FILE_PVT locks, the owner is the filp, not the files_struct. + * For FL_OFDLCK locks, the owner is the filp, not the files_struct. * Because the owner is not even nominally tied to a thread of * execution, the deadlock detection below can't reasonably work well. Just * skip it for those. * - * In principle, we could do a more limited deadlock detection on FL_FILE_PVT + * In principle, we could do a more limited deadlock detection on FL_OFDLCK * locks that just checks for the case where two tasks are attempting to * upgrade from read to write locks on the same inode. */ @@ -791,9 +779,9 @@ static int posix_locks_deadlock(struct file_lock *caller_fl, /* * This deadlock detector can't reasonably detect deadlocks with - * FL_FILE_PVT locks, since they aren't owned by a process, per-se. + * FL_OFDLCK locks, since they aren't owned by a process, per-se. */ - if (IS_FILE_PVT(caller_fl)) + if (IS_OFDLCK(caller_fl)) return 0; while ((block_fl = what_owner_is_waiting_for(block_fl))) { @@ -1391,11 +1379,10 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) restart: break_time = flock->fl_break_time; - if (break_time != 0) { + if (break_time != 0) break_time -= jiffies; - if (break_time == 0) - break_time++; - } + if (break_time == 0) + break_time++; locks_insert_block(flock, new_fl); spin_unlock(&inode->i_lock); error = wait_event_interruptible_timeout(new_fl->fl_wait, @@ -1891,7 +1878,7 @@ EXPORT_SYMBOL_GPL(vfs_test_lock); static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) { - flock->l_pid = IS_FILE_PVT(fl) ? -1 : fl->fl_pid; + flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid; #if BITS_PER_LONG == 32 /* * Make sure we can represent the posix lock via @@ -1913,7 +1900,7 @@ static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) #if BITS_PER_LONG == 32 static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) { - flock->l_pid = IS_FILE_PVT(fl) ? -1 : fl->fl_pid; + flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid; flock->l_start = fl->fl_start; flock->l_len = fl->fl_end == OFFSET_MAX ? 0 : fl->fl_end - fl->fl_start + 1; @@ -1942,13 +1929,13 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l) if (error) goto out; - if (cmd == F_GETLKP) { + if (cmd == F_OFD_GETLK) { error = -EINVAL; if (flock.l_pid != 0) goto out; cmd = F_GETLK; - file_lock.fl_flags |= FL_FILE_PVT; + file_lock.fl_flags |= FL_OFDLCK; file_lock.fl_owner = (fl_owner_t)filp; } @@ -2035,6 +2022,22 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd, return error; } +/* Ensure that fl->fl_filp has compatible f_mode for F_SETLK calls */ +static int +check_fmode_for_setlk(struct file_lock *fl) +{ + switch (fl->fl_type) { + case F_RDLCK: + if (!(fl->fl_file->f_mode & FMODE_READ)) + return -EBADF; + break; + case F_WRLCK: + if (!(fl->fl_file->f_mode & FMODE_WRITE)) + return -EBADF; + } + return 0; +} + /* Apply the lock described by l to an open file descriptor. * This implements both the F_SETLK and F_SETLKW commands of fcntl(). */ @@ -2072,27 +2075,31 @@ again: if (error) goto out; + error = check_fmode_for_setlk(file_lock); + if (error) + goto out; + /* * If the cmd is requesting file-private locks, then set the - * FL_FILE_PVT flag and override the owner. + * FL_OFDLCK flag and override the owner. */ switch (cmd) { - case F_SETLKP: + case F_OFD_SETLK: error = -EINVAL; if (flock.l_pid != 0) goto out; cmd = F_SETLK; - file_lock->fl_flags |= FL_FILE_PVT; + file_lock->fl_flags |= FL_OFDLCK; file_lock->fl_owner = (fl_owner_t)filp; break; - case F_SETLKPW: + case F_OFD_SETLKW: error = -EINVAL; if (flock.l_pid != 0) goto out; cmd = F_SETLKW; - file_lock->fl_flags |= FL_FILE_PVT; + file_lock->fl_flags |= FL_OFDLCK; file_lock->fl_owner = (fl_owner_t)filp; /* Fallthrough */ case F_SETLKW: @@ -2144,13 +2151,13 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) if (error) goto out; - if (cmd == F_GETLKP) { + if (cmd == F_OFD_GETLK) { error = -EINVAL; if (flock.l_pid != 0) goto out; cmd = F_GETLK64; - file_lock.fl_flags |= FL_FILE_PVT; + file_lock.fl_flags |= FL_OFDLCK; file_lock.fl_owner = (fl_owner_t)filp; } @@ -2207,27 +2214,31 @@ again: if (error) goto out; + error = check_fmode_for_setlk(file_lock); + if (error) + goto out; + /* * If the cmd is requesting file-private locks, then set the - * FL_FILE_PVT flag and override the owner. + * FL_OFDLCK flag and override the owner. */ switch (cmd) { - case F_SETLKP: + case F_OFD_SETLK: error = -EINVAL; if (flock.l_pid != 0) goto out; cmd = F_SETLK64; - file_lock->fl_flags |= FL_FILE_PVT; + file_lock->fl_flags |= FL_OFDLCK; file_lock->fl_owner = (fl_owner_t)filp; break; - case F_SETLKPW: + case F_OFD_SETLKW: error = -EINVAL; if (flock.l_pid != 0) goto out; cmd = F_SETLKW64; - file_lock->fl_flags |= FL_FILE_PVT; + file_lock->fl_flags |= FL_OFDLCK; file_lock->fl_owner = (fl_owner_t)filp; /* Fallthrough */ case F_SETLKW64: @@ -2413,8 +2424,8 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, if (IS_POSIX(fl)) { if (fl->fl_flags & FL_ACCESS) seq_printf(f, "ACCESS"); - else if (IS_FILE_PVT(fl)) - seq_printf(f, "FLPVT "); + else if (IS_OFDLCK(fl)) + seq_printf(f, "OFDLCK"); else seq_printf(f, "POSIX "); diff --git a/fs/namei.c b/fs/namei.c index c6157c894fc..80168273396 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1542,7 +1542,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path, inode = path->dentry->d_inode; } err = -ENOENT; - if (!inode) + if (!inode || d_is_negative(path->dentry)) goto out_path_put; if (should_follow_link(path->dentry, follow)) { @@ -2249,7 +2249,7 @@ mountpoint_last(struct nameidata *nd, struct path *path) mutex_unlock(&dir->d_inode->i_mutex); done: - if (!dentry->d_inode) { + if (!dentry->d_inode || d_is_negative(dentry)) { error = -ENOENT; dput(dentry); goto out; @@ -2994,7 +2994,7 @@ retry_lookup: finish_lookup: /* we _can_ be in RCU mode here */ error = -ENOENT; - if (d_is_negative(path->dentry)) { + if (!inode || d_is_negative(path->dentry)) { path_to_nameidata(path, nd); goto out; } diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 6f3f392d48a..f66c66b9f18 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -402,8 +402,10 @@ sort_pacl(struct posix_acl *pacl) * by uid/gid. */ int i, j; - if (pacl->a_count <= 4) - return; /* no users or groups */ + /* no users or groups */ + if (!pacl || pacl->a_count <= 4) + return; + i = 1; while (pacl->a_entries[i].e_tag == ACL_USER) i++; @@ -530,13 +532,12 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) /* * ACLs with no ACEs are treated differently in the inheritable - * and effective cases: when there are no inheritable ACEs, we - * set a zero-length default posix acl: + * and effective cases: when there are no inheritable ACEs, + * calls ->set_acl with a NULL ACL structure. */ - if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) { - pacl = posix_acl_alloc(0, GFP_KERNEL); - return pacl ? pacl : ERR_PTR(-ENOMEM); - } + if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) + return NULL; + /* * When there are no effective ACEs, the following will end * up setting a 3-element effective posix ACL with all @@ -589,7 +590,7 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) add_to_mask(state, &state->groups->aces[i].perms); } - if (!state->users->n && !state->groups->n) { + if (state->users->n || state->groups->n) { pace++; pace->e_tag = ACL_MASK; low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags); diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 39c8ef875f9..2c73cae9899 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -654,9 +654,11 @@ static struct rpc_clnt *create_backchannel_client(struct rpc_create_args *args) static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses) { + int maxtime = max_cb_time(clp->net); struct rpc_timeout timeparms = { - .to_initval = max_cb_time(clp->net), + .to_initval = maxtime, .to_retries = 0, + .to_maxval = maxtime, }; struct rpc_create_args args = { .net = clp->net, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3ba65979a3c..9a77a5a2155 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1078,6 +1078,18 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) return NULL; } clp->cl_name.len = name.len; + INIT_LIST_HEAD(&clp->cl_sessions); + idr_init(&clp->cl_stateids); + atomic_set(&clp->cl_refcount, 0); + clp->cl_cb_state = NFSD4_CB_UNKNOWN; + INIT_LIST_HEAD(&clp->cl_idhash); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); + INIT_LIST_HEAD(&clp->cl_lru); + INIT_LIST_HEAD(&clp->cl_callbacks); + INIT_LIST_HEAD(&clp->cl_revoked); + spin_lock_init(&clp->cl_lock); + rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); return clp; } @@ -1095,6 +1107,7 @@ free_client(struct nfs4_client *clp) WARN_ON_ONCE(atomic_read(&ses->se_ref)); free_session(ses); } + rpc_destroy_wait_queue(&clp->cl_cb_waitq); free_svc_cred(&clp->cl_cred); kfree(clp->cl_name.data); idr_destroy(&clp->cl_stateids); @@ -1347,7 +1360,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, if (clp == NULL) return NULL; - INIT_LIST_HEAD(&clp->cl_sessions); ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred); if (ret) { spin_lock(&nn->client_lock); @@ -1355,20 +1367,9 @@ static struct nfs4_client *create_client(struct xdr_netobj name, spin_unlock(&nn->client_lock); return NULL; } - idr_init(&clp->cl_stateids); - atomic_set(&clp->cl_refcount, 0); - clp->cl_cb_state = NFSD4_CB_UNKNOWN; - INIT_LIST_HEAD(&clp->cl_idhash); - INIT_LIST_HEAD(&clp->cl_openowners); - INIT_LIST_HEAD(&clp->cl_delegations); - INIT_LIST_HEAD(&clp->cl_lru); - INIT_LIST_HEAD(&clp->cl_callbacks); - INIT_LIST_HEAD(&clp->cl_revoked); - spin_lock_init(&clp->cl_lock); nfsd4_init_callback(&clp->cl_cb_null); clp->cl_time = get_seconds(); clear_bit(0, &clp->cl_cb_slot_busy); - rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); copy_verf(clp, verf); rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa); gen_confirm(clp); @@ -3716,9 +3717,16 @@ out: static __be32 nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp) { - if (check_for_locks(stp->st_file, lockowner(stp->st_stateowner))) + struct nfs4_lockowner *lo = lockowner(stp->st_stateowner); + + if (check_for_locks(stp->st_file, lo)) return nfserr_locks_held; - release_lock_stateid(stp); + /* + * Currently there's a 1-1 lock stateid<->lockowner + * correspondance, and we have to delete the lockowner when we + * delete the lock stateid: + */ + unhash_lockowner(lo); return nfs_ok; } @@ -4158,6 +4166,10 @@ static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, c if (!same_owner_str(&lo->lo_owner, owner, clid)) return false; + if (list_empty(&lo->lo_owner.so_stateids)) { + WARN_ON_ONCE(1); + return false; + } lst = list_first_entry(&lo->lo_owner.so_stateids, struct nfs4_ol_stateid, st_perstateowner); return lst->st_file->fi_inode == inode; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 2723c1badd0..18881f34737 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3627,14 +3627,6 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) /* nfsd4_check_resp_size guarantees enough room for error status */ if (!op->status) op->status = nfsd4_check_resp_size(resp, 0); - if (op->status == nfserr_resource && nfsd4_has_session(&resp->cstate)) { - struct nfsd4_slot *slot = resp->cstate.slot; - - if (slot->sl_flags & NFSD4_SLOT_CACHETHIS) - op->status = nfserr_rep_too_big_to_cache; - else - op->status = nfserr_rep_too_big; - } if (so) { so->so_replay.rp_status = op->status; so->so_replay.rp_buflen = (char *)resp->p - (char *)(statp+1); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 4e565c81430..732648b270d 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -698,6 +698,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) } group->overflow_event = &oevent->fse; + if (force_o_largefile()) + event_f_flags |= O_LARGEFILE; group->fanotify_data.f_flags = event_f_flags; #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS spin_lock_init(&group->fanotify_data.access_lock); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index af3f7aa73e1..ee1f88419cb 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -472,11 +472,15 @@ bail: void dlm_destroy_master_caches(void) { - if (dlm_lockname_cache) + if (dlm_lockname_cache) { kmem_cache_destroy(dlm_lockname_cache); + dlm_lockname_cache = NULL; + } - if (dlm_lockres_cache) + if (dlm_lockres_cache) { kmem_cache_destroy(dlm_lockres_cache); + dlm_lockres_cache = NULL; + } } static void dlm_lockres_release(struct kref *kref) diff --git a/fs/open.c b/fs/open.c index 3d30eb1fc95..9d64679cec7 100644 --- a/fs/open.c +++ b/fs/open.c @@ -254,17 +254,22 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return -EBADF; /* - * It's not possible to punch hole or perform collapse range - * on append only file + * We can only allow pure fallocate on append only files */ - if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE) - && IS_APPEND(inode)) + if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) return -EPERM; if (IS_IMMUTABLE(inode)) return -EPERM; /* + * We can not allow to do any fallocate operation on an active + * swapfile + */ + if (IS_SWAPFILE(inode)) + ret = -ETXTBSY; + + /* * Revalidate the write permissions, in case security policy has * changed since the files were opened. */ @@ -286,14 +291,6 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) return -EFBIG; - /* - * There is no need to overlap collapse range with EOF, in which case - * it is effectively a truncate operation - */ - if ((mode & FALLOC_FL_COLLAPSE_RANGE) && - (offset + len >= i_size_read(inode))) - return -EINVAL; - if (!file->f_op->fallocate) return -EOPNOTSUPP; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 9e363e41dac..0855f772cd4 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -246,6 +246,12 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p) umode_t mode = 0; int not_equiv = 0; + /* + * A null ACL can always be presented as mode bits. + */ + if (!acl) + return 0; + FOREACH_ACL_ENTRY(pa, acl, pe) { switch (pa->e_tag) { case ACL_USER_OBJ: diff --git a/fs/splice.c b/fs/splice.c index 9bc07d2b53c..e246954ea48 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1537,7 +1537,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; - ssize_t count = 0; + ssize_t count; pipe = get_pipe_info(file); if (!pipe) @@ -1546,8 +1546,9 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, ret = rw_copy_check_uvector(READ, uiov, nr_segs, ARRAY_SIZE(iovstack), iovstack, &iov); if (ret <= 0) - return ret; + goto out; + count = ret; iov_iter_init(&iter, iov, nr_segs, count, 0); sd.len = 0; @@ -1560,6 +1561,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, ret = __splice_from_pipe(pipe, &sd, pipe_to_user); pipe_unlock(pipe); +out: if (iov != iovstack) kfree(iov); diff --git a/fs/super.c b/fs/super.c index e9dc3c3fe15..48377f7463c 100644 --- a/fs/super.c +++ b/fs/super.c @@ -800,7 +800,10 @@ void emergency_remount(void) static DEFINE_IDA(unnamed_dev_ida); static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ -static int unnamed_dev_start = 0; /* don't bother trying below it */ +/* Many userspace utilities consider an FSID of 0 invalid. + * Always return at least 1 from get_anon_bdev. + */ +static int unnamed_dev_start = 1; int get_anon_bdev(dev_t *p) { diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1b8b91b67fd..e9ef59b3abb 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -47,12 +47,13 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v) ssize_t count; char *buf; - /* acquire buffer and ensure that it's >= PAGE_SIZE */ + /* acquire buffer and ensure that it's >= PAGE_SIZE and clear */ count = seq_get_buf(sf, &buf); if (count < PAGE_SIZE) { seq_commit(sf, -1); return 0; } + memset(buf, 0, PAGE_SIZE); /* * Invoke show(). Control may reach here via seq file lseek even @@ -453,95 +454,3 @@ void sysfs_remove_bin_file(struct kobject *kobj, kernfs_remove_by_name(kobj->sd, attr->attr.name); } EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); - -struct sysfs_schedule_callback_struct { - struct list_head workq_list; - struct kobject *kobj; - void (*func)(void *); - void *data; - struct module *owner; - struct work_struct work; -}; - -static struct workqueue_struct *sysfs_workqueue; -static DEFINE_MUTEX(sysfs_workq_mutex); -static LIST_HEAD(sysfs_workq); -static void sysfs_schedule_callback_work(struct work_struct *work) -{ - struct sysfs_schedule_callback_struct *ss = container_of(work, - struct sysfs_schedule_callback_struct, work); - - (ss->func)(ss->data); - kobject_put(ss->kobj); - module_put(ss->owner); - mutex_lock(&sysfs_workq_mutex); - list_del(&ss->workq_list); - mutex_unlock(&sysfs_workq_mutex); - kfree(ss); -} - -/** - * sysfs_schedule_callback - helper to schedule a callback for a kobject - * @kobj: object we're acting for. - * @func: callback function to invoke later. - * @data: argument to pass to @func. - * @owner: module owning the callback code - * - * sysfs attribute methods must not unregister themselves or their parent - * kobject (which would amount to the same thing). Attempts to do so will - * deadlock, since unregistration is mutually exclusive with driver - * callbacks. - * - * Instead methods can call this routine, which will attempt to allocate - * and schedule a workqueue request to call back @func with @data as its - * argument in the workqueue's process context. @kobj will be pinned - * until @func returns. - * - * Returns 0 if the request was submitted, -ENOMEM if storage could not - * be allocated, -ENODEV if a reference to @owner isn't available, - * -EAGAIN if a callback has already been scheduled for @kobj. - */ -int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), - void *data, struct module *owner) -{ - struct sysfs_schedule_callback_struct *ss, *tmp; - - if (!try_module_get(owner)) - return -ENODEV; - - mutex_lock(&sysfs_workq_mutex); - list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list) - if (ss->kobj == kobj) { - module_put(owner); - mutex_unlock(&sysfs_workq_mutex); - return -EAGAIN; - } - mutex_unlock(&sysfs_workq_mutex); - - if (sysfs_workqueue == NULL) { - sysfs_workqueue = create_singlethread_workqueue("sysfsd"); - if (sysfs_workqueue == NULL) { - module_put(owner); - return -ENOMEM; - } - } - - ss = kmalloc(sizeof(*ss), GFP_KERNEL); - if (!ss) { - module_put(owner); - return -ENOMEM; - } - kobject_get(kobj); - ss->kobj = kobj; - ss->func = func; - ss->data = data; - ss->owner = owner; - INIT_WORK(&ss->work, sysfs_schedule_callback_work); - INIT_LIST_HEAD(&ss->workq_list); - mutex_lock(&sysfs_workq_mutex); - list_add_tail(&ss->workq_list, &sysfs_workq); - mutex_unlock(&sysfs_workq_mutex); - queue_work(sysfs_workqueue, &ss->work); - return 0; -} -EXPORT_SYMBOL_GPL(sysfs_schedule_callback); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index a66ad6196f5..8794423f7ef 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -63,7 +63,8 @@ int __init sysfs_init(void) { int err; - sysfs_root = kernfs_create_root(NULL, 0, NULL); + sysfs_root = kernfs_create_root(NULL, KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, + NULL); if (IS_ERR(sysfs_root)) return PTR_ERR(sysfs_root); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index a1266089eca..a81c7b55689 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1556,7 +1556,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) if (c->space_fixup) { err = ubifs_fixup_free_space(c); if (err) - return err; + goto out; } err = check_free_space(c); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 75df77d09f7..0479c32c5eb 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1344,6 +1344,14 @@ __xfs_get_blocks( /* * If this is O_DIRECT or the mpage code calling tell them how large * the mapping is, so that we can avoid repeated get_blocks calls. + * + * If the mapping spans EOF, then we have to break the mapping up as the + * mapping for blocks beyond EOF must be marked new so that sub block + * regions can be correctly zeroed. We can't do this for mappings within + * EOF unless the mapping was just allocated or is unwritten, otherwise + * the callers would overwrite existing data with zeros. Hence we have + * to split the mapping into a range up to and including EOF, and a + * second mapping for beyond EOF. */ if (direct || size > (1 << inode->i_blkbits)) { xfs_off_t mapping_size; @@ -1354,6 +1362,12 @@ __xfs_get_blocks( ASSERT(mapping_size > 0); if (mapping_size > size) mapping_size = size; + if (offset < i_size_read(inode) && + offset + mapping_size >= i_size_read(inode)) { + /* limit mapping to block that spans EOF */ + mapping_size = roundup_64(i_size_read(inode) - offset, + 1 << inode->i_blkbits); + } if (mapping_size > LONG_MAX) mapping_size = LONG_MAX; @@ -1566,6 +1580,16 @@ xfs_vm_write_failed( xfs_vm_kill_delalloc_range(inode, block_offset, block_offset + bh->b_size); + + /* + * This buffer does not contain data anymore. make sure anyone + * who finds it knows that for certain. + */ + clear_buffer_delay(bh); + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_dirty(bh); } } @@ -1599,12 +1623,21 @@ xfs_vm_write_begin( status = __block_write_begin(page, pos, len, xfs_get_blocks); if (unlikely(status)) { struct inode *inode = mapping->host; + size_t isize = i_size_read(inode); xfs_vm_write_failed(inode, page, pos, len); unlock_page(page); - if (pos + len > i_size_read(inode)) - truncate_pagecache(inode, i_size_read(inode)); + /* + * If the write is beyond EOF, we only want to kill blocks + * allocated in this write, not blocks that were previously + * written successfully. + */ + if (pos + len > isize) { + ssize_t start = max_t(ssize_t, pos, isize); + + truncate_pagecache_range(inode, start, pos + len); + } page_cache_release(page); page = NULL; @@ -1615,9 +1648,12 @@ xfs_vm_write_begin( } /* - * On failure, we only need to kill delalloc blocks beyond EOF because they - * will never be written. For blocks within EOF, generic_write_end() zeros them - * so they are safe to leave alone and be written with all the other valid data. + * On failure, we only need to kill delalloc blocks beyond EOF in the range of + * this specific write because they will never be written. Previous writes + * beyond EOF where block allocation succeeded do not need to be trashed, so + * only new blocks from this write should be trashed. For blocks within + * EOF, generic_write_end() zeros them so they are safe to leave alone and be + * written with all the other valid data. */ STATIC int xfs_vm_write_end( @@ -1640,8 +1676,11 @@ xfs_vm_write_end( loff_t to = pos + len; if (to > isize) { - truncate_pagecache(inode, isize); + /* only kill blocks in this write beyond EOF */ + if (pos > isize) + isize = pos; xfs_vm_kill_delalloc_range(inode, isize, to); + truncate_pagecache_range(inode, isize, to); } } return ret; diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 01b6a0102fb..abda1124a70 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -213,7 +213,7 @@ xfs_attr_calc_size( * Out of line attribute, cannot double split, but * make room for the attribute value itself. */ - uint dblocks = XFS_B_TO_FSB(mp, valuelen); + uint dblocks = xfs_attr3_rmt_blocks(mp, valuelen); nblks += dblocks; nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK); } @@ -698,11 +698,22 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) trace_xfs_attr_leaf_replace(args); + /* save the attribute state for later removal*/ args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; args->rmtblkno2 = args->rmtblkno; args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + + /* + * clear the remote attr state now that it is saved so that the + * values reflect the state of the attribute we are about to + * add, not the attribute we just found and will remove later. + */ + args->rmtblkno = 0; + args->rmtblkcnt = 0; + args->rmtvaluelen = 0; } /* @@ -794,6 +805,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) args->blkno = args->blkno2; args->rmtblkno = args->rmtblkno2; args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; if (args->rmtblkno) { error = xfs_attr_rmtval_remove(args); if (error) @@ -999,13 +1011,22 @@ restart: trace_xfs_attr_node_replace(args); + /* save the attribute state for later removal*/ args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; args->rmtblkno2 = args->rmtblkno; args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + + /* + * clear the remote attr state now that it is saved so that the + * values reflect the state of the attribute we are about to + * add, not the attribute we just found and will remove later. + */ args->rmtblkno = 0; args->rmtblkcnt = 0; + args->rmtvaluelen = 0; } retval = xfs_attr3_leaf_add(blk->bp, state->args); @@ -1133,6 +1154,7 @@ restart: args->blkno = args->blkno2; args->rmtblkno = args->rmtblkno2; args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; if (args->rmtblkno) { error = xfs_attr_rmtval_remove(args); if (error) diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index fe9587fab17..511c283459b 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -1229,6 +1229,7 @@ xfs_attr3_leaf_add_work( name_rmt->valueblk = 0; args->rmtblkno = 1; args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); + args->rmtvaluelen = args->valuelen; } xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), @@ -2167,11 +2168,11 @@ xfs_attr3_leaf_lookup_int( if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; - args->valuelen = be32_to_cpu(name_rmt->valuelen); + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = xfs_attr3_rmt_blocks( args->dp->i_mount, - args->valuelen); + args->rmtvaluelen); return XFS_ERROR(EEXIST); } } @@ -2220,19 +2221,19 @@ xfs_attr3_leaf_getvalue( name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); ASSERT(name_rmt->namelen == args->namelen); ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); - valuelen = be32_to_cpu(name_rmt->valuelen); + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, - valuelen); + args->rmtvaluelen); if (args->flags & ATTR_KERNOVAL) { - args->valuelen = valuelen; + args->valuelen = args->rmtvaluelen; return 0; } - if (args->valuelen < valuelen) { - args->valuelen = valuelen; + if (args->valuelen < args->rmtvaluelen) { + args->valuelen = args->rmtvaluelen; return XFS_ERROR(ERANGE); } - args->valuelen = valuelen; + args->valuelen = args->rmtvaluelen; } return 0; } @@ -2519,7 +2520,7 @@ xfs_attr3_leaf_clearflag( ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0); name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); - name_rmt->valuelen = cpu_to_be32(args->valuelen); + name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } @@ -2677,7 +2678,7 @@ xfs_attr3_leaf_flipflags( ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); - name_rmt->valuelen = cpu_to_be32(args->valuelen); + name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); xfs_trans_log_buf(args->trans, bp1, XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt))); } diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 01db96f60cf..833fe5d98d8 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -447,6 +447,7 @@ xfs_attr3_leaf_list_int( args.dp = context->dp; args.whichfork = XFS_ATTR_FORK; args.valuelen = valuelen; + args.rmtvaluelen = valuelen; args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); args.rmtblkno = be32_to_cpu(name_rmt->valueblk); args.rmtblkcnt = xfs_attr3_rmt_blocks( diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index 6e37823e293..d2e6e948cec 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -337,7 +337,7 @@ xfs_attr_rmtval_get( struct xfs_buf *bp; xfs_dablk_t lblkno = args->rmtblkno; __uint8_t *dst = args->value; - int valuelen = args->valuelen; + int valuelen; int nmap; int error; int blkcnt = args->rmtblkcnt; @@ -347,7 +347,9 @@ xfs_attr_rmtval_get( trace_xfs_attr_rmtval_get(args); ASSERT(!(args->flags & ATTR_KERNOVAL)); + ASSERT(args->rmtvaluelen == args->valuelen); + valuelen = args->rmtvaluelen; while (valuelen > 0) { nmap = ATTR_RMTVALUE_MAPSIZE; error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, @@ -415,7 +417,7 @@ xfs_attr_rmtval_set( * attributes have headers, we can't just do a straight byte to FSB * conversion and have to take the header space into account. */ - blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); + blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen); error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, XFS_ATTR_FORK); if (error) @@ -480,7 +482,7 @@ xfs_attr_rmtval_set( */ lblkno = args->rmtblkno; blkcnt = args->rmtblkcnt; - valuelen = args->valuelen; + valuelen = args->rmtvaluelen; while (valuelen > 0) { struct xfs_buf *bp; xfs_daddr_t dblkno; diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 5b6092ef51e..f0efc7e970e 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5413,6 +5413,7 @@ xfs_bmap_shift_extents( int whichfork = XFS_DATA_FORK; int logflags; xfs_filblks_t blockcount = 0; + int total_extents; if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -5429,7 +5430,6 @@ xfs_bmap_shift_extents( ASSERT(current_ext != NULL); ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { /* Read in all the extents */ error = xfs_iread_extents(tp, ip, whichfork); @@ -5456,7 +5456,6 @@ xfs_bmap_shift_extents( /* We are going to change core inode */ logflags = XFS_ILOG_CORE; - if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = *firstblock; @@ -5467,8 +5466,14 @@ xfs_bmap_shift_extents( logflags |= XFS_ILOG_DEXT; } - while (nexts++ < num_exts && - *current_ext < XFS_IFORK_NEXTENTS(ip, whichfork)) { + /* + * There may be delalloc extents in the data fork before the range we + * are collapsing out, so we cannot + * use the count of real extents here. Instead we have to calculate it + * from the incore fork. + */ + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + while (nexts++ < num_exts && *current_ext < total_extents) { gotp = xfs_iext_get_ext(ifp, *current_ext); xfs_bmbt_get_all(gotp, &got); @@ -5556,10 +5561,11 @@ xfs_bmap_shift_extents( } (*current_ext)++; + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); } /* Check if we are done */ - if (*current_ext == XFS_IFORK_NEXTENTS(ip, whichfork)) + if (*current_ext == total_extents) *done = 1; del_cursor: @@ -5568,6 +5574,5 @@ del_cursor: error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); xfs_trans_log_inode(tp, ip, logflags); - return error; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 01f6a646caa..296160b8e78 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1418,6 +1418,8 @@ xfs_zero_file_space( xfs_off_t end_boundary; int error; + trace_xfs_zero_file_space(ip); + granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); /* @@ -1432,9 +1434,18 @@ xfs_zero_file_space( ASSERT(end_boundary <= offset + len); if (start_boundary < end_boundary - 1) { - /* punch out the page cache over the conversion range */ + /* + * punch out delayed allocation blocks and the page cache over + * the conversion range + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, + XFS_B_TO_FSBT(mp, start_boundary), + XFS_B_TO_FSB(mp, end_boundary - start_boundary)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); truncate_pagecache_range(VFS_I(ip), start_boundary, end_boundary - 1); + /* convert the blocks */ error = xfs_alloc_file_space(ip, start_boundary, end_boundary - start_boundary - 1, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 107f2fdfe41..cb10a0aaab3 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1372,21 +1372,29 @@ xfs_buf_iorequest( xfs_buf_wait_unpin(bp); xfs_buf_hold(bp); - /* Set the count to 1 initially, this will stop an I/O + /* + * Set the count to 1 initially, this will stop an I/O * completion callout which happens before we have started * all the I/O from calling xfs_buf_ioend too early. */ atomic_set(&bp->b_io_remaining, 1); _xfs_buf_ioapply(bp); - _xfs_buf_ioend(bp, 1); + /* + * If _xfs_buf_ioapply failed, we'll get back here with + * only the reference we took above. _xfs_buf_ioend will + * drop it to zero, so we'd better not queue it for later, + * or we'll free it before it's done. + */ + _xfs_buf_ioend(bp, bp->b_error ? 0 : 1); xfs_buf_rele(bp); } /* * Waits for I/O to complete on the buffer supplied. It returns immediately if - * no I/O is pending or there is already a pending error on the buffer. It - * returns the I/O error code, if any, or 0 if there was no error. + * no I/O is pending or there is already a pending error on the buffer, in which + * case nothing will ever complete. It returns the I/O error code, if any, or + * 0 if there was no error. */ int xfs_buf_iowait( diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 6e95ea79f5d..201c6091d26 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -60,10 +60,12 @@ typedef struct xfs_da_args { int index; /* index of attr of interest in blk */ xfs_dablk_t rmtblkno; /* remote attr value starting blkno */ int rmtblkcnt; /* remote attr value block count */ + int rmtvaluelen; /* remote attr value length in bytes */ xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */ int index2; /* index of 2nd attr in blk */ xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ int rmtblkcnt2; /* remote attr value block count */ + int rmtvaluelen2; /* remote attr value length in bytes */ int op_flags; /* operation flags */ enum xfs_dacmp cmpresult; /* name compare result for lookups */ } xfs_da_args_t; diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 1399e187d42..753e467aa1a 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -237,7 +237,7 @@ xfs_fs_nfs_commit_metadata( if (!lsn) return 0; - return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } const struct export_operations xfs_export_operations = { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 79e96ce9873..830c1c937b8 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -155,7 +155,7 @@ xfs_dir_fsync( if (!lsn) return 0; - return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } STATIC int @@ -295,7 +295,7 @@ xfs_file_aio_read( xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { - ret = -filemap_write_and_wait_range( + ret = filemap_write_and_wait_range( VFS_I(ip)->i_mapping, pos, -1); if (ret) { @@ -679,7 +679,7 @@ xfs_file_dio_aio_write( goto out; if (mapping->nrpages) { - ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, pos, -1); if (ret) goto out; @@ -837,11 +837,19 @@ xfs_file_fallocate( unsigned blksize_mask = (1 << inode->i_blkbits) - 1; if (offset & blksize_mask || len & blksize_mask) { - error = -EINVAL; + error = EINVAL; + goto out_unlock; + } + + /* + * There is no need to overlap collapse range with EOF, + * in which case it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + error = EINVAL; goto out_unlock; } - ASSERT(offset + len < i_size_read(inode)); new_size = i_size_read(inode) - len; error = xfs_collapse_file_space(ip, offset, len); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5e7a38fa6ee..768087bedba 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1334,7 +1334,8 @@ int xfs_create_tmpfile( struct xfs_inode *dp, struct dentry *dentry, - umode_t mode) + umode_t mode, + struct xfs_inode **ipp) { struct xfs_mount *mp = dp->i_mount; struct xfs_inode *ip = NULL; @@ -1402,7 +1403,6 @@ xfs_create_tmpfile( xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); ip->i_d.di_nlink--; - d_tmpfile(dentry, VFS_I(ip)); error = xfs_iunlink(tp, ip); if (error) goto out_trans_abort; @@ -1415,6 +1415,7 @@ xfs_create_tmpfile( xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); + *ipp = ip; return 0; out_trans_abort: diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 396cc1fafd0..f2fcde52b66 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -334,7 +334,7 @@ int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, int xfs_create(struct xfs_inode *dp, struct xfs_name *name, umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp); int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry, - umode_t mode); + umode_t mode, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 89b07e43ca2..36d630319a2 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -72,8 +72,8 @@ xfs_initxattrs( int error = 0; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - error = xfs_attr_set(ip, xattr->name, xattr->value, - xattr->value_len, ATTR_SECURE); + error = -xfs_attr_set(ip, xattr->name, xattr->value, + xattr->value_len, ATTR_SECURE); if (error < 0) break; } @@ -93,8 +93,8 @@ xfs_init_security( struct inode *dir, const struct qstr *qstr) { - return security_inode_init_security(inode, dir, qstr, - &xfs_initxattrs, NULL); + return -security_inode_init_security(inode, dir, qstr, + &xfs_initxattrs, NULL); } static void @@ -124,15 +124,15 @@ xfs_cleanup_inode( xfs_dentry_to_name(&teardown, dentry, 0); xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); - iput(inode); } STATIC int -xfs_vn_mknod( +xfs_generic_create( struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t rdev) + dev_t rdev, + bool tmpfile) /* unnamed file */ { struct inode *inode; struct xfs_inode *ip = NULL; @@ -156,8 +156,12 @@ xfs_vn_mknod( if (error) return error; - xfs_dentry_to_name(&name, dentry, mode); - error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); + if (!tmpfile) { + xfs_dentry_to_name(&name, dentry, mode); + error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); + } else { + error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); + } if (unlikely(error)) goto out_free_acl; @@ -169,18 +173,22 @@ xfs_vn_mknod( #ifdef CONFIG_XFS_POSIX_ACL if (default_acl) { - error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + error = -xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); if (error) goto out_cleanup_inode; } if (acl) { - error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); + error = -xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); if (error) goto out_cleanup_inode; } #endif - d_instantiate(dentry, inode); + if (tmpfile) + d_tmpfile(dentry, inode); + else + d_instantiate(dentry, inode); + out_free_acl: if (default_acl) posix_acl_release(default_acl); @@ -189,11 +197,23 @@ xfs_vn_mknod( return -error; out_cleanup_inode: - xfs_cleanup_inode(dir, inode, dentry); + if (!tmpfile) + xfs_cleanup_inode(dir, inode, dentry); + iput(inode); goto out_free_acl; } STATIC int +xfs_vn_mknod( + struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t rdev) +{ + return xfs_generic_create(dir, dentry, mode, rdev, false); +} + +STATIC int xfs_vn_create( struct inode *dir, struct dentry *dentry, @@ -353,6 +373,7 @@ xfs_vn_symlink( out_cleanup_inode: xfs_cleanup_inode(dir, inode, dentry); + iput(inode); out: return -error; } @@ -1053,11 +1074,7 @@ xfs_vn_tmpfile( struct dentry *dentry, umode_t mode) { - int error; - - error = xfs_create_tmpfile(XFS_I(dir), dentry, mode); - - return -error; + return xfs_generic_create(dir, dentry, mode, 0, true); } static const struct inode_operations xfs_inode_operations = { diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 8497a00e399..a5f8bd9899d 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -616,11 +616,13 @@ xfs_log_mount( int error = 0; int min_logfsbs; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) - xfs_notice(mp, "Mounting Filesystem"); - else { + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + xfs_notice(mp, "Mounting V%d Filesystem", + XFS_SB_VERSION_NUM(&mp->m_sb)); + } else { xfs_notice(mp, -"Mounting filesystem in no-recovery mode. Filesystem will be inconsistent."); +"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", + XFS_SB_VERSION_NUM(&mp->m_sb)); ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } @@ -1181,11 +1183,14 @@ xlog_iodone(xfs_buf_t *bp) /* log I/O is always issued ASYNC */ ASSERT(XFS_BUF_ISASYNC(bp)); xlog_state_done_syncing(iclog, aborted); + /* - * do not reference the buffer (bp) here as we could race - * with it being freed after writing the unmount record to the - * log. + * drop the buffer lock now that we are done. Nothing references + * the buffer after this, so an unmount waiting on this lock can now + * tear it down safely. As such, it is unsafe to reference the buffer + * (bp) after the unlock as we could race with it being freed. */ + xfs_buf_unlock(bp); } /* @@ -1368,8 +1373,16 @@ xlog_alloc_log( bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0); if (!bp) goto out_free_log; - bp->b_iodone = xlog_iodone; + + /* + * The iclogbuf buffer locks are held over IO but we are not going to do + * IO yet. Hence unlock the buffer so that the log IO path can grab it + * when appropriately. + */ ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + + bp->b_iodone = xlog_iodone; log->l_xbuf = bp; spin_lock_init(&log->l_icloglock); @@ -1398,6 +1411,9 @@ xlog_alloc_log( if (!bp) goto out_free_iclog; + ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + bp->b_iodone = xlog_iodone; iclog->ic_bp = bp; iclog->ic_data = bp->b_addr; @@ -1422,7 +1438,6 @@ xlog_alloc_log( iclog->ic_callback_tail = &(iclog->ic_callback); iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; - ASSERT(xfs_buf_islocked(iclog->ic_bp)); init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -1631,6 +1646,12 @@ xlog_cksum( * we transition the iclogs to IOERROR state *after* flushing all existing * iclogs to disk. This is because we don't want anymore new transactions to be * started or completed afterwards. + * + * We lock the iclogbufs here so that we can serialise against IO completion + * during unmount. We might be processing a shutdown triggered during unmount, + * and that can occur asynchronously to the unmount thread, and hence we need to + * ensure that completes before tearing down the iclogbufs. Hence we need to + * hold the buffer lock across the log IO to acheive that. */ STATIC int xlog_bdstrat( @@ -1638,6 +1659,7 @@ xlog_bdstrat( { struct xlog_in_core *iclog = bp->b_fspriv; + xfs_buf_lock(bp); if (iclog->ic_state & XLOG_STATE_IOERROR) { xfs_buf_ioerror(bp, EIO); xfs_buf_stale(bp); @@ -1645,7 +1667,8 @@ xlog_bdstrat( /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of - * doing it here. + * doing it here. Similarly, IO completion will unlock the + * buffer, so we don't do it here. */ return 0; } @@ -1847,14 +1870,28 @@ xlog_dealloc_log( xlog_cil_destroy(log); /* - * always need to ensure that the extra buffer does not point to memory - * owned by another log buffer before we free it. + * Cycle all the iclogbuf locks to make sure all log IO completion + * is done before we tear down these buffers. */ + iclog = log->l_iclog; + for (i = 0; i < log->l_iclog_bufs; i++) { + xfs_buf_lock(iclog->ic_bp); + xfs_buf_unlock(iclog->ic_bp); + iclog = iclog->ic_next; + } + + /* + * Always need to ensure that the extra buffer does not point to memory + * owned by another log buffer before we free it. Also, cycle the lock + * first to ensure we've completed IO on it. + */ + xfs_buf_lock(log->l_xbuf); + xfs_buf_unlock(log->l_xbuf); xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size)); xfs_buf_free(log->l_xbuf); iclog = log->l_iclog; - for (i=0; i<log->l_iclog_bufs; i++) { + for (i = 0; i < log->l_iclog_bufs; i++) { xfs_buf_free(iclog->ic_bp); next_iclog = iclog->ic_next; kmem_free(iclog); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 993cb19e7d3..944f3d9456a 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -743,8 +743,6 @@ xfs_mountfs( new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size)) mp->m_inode_cluster_size = new_size; - xfs_info(mp, "Using inode cluster size of %d bytes", - mp->m_inode_cluster_size); } /* diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 348e4d2ed6e..dc977b6e6a3 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -843,22 +843,17 @@ xfs_qm_init_quotainfo( qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); - if ((error = list_lru_init(&qinf->qi_lru))) { - kmem_free(qinf); - mp->m_quotainfo = NULL; - return error; - } + error = -list_lru_init(&qinf->qi_lru); + if (error) + goto out_free_qinf; /* * See if quotainodes are setup, and if not, allocate them, * and change the superblock accordingly. */ - if ((error = xfs_qm_init_quotainos(mp))) { - list_lru_destroy(&qinf->qi_lru); - kmem_free(qinf); - mp->m_quotainfo = NULL; - return error; - } + error = xfs_qm_init_quotainos(mp); + if (error) + goto out_free_lru; INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS); INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS); @@ -918,7 +913,7 @@ xfs_qm_init_quotainfo( qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); - + xfs_qm_dqdestroy(dqp); } else { qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; @@ -935,6 +930,13 @@ xfs_qm_init_quotainfo( qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; register_shrinker(&qinf->qi_shrinker); return 0; + +out_free_lru: + list_lru_destroy(&qinf->qi_lru); +out_free_qinf: + kmem_free(qinf); + mp->m_quotainfo = NULL; + return error; } diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c index 0c0e41bbe4e..8baf61afae1 100644 --- a/fs/xfs/xfs_sb.c +++ b/fs/xfs/xfs_sb.c @@ -201,10 +201,6 @@ xfs_mount_validate_sb( * write validation, we don't need to check feature masks. */ if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { - xfs_alert(mp, -"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n" -"Use of these features in this kernel is at your own risk!"); - if (xfs_sb_has_compat_feature(sbp, XFS_SB_FEAT_COMPAT_UNKNOWN)) { xfs_warn(mp, diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 20537677637..3494eff8e4e 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1433,11 +1433,11 @@ xfs_fs_fill_super( if (error) goto out_free_fsname; - error = xfs_init_mount_workqueues(mp); + error = -xfs_init_mount_workqueues(mp); if (error) goto out_close_devices; - error = xfs_icsb_init_counters(mp); + error = -xfs_icsb_init_counters(mp); if (error) goto out_destroy_workqueues; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index a4ae41c179a..65d8c793a25 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -603,6 +603,7 @@ DEFINE_INODE_EVENT(xfs_readlink); DEFINE_INODE_EVENT(xfs_inactive_symlink); DEFINE_INODE_EVENT(xfs_alloc_file_space); DEFINE_INODE_EVENT(xfs_free_file_space); +DEFINE_INODE_EVENT(xfs_zero_file_space); DEFINE_INODE_EVENT(xfs_collapse_file_space); DEFINE_INODE_EVENT(xfs_readdir); #ifdef CONFIG_XFS_POSIX_ACL |