diff options
Diffstat (limited to 'fs')
218 files changed, 7124 insertions, 4336 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 02a2cf61631..51545529637 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -21,8 +21,8 @@ #include <linux/posix_acl_xattr.h> #include "xattr.h" #include "acl.h" -#include "v9fs_vfs.h" #include "v9fs.h" +#include "v9fs_vfs.h" static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name) { @@ -59,7 +59,8 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) struct v9fs_session_info *v9ses; v9ses = v9fs_inode2v9ses(inode); - if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) { + if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) || + ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) { set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL); set_cached_acl(inode, ACL_TYPE_ACCESS, NULL); return 0; @@ -71,11 +72,15 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) if (!IS_ERR(dacl) && !IS_ERR(pacl)) { set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl); set_cached_acl(inode, ACL_TYPE_ACCESS, pacl); - posix_acl_release(dacl); - posix_acl_release(pacl); } else retval = -EIO; + if (!IS_ERR(dacl)) + posix_acl_release(dacl); + + if (!IS_ERR(pacl)) + posix_acl_release(pacl); + return retval; } @@ -100,9 +105,10 @@ int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags) return -ECHILD; v9ses = v9fs_inode2v9ses(inode); - if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) { + if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) || + ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) { /* - * On access = client mode get the acl + * On access = client and acl = on mode get the acl * values from the server */ return 0; @@ -128,6 +134,10 @@ static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl) struct inode *inode = dentry->d_inode; set_cached_acl(inode, type, acl); + + if (!acl) + return 0; + /* Set a setxattr request to server */ size = posix_acl_xattr_size(acl->a_count); buffer = kmalloc(size, GFP_KERNEL); @@ -177,10 +187,8 @@ int v9fs_acl_chmod(struct dentry *dentry) int v9fs_set_create_acl(struct dentry *dentry, struct posix_acl *dpacl, struct posix_acl *pacl) { - if (dpacl) - v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl); - if (pacl) - v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl); + v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl); + v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl); posix_acl_release(dpacl); posix_acl_release(pacl); return 0; diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 0dbe0d139ac..5b335c5086a 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -33,67 +33,11 @@ #define CACHETAG_LEN 11 -struct kmem_cache *vcookie_cache; - struct fscache_netfs v9fs_cache_netfs = { .name = "9p", .version = 0, }; -static void init_once(void *foo) -{ - struct v9fs_cookie *vcookie = (struct v9fs_cookie *) foo; - vcookie->fscache = NULL; - vcookie->qid = NULL; - inode_init_once(&vcookie->inode); -} - -/** - * v9fs_init_vcookiecache - initialize a cache for vcookies to maintain - * vcookie to inode mapping - * - * Returns 0 on success. - */ - -static int v9fs_init_vcookiecache(void) -{ - vcookie_cache = kmem_cache_create("vcookie_cache", - sizeof(struct v9fs_cookie), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), - init_once); - if (!vcookie_cache) - return -ENOMEM; - - return 0; -} - -/** - * v9fs_destroy_vcookiecache - destroy the cache of vcookies - * - */ - -static void v9fs_destroy_vcookiecache(void) -{ - kmem_cache_destroy(vcookie_cache); -} - -int __v9fs_cache_register(void) -{ - int ret; - ret = v9fs_init_vcookiecache(); - if (ret < 0) - return ret; - - return fscache_register_netfs(&v9fs_cache_netfs); -} - -void __v9fs_cache_unregister(void) -{ - v9fs_destroy_vcookiecache(); - fscache_unregister_netfs(&v9fs_cache_netfs); -} - /** * v9fs_random_cachetag - Generate a random tag to be associated * with a new cache session. @@ -133,9 +77,9 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data, } const struct fscache_cookie_def v9fs_cache_session_index_def = { - .name = "9P.session", - .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = v9fs_cache_session_get_key, + .name = "9P.session", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = v9fs_cache_session_get_key, }; void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses) @@ -163,33 +107,33 @@ void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses) static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data, void *buffer, uint16_t bufmax) { - const struct v9fs_cookie *vcookie = cookie_netfs_data; - memcpy(buffer, &vcookie->qid->path, sizeof(vcookie->qid->path)); - - P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &vcookie->inode, - vcookie->qid->path); - return sizeof(vcookie->qid->path); + const struct v9fs_inode *v9inode = cookie_netfs_data; + memcpy(buffer, &v9inode->fscache_key->path, + sizeof(v9inode->fscache_key->path)); + P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &v9inode->vfs_inode, + v9inode->fscache_key->path); + return sizeof(v9inode->fscache_key->path); } static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data, uint64_t *size) { - const struct v9fs_cookie *vcookie = cookie_netfs_data; - *size = i_size_read(&vcookie->inode); + const struct v9fs_inode *v9inode = cookie_netfs_data; + *size = i_size_read(&v9inode->vfs_inode); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &vcookie->inode, + P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &v9inode->vfs_inode, *size); } static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data, void *buffer, uint16_t buflen) { - const struct v9fs_cookie *vcookie = cookie_netfs_data; - memcpy(buffer, &vcookie->qid->version, sizeof(vcookie->qid->version)); - - P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &vcookie->inode, - vcookie->qid->version); - return sizeof(vcookie->qid->version); + const struct v9fs_inode *v9inode = cookie_netfs_data; + memcpy(buffer, &v9inode->fscache_key->version, + sizeof(v9inode->fscache_key->version)); + P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &v9inode->vfs_inode, + v9inode->fscache_key->version); + return sizeof(v9inode->fscache_key->version); } static enum @@ -197,13 +141,13 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data, const void *buffer, uint16_t buflen) { - const struct v9fs_cookie *vcookie = cookie_netfs_data; + const struct v9fs_inode *v9inode = cookie_netfs_data; - if (buflen != sizeof(vcookie->qid->version)) + if (buflen != sizeof(v9inode->fscache_key->version)) return FSCACHE_CHECKAUX_OBSOLETE; - if (memcmp(buffer, &vcookie->qid->version, - sizeof(vcookie->qid->version))) + if (memcmp(buffer, &v9inode->fscache_key->version, + sizeof(v9inode->fscache_key->version))) return FSCACHE_CHECKAUX_OBSOLETE; return FSCACHE_CHECKAUX_OKAY; @@ -211,7 +155,7 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data, static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data) { - struct v9fs_cookie *vcookie = cookie_netfs_data; + struct v9fs_inode *v9inode = cookie_netfs_data; struct pagevec pvec; pgoff_t first; int loop, nr_pages; @@ -220,7 +164,7 @@ static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data) first = 0; for (;;) { - nr_pages = pagevec_lookup(&pvec, vcookie->inode.i_mapping, + nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping, first, PAGEVEC_SIZE - pagevec_count(&pvec)); if (!nr_pages) @@ -249,115 +193,114 @@ const struct fscache_cookie_def v9fs_cache_inode_index_def = { void v9fs_cache_inode_get_cookie(struct inode *inode) { - struct v9fs_cookie *vcookie; + struct v9fs_inode *v9inode; struct v9fs_session_info *v9ses; if (!S_ISREG(inode->i_mode)) return; - vcookie = v9fs_inode2cookie(inode); - if (vcookie->fscache) + v9inode = V9FS_I(inode); + if (v9inode->fscache) return; v9ses = v9fs_inode2v9ses(inode); - vcookie->fscache = fscache_acquire_cookie(v9ses->fscache, + v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, &v9fs_cache_inode_index_def, - vcookie); + v9inode); P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode, - vcookie->fscache); + v9inode->fscache); } void v9fs_cache_inode_put_cookie(struct inode *inode) { - struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + struct v9fs_inode *v9inode = V9FS_I(inode); - if (!vcookie->fscache) + if (!v9inode->fscache) return; P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode, - vcookie->fscache); + v9inode->fscache); - fscache_relinquish_cookie(vcookie->fscache, 0); - vcookie->fscache = NULL; + fscache_relinquish_cookie(v9inode->fscache, 0); + v9inode->fscache = NULL; } void v9fs_cache_inode_flush_cookie(struct inode *inode) { - struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + struct v9fs_inode *v9inode = V9FS_I(inode); - if (!vcookie->fscache) + if (!v9inode->fscache) return; P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode, - vcookie->fscache); + v9inode->fscache); - fscache_relinquish_cookie(vcookie->fscache, 1); - vcookie->fscache = NULL; + fscache_relinquish_cookie(v9inode->fscache, 1); + v9inode->fscache = NULL; } void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp) { - struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + struct v9fs_inode *v9inode = V9FS_I(inode); struct p9_fid *fid; - if (!vcookie->fscache) + if (!v9inode->fscache) return; - spin_lock(&vcookie->lock); + spin_lock(&v9inode->fscache_lock); fid = filp->private_data; if ((filp->f_flags & O_ACCMODE) != O_RDONLY) v9fs_cache_inode_flush_cookie(inode); else v9fs_cache_inode_get_cookie(inode); - spin_unlock(&vcookie->lock); + spin_unlock(&v9inode->fscache_lock); } void v9fs_cache_inode_reset_cookie(struct inode *inode) { - struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + struct v9fs_inode *v9inode = V9FS_I(inode); struct v9fs_session_info *v9ses; struct fscache_cookie *old; - if (!vcookie->fscache) + if (!v9inode->fscache) return; - old = vcookie->fscache; + old = v9inode->fscache; - spin_lock(&vcookie->lock); - fscache_relinquish_cookie(vcookie->fscache, 1); + spin_lock(&v9inode->fscache_lock); + fscache_relinquish_cookie(v9inode->fscache, 1); v9ses = v9fs_inode2v9ses(inode); - vcookie->fscache = fscache_acquire_cookie(v9ses->fscache, + v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, &v9fs_cache_inode_index_def, - vcookie); - + v9inode); P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p", - inode, old, vcookie->fscache); + inode, old, v9inode->fscache); - spin_unlock(&vcookie->lock); + spin_unlock(&v9inode->fscache_lock); } int __v9fs_fscache_release_page(struct page *page, gfp_t gfp) { struct inode *inode = page->mapping->host; - struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + struct v9fs_inode *v9inode = V9FS_I(inode); - BUG_ON(!vcookie->fscache); + BUG_ON(!v9inode->fscache); - return fscache_maybe_release_page(vcookie->fscache, page, gfp); + return fscache_maybe_release_page(v9inode->fscache, page, gfp); } void __v9fs_fscache_invalidate_page(struct page *page) { struct inode *inode = page->mapping->host; - struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + struct v9fs_inode *v9inode = V9FS_I(inode); - BUG_ON(!vcookie->fscache); + BUG_ON(!v9inode->fscache); if (PageFsCache(page)) { - fscache_wait_on_page_write(vcookie->fscache, page); + fscache_wait_on_page_write(v9inode->fscache, page); BUG_ON(!PageLocked(page)); - fscache_uncache_page(vcookie->fscache, page); + fscache_uncache_page(v9inode->fscache, page); } } @@ -380,13 +323,13 @@ static void v9fs_vfs_readpage_complete(struct page *page, void *data, int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page) { int ret; - const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + const struct v9fs_inode *v9inode = V9FS_I(inode); P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); - if (!vcookie->fscache) + if (!v9inode->fscache) return -ENOBUFS; - ret = fscache_read_or_alloc_page(vcookie->fscache, + ret = fscache_read_or_alloc_page(v9inode->fscache, page, v9fs_vfs_readpage_complete, NULL, @@ -418,13 +361,13 @@ int __v9fs_readpages_from_fscache(struct inode *inode, unsigned *nr_pages) { int ret; - const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + const struct v9fs_inode *v9inode = V9FS_I(inode); P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages); - if (!vcookie->fscache) + if (!v9inode->fscache) return -ENOBUFS; - ret = fscache_read_or_alloc_pages(vcookie->fscache, + ret = fscache_read_or_alloc_pages(v9inode->fscache, mapping, pages, nr_pages, v9fs_vfs_readpage_complete, NULL, @@ -453,11 +396,22 @@ int __v9fs_readpages_from_fscache(struct inode *inode, void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page) { int ret; - const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + const struct v9fs_inode *v9inode = V9FS_I(inode); P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); - ret = fscache_write_page(vcookie->fscache, page, GFP_KERNEL); + ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL); P9_DPRINTK(P9_DEBUG_FSC, "ret = %d", ret); if (ret != 0) v9fs_uncache_page(inode, page); } + +/* + * wait for a page to complete writing to the cache + */ +void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page) +{ + const struct v9fs_inode *v9inode = V9FS_I(inode); + P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); + if (PageFsCache(page)) + fscache_wait_on_page_write(v9inode->fscache, page); +} diff --git a/fs/9p/cache.h b/fs/9p/cache.h index a94192bfaee..049507a5b01 100644 --- a/fs/9p/cache.h +++ b/fs/9p/cache.h @@ -25,20 +25,6 @@ #include <linux/fscache.h> #include <linux/spinlock.h> -extern struct kmem_cache *vcookie_cache; - -struct v9fs_cookie { - spinlock_t lock; - struct inode inode; - struct fscache_cookie *fscache; - struct p9_qid *qid; -}; - -static inline struct v9fs_cookie *v9fs_inode2cookie(const struct inode *inode) -{ - return container_of(inode, struct v9fs_cookie, inode); -} - extern struct fscache_netfs v9fs_cache_netfs; extern const struct fscache_cookie_def v9fs_cache_session_index_def; extern const struct fscache_cookie_def v9fs_cache_inode_index_def; @@ -64,23 +50,8 @@ extern int __v9fs_readpages_from_fscache(struct inode *inode, struct list_head *pages, unsigned *nr_pages); extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page); - - -/** - * v9fs_cache_register - Register v9fs file system with the cache - */ -static inline int v9fs_cache_register(void) -{ - return __v9fs_cache_register(); -} - -/** - * v9fs_cache_unregister - Unregister v9fs from the cache - */ -static inline void v9fs_cache_unregister(void) -{ - __v9fs_cache_unregister(); -} +extern void __v9fs_fscache_wait_on_page_write(struct inode *inode, + struct page *page); static inline int v9fs_fscache_release_page(struct page *page, gfp_t gfp) @@ -117,28 +88,27 @@ static inline void v9fs_readpage_to_fscache(struct inode *inode, static inline void v9fs_uncache_page(struct inode *inode, struct page *page) { - struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); - fscache_uncache_page(vcookie->fscache, page); + struct v9fs_inode *v9inode = V9FS_I(inode); + fscache_uncache_page(v9inode->fscache, page); BUG_ON(PageFsCache(page)); } -static inline void v9fs_vcookie_set_qid(struct inode *inode, +static inline void v9fs_fscache_set_key(struct inode *inode, struct p9_qid *qid) { - struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); - spin_lock(&vcookie->lock); - vcookie->qid = qid; - spin_unlock(&vcookie->lock); + struct v9fs_inode *v9inode = V9FS_I(inode); + spin_lock(&v9inode->fscache_lock); + v9inode->fscache_key = qid; + spin_unlock(&v9inode->fscache_lock); } -#else /* CONFIG_9P_FSCACHE */ - -static inline int v9fs_cache_register(void) +static inline void v9fs_fscache_wait_on_page_write(struct inode *inode, + struct page *page) { - return 1; + return __v9fs_fscache_wait_on_page_write(inode, page); } -static inline void v9fs_cache_unregister(void) {} +#else /* CONFIG_9P_FSCACHE */ static inline int v9fs_fscache_release_page(struct page *page, gfp_t gfp) { @@ -168,9 +138,11 @@ static inline void v9fs_readpage_to_fscache(struct inode *inode, static inline void v9fs_uncache_page(struct inode *inode, struct page *page) {} -static inline void v9fs_vcookie_set_qid(struct inode *inode, - struct p9_qid *qid) -{} +static inline void v9fs_fscache_wait_on_page_write(struct inode *inode, + struct page *page) +{ + return; +} #endif /* CONFIG_9P_FSCACHE */ #endif /* _9P_CACHE_H */ diff --git a/fs/9p/fid.c b/fs/9p/fid.c index b00223c99d7..cd63e002d82 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -125,46 +125,17 @@ err_out: return -ENOMEM; } -/** - * v9fs_fid_lookup - lookup for a fid, try to walk if not found - * @dentry: dentry to look for fid in - * - * Look for a fid in the specified dentry for the current user. - * If no fid is found, try to create one walking from a fid from the parent - * dentry (if it has one), or the root dentry. If the user haven't accessed - * the fs yet, attach now and walk from the root. - */ - -struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) +static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, + uid_t uid, int any) { - int i, n, l, clone, any, access; - u32 uid; - struct p9_fid *fid, *old_fid = NULL; struct dentry *ds; - struct v9fs_session_info *v9ses; char **wnames, *uname; + int i, n, l, clone, access; + struct v9fs_session_info *v9ses; + struct p9_fid *fid, *old_fid = NULL; v9ses = v9fs_inode2v9ses(dentry->d_inode); access = v9ses->flags & V9FS_ACCESS_MASK; - switch (access) { - case V9FS_ACCESS_SINGLE: - case V9FS_ACCESS_USER: - case V9FS_ACCESS_CLIENT: - uid = current_fsuid(); - any = 0; - break; - - case V9FS_ACCESS_ANY: - uid = v9ses->uid; - any = 1; - break; - - default: - uid = ~0; - any = 0; - break; - } - fid = v9fs_fid_find(dentry, uid, any); if (fid) return fid; @@ -250,6 +221,45 @@ err_out: return fid; } +/** + * v9fs_fid_lookup - lookup for a fid, try to walk if not found + * @dentry: dentry to look for fid in + * + * Look for a fid in the specified dentry for the current user. + * If no fid is found, try to create one walking from a fid from the parent + * dentry (if it has one), or the root dentry. If the user haven't accessed + * the fs yet, attach now and walk from the root. + */ + +struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) +{ + uid_t uid; + int any, access; + struct v9fs_session_info *v9ses; + + v9ses = v9fs_inode2v9ses(dentry->d_inode); + access = v9ses->flags & V9FS_ACCESS_MASK; + switch (access) { + case V9FS_ACCESS_SINGLE: + case V9FS_ACCESS_USER: + case V9FS_ACCESS_CLIENT: + uid = current_fsuid(); + any = 0; + break; + + case V9FS_ACCESS_ANY: + uid = v9ses->uid; + any = 1; + break; + + default: + uid = ~0; + any = 0; + break; + } + return v9fs_fid_lookup_with_uid(dentry, uid, any); +} + struct p9_fid *v9fs_fid_clone(struct dentry *dentry) { struct p9_fid *fid, *ret; @@ -261,3 +271,39 @@ struct p9_fid *v9fs_fid_clone(struct dentry *dentry) ret = p9_client_walk(fid, 0, NULL, 1); return ret; } + +static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, uid_t uid) +{ + struct p9_fid *fid, *ret; + + fid = v9fs_fid_lookup_with_uid(dentry, uid, 0); + if (IS_ERR(fid)) + return fid; + + ret = p9_client_walk(fid, 0, NULL, 1); + return ret; +} + +struct p9_fid *v9fs_writeback_fid(struct dentry *dentry) +{ + int err; + struct p9_fid *fid; + + fid = v9fs_fid_clone_with_uid(dentry, 0); + if (IS_ERR(fid)) + goto error_out; + /* + * writeback fid will only be used to write back the + * dirty pages. We always request for the open fid in read-write + * mode so that a partial page write which result in page + * read can work. + */ + err = p9_client_open(fid, O_RDWR); + if (err < 0) { + p9_client_clunk(fid); + fid = ERR_PTR(err); + goto error_out; + } +error_out: + return fid; +} diff --git a/fs/9p/fid.h b/fs/9p/fid.h index c3bbd6af996..bb0b6e7f58f 100644 --- a/fs/9p/fid.h +++ b/fs/9p/fid.h @@ -19,7 +19,8 @@ * Boston, MA 02111-1301 USA * */ - +#ifndef FS_9P_FID_H +#define FS_9P_FID_H #include <linux/list.h> /** @@ -45,3 +46,5 @@ struct v9fs_dentry { struct p9_fid *v9fs_fid_lookup(struct dentry *dentry); struct p9_fid *v9fs_fid_clone(struct dentry *dentry); int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid); +struct p9_fid *v9fs_writeback_fid(struct dentry *dentry); +#endif diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 2f77cd33ba8..c82b017f51f 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -39,6 +39,7 @@ static DEFINE_SPINLOCK(v9fs_sessionlist_lock); static LIST_HEAD(v9fs_sessionlist); +struct kmem_cache *v9fs_inode_cache; /* * Option Parsing (code inspired by NFS code) @@ -55,7 +56,7 @@ enum { /* Cache options */ Opt_cache_loose, Opt_fscache, /* Access options */ - Opt_access, + Opt_access, Opt_posixacl, /* Error token */ Opt_err }; @@ -73,6 +74,7 @@ static const match_table_t tokens = { {Opt_fscache, "fscache"}, {Opt_cachetag, "cachetag=%s"}, {Opt_access, "access=%s"}, + {Opt_posixacl, "posixacl"}, {Opt_err, NULL} }; @@ -194,15 +196,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) else if (strcmp(s, "any") == 0) v9ses->flags |= V9FS_ACCESS_ANY; else if (strcmp(s, "client") == 0) { -#ifdef CONFIG_9P_FS_POSIX_ACL v9ses->flags |= V9FS_ACCESS_CLIENT; -#else - P9_DPRINTK(P9_DEBUG_ERROR, - "access=client option not supported\n"); - kfree(s); - ret = -EINVAL; - goto free_and_return; -#endif } else { v9ses->flags |= V9FS_ACCESS_SINGLE; v9ses->uid = simple_strtoul(s, &e, 10); @@ -212,6 +206,16 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) kfree(s); break; + case Opt_posixacl: +#ifdef CONFIG_9P_FS_POSIX_ACL + v9ses->flags |= V9FS_POSIX_ACL; +#else + P9_DPRINTK(P9_DEBUG_ERROR, + "Not defined CONFIG_9P_FS_POSIX_ACL. " + "Ignoring posixacl option\n"); +#endif + break; + default: continue; } @@ -260,19 +264,12 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, list_add(&v9ses->slist, &v9fs_sessionlist); spin_unlock(&v9fs_sessionlist_lock); - v9ses->flags = V9FS_ACCESS_USER; strcpy(v9ses->uname, V9FS_DEFUSER); strcpy(v9ses->aname, V9FS_DEFANAME); v9ses->uid = ~0; v9ses->dfltuid = V9FS_DEFUID; v9ses->dfltgid = V9FS_DEFGID; - rc = v9fs_parse_options(v9ses, data); - if (rc < 0) { - retval = rc; - goto error; - } - v9ses->clnt = p9_client_create(dev_name, data); if (IS_ERR(v9ses->clnt)) { retval = PTR_ERR(v9ses->clnt); @@ -281,10 +278,20 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, goto error; } - if (p9_is_proto_dotl(v9ses->clnt)) + v9ses->flags = V9FS_ACCESS_USER; + + if (p9_is_proto_dotl(v9ses->clnt)) { + v9ses->flags = V9FS_ACCESS_CLIENT; v9ses->flags |= V9FS_PROTO_2000L; - else if (p9_is_proto_dotu(v9ses->clnt)) + } else if (p9_is_proto_dotu(v9ses->clnt)) { v9ses->flags |= V9FS_PROTO_2000U; + } + + rc = v9fs_parse_options(v9ses, data); + if (rc < 0) { + retval = rc; + goto error; + } v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; @@ -306,6 +313,14 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, v9ses->flags |= V9FS_ACCESS_ANY; v9ses->uid = ~0; } + if (!v9fs_proto_dotl(v9ses) || + !((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) { + /* + * We support ACL checks on clinet only if the protocol is + * 9P2000.L and access is V9FS_ACCESS_CLIENT. + */ + v9ses->flags &= ~V9FS_ACL_MASK; + } fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, ~0, v9ses->aname); @@ -467,6 +482,63 @@ static void v9fs_sysfs_cleanup(void) kobject_put(v9fs_kobj); } +static void v9fs_inode_init_once(void *foo) +{ + struct v9fs_inode *v9inode = (struct v9fs_inode *)foo; +#ifdef CONFIG_9P_FSCACHE + v9inode->fscache = NULL; + v9inode->fscache_key = NULL; +#endif + inode_init_once(&v9inode->vfs_inode); +} + +/** + * v9fs_init_inode_cache - initialize a cache for 9P + * Returns 0 on success. + */ +static int v9fs_init_inode_cache(void) +{ + v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache", + sizeof(struct v9fs_inode), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + v9fs_inode_init_once); + if (!v9fs_inode_cache) + return -ENOMEM; + + return 0; +} + +/** + * v9fs_destroy_inode_cache - destroy the cache of 9P inode + * + */ +static void v9fs_destroy_inode_cache(void) +{ + kmem_cache_destroy(v9fs_inode_cache); +} + +static int v9fs_cache_register(void) +{ + int ret; + ret = v9fs_init_inode_cache(); + if (ret < 0) + return ret; +#ifdef CONFIG_9P_FSCACHE + return fscache_register_netfs(&v9fs_cache_netfs); +#else + return ret; +#endif +} + +static void v9fs_cache_unregister(void) +{ + v9fs_destroy_inode_cache(); +#ifdef CONFIG_9P_FSCACHE + fscache_unregister_netfs(&v9fs_cache_netfs); +#endif +} + /** * init_v9fs - Initialize module * diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index c4b5d8864f0..bd8496db135 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -20,6 +20,9 @@ * Boston, MA 02111-1301 USA * */ +#ifndef FS_9P_V9FS_H +#define FS_9P_V9FS_H + #include <linux/backing-dev.h> /** @@ -28,8 +31,10 @@ * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy * @V9FS_ACCESS_USER: a new attach will be issued for every user (default) + * @V9FS_ACCESS_CLIENT: Just like user, but access check is performed on client. * @V9FS_ACCESS_ANY: use a single attach for all users * @V9FS_ACCESS_MASK: bit mask of different ACCESS options + * @V9FS_POSIX_ACL: POSIX ACLs are enforced * * Session flags reflect options selected by users at mount time */ @@ -37,13 +42,15 @@ V9FS_ACCESS_USER | \ V9FS_ACCESS_CLIENT) #define V9FS_ACCESS_MASK V9FS_ACCESS_ANY +#define V9FS_ACL_MASK V9FS_POSIX_ACL enum p9_session_flags { V9FS_PROTO_2000U = 0x01, V9FS_PROTO_2000L = 0x02, V9FS_ACCESS_SINGLE = 0x04, V9FS_ACCESS_USER = 0x08, - V9FS_ACCESS_CLIENT = 0x10 + V9FS_ACCESS_CLIENT = 0x10, + V9FS_POSIX_ACL = 0x20 }; /* possible values of ->cache */ @@ -109,8 +116,28 @@ struct v9fs_session_info { struct list_head slist; /* list of sessions registered with v9fs */ struct backing_dev_info bdi; struct rw_semaphore rename_sem; + struct p9_fid *root_fid; /* Used for file system sync */ +}; + +/* cache_validity flags */ +#define V9FS_INO_INVALID_ATTR 0x01 + +struct v9fs_inode { +#ifdef CONFIG_9P_FSCACHE + spinlock_t fscache_lock; + struct fscache_cookie *fscache; + struct p9_qid *fscache_key; +#endif + unsigned int cache_validity; + struct p9_fid *writeback_fid; + struct inode vfs_inode; }; +static inline struct v9fs_inode *V9FS_I(const struct inode *inode) +{ + return container_of(inode, struct v9fs_inode, vfs_inode); +} + struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, char *); extern void v9fs_session_close(struct v9fs_session_info *v9ses); @@ -124,16 +151,15 @@ extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p); -extern struct inode *v9fs_inode(struct v9fs_session_info *v9ses, - struct p9_fid *fid, - struct super_block *sb); - +extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses, + struct p9_fid *fid, + struct super_block *sb); extern const struct inode_operations v9fs_dir_inode_operations_dotl; extern const struct inode_operations v9fs_file_inode_operations_dotl; extern const struct inode_operations v9fs_symlink_inode_operations_dotl; -extern struct inode *v9fs_inode_dotl(struct v9fs_session_info *v9ses, - struct p9_fid *fid, - struct super_block *sb); +extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, + struct p9_fid *fid, + struct super_block *sb); /* other default globals */ #define V9FS_PORT 564 @@ -158,7 +184,7 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses) } /** - * v9fs_inode_from_fid - Helper routine to populate an inode by + * v9fs_get_inode_from_fid - Helper routine to populate an inode by * issuing a attribute request * @v9ses: session information * @fid: fid to issue attribute request for @@ -166,11 +192,12 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses) * */ static inline struct inode * -v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, - struct super_block *sb) +v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb) { if (v9fs_proto_dotl(v9ses)) - return v9fs_inode_dotl(v9ses, fid, sb); + return v9fs_inode_from_fid_dotl(v9ses, fid, sb); else - return v9fs_inode(v9ses, fid, sb); + return v9fs_inode_from_fid(v9ses, fid, sb); } +#endif diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index b789f8e597e..4014160903a 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -20,6 +20,8 @@ * Boston, MA 02111-1301 USA * */ +#ifndef FS_9P_V9FS_VFS_H +#define FS_9P_V9FS_VFS_H /* plan9 semantics are that created files are implicitly opened. * But linux semantics are that you call create, then open. @@ -36,6 +38,7 @@ * unlink calls remove, which is an implicit clunk. So we have to track * that kind of thing so that we don't try to clunk a dead fid. */ +#define P9_LOCK_TIMEOUT (30*HZ) extern struct file_system_type v9fs_fs_type; extern const struct address_space_operations v9fs_addr_operations; @@ -45,13 +48,15 @@ extern const struct file_operations v9fs_dir_operations; extern const struct file_operations v9fs_dir_operations_dotl; extern const struct dentry_operations v9fs_dentry_operations; extern const struct dentry_operations v9fs_cached_dentry_operations; +extern const struct file_operations v9fs_cached_file_operations; +extern const struct file_operations v9fs_cached_file_operations_dotl; +extern struct kmem_cache *v9fs_inode_cache; -#ifdef CONFIG_9P_FSCACHE struct inode *v9fs_alloc_inode(struct super_block *sb); void v9fs_destroy_inode(struct inode *inode); -#endif - struct inode *v9fs_get_inode(struct super_block *sb, int mode); +int v9fs_init_inode(struct v9fs_session_info *v9ses, + struct inode *inode, int mode); void v9fs_evict_inode(struct inode *inode); ino_t v9fs_qid2ino(struct p9_qid *qid); void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); @@ -62,8 +67,19 @@ void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); int v9fs_uflags2omode(int uflags, int extended); ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64); +ssize_t v9fs_fid_readn(struct p9_fid *, char *, char __user *, u32, u64); void v9fs_blank_wstat(struct p9_wstat *wstat); int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *); int v9fs_file_fsync_dotl(struct file *filp, int datasync); - -#define P9_LOCK_TIMEOUT (30*HZ) +ssize_t v9fs_file_write_internal(struct inode *, struct p9_fid *, + const char __user *, size_t, loff_t *, int); +int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode); +int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode); +static inline void v9fs_invalidate_inode_attr(struct inode *inode) +{ + struct v9fs_inode *v9inode; + v9inode = V9FS_I(inode); + v9inode->cache_validity |= V9FS_INO_INVALID_ATTR; + return; +} +#endif diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index b7f2a8e3863..2524e4cbb8e 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -39,16 +39,16 @@ #include "v9fs.h" #include "v9fs_vfs.h" #include "cache.h" +#include "fid.h" /** - * v9fs_vfs_readpage - read an entire page in from 9P + * v9fs_fid_readpage - read an entire page in from 9P * - * @filp: file being read + * @fid: fid being read * @page: structure to page * */ - -static int v9fs_vfs_readpage(struct file *filp, struct page *page) +static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page) { int retval; loff_t offset; @@ -67,7 +67,7 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page) buffer = kmap(page); offset = page_offset(page); - retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset); + retval = v9fs_fid_readn(fid, buffer, NULL, PAGE_CACHE_SIZE, offset); if (retval < 0) { v9fs_uncache_page(inode, page); goto done; @@ -87,6 +87,19 @@ done: } /** + * v9fs_vfs_readpage - read an entire page in from 9P + * + * @filp: file being read + * @page: structure to page + * + */ + +static int v9fs_vfs_readpage(struct file *filp, struct page *page) +{ + return v9fs_fid_readpage(filp->private_data, page); +} + +/** * v9fs_vfs_readpages - read a set of pages from 9P * * @filp: file being read @@ -124,7 +137,6 @@ static int v9fs_release_page(struct page *page, gfp_t gfp) { if (PagePrivate(page)) return 0; - return v9fs_fscache_release_page(page, gfp); } @@ -137,20 +149,89 @@ static int v9fs_release_page(struct page *page, gfp_t gfp) static void v9fs_invalidate_page(struct page *page, unsigned long offset) { + /* + * If called with zero offset, we should release + * the private state assocated with the page + */ if (offset == 0) v9fs_fscache_invalidate_page(page); } +static int v9fs_vfs_writepage_locked(struct page *page) +{ + char *buffer; + int retval, len; + loff_t offset, size; + mm_segment_t old_fs; + struct v9fs_inode *v9inode; + struct inode *inode = page->mapping->host; + + v9inode = V9FS_I(inode); + size = i_size_read(inode); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + set_page_writeback(page); + + buffer = kmap(page); + offset = page_offset(page); + + old_fs = get_fs(); + set_fs(get_ds()); + /* We should have writeback_fid always set */ + BUG_ON(!v9inode->writeback_fid); + + retval = v9fs_file_write_internal(inode, + v9inode->writeback_fid, + (__force const char __user *)buffer, + len, &offset, 0); + if (retval > 0) + retval = 0; + + set_fs(old_fs); + kunmap(page); + end_page_writeback(page); + return retval; +} + +static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc) +{ + int retval; + + retval = v9fs_vfs_writepage_locked(page); + if (retval < 0) { + if (retval == -EAGAIN) { + redirty_page_for_writepage(wbc, page); + retval = 0; + } else { + SetPageError(page); + mapping_set_error(page->mapping, retval); + } + } else + retval = 0; + + unlock_page(page); + return retval; +} + /** * v9fs_launder_page - Writeback a dirty page - * Since the writes go directly to the server, we simply return a 0 - * here to indicate success. - * * Returns 0 on success. */ static int v9fs_launder_page(struct page *page) { + int retval; + struct inode *inode = page->mapping->host; + + v9fs_fscache_wait_on_page_write(inode, page); + if (clear_page_dirty_for_io(page)) { + retval = v9fs_vfs_writepage_locked(page); + if (retval) + return retval; + } return 0; } @@ -173,9 +254,15 @@ static int v9fs_launder_page(struct page *page) * with an error. * */ -ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t pos, unsigned long nr_segs) +static ssize_t +v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, + loff_t pos, unsigned long nr_segs) { + /* + * FIXME + * Now that we do caching with cache mode enabled, We need + * to support direct IO + */ P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) " "off/no(%lld/%lu) EINVAL\n", iocb->ki_filp->f_path.dentry->d_name.name, @@ -183,11 +270,84 @@ ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, return -EINVAL; } + +static int v9fs_write_begin(struct file *filp, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int retval = 0; + struct page *page; + struct v9fs_inode *v9inode; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct inode *inode = mapping->host; + + v9inode = V9FS_I(inode); +start: + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) { + retval = -ENOMEM; + goto out; + } + BUG_ON(!v9inode->writeback_fid); + if (PageUptodate(page)) + goto out; + + if (len == PAGE_CACHE_SIZE) + goto out; + + retval = v9fs_fid_readpage(v9inode->writeback_fid, page); + page_cache_release(page); + if (!retval) + goto start; +out: + *pagep = page; + return retval; +} + +static int v9fs_write_end(struct file *filp, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + loff_t last_pos = pos + copied; + struct inode *inode = page->mapping->host; + + if (unlikely(copied < len)) { + /* + * zero out the rest of the area + */ + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + + zero_user(page, from + copied, len - copied); + flush_dcache_page(page); + } + + if (!PageUptodate(page)) + SetPageUptodate(page); + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold the i_mutex. + */ + if (last_pos > inode->i_size) { + inode_add_bytes(inode, last_pos - inode->i_size); + i_size_write(inode, last_pos); + } + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + + return copied; +} + + const struct address_space_operations v9fs_addr_operations = { - .readpage = v9fs_vfs_readpage, - .readpages = v9fs_vfs_readpages, - .releasepage = v9fs_release_page, - .invalidatepage = v9fs_invalidate_page, - .launder_page = v9fs_launder_page, - .direct_IO = v9fs_direct_IO, + .readpage = v9fs_vfs_readpage, + .readpages = v9fs_vfs_readpages, + .set_page_dirty = __set_page_dirty_nobuffers, + .writepage = v9fs_vfs_writepage, + .write_begin = v9fs_write_begin, + .write_end = v9fs_write_end, + .releasepage = v9fs_release_page, + .invalidatepage = v9fs_invalidate_page, + .launder_page = v9fs_launder_page, + .direct_IO = v9fs_direct_IO, }; diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index 233b7d4ffe5..b6a3b9f7fe4 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -63,20 +63,15 @@ static int v9fs_dentry_delete(const struct dentry *dentry) * v9fs_cached_dentry_delete - called when dentry refcount equals 0 * @dentry: dentry in question * - * Only return 1 if our inode is invalid. Only non-synthetic files - * (ones without mtime == 0) should be calling this function. - * */ - static int v9fs_cached_dentry_delete(const struct dentry *dentry) { - struct inode *inode = dentry->d_inode; - P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, - dentry); + P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", + dentry->d_name.name, dentry); - if(!inode) + /* Don't cache negative dentries */ + if (!dentry->d_inode) return 1; - return 0; } @@ -105,7 +100,41 @@ static void v9fs_dentry_release(struct dentry *dentry) } } +static int v9fs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct p9_fid *fid; + struct inode *inode; + struct v9fs_inode *v9inode; + + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + if (!inode) + goto out_valid; + + v9inode = V9FS_I(inode); + if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) { + int retval; + struct v9fs_session_info *v9ses; + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + v9ses = v9fs_inode2v9ses(inode); + if (v9fs_proto_dotl(v9ses)) + retval = v9fs_refresh_inode_dotl(fid, inode); + else + retval = v9fs_refresh_inode(fid, inode); + if (retval <= 0) + return retval; + } +out_valid: + return 1; +} + const struct dentry_operations v9fs_cached_dentry_operations = { + .d_revalidate = v9fs_lookup_revalidate, .d_delete = v9fs_cached_dentry_delete, .d_release = v9fs_dentry_release, }; diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index b84ebe8cefe..9c2bdda5cd9 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -295,7 +295,6 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) P9_DPRINTK(P9_DEBUG_VFS, "v9fs_dir_release: inode: %p filp: %p fid: %d\n", inode, filp, fid ? fid->fid : -1); - filemap_write_and_wait(inode->i_mapping); if (fid) p9_client_clunk(fid); return 0; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 240c3067439..78bcb97c342 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -44,8 +44,7 @@ #include "fid.h" #include "cache.h" -static const struct file_operations v9fs_cached_file_operations; -static const struct file_operations v9fs_cached_file_operations_dotl; +static const struct vm_operations_struct v9fs_file_vm_ops; /** * v9fs_file_open - open a file (or directory) @@ -57,11 +56,13 @@ static const struct file_operations v9fs_cached_file_operations_dotl; int v9fs_file_open(struct inode *inode, struct file *file) { int err; + struct v9fs_inode *v9inode; struct v9fs_session_info *v9ses; struct p9_fid *fid; int omode; P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file); + v9inode = V9FS_I(inode); v9ses = v9fs_inode2v9ses(inode); if (v9fs_proto_dotl(v9ses)) omode = file->f_flags; @@ -89,20 +90,30 @@ int v9fs_file_open(struct inode *inode, struct file *file) } file->private_data = fid; - if ((fid->qid.version) && (v9ses->cache)) { - P9_DPRINTK(P9_DEBUG_VFS, "cached"); - /* enable cached file options */ - if(file->f_op == &v9fs_file_operations) - file->f_op = &v9fs_cached_file_operations; - else if (file->f_op == &v9fs_file_operations_dotl) - file->f_op = &v9fs_cached_file_operations_dotl; - + if (v9ses->cache && !v9inode->writeback_fid) { + /* + * clone a fid and add it to writeback_fid + * we do it during open time instead of + * page dirty time via write_begin/page_mkwrite + * because we want write after unlink usecase + * to work. + */ + fid = v9fs_writeback_fid(file->f_path.dentry); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + goto out_error; + } + v9inode->writeback_fid = (void *) fid; + } #ifdef CONFIG_9P_FSCACHE + if (v9ses->cache) v9fs_cache_inode_set_cookie(inode, file); #endif - } - return 0; +out_error: + p9_client_clunk(file->private_data); + file->private_data = NULL; + return err; } /** @@ -335,25 +346,22 @@ out_err: } /** - * v9fs_file_readn - read from a file - * @filp: file pointer to read + * v9fs_fid_readn - read from a fid + * @fid: fid to read * @data: data buffer to read data into * @udata: user data buffer to read data into * @count: size of buffer * @offset: offset at which to read data * */ - ssize_t -v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, +v9fs_fid_readn(struct p9_fid *fid, char *data, char __user *udata, u32 count, u64 offset) { int n, total, size; - struct p9_fid *fid = filp->private_data; P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid, - (long long unsigned) offset, count); - + (long long unsigned) offset, count); n = 0; total = 0; size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ; @@ -379,6 +387,22 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, } /** + * v9fs_file_readn - read from a file + * @filp: file pointer to read + * @data: data buffer to read data into + * @udata: user data buffer to read data into + * @count: size of buffer + * @offset: offset at which to read data + * + */ +ssize_t +v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, + u64 offset) +{ + return v9fs_fid_readn(filp->private_data, data, udata, count, offset); +} + +/** * v9fs_file_read - read from a file * @filp: file pointer to read * @udata: user data buffer to read data into @@ -410,45 +434,22 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count, return ret; } -/** - * v9fs_file_write - write to a file - * @filp: file pointer to write - * @data: data buffer to write data from - * @count: size of buffer - * @offset: offset at which to write data - * - */ - -static ssize_t -v9fs_file_write(struct file *filp, const char __user * data, - size_t count, loff_t * offset) +ssize_t +v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid, + const char __user *data, size_t count, + loff_t *offset, int invalidate) { - ssize_t retval; - size_t total = 0; int n; - struct p9_fid *fid; + loff_t i_size; + size_t total = 0; struct p9_client *clnt; - struct inode *inode = filp->f_path.dentry->d_inode; loff_t origin = *offset; unsigned long pg_start, pg_end; P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, (int)count, (int)*offset); - fid = filp->private_data; clnt = fid->clnt; - - retval = generic_write_checks(filp, &origin, &count, 0); - if (retval) - goto out; - - retval = -EINVAL; - if ((ssize_t) count < 0) - goto out; - retval = 0; - if (!count) - goto out; - do { n = p9_client_write(fid, NULL, data+total, origin+total, count); if (n <= 0) @@ -457,25 +458,60 @@ v9fs_file_write(struct file *filp, const char __user * data, total += n; } while (count > 0); - if (total > 0) { + if (invalidate && (total > 0)) { pg_start = origin >> PAGE_CACHE_SHIFT; pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT; if (inode->i_mapping && inode->i_mapping->nrpages) invalidate_inode_pages2_range(inode->i_mapping, pg_start, pg_end); *offset += total; - i_size_write(inode, i_size_read(inode) + total); - inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9; + i_size = i_size_read(inode); + if (*offset > i_size) { + inode_add_bytes(inode, *offset - i_size); + i_size_write(inode, *offset); + } } - if (n < 0) - retval = n; - else - retval = total; + return n; + + return total; +} + +/** + * v9fs_file_write - write to a file + * @filp: file pointer to write + * @data: data buffer to write data from + * @count: size of buffer + * @offset: offset at which to write data + * + */ +static ssize_t +v9fs_file_write(struct file *filp, const char __user * data, + size_t count, loff_t *offset) +{ + ssize_t retval = 0; + loff_t origin = *offset; + + + retval = generic_write_checks(filp, &origin, &count, 0); + if (retval) + goto out; + + retval = -EINVAL; + if ((ssize_t) count < 0) + goto out; + retval = 0; + if (!count) + goto out; + + return v9fs_file_write_internal(filp->f_path.dentry->d_inode, + filp->private_data, + data, count, offset, 1); out: return retval; } + static int v9fs_file_fsync(struct file *filp, int datasync) { struct p9_fid *fid; @@ -505,28 +541,182 @@ int v9fs_file_fsync_dotl(struct file *filp, int datasync) return retval; } -static const struct file_operations v9fs_cached_file_operations = { +static int +v9fs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + int retval; + + retval = generic_file_mmap(file, vma); + if (!retval) + vma->vm_ops = &v9fs_file_vm_ops; + + return retval; +} + +static int +v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct v9fs_inode *v9inode; + struct page *page = vmf->page; + struct file *filp = vma->vm_file; + struct inode *inode = filp->f_path.dentry->d_inode; + + + P9_DPRINTK(P9_DEBUG_VFS, "page %p fid %lx\n", + page, (unsigned long)filp->private_data); + + v9inode = V9FS_I(inode); + /* make sure the cache has finished storing the page */ + v9fs_fscache_wait_on_page_write(inode, page); + BUG_ON(!v9inode->writeback_fid); + lock_page(page); + if (page->mapping != inode->i_mapping) + goto out_unlock; + + return VM_FAULT_LOCKED; +out_unlock: + unlock_page(page); + return VM_FAULT_NOPAGE; +} + +static ssize_t +v9fs_direct_read(struct file *filp, char __user *udata, size_t count, + loff_t *offsetp) +{ + loff_t size, offset; + struct inode *inode; + struct address_space *mapping; + + offset = *offsetp; + mapping = filp->f_mapping; + inode = mapping->host; + if (!count) + return 0; + size = i_size_read(inode); + if (offset < size) + filemap_write_and_wait_range(mapping, offset, + offset + count - 1); + + return v9fs_file_read(filp, udata, count, offsetp); +} + +/** + * v9fs_cached_file_read - read from a file + * @filp: file pointer to read + * @udata: user data buffer to read data into + * @count: size of buffer + * @offset: offset at which to read data + * + */ +static ssize_t +v9fs_cached_file_read(struct file *filp, char __user *data, size_t count, + loff_t *offset) +{ + if (filp->f_flags & O_DIRECT) + return v9fs_direct_read(filp, data, count, offset); + return do_sync_read(filp, data, count, offset); +} + +static ssize_t +v9fs_direct_write(struct file *filp, const char __user * data, + size_t count, loff_t *offsetp) +{ + loff_t offset; + ssize_t retval; + struct inode *inode; + struct address_space *mapping; + + offset = *offsetp; + mapping = filp->f_mapping; + inode = mapping->host; + if (!count) + return 0; + + mutex_lock(&inode->i_mutex); + retval = filemap_write_and_wait_range(mapping, offset, + offset + count - 1); + if (retval) + goto err_out; + /* + * After a write we want buffered reads to be sure to go to disk to get + * the new data. We invalidate clean cached page from the region we're + * about to write. We do this *before* the write so that if we fail + * here we fall back to buffered write + */ + if (mapping->nrpages) { + pgoff_t pg_start = offset >> PAGE_CACHE_SHIFT; + pgoff_t pg_end = (offset + count - 1) >> PAGE_CACHE_SHIFT; + + retval = invalidate_inode_pages2_range(mapping, + pg_start, pg_end); + /* + * If a page can not be invalidated, fall back + * to buffered write. + */ + if (retval) { + if (retval == -EBUSY) + goto buff_write; + goto err_out; + } + } + retval = v9fs_file_write(filp, data, count, offsetp); +err_out: + mutex_unlock(&inode->i_mutex); + return retval; + +buff_write: + mutex_unlock(&inode->i_mutex); + return do_sync_write(filp, data, count, offsetp); +} + +/** + * v9fs_cached_file_write - write to a file + * @filp: file pointer to write + * @data: data buffer to write data from + * @count: size of buffer + * @offset: offset at which to write data + * + */ +static ssize_t +v9fs_cached_file_write(struct file *filp, const char __user * data, + size_t count, loff_t *offset) +{ + + if (filp->f_flags & O_DIRECT) + return v9fs_direct_write(filp, data, count, offset); + return do_sync_write(filp, data, count, offset); +} + +static const struct vm_operations_struct v9fs_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = v9fs_vm_page_mkwrite, +}; + + +const struct file_operations v9fs_cached_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, + .read = v9fs_cached_file_read, + .write = v9fs_cached_file_write, .aio_read = generic_file_aio_read, - .write = v9fs_file_write, + .aio_write = generic_file_aio_write, .open = v9fs_file_open, .release = v9fs_dir_release, .lock = v9fs_file_lock, - .mmap = generic_file_readonly_mmap, + .mmap = v9fs_file_mmap, .fsync = v9fs_file_fsync, }; -static const struct file_operations v9fs_cached_file_operations_dotl = { +const struct file_operations v9fs_cached_file_operations_dotl = { .llseek = generic_file_llseek, - .read = do_sync_read, + .read = v9fs_cached_file_read, + .write = v9fs_cached_file_write, .aio_read = generic_file_aio_read, - .write = v9fs_file_write, + .aio_write = generic_file_aio_write, .open = v9fs_file_open, .release = v9fs_dir_release, .lock = v9fs_file_lock_dotl, .flock = v9fs_file_flock_dotl, - .mmap = generic_file_readonly_mmap, + .mmap = v9fs_file_mmap, .fsync = v9fs_file_fsync_dotl, }; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index b76a40bdf4c..8a2c232f708 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -203,26 +203,25 @@ v9fs_blank_wstat(struct p9_wstat *wstat) wstat->extension = NULL; } -#ifdef CONFIG_9P_FSCACHE /** * v9fs_alloc_inode - helper function to allocate an inode - * This callback is executed before setting up the inode so that we - * can associate a vcookie with each inode. * */ - struct inode *v9fs_alloc_inode(struct super_block *sb) { - struct v9fs_cookie *vcookie; - vcookie = (struct v9fs_cookie *)kmem_cache_alloc(vcookie_cache, - GFP_KERNEL); - if (!vcookie) + struct v9fs_inode *v9inode; + v9inode = (struct v9fs_inode *)kmem_cache_alloc(v9fs_inode_cache, + GFP_KERNEL); + if (!v9inode) return NULL; - - vcookie->fscache = NULL; - vcookie->qid = NULL; - spin_lock_init(&vcookie->lock); - return &vcookie->inode; +#ifdef CONFIG_9P_FSCACHE + v9inode->fscache = NULL; + v9inode->fscache_key = NULL; + spin_lock_init(&v9inode->fscache_lock); +#endif + v9inode->writeback_fid = NULL; + v9inode->cache_validity = 0; + return &v9inode->vfs_inode; } /** @@ -234,35 +233,18 @@ static void v9fs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); INIT_LIST_HEAD(&inode->i_dentry); - kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode)); + kmem_cache_free(v9fs_inode_cache, V9FS_I(inode)); } void v9fs_destroy_inode(struct inode *inode) { call_rcu(&inode->i_rcu, v9fs_i_callback); } -#endif -/** - * v9fs_get_inode - helper function to setup an inode - * @sb: superblock - * @mode: mode to setup inode with - * - */ - -struct inode *v9fs_get_inode(struct super_block *sb, int mode) +int v9fs_init_inode(struct v9fs_session_info *v9ses, + struct inode *inode, int mode) { - int err; - struct inode *inode; - struct v9fs_session_info *v9ses = sb->s_fs_info; - - P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode); - - inode = new_inode(sb); - if (!inode) { - P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); - return ERR_PTR(-ENOMEM); - } + int err = 0; inode_init_owner(inode, NULL, mode); inode->i_blocks = 0; @@ -292,14 +274,20 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) case S_IFREG: if (v9fs_proto_dotl(v9ses)) { inode->i_op = &v9fs_file_inode_operations_dotl; - inode->i_fop = &v9fs_file_operations_dotl; + if (v9ses->cache) + inode->i_fop = + &v9fs_cached_file_operations_dotl; + else + inode->i_fop = &v9fs_file_operations_dotl; } else { inode->i_op = &v9fs_file_inode_operations; - inode->i_fop = &v9fs_file_operations; + if (v9ses->cache) + inode->i_fop = &v9fs_cached_file_operations; + else + inode->i_fop = &v9fs_file_operations; } break; - case S_IFLNK: if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) { P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with " @@ -335,12 +323,37 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) err = -EINVAL; goto error; } +error: + return err; - return inode; +} -error: - iput(inode); - return ERR_PTR(err); +/** + * v9fs_get_inode - helper function to setup an inode + * @sb: superblock + * @mode: mode to setup inode with + * + */ + +struct inode *v9fs_get_inode(struct super_block *sb, int mode) +{ + int err; + struct inode *inode; + struct v9fs_session_info *v9ses = sb->s_fs_info; + + P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode); + + inode = new_inode(sb); + if (!inode) { + P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); + return ERR_PTR(-ENOMEM); + } + err = v9fs_init_inode(v9ses, inode, mode); + if (err) { + iput(inode); + return ERR_PTR(err); + } + return inode; } /* @@ -403,6 +416,8 @@ error: */ void v9fs_evict_inode(struct inode *inode) { + struct v9fs_inode *v9inode = V9FS_I(inode); + truncate_inode_pages(inode->i_mapping, 0); end_writeback(inode); filemap_fdatawrite(inode->i_mapping); @@ -410,41 +425,67 @@ void v9fs_evict_inode(struct inode *inode) #ifdef CONFIG_9P_FSCACHE v9fs_cache_inode_put_cookie(inode); #endif + /* clunk the fid stashed in writeback_fid */ + if (v9inode->writeback_fid) { + p9_client_clunk(v9inode->writeback_fid); + v9inode->writeback_fid = NULL; + } } -struct inode * -v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid, - struct super_block *sb) +static struct inode *v9fs_qid_iget(struct super_block *sb, + struct p9_qid *qid, + struct p9_wstat *st) { - int err, umode; - struct inode *ret = NULL; - struct p9_wstat *st; - - st = p9_client_stat(fid); - if (IS_ERR(st)) - return ERR_CAST(st); + int retval, umode; + unsigned long i_ino; + struct inode *inode; + struct v9fs_session_info *v9ses = sb->s_fs_info; + i_ino = v9fs_qid2ino(qid); + inode = iget_locked(sb, i_ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + /* + * initialize the inode with the stat info + * FIXME!! we may need support for stale inodes + * later. + */ umode = p9mode2unixmode(v9ses, st->mode); - ret = v9fs_get_inode(sb, umode); - if (IS_ERR(ret)) { - err = PTR_ERR(ret); + retval = v9fs_init_inode(v9ses, inode, umode); + if (retval) goto error; - } - - v9fs_stat2inode(st, ret, sb); - ret->i_ino = v9fs_qid2ino(&st->qid); + v9fs_stat2inode(st, inode, sb); #ifdef CONFIG_9P_FSCACHE - v9fs_vcookie_set_qid(ret, &st->qid); - v9fs_cache_inode_get_cookie(ret); + v9fs_fscache_set_key(inode, &st->qid); + v9fs_cache_inode_get_cookie(inode); #endif - p9stat_free(st); - kfree(st); - return ret; + unlock_new_inode(inode); + return inode; error: + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(retval); + +} + +struct inode * +v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb) +{ + struct p9_wstat *st; + struct inode *inode = NULL; + + st = p9_client_stat(fid); + if (IS_ERR(st)) + return ERR_CAST(st); + + inode = v9fs_qid_iget(sb, &st->qid, st); p9stat_free(st); kfree(st); - return ERR_PTR(err); + return inode; } /** @@ -458,8 +499,8 @@ error: static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) { int retval; - struct inode *file_inode; struct p9_fid *v9fid; + struct inode *file_inode; P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file, rmdir); @@ -470,8 +511,20 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) return PTR_ERR(v9fid); retval = p9_client_remove(v9fid); - if (!retval) - drop_nlink(file_inode); + if (!retval) { + /* + * directories on unlink should have zero + * link count + */ + if (rmdir) { + clear_nlink(file_inode); + drop_nlink(dir); + } else + drop_nlink(file_inode); + + v9fs_invalidate_inode_attr(file_inode); + v9fs_invalidate_inode_attr(dir); + } return retval; } @@ -531,7 +584,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, } /* instantiate inode and assign the unopened fid to the dentry */ - inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); @@ -570,9 +623,10 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, int err; u32 perm; int flags; - struct v9fs_session_info *v9ses; - struct p9_fid *fid; struct file *filp; + struct v9fs_inode *v9inode; + struct v9fs_session_info *v9ses; + struct p9_fid *fid, *inode_fid; err = 0; fid = NULL; @@ -592,8 +646,25 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, goto error; } + v9fs_invalidate_inode_attr(dir); /* if we are opening a file, assign the open fid to the file */ if (nd && nd->flags & LOOKUP_OPEN) { + v9inode = V9FS_I(dentry->d_inode); + if (v9ses->cache && !v9inode->writeback_fid) { + /* + * clone a fid and add it to writeback_fid + * we do it during open time instead of + * page dirty time via write_begin/page_mkwrite + * because we want write after unlink usecase + * to work. + */ + inode_fid = v9fs_writeback_fid(dentry); + if (IS_ERR(inode_fid)) { + err = PTR_ERR(inode_fid); + goto error; + } + v9inode->writeback_fid = (void *) inode_fid; + } filp = lookup_instantiate_filp(nd, dentry, generic_file_open); if (IS_ERR(filp)) { err = PTR_ERR(filp); @@ -601,6 +672,10 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, } filp->private_data = fid; +#ifdef CONFIG_9P_FSCACHE + if (v9ses->cache) + v9fs_cache_inode_set_cookie(dentry->d_inode, filp); +#endif } else p9_client_clunk(fid); @@ -625,8 +700,8 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) { int err; u32 perm; - struct v9fs_session_info *v9ses; struct p9_fid *fid; + struct v9fs_session_info *v9ses; P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); err = 0; @@ -636,6 +711,9 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (IS_ERR(fid)) { err = PTR_ERR(fid); fid = NULL; + } else { + inc_nlink(dir); + v9fs_invalidate_inode_attr(dir); } if (fid) @@ -687,7 +765,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(result); } - inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { result = PTR_ERR(inode); inode = NULL; @@ -747,17 +825,19 @@ int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { + int retval; struct inode *old_inode; + struct inode *new_inode; struct v9fs_session_info *v9ses; struct p9_fid *oldfid; struct p9_fid *olddirfid; struct p9_fid *newdirfid; struct p9_wstat wstat; - int retval; P9_DPRINTK(P9_DEBUG_VFS, "\n"); retval = 0; old_inode = old_dentry->d_inode; + new_inode = new_dentry->d_inode; v9ses = v9fs_inode2v9ses(old_inode); oldfid = v9fs_fid_lookup(old_dentry); if (IS_ERR(oldfid)) @@ -798,9 +878,30 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, retval = p9_client_wstat(oldfid, &wstat); clunk_newdir: - if (!retval) + if (!retval) { + if (new_inode) { + if (S_ISDIR(new_inode->i_mode)) + clear_nlink(new_inode); + else + drop_nlink(new_inode); + /* + * Work around vfs rename rehash bug with + * FS_RENAME_DOES_D_MOVE + */ + v9fs_invalidate_inode_attr(new_inode); + } + if (S_ISDIR(old_inode->i_mode)) { + if (!new_inode) + inc_nlink(new_dir); + drop_nlink(old_dir); + } + v9fs_invalidate_inode_attr(old_inode); + v9fs_invalidate_inode_attr(old_dir); + v9fs_invalidate_inode_attr(new_dir); + /* successful rename */ d_move(old_dentry, new_dentry); + } up_write(&v9ses->rename_sem); p9_client_clunk(newdirfid); @@ -831,9 +932,10 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); err = -EPERM; v9ses = v9fs_inode2v9ses(dentry->d_inode); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - return simple_getattr(mnt, dentry, stat); - + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + generic_fillattr(dentry->d_inode, stat); + return 0; + } fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return PTR_ERR(fid); @@ -891,17 +993,20 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) if (iattr->ia_valid & ATTR_GID) wstat.n_gid = iattr->ia_gid; } - - retval = p9_client_wstat(fid, &wstat); - if (retval < 0) - return retval; - if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size != i_size_read(dentry->d_inode)) { retval = vmtruncate(dentry->d_inode, iattr->ia_size); if (retval) return retval; } + /* Write all dirty data */ + if (S_ISREG(dentry->d_inode->i_mode)) + filemap_write_and_wait(dentry->d_inode->i_mapping); + + retval = p9_client_wstat(fid, &wstat); + if (retval < 0) + return retval; + v9fs_invalidate_inode_attr(dentry->d_inode); setattr_copy(dentry->d_inode, iattr); mark_inode_dirty(dentry->d_inode); @@ -924,6 +1029,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, char tag_name[14]; unsigned int i_nlink; struct v9fs_session_info *v9ses = sb->s_fs_info; + struct v9fs_inode *v9inode = V9FS_I(inode); inode->i_nlink = 1; @@ -983,6 +1089,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, /* not real number of blocks, but 512 byte ones ... */ inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9; + v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR; } /** @@ -1115,8 +1222,8 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, int mode, const char *extension) { u32 perm; - struct v9fs_session_info *v9ses; struct p9_fid *fid; + struct v9fs_session_info *v9ses; v9ses = v9fs_inode2v9ses(dir); if (!v9fs_proto_dotu(v9ses)) { @@ -1130,6 +1237,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, if (IS_ERR(fid)) return PTR_ERR(fid); + v9fs_invalidate_inode_attr(dir); p9_client_clunk(fid); return 0; } @@ -1166,8 +1274,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { int retval; - struct p9_fid *oldfid; char *name; + struct p9_fid *oldfid; P9_DPRINTK(P9_DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name, @@ -1186,7 +1294,10 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, sprintf(name, "%d\n", oldfid->fid); retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name); __putname(name); - + if (!retval) { + v9fs_refresh_inode(oldfid, old_dentry->d_inode); + v9fs_invalidate_inode_attr(dir); + } clunk_fid: p9_client_clunk(oldfid); return retval; @@ -1237,6 +1348,32 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) return retval; } +int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) +{ + loff_t i_size; + struct p9_wstat *st; + struct v9fs_session_info *v9ses; + + v9ses = v9fs_inode2v9ses(inode); + st = p9_client_stat(fid); + if (IS_ERR(st)) + return PTR_ERR(st); + + spin_lock(&inode->i_lock); + /* + * We don't want to refresh inode->i_size, + * because we may have cached data + */ + i_size = inode->i_size; + v9fs_stat2inode(st, inode, inode->i_sb); + if (v9ses->cache) + inode->i_size = i_size; + spin_unlock(&inode->i_lock); + p9stat_free(st); + kfree(st); + return 0; +} + static const struct inode_operations v9fs_dir_inode_operations_dotu = { .create = v9fs_vfs_create, .lookup = v9fs_vfs_lookup, diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index fe3ffa9aace..67c138e94fe 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -86,40 +86,63 @@ static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode) return dentry; } +static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, + struct p9_qid *qid, + struct p9_fid *fid, + struct p9_stat_dotl *st) +{ + int retval; + unsigned long i_ino; + struct inode *inode; + struct v9fs_session_info *v9ses = sb->s_fs_info; + + i_ino = v9fs_qid2ino(qid); + inode = iget_locked(sb, i_ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + /* + * initialize the inode with the stat info + * FIXME!! we may need support for stale inodes + * later. + */ + retval = v9fs_init_inode(v9ses, inode, st->st_mode); + if (retval) + goto error; + + v9fs_stat2inode_dotl(st, inode); +#ifdef CONFIG_9P_FSCACHE + v9fs_fscache_set_key(inode, &st->qid); + v9fs_cache_inode_get_cookie(inode); +#endif + retval = v9fs_get_acl(inode, fid); + if (retval) + goto error; + + unlock_new_inode(inode); + return inode; +error: + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(retval); + +} + struct inode * -v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, - struct super_block *sb) +v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb) { - struct inode *ret = NULL; - int err; struct p9_stat_dotl *st; + struct inode *inode = NULL; st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); if (IS_ERR(st)) return ERR_CAST(st); - ret = v9fs_get_inode(sb, st->st_mode); - if (IS_ERR(ret)) { - err = PTR_ERR(ret); - goto error; - } - - v9fs_stat2inode_dotl(st, ret); - ret->i_ino = v9fs_qid2ino(&st->qid); -#ifdef CONFIG_9P_FSCACHE - v9fs_vcookie_set_qid(ret, &st->qid); - v9fs_cache_inode_get_cookie(ret); -#endif - err = v9fs_get_acl(ret, fid); - if (err) { - iput(ret); - goto error; - } - kfree(st); - return ret; -error: + inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st); kfree(st); - return ERR_PTR(err); + return inode; } /** @@ -136,16 +159,17 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, struct nameidata *nd) { int err = 0; - char *name = NULL; gid_t gid; int flags; mode_t mode; - struct v9fs_session_info *v9ses; - struct p9_fid *fid = NULL; - struct p9_fid *dfid, *ofid; + char *name = NULL; struct file *filp; struct p9_qid qid; struct inode *inode; + struct p9_fid *fid = NULL; + struct v9fs_inode *v9inode; + struct p9_fid *dfid, *ofid, *inode_fid; + struct v9fs_session_info *v9ses; struct posix_acl *pacl = NULL, *dacl = NULL; v9ses = v9fs_inode2v9ses(dir); @@ -196,6 +220,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, err); goto error; } + v9fs_invalidate_inode_attr(dir); /* instantiate inode and assign the unopened fid to the dentry */ fid = p9_client_walk(dfid, 1, &name, 1); @@ -205,7 +230,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, fid = NULL; goto error; } - inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); @@ -219,6 +244,22 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, /* Now set the ACL based on the default value */ v9fs_set_create_acl(dentry, dacl, pacl); + v9inode = V9FS_I(inode); + if (v9ses->cache && !v9inode->writeback_fid) { + /* + * clone a fid and add it to writeback_fid + * we do it during open time instead of + * page dirty time via write_begin/page_mkwrite + * because we want write after unlink usecase + * to work. + */ + inode_fid = v9fs_writeback_fid(dentry); + if (IS_ERR(inode_fid)) { + err = PTR_ERR(inode_fid); + goto error; + } + v9inode->writeback_fid = (void *) inode_fid; + } /* Since we are opening a file, assign the open fid to the file */ filp = lookup_instantiate_filp(nd, dentry, generic_file_open); if (IS_ERR(filp)) { @@ -226,6 +267,10 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, return PTR_ERR(filp); } filp->private_data = ofid; +#ifdef CONFIG_9P_FSCACHE + if (v9ses->cache) + v9fs_cache_inode_set_cookie(inode, filp); +#endif return 0; error: @@ -300,7 +345,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, goto error; } - inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", @@ -327,7 +372,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, } /* Now set the ACL based on the default value */ v9fs_set_create_acl(dentry, dacl, pacl); - + inc_nlink(dir); + v9fs_invalidate_inode_attr(dir); error: if (fid) p9_client_clunk(fid); @@ -346,9 +392,10 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); err = -EPERM; v9ses = v9fs_inode2v9ses(dentry->d_inode); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - return simple_getattr(mnt, dentry, stat); - + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + generic_fillattr(dentry->d_inode, stat); + return 0; + } fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return PTR_ERR(fid); @@ -406,16 +453,20 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) if (IS_ERR(fid)) return PTR_ERR(fid); - retval = p9_client_setattr(fid, &p9attr); - if (retval < 0) - return retval; - if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size != i_size_read(dentry->d_inode)) { retval = vmtruncate(dentry->d_inode, iattr->ia_size); if (retval) return retval; } + /* Write all dirty data */ + if (S_ISREG(dentry->d_inode->i_mode)) + filemap_write_and_wait(dentry->d_inode->i_mapping); + + retval = p9_client_setattr(fid, &p9attr); + if (retval < 0) + return retval; + v9fs_invalidate_inode_attr(dentry->d_inode); setattr_copy(dentry->d_inode, iattr); mark_inode_dirty(dentry->d_inode); @@ -439,6 +490,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) { + struct v9fs_inode *v9inode = V9FS_I(inode); if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { inode->i_atime.tv_sec = stat->st_atime_sec; @@ -497,20 +549,21 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION * because the inode structure does not have fields for them. */ + v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR; } static int v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, const char *symname) { - struct v9fs_session_info *v9ses; - struct p9_fid *dfid; - struct p9_fid *fid = NULL; - struct inode *inode; - struct p9_qid qid; - char *name; int err; gid_t gid; + char *name; + struct p9_qid qid; + struct inode *inode; + struct p9_fid *dfid; + struct p9_fid *fid = NULL; + struct v9fs_session_info *v9ses; name = (char *) dentry->d_name.name; P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n", @@ -534,6 +587,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, goto error; } + v9fs_invalidate_inode_attr(dir); if (v9ses->cache) { /* Now walk from the parent so we can get an unopened fid. */ fid = p9_client_walk(dfid, 1, &name, 1); @@ -546,7 +600,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, } /* instantiate inode and assign the unopened fid to dentry */ - inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", @@ -588,10 +642,10 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { int err; - struct p9_fid *dfid, *oldfid; char *name; - struct v9fs_session_info *v9ses; struct dentry *dir_dentry; + struct p9_fid *dfid, *oldfid; + struct v9fs_session_info *v9ses; P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n", dir->i_ino, old_dentry->d_name.name, @@ -616,29 +670,17 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, return err; } + v9fs_invalidate_inode_attr(dir); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { /* Get the latest stat info from server. */ struct p9_fid *fid; - struct p9_stat_dotl *st; - fid = v9fs_fid_lookup(old_dentry); if (IS_ERR(fid)) return PTR_ERR(fid); - st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); - if (IS_ERR(st)) - return PTR_ERR(st); - - v9fs_stat2inode_dotl(st, old_dentry->d_inode); - - kfree(st); - } else { - /* Caching disabled. No need to get upto date stat info. - * This dentry will be released immediately. So, just hold the - * inode - */ - ihold(old_dentry->d_inode); + v9fs_refresh_inode_dotl(fid, old_dentry->d_inode); } + ihold(old_dentry->d_inode); d_instantiate(dentry, old_dentry->d_inode); return err; @@ -657,12 +699,12 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, dev_t rdev) { int err; + gid_t gid; char *name; mode_t mode; struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; struct inode *inode; - gid_t gid; struct p9_qid qid; struct dentry *dir_dentry; struct posix_acl *dacl = NULL, *pacl = NULL; @@ -699,6 +741,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, if (err < 0) goto error; + v9fs_invalidate_inode_attr(dir); /* instantiate inode and assign the unopened fid to the dentry */ if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { fid = p9_client_walk(dfid, 1, &name, 1); @@ -710,7 +753,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, goto error; } - inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", @@ -782,6 +825,31 @@ ndset: return NULL; } +int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) +{ + loff_t i_size; + struct p9_stat_dotl *st; + struct v9fs_session_info *v9ses; + + v9ses = v9fs_inode2v9ses(inode); + st = p9_client_getattr_dotl(fid, P9_STATS_ALL); + if (IS_ERR(st)) + return PTR_ERR(st); + + spin_lock(&inode->i_lock); + /* + * We don't want to refresh inode->i_size, + * because we may have cached data + */ + i_size = inode->i_size; + v9fs_stat2inode_dotl(st, inode); + if (v9ses->cache) + inode->i_size = i_size; + spin_unlock(&inode->i_lock); + kfree(st); + return 0; +} + const struct inode_operations v9fs_dir_inode_operations_dotl = { .create = v9fs_vfs_create_dotl, .lookup = v9fs_vfs_lookup, diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index dbaabe3b813..09fd08d1606 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -86,12 +86,15 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, } else sb->s_op = &v9fs_super_ops; sb->s_bdi = &v9ses->bdi; + if (v9ses->cache) + sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_CACHE_SIZE; - sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | - MS_NOATIME; + sb->s_flags = flags | MS_ACTIVE | MS_DIRSYNC | MS_NOATIME; + if (!v9ses->cache) + sb->s_flags |= MS_SYNCHRONOUS; #ifdef CONFIG_9P_FS_POSIX_ACL - if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT) + if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL) sb->s_flags |= MS_POSIXACL; #endif @@ -151,7 +154,6 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, retval = PTR_ERR(inode); goto release_sb; } - root = d_alloc_root(inode); if (!root) { iput(inode); @@ -166,7 +168,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, retval = PTR_ERR(st); goto release_sb; } - + root->d_inode->i_ino = v9fs_qid2ino(&st->qid); v9fs_stat2inode_dotl(st, root->d_inode); kfree(st); } else { @@ -183,10 +185,21 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, p9stat_free(st); kfree(st); } + v9fs_fid_add(root, fid); retval = v9fs_get_acl(inode, fid); if (retval) goto release_sb; - v9fs_fid_add(root, fid); + /* + * Add the root fid to session info. This is used + * for file system sync. We want a cloned fid here + * so that we can do a sync_filesystem after a + * shrink_dcache_for_umount + */ + v9ses->root_fid = v9fs_fid_clone(root); + if (IS_ERR(v9ses->root_fid)) { + retval = PTR_ERR(v9ses->root_fid); + goto release_sb; + } P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); return dget(sb->s_root); @@ -197,15 +210,11 @@ close_session: v9fs_session_close(v9ses); kfree(v9ses); return ERR_PTR(retval); - release_sb: /* - * we will do the session_close and root dentry release - * in the below call. But we need to clunk fid, because we haven't - * attached the fid to dentry so it won't get clunked - * automatically. + * we will do the session_close and root dentry + * release in the below call. */ - p9_client_clunk(fid); deactivate_locked_super(sb); return ERR_PTR(retval); } @@ -223,7 +232,7 @@ static void v9fs_kill_super(struct super_block *s) P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s); kill_anon_super(s); - + p9_client_clunk(v9ses->root_fid); v9fs_session_cancel(v9ses); v9fs_session_close(v9ses); kfree(v9ses); @@ -276,11 +285,31 @@ done: return res; } +static int v9fs_sync_fs(struct super_block *sb, int wait) +{ + struct v9fs_session_info *v9ses = sb->s_fs_info; + + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_sync_fs: super_block %p\n", sb); + return p9_client_sync_fs(v9ses->root_fid); +} + +static int v9fs_drop_inode(struct inode *inode) +{ + struct v9fs_session_info *v9ses; + v9ses = v9fs_inode2v9ses(inode); + if (v9ses->cache) + return generic_drop_inode(inode); + /* + * in case of non cached mode always drop the + * the inode because we want the inode attribute + * to always match that on the server. + */ + return 1; +} + static const struct super_operations v9fs_super_ops = { -#ifdef CONFIG_9P_FSCACHE .alloc_inode = v9fs_alloc_inode, .destroy_inode = v9fs_destroy_inode, -#endif .statfs = simple_statfs, .evict_inode = v9fs_evict_inode, .show_options = generic_show_options, @@ -288,11 +317,11 @@ static const struct super_operations v9fs_super_ops = { }; static const struct super_operations v9fs_super_ops_dotl = { -#ifdef CONFIG_9P_FSCACHE .alloc_inode = v9fs_alloc_inode, .destroy_inode = v9fs_destroy_inode, -#endif + .sync_fs = v9fs_sync_fs, .statfs = v9fs_statfs, + .drop_inode = v9fs_drop_inode, .evict_inode = v9fs_evict_inode, .show_options = generic_show_options, .umount_begin = v9fs_umount_begin, @@ -303,5 +332,5 @@ struct file_system_type v9fs_fs_type = { .mount = v9fs_mount, .kill_sb = v9fs_kill_super, .owner = THIS_MODULE, - .fs_flags = FS_RENAME_DOES_D_MOVE, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT, }; diff --git a/fs/Kconfig b/fs/Kconfig index 3db9caa57ed..f3aa9b08b22 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -47,7 +47,7 @@ config FS_POSIX_ACL def_bool n config EXPORTFS - tristate + bool config FILE_LOCKING bool "Enable POSIX file locking API" if EXPERT @@ -187,6 +187,7 @@ source "fs/omfs/Kconfig" source "fs/hpfs/Kconfig" source "fs/qnx4/Kconfig" source "fs/romfs/Kconfig" +source "fs/pstore/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/exofs/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index a7f7cef0c0c..fb68c2b8cf8 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -48,6 +48,8 @@ obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_GENERIC_ACL) += generic_acl.o +obj-$(CONFIG_FHANDLE) += fhandle.o + obj-y += quota/ obj-$(CONFIG_PROC_FS) += proc/ @@ -121,3 +123,4 @@ obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ obj-$(CONFIG_EXOFS_FS) += exofs/ obj-$(CONFIG_CEPH_FS) += ceph/ +obj-$(CONFIG_PSTORE) += pstore/ diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig index 1dd5f34b3cf..e55182a7460 100644 --- a/fs/adfs/Kconfig +++ b/fs/adfs/Kconfig @@ -1,7 +1,6 @@ config ADFS_FS tristate "ADFS file system support (EXPERIMENTAL)" depends on BLOCK && EXPERIMENTAL - depends on BKL # need to fix help The Acorn Disc Filing System is the standard file system of the RiscOS operating system which runs on Acorn's ARM-based Risc PC diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index 3b4a764ed78..3d83075aaa2 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -9,7 +9,6 @@ * * Common directory handling for ADFS */ -#include <linux/smp_lock.h> #include "adfs.h" /* @@ -27,8 +26,6 @@ adfs_readdir(struct file *filp, void *dirent, filldir_t filldir) struct adfs_dir dir; int ret = 0; - lock_kernel(); - if (filp->f_pos >> 32) goto out; @@ -70,7 +67,6 @@ free_out: ops->free(&dir); out: - unlock_kernel(); return ret; } @@ -276,7 +272,6 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) struct object_info obj; int error; - lock_kernel(); error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj); if (error == 0) { error = -EACCES; @@ -288,7 +283,6 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) if (inode) error = 0; } - unlock_kernel(); d_add(dentry, inode); return ERR_PTR(error); } diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 65794b8fe79..09fe40198d1 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -7,7 +7,6 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/writeback.h> #include "adfs.h" @@ -316,8 +315,6 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr) unsigned int ia_valid = attr->ia_valid; int error; - lock_kernel(); - error = inode_change_ok(inode, attr); /* @@ -359,7 +356,6 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr) if (ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MODE)) mark_inode_dirty(inode); out: - unlock_kernel(); return error; } @@ -374,7 +370,6 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc) struct object_info obj; int ret; - lock_kernel(); obj.file_id = inode->i_ino; obj.name_len = 0; obj.parent_id = ADFS_I(inode)->parent_id; @@ -384,6 +379,5 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc) obj.size = inode->i_size; ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL); - unlock_kernel(); return ret; } diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 2d7954049fb..06d7388b477 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -14,7 +14,6 @@ #include <linux/mount.h> #include <linux/seq_file.h> #include <linux/slab.h> -#include <linux/smp_lock.h> #include <linux/statfs.h> #include "adfs.h" #include "dir_f.h" @@ -120,15 +119,11 @@ static void adfs_put_super(struct super_block *sb) int i; struct adfs_sb_info *asb = ADFS_SB(sb); - lock_kernel(); - for (i = 0; i < asb->s_map_size; i++) brelse(asb->s_map[i].dm_bh); kfree(asb->s_map); kfree(asb); sb->s_fs_info = NULL; - - unlock_kernel(); } static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt) @@ -359,15 +354,11 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) struct adfs_sb_info *asb; struct inode *root; - lock_kernel(); - sb->s_flags |= MS_NODIRATIME; asb = kzalloc(sizeof(*asb), GFP_KERNEL); - if (!asb) { - unlock_kernel(); + if (!asb) return -ENOMEM; - } sb->s_fs_info = asb; /* set default options */ @@ -485,7 +476,6 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) adfs_error(sb, "get root inode failed\n"); goto error; } - unlock_kernel(); return 0; error_free_bh: @@ -493,7 +483,6 @@ error_free_bh: error: sb->s_fs_info = NULL; kfree(asb); - unlock_kernel(); return -EINVAL; } @@ -85,7 +85,7 @@ static int __init aio_setup(void) kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); - aio_wq = create_workqueue("aio"); + aio_wq = alloc_workqueue("aio", 0, 1); /* used to limit concurrency */ abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry)); BUG_ON(!aio_wq || !abe_pool); @@ -577,7 +577,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) spin_lock(&fput_lock); list_add(&req->ki_list, &fput_head); spin_unlock(&fput_lock); - queue_work(aio_wq, &fput_work); + schedule_work(&fput_work); } else { req->ki_filp = NULL; really_put_req(ctx, req); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 014e7aba3b0..e6f84d26f4c 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -36,7 +36,7 @@ static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long); static int autofs4_dir_open(struct inode *inode, struct file *file); static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); static struct vfsmount *autofs4_d_automount(struct path *); -static int autofs4_d_manage(struct dentry *, bool, bool); +static int autofs4_d_manage(struct dentry *, bool); static void autofs4_dentry_release(struct dentry *); const struct file_operations autofs4_root_operations = { @@ -446,7 +446,7 @@ done: return NULL; } -int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk) +int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); @@ -454,7 +454,7 @@ int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk) dentry, dentry->d_name.len, dentry->d_name.name); /* The daemon never waits. */ - if (autofs4_oz_mode(sbi) || mounting_here) { + if (autofs4_oz_mode(sbi)) { if (!d_mountpoint(dentry)) return -EISDIR; return 0; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6f820fa23df..7f78cc78fdd 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -729,6 +729,15 @@ struct btrfs_space_info { u64 disk_total; /* total bytes on disk, takes mirrors into account */ + /* + * we bump reservation progress every time we decrement + * bytes_reserved. This way people waiting for reservations + * know something good has happened and they can check + * for progress. The number here isn't to be trusted, it + * just shows reclaim activity + */ + unsigned long reservation_progress; + int full; /* indicates that we cannot allocate any more chunks for this space */ int force_alloc; /* set if we need to force a chunk alloc for diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e1aa8d607bc..100b07f021b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2493,7 +2493,7 @@ int close_ctree(struct btrfs_root *root) * ERROR state on disk. * * 2. when btrfs flips readonly just in btrfs_commit_super, - * and in such case, btrfs cannnot write sb via btrfs_commit_super, + * and in such case, btrfs cannot write sb via btrfs_commit_super, * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag, * btrfs will cleanup all FS resources first and write sb then. */ diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index ff27d7a477b..b4ffad859ad 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -21,9 +21,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, int len = *max_len; int type; - if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) || - (connectable && len < BTRFS_FID_SIZE_CONNECTABLE)) + if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) { + *max_len = BTRFS_FID_SIZE_CONNECTABLE; return 255; + } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) { + *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE; + return 255; + } len = BTRFS_FID_SIZE_NON_CONNECTABLE; type = FILEID_BTRFS_WITHOUT_PARENT; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 588ff984987..7b3089b5c2d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3342,15 +3342,16 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, u64 max_reclaim; u64 reclaimed = 0; long time_left; - int pause = 1; int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; int loops = 0; + unsigned long progress; block_rsv = &root->fs_info->delalloc_block_rsv; space_info = block_rsv->space_info; smp_mb(); reserved = space_info->bytes_reserved; + progress = space_info->reservation_progress; if (reserved == 0) return 0; @@ -3365,31 +3366,36 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); spin_lock(&space_info->lock); - if (reserved > space_info->bytes_reserved) { - loops = 0; + if (reserved > space_info->bytes_reserved) reclaimed += reserved - space_info->bytes_reserved; - } else { - loops++; - } reserved = space_info->bytes_reserved; spin_unlock(&space_info->lock); + loops++; + if (reserved == 0 || reclaimed >= max_reclaim) break; if (trans && trans->transaction->blocked) return -EAGAIN; - __set_current_state(TASK_INTERRUPTIBLE); - time_left = schedule_timeout(pause); + time_left = schedule_timeout_interruptible(1); /* We were interrupted, exit */ if (time_left) break; - pause <<= 1; - if (pause > HZ / 10) - pause = HZ / 10; + /* we've kicked the IO a few times, if anything has been freed, + * exit. There is no sense in looping here for a long time + * when we really need to commit the transaction, or there are + * just too many writers without enough free space + */ + + if (loops > 3) { + smp_mb(); + if (progress != space_info->reservation_progress) + break; + } } return reclaimed >= to_reclaim; @@ -3612,6 +3618,7 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, if (num_bytes) { spin_lock(&space_info->lock); space_info->bytes_reserved -= num_bytes; + space_info->reservation_progress++; spin_unlock(&space_info->lock); } } @@ -3844,6 +3851,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; sinfo->bytes_reserved -= num_bytes; + sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; } @@ -4005,7 +4013,6 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) to_reserve = 0; } spin_unlock(&BTRFS_I(inode)->accounting_lock); - to_reserve += calc_csum_metadata_size(inode, num_bytes); ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); if (ret) @@ -4133,6 +4140,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, btrfs_set_block_group_used(&cache->item, old_val); cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; + cache->space_info->reservation_progress++; cache->space_info->bytes_used += num_bytes; cache->space_info->disk_used += num_bytes * factor; spin_unlock(&cache->lock); @@ -4184,6 +4192,7 @@ static int pin_down_extent(struct btrfs_root *root, if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; + cache->space_info->reservation_progress++; } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -4234,6 +4243,7 @@ static int update_reserved_bytes(struct btrfs_block_group_cache *cache, space_info->bytes_readonly += num_bytes; cache->reserved -= num_bytes; space_info->bytes_reserved -= num_bytes; + space_info->reservation_progress++; } spin_unlock(&cache->lock); spin_unlock(&space_info->lock); @@ -4712,6 +4722,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (ret) { spin_lock(&cache->space_info->lock); cache->space_info->bytes_reserved -= buf->len; + cache->space_info->reservation_progress++; spin_unlock(&cache->space_info->lock); } goto out; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fd3f172e94e..714adc4ac4c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3046,17 +3046,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } while (!end) { - off = extent_map_end(em); - if (off >= max) - end = 1; + u64 offset_in_extent; + + /* break if the extent we found is outside the range */ + if (em->start >= max || extent_map_end(em) < off) + break; + + /* + * get_extent may return an extent that starts before our + * requested range. We have to make sure the ranges + * we return to fiemap always move forward and don't + * overlap, so adjust the offsets here + */ + em_start = max(em->start, off); - em_start = em->start; - em_len = em->len; + /* + * record the offset from the start of the extent + * for adjusting the disk offset below + */ + offset_in_extent = em_start - em->start; em_end = extent_map_end(em); + em_len = em_end - em_start; emflags = em->flags; disko = 0; flags = 0; + /* + * bump off for our next call to get_extent + */ + off = extent_map_end(em); + if (off >= max) + end = 1; + if (em->block_start == EXTENT_MAP_LAST_BYTE) { end = 1; flags |= FIEMAP_EXTENT_LAST; @@ -3067,7 +3088,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, flags |= (FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); } else { - disko = em->block_start; + disko = em->block_start + offset_in_extent; } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7084140d594..f447b783bb8 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -70,6 +70,19 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, /* Flush processor's dcache for this page */ flush_dcache_page(page); + + /* + * if we get a partial write, we can end up with + * partially up to date pages. These add + * a lot of complexity, so make sure they don't + * happen by forcing this copy to be retried. + * + * The rest of the btrfs_file_write code will fall + * back to page at a time copies after we return 0. + */ + if (!PageUptodate(page) && copied < count) + copied = 0; + iov_iter_advance(i, copied); write_bytes -= copied; total_copied += copied; @@ -763,6 +776,27 @@ out: } /* + * on error we return an unlocked page and the error value + * on success we return a locked page and 0 + */ +static int prepare_uptodate_page(struct page *page, u64 pos) +{ + int ret = 0; + + if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) { + ret = btrfs_readpage(NULL, page); + if (ret) + return ret; + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + return -EIO; + } + } + return 0; +} + +/* * this gets pages into the page cache and locks them down, it also properly * waits for data=ordered extents to finish before allowing the pages to be * modified. @@ -777,6 +811,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, unsigned long index = pos >> PAGE_CACHE_SHIFT; struct inode *inode = fdentry(file)->d_inode; int err = 0; + int faili = 0; u64 start_pos; u64 last_pos; @@ -794,15 +829,24 @@ again: for (i = 0; i < num_pages; i++) { pages[i] = grab_cache_page(inode->i_mapping, index + i); if (!pages[i]) { - int c; - for (c = i - 1; c >= 0; c--) { - unlock_page(pages[c]); - page_cache_release(pages[c]); - } - return -ENOMEM; + faili = i - 1; + err = -ENOMEM; + goto fail; + } + + if (i == 0) + err = prepare_uptodate_page(pages[i], pos); + if (i == num_pages - 1) + err = prepare_uptodate_page(pages[i], + pos + write_bytes); + if (err) { + page_cache_release(pages[i]); + faili = i - 1; + goto fail; } wait_on_page_writeback(pages[i]); } + err = 0; if (start_pos < inode->i_size) { struct btrfs_ordered_extent *ordered; lock_extent_bits(&BTRFS_I(inode)->io_tree, @@ -842,6 +886,14 @@ again: WARN_ON(!PageLocked(pages[i])); } return 0; +fail: + while (faili >= 0) { + unlock_page(pages[faili]); + page_cache_release(pages[faili]); + faili--; + } + return err; + } static ssize_t btrfs_file_aio_write(struct kiocb *iocb, @@ -851,7 +903,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, struct file *file = iocb->ki_filp; struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; - struct page *pinned[2]; struct page **pages = NULL; struct iov_iter i; loff_t *ppos = &iocb->ki_pos; @@ -872,9 +923,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || (file->f_flags & O_DIRECT)); - pinned[0] = NULL; - pinned[1] = NULL; - start_pos = pos; vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); @@ -962,32 +1010,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, first_index = pos >> PAGE_CACHE_SHIFT; last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT; - /* - * there are lots of better ways to do this, but this code - * makes sure the first and last page in the file range are - * up to date and ready for cow - */ - if ((pos & (PAGE_CACHE_SIZE - 1))) { - pinned[0] = grab_cache_page(inode->i_mapping, first_index); - if (!PageUptodate(pinned[0])) { - ret = btrfs_readpage(NULL, pinned[0]); - BUG_ON(ret); - wait_on_page_locked(pinned[0]); - } else { - unlock_page(pinned[0]); - } - } - if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) { - pinned[1] = grab_cache_page(inode->i_mapping, last_index); - if (!PageUptodate(pinned[1])) { - ret = btrfs_readpage(NULL, pinned[1]); - BUG_ON(ret); - wait_on_page_locked(pinned[1]); - } else { - unlock_page(pinned[1]); - } - } - while (iov_iter_count(&i) > 0) { size_t offset = pos & (PAGE_CACHE_SIZE - 1); size_t write_bytes = min(iov_iter_count(&i), @@ -1024,8 +1046,20 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, copied = btrfs_copy_from_user(pos, num_pages, write_bytes, pages, &i); - dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + + /* + * if we have trouble faulting in the pages, fall + * back to one page at a time + */ + if (copied < write_bytes) + nrptrs = 1; + + if (copied == 0) + dirty_pages = 0; + else + dirty_pages = (copied + offset + + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; if (num_pages > dirty_pages) { if (copied > 0) @@ -1069,10 +1103,6 @@ out: err = ret; kfree(pages); - if (pinned[0]) - page_cache_release(pinned[0]); - if (pinned[1]) - page_cache_release(pinned[1]); *ppos = pos; /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0efdb65953c..512c3d1da08 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -90,13 +90,14 @@ static noinline int cow_file_range(struct inode *inode, unsigned long *nr_written, int unlock); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir) + struct inode *inode, struct inode *dir, + const struct qstr *qstr) { int err; err = btrfs_init_acl(trans, inode, dir); if (!err) - err = btrfs_xattr_security_init(trans, inode, dir); + err = btrfs_xattr_security_init(trans, inode, dir, qstr); return err; } @@ -4704,7 +4705,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_inode_security(trans, inode, dir); + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) { drop_inode = 1; goto out_unlock; @@ -4765,7 +4766,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_inode_security(trans, inode, dir); + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) { drop_inode = 1; goto out_unlock; @@ -4806,9 +4807,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, int err; int drop_inode = 0; - if (inode->i_nlink == 0) - return -ENOENT; - /* do not allow sys_link's with other subvols of the same device */ if (root->objectid != BTRFS_I(inode)->root->objectid) return -EPERM; @@ -4821,10 +4819,11 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, goto fail; /* - * 1 item for inode ref + * 2 items for inode and inode ref * 2 items for dir items + * 1 item for parent inode */ - trans = btrfs_start_transaction(root, 3); + trans = btrfs_start_transaction(root, 5); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto fail; @@ -4893,7 +4892,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) drop_on_err = 1; - err = btrfs_init_inode_security(trans, inode, dir); + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) goto out_fail; @@ -6056,6 +6055,7 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, if (!skip_sum) { dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); if (!dip->csums) { + kfree(dip); ret = -ENOMEM; goto free_ordered; } @@ -7104,7 +7104,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_inode_security(trans, inode, dir); + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) { drop_inode = 1; goto out_unlock; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index a5776531dc2..d779cefcfd7 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -370,7 +370,8 @@ int btrfs_removexattr(struct dentry *dentry, const char *name) } int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir) + struct inode *inode, struct inode *dir, + const struct qstr *qstr) { int err; size_t len; @@ -378,7 +379,8 @@ int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, char *suffix; char *name; - err = security_inode_init_security(inode, dir, &suffix, &value, &len); + err = security_inode_init_security(inode, dir, qstr, &suffix, &value, + &len); if (err) { if (err == -EOPNOTSUPP) return 0; diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 7a43fd640bb..b3cc8039134 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -37,6 +37,7 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name, extern int btrfs_removexattr(struct dentry *dentry, const char *name); extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir); + struct inode *inode, struct inode *dir, + const struct qstr *qstr); #endif /* __XATTR__ */ diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 42c7fafc8bf..a0358c2189c 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -275,6 +275,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, bool preemptive) { struct dentry *grave, *trap; + struct path path, path_to_graveyard; char nbuffer[8 + 8 + 1]; int ret; @@ -287,10 +288,18 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, /* non-directories can just be unlinked */ if (!S_ISDIR(rep->d_inode->i_mode)) { _debug("unlink stale object"); - ret = vfs_unlink(dir->d_inode, rep); - if (preemptive) - cachefiles_mark_object_buried(cache, rep); + path.mnt = cache->mnt; + path.dentry = dir; + ret = security_path_unlink(&path, rep); + if (ret < 0) { + cachefiles_io_error(cache, "Unlink security error"); + } else { + ret = vfs_unlink(dir->d_inode, rep); + + if (preemptive) + cachefiles_mark_object_buried(cache, rep); + } mutex_unlock(&dir->d_inode->i_mutex); @@ -379,12 +388,23 @@ try_again: } /* attempt the rename */ - ret = vfs_rename(dir->d_inode, rep, cache->graveyard->d_inode, grave); - if (ret != 0 && ret != -ENOMEM) - cachefiles_io_error(cache, "Rename failed with error %d", ret); + path.mnt = cache->mnt; + path.dentry = dir; + path_to_graveyard.mnt = cache->mnt; + path_to_graveyard.dentry = cache->graveyard; + ret = security_path_rename(&path, rep, &path_to_graveyard, grave); + if (ret < 0) { + cachefiles_io_error(cache, "Rename security error %d", ret); + } else { + ret = vfs_rename(dir->d_inode, rep, + cache->graveyard->d_inode, grave); + if (ret != 0 && ret != -ENOMEM) + cachefiles_io_error(cache, + "Rename failed with error %d", ret); - if (preemptive) - cachefiles_mark_object_buried(cache, rep); + if (preemptive) + cachefiles_mark_object_buried(cache, rep); + } unlock_rename(cache->graveyard, dir); dput(grave); @@ -448,6 +468,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent, { struct cachefiles_cache *cache; struct dentry *dir, *next = NULL; + struct path path; unsigned long start; const char *name; int ret, nlen; @@ -458,6 +479,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent, cache = container_of(parent->fscache.cache, struct cachefiles_cache, cache); + path.mnt = cache->mnt; ASSERT(parent->dentry); ASSERT(parent->dentry->d_inode); @@ -511,6 +533,10 @@ lookup_again: if (ret < 0) goto create_error; + path.dentry = dir; + ret = security_path_mkdir(&path, next, 0); + if (ret < 0) + goto create_error; start = jiffies; ret = vfs_mkdir(dir->d_inode, next, 0); cachefiles_hist(cachefiles_mkdir_histogram, start); @@ -536,6 +562,10 @@ lookup_again: if (ret < 0) goto create_error; + path.dentry = dir; + ret = security_path_mknod(&path, next, S_IFREG, 0); + if (ret < 0) + goto create_error; start = jiffies; ret = vfs_create(dir->d_inode, next, S_IFREG, NULL); cachefiles_hist(cachefiles_create_histogram, start); @@ -692,6 +722,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, { struct dentry *subdir; unsigned long start; + struct path path; int ret; _enter(",,%s", dirname); @@ -719,6 +750,11 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, _debug("attempt mkdir"); + path.mnt = cache->mnt; + path.dentry = dir; + ret = security_path_mkdir(&path, subdir, 0700); + if (ret < 0) + goto mkdir_error; ret = vfs_mkdir(dir->d_inode, subdir, 0700); if (ret < 0) goto mkdir_error; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 099a58615b9..ebafa65a29b 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -993,7 +993,7 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *dir; - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; dir = dentry->d_parent->d_inode; diff --git a/fs/compat.c b/fs/compat.c index f6fd0a00e6c..c6d31a3bab8 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -262,35 +262,19 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs * */ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf) { - struct path path; - int error; - - error = user_path(pathname, &path); - if (!error) { - struct kstatfs tmp; - error = vfs_statfs(&path, &tmp); - if (!error) - error = put_compat_statfs(buf, &tmp); - path_put(&path); - } + struct kstatfs tmp; + int error = user_statfs(pathname, &tmp); + if (!error) + error = put_compat_statfs(buf, &tmp); return error; } asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf) { - struct file * file; struct kstatfs tmp; - int error; - - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = vfs_statfs(&file->f_path, &tmp); + int error = fd_statfs(fd, &tmp); if (!error) error = put_compat_statfs(buf, &tmp); - fput(file); -out: return error; } @@ -329,41 +313,29 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf) { - struct path path; + struct kstatfs tmp; int error; if (sz != sizeof(*buf)) return -EINVAL; - error = user_path(pathname, &path); - if (!error) { - struct kstatfs tmp; - error = vfs_statfs(&path, &tmp); - if (!error) - error = put_compat_statfs64(buf, &tmp); - path_put(&path); - } + error = user_statfs(pathname, &tmp); + if (!error) + error = put_compat_statfs64(buf, &tmp); return error; } asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf) { - struct file * file; struct kstatfs tmp; int error; if (sz != sizeof(*buf)) return -EINVAL; - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = vfs_statfs(&file->f_path, &tmp); + error = fd_statfs(fd, &tmp); if (!error) error = put_compat_statfs64(buf, &tmp); - fput(file); -out: return error; } @@ -1228,7 +1200,9 @@ compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec, file = fget_light(fd, &fput_needed); if (!file) return -EBADF; - ret = compat_readv(file, vec, vlen, &pos); + ret = -ESPIPE; + if (file->f_mode & FMODE_PREAD) + ret = compat_readv(file, vec, vlen, &pos); fput_light(file, fput_needed); return ret; } @@ -1285,7 +1259,9 @@ compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec, file = fget_light(fd, &fput_needed); if (!file) return -EBADF; - ret = compat_writev(file, vec, vlen, &pos); + ret = -ESPIPE; + if (file->f_mode & FMODE_PWRITE) + ret = compat_writev(file, vec, vlen, &pos); fput_light(file, fput_needed); return ret; } @@ -2308,3 +2284,16 @@ asmlinkage long compat_sys_timerfd_gettime(int ufd, } #endif /* CONFIG_TIMERFD */ + +#ifdef CONFIG_FHANDLE +/* + * Exactly like fs/open.c:sys_open_by_handle_at(), except that it + * doesn't set the O_LARGEFILE flag. + */ +asmlinkage long +compat_sys_open_by_handle_at(int mountdirfd, + struct file_handle __user *handle, int flags) +{ + return do_handle_open(mountdirfd, handle, flags); +} +#endif diff --git a/fs/dcache.c b/fs/dcache.c index 2a6bd9a4ae9..ad25c4cec7d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -296,8 +296,12 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) __releases(parent->d_lock) __releases(dentry->d_inode->i_lock) { - dentry->d_parent = NULL; list_del(&dentry->d_u.d_child); + /* + * Inform try_to_ascend() that we are no longer attached to the + * dentry tree + */ + dentry->d_flags |= DCACHE_DISCONNECTED; if (parent) spin_unlock(&parent->d_lock); dentry_iput(dentry); @@ -1012,6 +1016,35 @@ void shrink_dcache_for_umount(struct super_block *sb) } /* + * This tries to ascend one level of parenthood, but + * we can race with renaming, so we need to re-check + * the parenthood after dropping the lock and check + * that the sequence number still matches. + */ +static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq) +{ + struct dentry *new = old->d_parent; + + rcu_read_lock(); + spin_unlock(&old->d_lock); + spin_lock(&new->d_lock); + + /* + * might go back up the wrong parent if we have had a rename + * or deletion + */ + if (new != old->d_parent || + (old->d_flags & DCACHE_DISCONNECTED) || + (!locked && read_seqretry(&rename_lock, seq))) { + spin_unlock(&new->d_lock); + new = NULL; + } + rcu_read_unlock(); + return new; +} + + +/* * Search for at least 1 mount point in the dentry's subdirs. * We descend to the next level whenever the d_subdirs * list is non-empty and continue searching. @@ -1066,24 +1099,10 @@ resume: * All done at this level ... ascend and resume the search. */ if (this_parent != parent) { - struct dentry *tmp; - struct dentry *child; - - tmp = this_parent->d_parent; - rcu_read_lock(); - spin_unlock(&this_parent->d_lock); - child = this_parent; - this_parent = tmp; - spin_lock(&this_parent->d_lock); - /* might go back up the wrong parent if we have had a rename - * or deletion */ - if (this_parent != child->d_parent || - (!locked && read_seqretry(&rename_lock, seq))) { - spin_unlock(&this_parent->d_lock); - rcu_read_unlock(); + struct dentry *child = this_parent; + this_parent = try_to_ascend(this_parent, locked, seq); + if (!this_parent) goto rename_retry; - } - rcu_read_unlock(); next = child->d_u.d_child.next; goto resume; } @@ -1181,24 +1200,10 @@ resume: * All done at this level ... ascend and resume the search. */ if (this_parent != parent) { - struct dentry *tmp; - struct dentry *child; - - tmp = this_parent->d_parent; - rcu_read_lock(); - spin_unlock(&this_parent->d_lock); - child = this_parent; - this_parent = tmp; - spin_lock(&this_parent->d_lock); - /* might go back up the wrong parent if we have had a rename - * or deletion */ - if (this_parent != child->d_parent || - (!locked && read_seqretry(&rename_lock, seq))) { - spin_unlock(&this_parent->d_lock); - rcu_read_unlock(); + struct dentry *child = this_parent; + this_parent = try_to_ascend(this_parent, locked, seq); + if (!this_parent) goto rename_retry; - } - rcu_read_unlock(); next = child->d_u.d_child.next; goto resume; } @@ -1523,6 +1528,28 @@ struct dentry * d_alloc_root(struct inode * root_inode) } EXPORT_SYMBOL(d_alloc_root); +static struct dentry * __d_find_any_alias(struct inode *inode) +{ + struct dentry *alias; + + if (list_empty(&inode->i_dentry)) + return NULL; + alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias); + __dget(alias); + return alias; +} + +static struct dentry * d_find_any_alias(struct inode *inode) +{ + struct dentry *de; + + spin_lock(&inode->i_lock); + de = __d_find_any_alias(inode); + spin_unlock(&inode->i_lock); + return de; +} + + /** * d_obtain_alias - find or allocate a dentry for a given inode * @inode: inode to allocate the dentry for @@ -1552,7 +1579,7 @@ struct dentry *d_obtain_alias(struct inode *inode) if (IS_ERR(inode)) return ERR_CAST(inode); - res = d_find_alias(inode); + res = d_find_any_alias(inode); if (res) goto out_iput; @@ -1565,7 +1592,7 @@ struct dentry *d_obtain_alias(struct inode *inode) spin_lock(&inode->i_lock); - res = __d_find_alias(inode, 0); + res = __d_find_any_alias(inode); if (res) { spin_unlock(&inode->i_lock); dput(tmp); @@ -1585,10 +1612,13 @@ struct dentry *d_obtain_alias(struct inode *inode) __bit_spin_unlock(0, (unsigned long *)&tmp->d_sb->s_anon.first); spin_unlock(&tmp->d_lock); spin_unlock(&inode->i_lock); + security_d_instantiate(tmp, inode); return tmp; out_iput: + if (res && !IS_ERR(res)) + security_d_instantiate(res, inode); iput(inode); return res; } @@ -1781,7 +1811,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, * false-negative result. d_lookup() protects against concurrent * renames using rename_lock seqlock. * - * See Documentation/vfs/dcache-locking.txt for more details. + * See Documentation/filesystems/path-lookup.txt for more details. */ hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) { struct inode *i; @@ -1901,7 +1931,7 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name) * false-negative result. d_lookup() protects against concurrent * renames using rename_lock seqlock. * - * See Documentation/vfs/dcache-locking.txt for more details. + * See Documentation/filesystems/path-lookup.txt for more details. */ rcu_read_lock(); @@ -2920,28 +2950,14 @@ resume: spin_unlock(&dentry->d_lock); } if (this_parent != root) { - struct dentry *tmp; - struct dentry *child; - - tmp = this_parent->d_parent; + struct dentry *child = this_parent; if (!(this_parent->d_flags & DCACHE_GENOCIDE)) { this_parent->d_flags |= DCACHE_GENOCIDE; this_parent->d_count--; } - rcu_read_lock(); - spin_unlock(&this_parent->d_lock); - child = this_parent; - this_parent = tmp; - spin_lock(&this_parent->d_lock); - /* might go back up the wrong parent if we have had a rename - * or deletion */ - if (this_parent != child->d_parent || - (!locked && read_seqretry(&rename_lock, seq))) { - spin_unlock(&this_parent->d_lock); - rcu_read_unlock(); + this_parent = try_to_ascend(this_parent, locked, seq); + if (!this_parent) goto rename_retry; - } - rcu_read_unlock(); next = child->d_u.d_child.next; goto resume; } diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 37a8ca7c122..e7a7a2f0732 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -13,9 +13,6 @@ * */ -/* uncomment to get debug messages from the debug filesystem, ah the irony. */ -/* #define DEBUG */ - #include <linux/module.h> #include <linux/fs.h> #include <linux/mount.h> @@ -310,7 +307,7 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, } EXPORT_SYMBOL_GPL(debugfs_create_symlink); -static void __debugfs_remove(struct dentry *dentry, struct dentry *parent) +static int __debugfs_remove(struct dentry *dentry, struct dentry *parent) { int ret = 0; @@ -333,6 +330,7 @@ static void __debugfs_remove(struct dentry *dentry, struct dentry *parent) dput(dentry); } } + return ret; } /** @@ -351,7 +349,8 @@ static void __debugfs_remove(struct dentry *dentry, struct dentry *parent) void debugfs_remove(struct dentry *dentry) { struct dentry *parent; - + int ret; + if (!dentry) return; @@ -360,9 +359,10 @@ void debugfs_remove(struct dentry *dentry) return; mutex_lock(&parent->d_inode->i_mutex); - __debugfs_remove(dentry, parent); + ret = __debugfs_remove(dentry, parent); mutex_unlock(&parent->d_inode->i_mutex); - simple_release_fs(&debugfs_mount, &debugfs_mount_count); + if (!ret) + simple_release_fs(&debugfs_mount, &debugfs_mount_count); } EXPORT_SYMBOL_GPL(debugfs_remove); @@ -540,17 +540,5 @@ static int __init debugfs_init(void) return retval; } - -static void __exit debugfs_exit(void) -{ - debugfs_registered = false; - - simple_release_fs(&debugfs_mount, &debugfs_mount_count); - unregister_filesystem(&debug_fs_type); - kobject_put(debug_kobj); -} - core_initcall(debugfs_init); -module_exit(debugfs_exit); -MODULE_LICENSE("GPL"); diff --git a/fs/direct-io.c b/fs/direct-io.c index b044705eedd..dcb5577cde1 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -645,11 +645,11 @@ static int dio_send_cur_page(struct dio *dio) /* * See whether this new request is contiguous with the old. * - * Btrfs cannot handl having logically non-contiguous requests - * submitted. For exmple if you have + * Btrfs cannot handle having logically non-contiguous requests + * submitted. For example if you have * * Logical: [0-4095][HOLE][8192-12287] - * Phyiscal: [0-4095] [4096-8181] + * Physical: [0-4095] [4096-8191] * * We cannot submit those pages together as one BIO. So if our * current logical offset in the file does not equal what would diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 4314f0d48d8..abc49f29245 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -18,6 +18,7 @@ #define WAKE_ASTS 0 +static uint64_t ast_seq_count; static struct list_head ast_queue; static spinlock_t ast_queue_lock; static struct task_struct * astd_task; @@ -25,40 +26,186 @@ static unsigned long astd_wakeflags; static struct mutex astd_running; +static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb) +{ + int i; + + log_print("last_bast %x %llu flags %x mode %d sb %d %x", + lkb->lkb_id, + (unsigned long long)lkb->lkb_last_bast.seq, + lkb->lkb_last_bast.flags, + lkb->lkb_last_bast.mode, + lkb->lkb_last_bast.sb_status, + lkb->lkb_last_bast.sb_flags); + + log_print("last_cast %x %llu flags %x mode %d sb %d %x", + lkb->lkb_id, + (unsigned long long)lkb->lkb_last_cast.seq, + lkb->lkb_last_cast.flags, + lkb->lkb_last_cast.mode, + lkb->lkb_last_cast.sb_status, + lkb->lkb_last_cast.sb_flags); + + for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { + log_print("cb %x %llu flags %x mode %d sb %d %x", + lkb->lkb_id, + (unsigned long long)lkb->lkb_callbacks[i].seq, + lkb->lkb_callbacks[i].flags, + lkb->lkb_callbacks[i].mode, + lkb->lkb_callbacks[i].sb_status, + lkb->lkb_callbacks[i].sb_flags); + } +} + void dlm_del_ast(struct dlm_lkb *lkb) { spin_lock(&ast_queue_lock); - if (lkb->lkb_ast_type & (AST_COMP | AST_BAST)) - list_del(&lkb->lkb_astqueue); + if (!list_empty(&lkb->lkb_astqueue)) + list_del_init(&lkb->lkb_astqueue); spin_unlock(&ast_queue_lock); } -void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode) +int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, + int status, uint32_t sbflags, uint64_t seq) { + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + uint64_t prev_seq; + int prev_mode; + int i; + + for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { + if (lkb->lkb_callbacks[i].seq) + continue; + + /* + * Suppress some redundant basts here, do more on removal. + * Don't even add a bast if the callback just before it + * is a bast for the same mode or a more restrictive mode. + * (the addional > PR check is needed for PR/CW inversion) + */ + + if ((i > 0) && (flags & DLM_CB_BAST) && + (lkb->lkb_callbacks[i-1].flags & DLM_CB_BAST)) { + + prev_seq = lkb->lkb_callbacks[i-1].seq; + prev_mode = lkb->lkb_callbacks[i-1].mode; + + if ((prev_mode == mode) || + (prev_mode > mode && prev_mode > DLM_LOCK_PR)) { + + log_debug(ls, "skip %x add bast %llu mode %d " + "for bast %llu mode %d", + lkb->lkb_id, + (unsigned long long)seq, + mode, + (unsigned long long)prev_seq, + prev_mode); + return 0; + } + } + + lkb->lkb_callbacks[i].seq = seq; + lkb->lkb_callbacks[i].flags = flags; + lkb->lkb_callbacks[i].mode = mode; + lkb->lkb_callbacks[i].sb_status = status; + lkb->lkb_callbacks[i].sb_flags = (sbflags & 0x000000FF); + break; + } + + if (i == DLM_CALLBACKS_SIZE) { + log_error(ls, "no callbacks %x %llu flags %x mode %d sb %d %x", + lkb->lkb_id, (unsigned long long)seq, + flags, mode, status, sbflags); + dlm_dump_lkb_callbacks(lkb); + return -1; + } + + return 0; +} + +int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_callback *cb, int *resid) +{ + int i; + + *resid = 0; + + if (!lkb->lkb_callbacks[0].seq) + return -ENOENT; + + /* oldest undelivered cb is callbacks[0] */ + + memcpy(cb, &lkb->lkb_callbacks[0], sizeof(struct dlm_callback)); + memset(&lkb->lkb_callbacks[0], 0, sizeof(struct dlm_callback)); + + /* shift others down */ + + for (i = 1; i < DLM_CALLBACKS_SIZE; i++) { + if (!lkb->lkb_callbacks[i].seq) + break; + memcpy(&lkb->lkb_callbacks[i-1], &lkb->lkb_callbacks[i], + sizeof(struct dlm_callback)); + memset(&lkb->lkb_callbacks[i], 0, sizeof(struct dlm_callback)); + (*resid)++; + } + + /* if cb is a bast, it should be skipped if the blocking mode is + compatible with the last granted mode */ + + if ((cb->flags & DLM_CB_BAST) && lkb->lkb_last_cast.seq) { + if (dlm_modes_compat(cb->mode, lkb->lkb_last_cast.mode)) { + cb->flags |= DLM_CB_SKIP; + + log_debug(ls, "skip %x bast %llu mode %d " + "for cast %llu mode %d", + lkb->lkb_id, + (unsigned long long)cb->seq, + cb->mode, + (unsigned long long)lkb->lkb_last_cast.seq, + lkb->lkb_last_cast.mode); + return 0; + } + } + + if (cb->flags & DLM_CB_CAST) { + memcpy(&lkb->lkb_last_cast, cb, sizeof(struct dlm_callback)); + lkb->lkb_last_cast_time = ktime_get(); + } + + if (cb->flags & DLM_CB_BAST) { + memcpy(&lkb->lkb_last_bast, cb, sizeof(struct dlm_callback)); + lkb->lkb_last_bast_time = ktime_get(); + } + + return 0; +} + +void dlm_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, + uint32_t sbflags) +{ + uint64_t seq; + int rv; + + spin_lock(&ast_queue_lock); + + seq = ++ast_seq_count; + if (lkb->lkb_flags & DLM_IFL_USER) { - dlm_user_add_ast(lkb, type, mode); + spin_unlock(&ast_queue_lock); + dlm_user_add_ast(lkb, flags, mode, status, sbflags, seq); return; } - spin_lock(&ast_queue_lock); - if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) { + rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq); + if (rv < 0) { + spin_unlock(&ast_queue_lock); + return; + } + + if (list_empty(&lkb->lkb_astqueue)) { kref_get(&lkb->lkb_ref); list_add_tail(&lkb->lkb_astqueue, &ast_queue); - lkb->lkb_ast_first = type; } - - /* sanity check, this should not happen */ - - if ((type == AST_COMP) && (lkb->lkb_ast_type & AST_COMP)) - log_print("repeat cast %d castmode %d lock %x %s", - mode, lkb->lkb_castmode, - lkb->lkb_id, lkb->lkb_resource->res_name); - - lkb->lkb_ast_type |= type; - if (type == AST_BAST) - lkb->lkb_bastmode = mode; - else - lkb->lkb_castmode = mode; spin_unlock(&ast_queue_lock); set_bit(WAKE_ASTS, &astd_wakeflags); @@ -72,7 +219,8 @@ static void process_asts(void) struct dlm_lkb *lkb; void (*castfn) (void *astparam); void (*bastfn) (void *astparam, int mode); - int type, first, bastmode, castmode, do_bast, do_cast, last_castmode; + struct dlm_callback callbacks[DLM_CALLBACKS_SIZE]; + int i, rv, resid; repeat: spin_lock(&ast_queue_lock); @@ -83,54 +231,45 @@ repeat: if (dlm_locking_stopped(ls)) continue; - list_del(&lkb->lkb_astqueue); - type = lkb->lkb_ast_type; - lkb->lkb_ast_type = 0; - first = lkb->lkb_ast_first; - lkb->lkb_ast_first = 0; - bastmode = lkb->lkb_bastmode; - castmode = lkb->lkb_castmode; + /* we remove from astqueue list and remove everything in + lkb_callbacks before releasing the spinlock so empty + lkb_astqueue is always consistent with empty lkb_callbacks */ + + list_del_init(&lkb->lkb_astqueue); + castfn = lkb->lkb_astfn; bastfn = lkb->lkb_bastfn; - spin_unlock(&ast_queue_lock); - do_cast = (type & AST_COMP) && castfn; - do_bast = (type & AST_BAST) && bastfn; + memset(&callbacks, 0, sizeof(callbacks)); - /* Skip a bast if its blocking mode is compatible with the - granted mode of the preceding cast. */ + for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { + rv = dlm_rem_lkb_callback(ls, lkb, &callbacks[i], &resid); + if (rv < 0) + break; + } + spin_unlock(&ast_queue_lock); - if (do_bast) { - if (first == AST_COMP) - last_castmode = castmode; - else - last_castmode = lkb->lkb_castmode_done; - if (dlm_modes_compat(bastmode, last_castmode)) - do_bast = 0; + if (resid) { + /* shouldn't happen, for loop should have removed all */ + log_error(ls, "callback resid %d lkb %x", + resid, lkb->lkb_id); } - if (first == AST_COMP) { - if (do_cast) - castfn(lkb->lkb_astparam); - if (do_bast) - bastfn(lkb->lkb_astparam, bastmode); - } else if (first == AST_BAST) { - if (do_bast) - bastfn(lkb->lkb_astparam, bastmode); - if (do_cast) + for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { + if (!callbacks[i].seq) + break; + if (callbacks[i].flags & DLM_CB_SKIP) { + continue; + } else if (callbacks[i].flags & DLM_CB_BAST) { + bastfn(lkb->lkb_astparam, callbacks[i].mode); + } else if (callbacks[i].flags & DLM_CB_CAST) { + lkb->lkb_lksb->sb_status = callbacks[i].sb_status; + lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags; castfn(lkb->lkb_astparam); - } else { - log_error(ls, "bad ast_first %d ast_type %d", - first, type); + } } - if (do_cast) - lkb->lkb_castmode_done = castmode; - if (do_bast) - lkb->lkb_bastmode_done = bastmode; - - /* this removes the reference added by dlm_add_ast - and may result in the lkb being freed */ + /* removes ref for ast_queue, may cause lkb to be freed */ dlm_put_lkb(lkb); cond_resched(); diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h index bcb1aaba519..8aa89c9b561 100644 --- a/fs/dlm/ast.h +++ b/fs/dlm/ast.h @@ -13,8 +13,13 @@ #ifndef __ASTD_DOT_H__ #define __ASTD_DOT_H__ -void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode); void dlm_del_ast(struct dlm_lkb *lkb); +int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, + int status, uint32_t sbflags, uint64_t seq); +int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_callback *cb, int *resid); +void dlm_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, + uint32_t sbflags); void dlm_astd_wake(void); int dlm_astd_start(void); diff --git a/fs/dlm/config.c b/fs/dlm/config.c index b54bca03d92..0d329ff8ed4 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -977,9 +977,9 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) /* Config file defaults */ #define DEFAULT_TCP_PORT 21064 #define DEFAULT_BUFFER_SIZE 4096 -#define DEFAULT_RSBTBL_SIZE 256 +#define DEFAULT_RSBTBL_SIZE 1024 #define DEFAULT_LKBTBL_SIZE 1024 -#define DEFAULT_DIRTBL_SIZE 512 +#define DEFAULT_DIRTBL_SIZE 1024 #define DEFAULT_RECOVER_TIMER 5 #define DEFAULT_TOSS_SECS 10 #define DEFAULT_SCAN_SECS 5 diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 6b42ba807df..59779237e2b 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -257,12 +257,12 @@ static int print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb, lkb->lkb_status, lkb->lkb_grmode, lkb->lkb_rqmode, - lkb->lkb_bastmode, + lkb->lkb_last_bast.mode, rsb_lookup, lkb->lkb_wait_type, lkb->lkb_lvbseq, (unsigned long long)ktime_to_ns(lkb->lkb_timestamp), - (unsigned long long)ktime_to_ns(lkb->lkb_time_bast)); + (unsigned long long)ktime_to_ns(lkb->lkb_last_bast_time)); return rv; } diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index f632b58cd22..b9420491301 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -192,11 +192,6 @@ struct dlm_args { * lkb is a process copy, the nodeid specifies the lock master. */ -/* lkb_ast_type */ - -#define AST_COMP 1 -#define AST_BAST 2 - /* lkb_status */ #define DLM_LKSTS_WAITING 1 @@ -217,6 +212,20 @@ struct dlm_args { #define DLM_IFL_USER 0x00000001 #define DLM_IFL_ORPHAN 0x00000002 +#define DLM_CALLBACKS_SIZE 6 + +#define DLM_CB_CAST 0x00000001 +#define DLM_CB_BAST 0x00000002 +#define DLM_CB_SKIP 0x00000004 + +struct dlm_callback { + uint64_t seq; + uint32_t flags; /* DLM_CBF_ */ + int sb_status; /* copy to lksb status */ + uint8_t sb_flags; /* copy to lksb flags */ + int8_t mode; /* rq mode of bast, gr mode of cast */ +}; + struct dlm_lkb { struct dlm_rsb *lkb_resource; /* the rsb */ struct kref lkb_ref; @@ -236,13 +245,6 @@ struct dlm_lkb { int8_t lkb_wait_type; /* type of reply waiting for */ int8_t lkb_wait_count; - int8_t lkb_ast_type; /* type of ast queued for */ - int8_t lkb_ast_first; /* type of first ast queued */ - - int8_t lkb_bastmode; /* req mode of queued bast */ - int8_t lkb_castmode; /* gr mode of queued cast */ - int8_t lkb_bastmode_done; /* last delivered bastmode */ - int8_t lkb_castmode_done; /* last delivered castmode */ struct list_head lkb_idtbl_list; /* lockspace lkbtbl */ struct list_head lkb_statequeue; /* rsb g/c/w list */ @@ -251,10 +253,15 @@ struct dlm_lkb { struct list_head lkb_astqueue; /* need ast to be sent */ struct list_head lkb_ownqueue; /* list of locks for a process */ struct list_head lkb_time_list; - ktime_t lkb_time_bast; /* for debugging */ ktime_t lkb_timestamp; unsigned long lkb_timeout_cs; + struct dlm_callback lkb_callbacks[DLM_CALLBACKS_SIZE]; + struct dlm_callback lkb_last_cast; + struct dlm_callback lkb_last_bast; + ktime_t lkb_last_cast_time; /* for debugging */ + ktime_t lkb_last_bast_time; /* for debugging */ + char *lkb_lvbptr; struct dlm_lksb *lkb_lksb; /* caller's status block */ void (*lkb_astfn) (void *astparam); @@ -544,8 +551,6 @@ struct dlm_user_args { (dlm_user_proc) on the struct file, the process's locks point back to it*/ struct dlm_lksb lksb; - int old_mode; - int update_user_lvb; struct dlm_lksb __user *user_lksb; void __user *castparam; void __user *castaddr; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 64e5f3efdd8..04b8c449303 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -160,10 +160,10 @@ static const int __quecvt_compat_matrix[8][8] = { void dlm_print_lkb(struct dlm_lkb *lkb) { printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n" - " status %d rqmode %d grmode %d wait_type %d ast_type %d\n", + " status %d rqmode %d grmode %d wait_type %d\n", lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags, lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode, - lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type); + lkb->lkb_grmode, lkb->lkb_wait_type); } static void dlm_print_rsb(struct dlm_rsb *r) @@ -305,10 +305,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) rv = -EDEADLK; } - lkb->lkb_lksb->sb_status = rv; - lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags; - - dlm_add_ast(lkb, AST_COMP, lkb->lkb_grmode); + dlm_add_ast(lkb, DLM_CB_CAST, lkb->lkb_grmode, rv, lkb->lkb_sbflags); } static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb) @@ -319,13 +316,10 @@ static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb) static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode) { - lkb->lkb_time_bast = ktime_get(); - if (is_master_copy(lkb)) { - lkb->lkb_bastmode = rqmode; /* printed by debugfs */ send_bast(r, lkb, rqmode); } else { - dlm_add_ast(lkb, AST_BAST, rqmode); + dlm_add_ast(lkb, DLM_CB_BAST, rqmode, 0, 0); } } @@ -600,6 +594,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) INIT_LIST_HEAD(&lkb->lkb_ownqueue); INIT_LIST_HEAD(&lkb->lkb_rsb_lookup); INIT_LIST_HEAD(&lkb->lkb_time_list); + INIT_LIST_HEAD(&lkb->lkb_astqueue); get_random_bytes(&bucket, sizeof(bucket)); bucket &= (ls->ls_lkbtbl_size - 1); @@ -2819,9 +2814,9 @@ static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb, not from lkb fields */ if (lkb->lkb_bastfn) - ms->m_asts |= AST_BAST; + ms->m_asts |= DLM_CB_BAST; if (lkb->lkb_astfn) - ms->m_asts |= AST_COMP; + ms->m_asts |= DLM_CB_CAST; /* compare with switch in create_message; send_remove() doesn't use send_args() */ @@ -3122,8 +3117,8 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb, lkb->lkb_grmode = DLM_LOCK_IV; lkb->lkb_rqmode = ms->m_rqmode; - lkb->lkb_bastfn = (ms->m_asts & AST_BAST) ? &fake_bastfn : NULL; - lkb->lkb_astfn = (ms->m_asts & AST_COMP) ? &fake_astfn : NULL; + lkb->lkb_bastfn = (ms->m_asts & DLM_CB_BAST) ? &fake_bastfn : NULL; + lkb->lkb_astfn = (ms->m_asts & DLM_CB_CAST) ? &fake_astfn : NULL; if (lkb->lkb_exflags & DLM_LKF_VALBLK) { /* lkb was just created so there won't be an lvb yet */ @@ -4412,8 +4407,8 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, lkb->lkb_grmode = rl->rl_grmode; /* don't set lkb_status because add_lkb wants to itself */ - lkb->lkb_bastfn = (rl->rl_asts & AST_BAST) ? &fake_bastfn : NULL; - lkb->lkb_astfn = (rl->rl_asts & AST_COMP) ? &fake_astfn : NULL; + lkb->lkb_bastfn = (rl->rl_asts & DLM_CB_BAST) ? &fake_bastfn : NULL; + lkb->lkb_astfn = (rl->rl_asts & DLM_CB_CAST) ? &fake_astfn : NULL; if (lkb->lkb_exflags & DLM_LKF_VALBLK) { int lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) - @@ -4589,7 +4584,6 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs, fake_astfn, ua, fake_bastfn, &args); lkb->lkb_flags |= DLM_IFL_USER; - ua->old_mode = DLM_LOCK_IV; if (error) { __put_lkb(ls, lkb); @@ -4658,7 +4652,6 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, ua->bastparam = ua_tmp->bastparam; ua->bastaddr = ua_tmp->bastaddr; ua->user_lksb = ua_tmp->user_lksb; - ua->old_mode = lkb->lkb_grmode; error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs, fake_astfn, ua, fake_bastfn, &args); @@ -4917,8 +4910,9 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) } list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) { - lkb->lkb_ast_type = 0; - list_del(&lkb->lkb_astqueue); + memset(&lkb->lkb_callbacks, 0, + sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE); + list_del_init(&lkb->lkb_astqueue); dlm_put_lkb(lkb); } @@ -4958,7 +4952,9 @@ static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) spin_lock(&proc->asts_spin); list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) { - list_del(&lkb->lkb_astqueue); + memset(&lkb->lkb_callbacks, 0, + sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE); + list_del_init(&lkb->lkb_astqueue); dlm_put_lkb(lkb); } spin_unlock(&proc->asts_spin); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 2d8c87b951c..bffa1e73b9a 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1468,13 +1468,15 @@ static void work_stop(void) static int work_start(void) { - recv_workqueue = create_singlethread_workqueue("dlm_recv"); + recv_workqueue = alloc_workqueue("dlm_recv", + WQ_UNBOUND | WQ_MEM_RECLAIM, 1); if (!recv_workqueue) { log_print("can't start dlm_recv"); return -ENOMEM; } - send_workqueue = create_singlethread_workqueue("dlm_send"); + send_workqueue = alloc_workqueue("dlm_send", + WQ_UNBOUND | WQ_MEM_RECLAIM, 1); if (!send_workqueue) { log_print("can't start dlm_send"); destroy_workqueue(recv_workqueue); diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 3c83a49a48a..f10a50f24e8 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -321,9 +321,9 @@ static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb, rl->rl_wait_type = cpu_to_le16(lkb->lkb_wait_type); if (lkb->lkb_bastfn) - rl->rl_asts |= AST_BAST; + rl->rl_asts |= DLM_CB_BAST; if (lkb->lkb_astfn) - rl->rl_asts |= AST_COMP; + rl->rl_asts |= DLM_CB_CAST; rl->rl_namelen = cpu_to_le16(r->res_length); memcpy(rl->rl_name, r->res_name, r->res_length); diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 66d6c16bf44..d5ab3fe7c19 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -24,6 +24,7 @@ #include "lock.h" #include "lvb_table.h" #include "user.h" +#include "ast.h" static const char name_prefix[] = "dlm"; static const struct file_operations device_fops; @@ -152,19 +153,16 @@ static void compat_output(struct dlm_lock_result *res, not related to the lifetime of the lkb struct which is managed entirely by refcount. */ -static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type) +static int lkb_is_endoflife(int mode, int status) { - switch (sb_status) { + switch (status) { case -DLM_EUNLOCK: return 1; case -DLM_ECANCEL: case -ETIMEDOUT: case -EDEADLK: - if (lkb->lkb_grmode == DLM_LOCK_IV) - return 1; - break; case -EAGAIN: - if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV) + if (mode == DLM_LOCK_IV) return 1; break; } @@ -174,12 +172,13 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type) /* we could possibly check if the cancel of an orphan has resulted in the lkb being removed and then remove that lkb from the orphans list and free it */ -void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode) +void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, + int status, uint32_t sbflags, uint64_t seq) { struct dlm_ls *ls; struct dlm_user_args *ua; struct dlm_user_proc *proc; - int eol = 0, ast_type; + int rv; if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) return; @@ -200,49 +199,29 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode) ua = lkb->lkb_ua; proc = ua->proc; - if (type == AST_BAST && ua->bastaddr == NULL) + if ((flags & DLM_CB_BAST) && ua->bastaddr == NULL) goto out; + if ((flags & DLM_CB_CAST) && lkb_is_endoflife(mode, status)) + lkb->lkb_flags |= DLM_IFL_ENDOFLIFE; + spin_lock(&proc->asts_spin); - ast_type = lkb->lkb_ast_type; - lkb->lkb_ast_type |= type; - if (type == AST_BAST) - lkb->lkb_bastmode = mode; - else - lkb->lkb_castmode = mode; + rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq); + if (rv < 0) { + spin_unlock(&proc->asts_spin); + goto out; + } - if (!ast_type) { + if (list_empty(&lkb->lkb_astqueue)) { kref_get(&lkb->lkb_ref); list_add_tail(&lkb->lkb_astqueue, &proc->asts); - lkb->lkb_ast_first = type; wake_up_interruptible(&proc->wait); } - if (type == AST_COMP && (ast_type & AST_COMP)) - log_debug(ls, "ast overlap %x status %x %x", - lkb->lkb_id, ua->lksb.sb_status, lkb->lkb_flags); - - eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type); - if (eol) { - lkb->lkb_flags |= DLM_IFL_ENDOFLIFE; - } - - /* We want to copy the lvb to userspace when the completion - ast is read if the status is 0, the lock has an lvb and - lvb_ops says we should. We could probably have set_lvb_lock() - set update_user_lvb instead and not need old_mode */ - - if ((lkb->lkb_ast_type & AST_COMP) && - (lkb->lkb_lksb->sb_status == 0) && - lkb->lkb_lksb->sb_lvbptr && - dlm_lvb_operations[ua->old_mode + 1][lkb->lkb_grmode + 1]) - ua->update_user_lvb = 1; - else - ua->update_user_lvb = 0; - spin_unlock(&proc->asts_spin); - if (eol) { + if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) { + /* N.B. spin_lock locks_spin, not asts_spin */ spin_lock(&proc->locks_spin); if (!list_empty(&lkb->lkb_ownqueue)) { list_del_init(&lkb->lkb_ownqueue); @@ -705,8 +684,9 @@ static int device_close(struct inode *inode, struct file *file) return 0; } -static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type, - int mode, char __user *buf, size_t count) +static int copy_result_to_user(struct dlm_user_args *ua, int compat, + uint32_t flags, int mode, int copy_lvb, + char __user *buf, size_t count) { #ifdef CONFIG_COMPAT struct dlm_lock_result32 result32; @@ -730,7 +710,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type, notes that a new blocking AST address and parameter are set even if the conversion fails, so maybe we should just do that. */ - if (type == AST_BAST) { + if (flags & DLM_CB_BAST) { result.user_astaddr = ua->bastaddr; result.user_astparam = ua->bastparam; result.bast_mode = mode; @@ -750,8 +730,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type, /* copy lvb to userspace if there is one, it's been updated, and the user buffer has space for it */ - if (ua->update_user_lvb && ua->lksb.sb_lvbptr && - count >= len + DLM_USER_LVB_LEN) { + if (copy_lvb && ua->lksb.sb_lvbptr && count >= len + DLM_USER_LVB_LEN) { if (copy_to_user(buf+len, ua->lksb.sb_lvbptr, DLM_USER_LVB_LEN)) { error = -EFAULT; @@ -801,13 +780,12 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, struct dlm_user_proc *proc = file->private_data; struct dlm_lkb *lkb; DECLARE_WAITQUEUE(wait, current); - int error = 0, removed; - int ret_type, ret_mode; - int bastmode, castmode, do_bast, do_cast; + struct dlm_callback cb; + int rv, resid, copy_lvb = 0; if (count == sizeof(struct dlm_device_version)) { - error = copy_version_to_user(buf, count); - return error; + rv = copy_version_to_user(buf, count); + return rv; } if (!proc) { @@ -854,92 +832,57 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, } } - /* there may be both completion and blocking asts to return for - the lkb, don't remove lkb from asts list unless no asts remain */ + /* if we empty lkb_callbacks, we don't want to unlock the spinlock + without removing lkb_astqueue; so empty lkb_astqueue is always + consistent with empty lkb_callbacks */ lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue); - removed = 0; - ret_type = 0; - ret_mode = 0; - do_bast = lkb->lkb_ast_type & AST_BAST; - do_cast = lkb->lkb_ast_type & AST_COMP; - bastmode = lkb->lkb_bastmode; - castmode = lkb->lkb_castmode; - - /* when both are queued figure out which to do first and - switch first so the other goes in the next read */ - - if (do_cast && do_bast) { - if (lkb->lkb_ast_first == AST_COMP) { - ret_type = AST_COMP; - ret_mode = castmode; - lkb->lkb_ast_type &= ~AST_COMP; - lkb->lkb_ast_first = AST_BAST; - } else { - ret_type = AST_BAST; - ret_mode = bastmode; - lkb->lkb_ast_type &= ~AST_BAST; - lkb->lkb_ast_first = AST_COMP; - } - } else { - ret_type = lkb->lkb_ast_first; - ret_mode = (ret_type == AST_COMP) ? castmode : bastmode; - lkb->lkb_ast_type &= ~ret_type; - lkb->lkb_ast_first = 0; + rv = dlm_rem_lkb_callback(lkb->lkb_resource->res_ls, lkb, &cb, &resid); + if (rv < 0) { + /* this shouldn't happen; lkb should have been removed from + list when resid was zero */ + log_print("dlm_rem_lkb_callback empty %x", lkb->lkb_id); + list_del_init(&lkb->lkb_astqueue); + spin_unlock(&proc->asts_spin); + /* removes ref for proc->asts, may cause lkb to be freed */ + dlm_put_lkb(lkb); + goto try_another; } + if (!resid) + list_del_init(&lkb->lkb_astqueue); + spin_unlock(&proc->asts_spin); - /* if we're doing a bast but the bast is unnecessary, then - switch to do nothing or do a cast if that was needed next */ - - if ((ret_type == AST_BAST) && - dlm_modes_compat(bastmode, lkb->lkb_castmode_done)) { - ret_type = 0; - ret_mode = 0; - - if (do_cast) { - ret_type = AST_COMP; - ret_mode = castmode; - lkb->lkb_ast_type &= ~AST_COMP; - lkb->lkb_ast_first = 0; - } + if (cb.flags & DLM_CB_SKIP) { + /* removes ref for proc->asts, may cause lkb to be freed */ + if (!resid) + dlm_put_lkb(lkb); + goto try_another; } - if (lkb->lkb_ast_first != lkb->lkb_ast_type) { - log_print("device_read %x ast_first %x ast_type %x", - lkb->lkb_id, lkb->lkb_ast_first, lkb->lkb_ast_type); - } + if (cb.flags & DLM_CB_CAST) { + int old_mode, new_mode; - if (!lkb->lkb_ast_type) { - list_del(&lkb->lkb_astqueue); - removed = 1; - } - spin_unlock(&proc->asts_spin); + old_mode = lkb->lkb_last_cast.mode; + new_mode = cb.mode; - if (ret_type) { - error = copy_result_to_user(lkb->lkb_ua, - test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags), - ret_type, ret_mode, buf, count); + if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr && + dlm_lvb_operations[old_mode + 1][new_mode + 1]) + copy_lvb = 1; - if (ret_type == AST_COMP) - lkb->lkb_castmode_done = castmode; - if (ret_type == AST_BAST) - lkb->lkb_bastmode_done = bastmode; + lkb->lkb_lksb->sb_status = cb.sb_status; + lkb->lkb_lksb->sb_flags = cb.sb_flags; } - /* removes reference for the proc->asts lists added by - dlm_user_add_ast() and may result in the lkb being freed */ + rv = copy_result_to_user(lkb->lkb_ua, + test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags), + cb.flags, cb.mode, copy_lvb, buf, count); - if (removed) + /* removes ref for proc->asts, may cause lkb to be freed */ + if (!resid) dlm_put_lkb(lkb); - /* the bast that was queued was eliminated (see unnecessary above), - leaving nothing to return */ - - if (!ret_type) - goto try_another; - - return error; + return rv; } static unsigned int device_poll(struct file *file, poll_table *wait) diff --git a/fs/dlm/user.h b/fs/dlm/user.h index f196091dd7f..00499ab8835 100644 --- a/fs/dlm/user.h +++ b/fs/dlm/user.h @@ -9,7 +9,8 @@ #ifndef __USER_DOT_H__ #define __USER_DOT_H__ -void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode); +void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, + int status, uint32_t sbflags, uint64_t seq); int dlm_user_init(void); void dlm_user_exit(void); int dlm_device_deregister(struct dlm_ls *ls); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4a09af9e9a6..ff12f7ac73e 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -62,7 +62,7 @@ * This mutex is acquired by ep_free() during the epoll file * cleanup path and it is also acquired by eventpoll_release_file() * if a file has been pushed inside an epoll set and it is then - * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL). + * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL). * It is also acquired when inserting an epoll fd onto another epoll * fd. We do this so that we walk the epoll tree and ensure that this * insertion does not create a cycle of epoll file descriptors, which @@ -152,11 +152,11 @@ struct epitem { /* * This structure is stored inside the "private_data" member of the file - * structure and rapresent the main data sructure for the eventpoll + * structure and represents the main data structure for the eventpoll * interface. */ struct eventpoll { - /* Protect the this structure access */ + /* Protect the access to this structure */ spinlock_t lock; /* @@ -793,7 +793,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) /* * This is the callback that is passed to the wait queue wakeup - * machanism. It is called by the stored file descriptors when they + * mechanism. It is called by the stored file descriptors when they * have events to report. */ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) @@ -824,9 +824,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k goto out_unlock; /* - * If we are trasfering events to userspace, we can hold no locks + * If we are transferring events to userspace, we can hold no locks * (because we're accessing user memory, and because of linux f_op->poll() - * semantics). All the events that happens during that period of time are + * semantics). All the events that happen during that period of time are * chained in ep->ovflist and requeued later on. */ if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { diff --git a/fs/exec.c b/fs/exec.c index 52a447d9b6a..ba99e1abb1a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -115,13 +115,16 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) struct file *file; char *tmp = getname(library); int error = PTR_ERR(tmp); + static const struct open_flags uselib_flags = { + .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN, + .intent = LOOKUP_OPEN + }; if (IS_ERR(tmp)) goto out; - file = do_filp_open(AT_FDCWD, tmp, - O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0, - MAY_READ | MAY_EXEC | MAY_OPEN); + file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW); putname(tmp); error = PTR_ERR(file); if (IS_ERR(file)) @@ -721,10 +724,13 @@ struct file *open_exec(const char *name) { struct file *file; int err; + static const struct open_flags open_exec_flags = { + .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .acc_mode = MAY_EXEC | MAY_OPEN, + .intent = LOOKUP_OPEN + }; - file = do_filp_open(AT_FDCWD, name, - O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0, - MAY_EXEC | MAY_OPEN); + file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW); if (IS_ERR(file)) goto out; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 4b6825740dd..b05acb79613 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -320,9 +320,14 @@ static int export_encode_fh(struct dentry *dentry, struct fid *fid, struct inode * inode = dentry->d_inode; int len = *max_len; int type = FILEID_INO32_GEN; - - if (len < 2 || (connectable && len < 4)) + + if (connectable && (len < 4)) { + *max_len = 4; + return 255; + } else if (len < 2) { + *max_len = 2; return 255; + } len = 2; fid->i32.ino = inode->i_ino; @@ -369,6 +374,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, /* * Try to get any dentry for the given file handle from the filesystem. */ + if (!nop || !nop->fh_to_dentry) + return ERR_PTR(-ESTALE); result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type); if (!result) result = ERR_PTR(-ESTALE); diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 6346a2acf32..1b48c337087 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -110,7 +110,7 @@ extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **); extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int); /* ialloc.c */ -extern struct inode * ext2_new_inode (struct inode *, int); +extern struct inode * ext2_new_inode (struct inode *, int, const struct qstr *); extern void ext2_free_inode (struct inode *); extern unsigned long ext2_count_free_inodes (struct super_block *); extern void ext2_check_inodes_bitmap (struct super_block *); diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index ad70479aabf..ee9ed31948e 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -429,7 +429,8 @@ found: return group; } -struct inode *ext2_new_inode(struct inode *dir, int mode) +struct inode *ext2_new_inode(struct inode *dir, int mode, + const struct qstr *qstr) { struct super_block *sb; struct buffer_head *bitmap_bh = NULL; @@ -585,7 +586,7 @@ got: if (err) goto fail_free_drop; - err = ext2_init_security(inode,dir); + err = ext2_init_security(inode, dir, qstr); if (err) goto fail_free_drop; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index adb91855ccd..ed5c5d496ee 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -104,7 +104,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, st dquot_initialize(dir); - inode = ext2_new_inode(dir, mode); + inode = ext2_new_inode(dir, mode, &dentry->d_name); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -133,7 +133,7 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_ dquot_initialize(dir); - inode = ext2_new_inode (dir, mode); + inode = ext2_new_inode (dir, mode, &dentry->d_name); err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); @@ -159,7 +159,7 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry, dquot_initialize(dir); - inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO); + inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO, &dentry->d_name); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out; @@ -230,7 +230,7 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode) inode_inc_link_count(dir); - inode = ext2_new_inode (dir, S_IFDIR | mode); + inode = ext2_new_inode(dir, S_IFDIR | mode, &dentry->d_name); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_dir; diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h index a1a1c218461..5e41cccff76 100644 --- a/fs/ext2/xattr.h +++ b/fs/ext2/xattr.h @@ -116,9 +116,11 @@ exit_ext2_xattr(void) # endif /* CONFIG_EXT2_FS_XATTR */ #ifdef CONFIG_EXT2_FS_SECURITY -extern int ext2_init_security(struct inode *inode, struct inode *dir); +extern int ext2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr); #else -static inline int ext2_init_security(struct inode *inode, struct inode *dir) +static inline int ext2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) { return 0; } diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index 3004e15d5da..5d979b4347b 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -47,14 +47,15 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name, } int -ext2_init_security(struct inode *inode, struct inode *dir) +ext2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) { int err; size_t len; void *value; char *name; - err = security_inode_init_security(inode, dir, &name, &value, &len); + err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); if (err) { if (err == -EOPNOTSUPP) return 0; diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 045995c8ce5..153242187fc 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -1991,6 +1991,7 @@ ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group, spin_unlock(sb_bgl_lock(sbi, group)); percpu_counter_sub(&sbi->s_freeblocks_counter, next - start); + free_blocks -= next - start; /* Do not issue a TRIM on extents smaller than minblocks */ if ((next - start) < minblocks) goto free_extent; @@ -2040,7 +2041,7 @@ free_extent: cond_resched(); /* No more suitable extents */ - if ((free_blocks - count) < minblocks) + if (free_blocks < minblocks) break; } @@ -2090,7 +2091,8 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count); int ret = 0; - start = range->start >> sb->s_blocksize_bits; + start = (range->start >> sb->s_blocksize_bits) + + le32_to_cpu(es->s_first_data_block); len = range->len >> sb->s_blocksize_bits; minlen = range->minlen >> sb->s_blocksize_bits; trimmed = 0; @@ -2099,10 +2101,6 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) return -EINVAL; if (start >= max_blks) goto out; - if (start < le32_to_cpu(es->s_first_data_block)) { - len -= le32_to_cpu(es->s_first_data_block) - start; - start = le32_to_cpu(es->s_first_data_block); - } if (start + len > max_blks) len = max_blks - start; @@ -2129,10 +2127,15 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) if (free_blocks < minlen) continue; - if (len >= EXT3_BLOCKS_PER_GROUP(sb)) - len -= (EXT3_BLOCKS_PER_GROUP(sb) - first_block); - else + /* + * For all the groups except the last one, last block will + * always be EXT3_BLOCKS_PER_GROUP(sb), so we only need to + * change it for the last group in which case first_block + + * len < EXT3_BLOCKS_PER_GROUP(sb). + */ + if (first_block + len < EXT3_BLOCKS_PER_GROUP(sb)) last_block = first_block + len; + len -= last_block - first_block; ret = ext3_trim_all_free(sb, group, first_block, last_block, minlen); diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 9724aef2246..bfc2dc43681 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -404,7 +404,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent) * For other inodes, search forward from the parent directory's block * group to find a free inode. */ -struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode) +struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, + const struct qstr *qstr, int mode) { struct super_block *sb; struct buffer_head *bitmap_bh = NULL; @@ -589,7 +590,7 @@ got: if (err) goto fail_free_drop; - err = ext3_init_security(handle,inode, dir); + err = ext3_init_security(handle, inode, dir, qstr); if (err) goto fail_free_drop; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index b27ba71810e..32f3b869585 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1540,8 +1540,8 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, goto cleanup; node2 = (struct dx_node *)(bh2->b_data); entries2 = node2->entries; + memset(&node2->fake, 0, sizeof(struct fake_dirent)); node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize); - node2->fake.inode = 0; BUFFER_TRACE(frame->bh, "get_write_access"); err = ext3_journal_get_write_access(handle, frame->bh); if (err) @@ -1710,7 +1710,7 @@ retry: if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = ext3_new_inode (handle, dir, mode); + inode = ext3_new_inode (handle, dir, &dentry->d_name, mode); err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext3_file_inode_operations; @@ -1746,7 +1746,7 @@ retry: if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = ext3_new_inode (handle, dir, mode); + inode = ext3_new_inode (handle, dir, &dentry->d_name, mode); err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); @@ -1784,7 +1784,7 @@ retry: if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = ext3_new_inode (handle, dir, S_IFDIR | mode); + inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFDIR | mode); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; @@ -2206,7 +2206,7 @@ retry: if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); + inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFLNK|S_IRWXUGO); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; @@ -2253,13 +2253,6 @@ static int ext3_link (struct dentry * old_dentry, dquot_initialize(dir); - /* - * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing - * otherwise has the potential to corrupt the orphan inode list. - */ - if (inode->i_nlink == 0) - return -ENOENT; - retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 85c8cc8f247..071689f86e1 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1464,6 +1464,13 @@ static void ext3_orphan_cleanup (struct super_block * sb, return; } + /* Check if feature set allows readwrite operations */ + if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) { + ext3_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " + "unknown ROCOMPAT features"); + return; + } + if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) { if (es->s_last_orphan) jbd_debug(1, "Errors on filesystem, " @@ -1936,6 +1943,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sb->s_qcop = &ext3_qctl_operations; sb->dq_op = &ext3_quota_operations; #endif + memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); mutex_init(&sbi->s_resize_lock); diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h index 377fe720116..2be4f69bfa6 100644 --- a/fs/ext3/xattr.h +++ b/fs/ext3/xattr.h @@ -128,10 +128,10 @@ exit_ext3_xattr(void) #ifdef CONFIG_EXT3_FS_SECURITY extern int ext3_init_security(handle_t *handle, struct inode *inode, - struct inode *dir); + struct inode *dir, const struct qstr *qstr); #else static inline int ext3_init_security(handle_t *handle, struct inode *inode, - struct inode *dir) + struct inode *dir, const struct qstr *qstr) { return 0; } diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index 03a99bfc59f..b8d9f83aa5c 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -49,14 +49,15 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name, } int -ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir) +ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir, + const struct qstr *qstr) { int err; size_t len; void *value; char *name; - err = security_inode_init_security(inode, dir, &name, &value, &len); + err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); if (err) { if (err == -EOPNOTSUPP) return 0; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ccce8a7e94e..7516fb9c0bd 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -131,7 +131,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, * fragmenting the file system's free space. Maybe we * should have some hueristics or some way to allow * userspace to pass a hint to file system, - * especiially if the latter case turns out to be + * especially if the latter case turns out to be * common. */ ex = path[depth].p_ext; @@ -2844,7 +2844,7 @@ fix_extent_len: * ext4_get_blocks_dio_write() when DIO to write * to an uninitialized extent. * - * Writing to an uninitized extent may result in splitting the uninitialized + * Writing to an uninitialized extent may result in splitting the uninitialized * extent into multiple /initialized uninitialized extents (up to three) * There are three possibilities: * a> There is no split required: Entire extent should be uninitialized diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index eb9097aec6f..78b79e1bd7e 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1042,7 +1042,7 @@ got: if (err) goto fail_free_drop; - err = ext4_init_security(handle, inode, dir); + err = ext4_init_security(handle, inode, dir, qstr); if (err) goto fail_free_drop; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 5485390d32c..e781b7ea563 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2304,13 +2304,6 @@ static int ext4_link(struct dentry *old_dentry, dquot_initialize(dir); - /* - * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing - * otherwise has the potential to corrupt the orphan inode list. - */ - if (inode->i_nlink == 0) - return -ENOENT; - retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f6a318f836b..203f9e4a70b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3415,6 +3415,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_qcop = &ext4_qctl_operations; sb->dq_op = &ext4_quota_operations; #endif + memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); + INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); mutex_init(&sbi->s_resize_lock); @@ -3509,7 +3511,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) percpu_counter_set(&sbi->s_dirtyblocks_counter, 0); no_journal: - EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); + /* + * The maximum number of concurrent works can be high and + * concurrency isn't really necessary. Limit it to 1. + */ + EXT4_SB(sb)->dio_unwritten_wq = + alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM, 1); if (!EXT4_SB(sb)->dio_unwritten_wq) { printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); goto failed_mount_wq; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 1ef16520b95..25b7387ff18 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -145,10 +145,10 @@ ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, #ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, - struct inode *dir); + struct inode *dir, const struct qstr *qstr); #else static inline int ext4_init_security(handle_t *handle, struct inode *inode, - struct inode *dir) + struct inode *dir, const struct qstr *qstr) { return 0; } diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 9b21268e121..007c3bfbf09 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -49,14 +49,15 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name, } int -ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir) +ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, + const struct qstr *qstr) { int err; size_t len; void *value; char *name; - err = security_inode_init_security(inode, dir, &name, &value, &len); + err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); if (err) { if (err == -EOPNOTSUPP) return 0; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 86753fe10bd..0e277ec4b61 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -757,8 +757,10 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable) struct inode *inode = de->d_inode; u32 ipos_h, ipos_m, ipos_l; - if (len < 5) + if (len < 5) { + *lenp = 5; return 255; /* no room */ + } ipos_h = MSDOS_I(inode)->i_pos >> 8; ipos_m = (MSDOS_I(inode)->i_pos & 0xf0) << 24; diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index f88f752babd..adae3fb7451 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -43,7 +43,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry) static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd) { - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; /* This is not negative dentry. Always valid. */ @@ -54,7 +54,7 @@ static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd) static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd) { - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; /* diff --git a/fs/fcntl.c b/fs/fcntl.c index cb1026181bd..6c82e5bac03 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -131,7 +131,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) SYSCALL_DEFINE1(dup, unsigned int, fildes) { int ret = -EBADF; - struct file *file = fget(fildes); + struct file *file = fget_raw(fildes); if (file) { ret = get_unused_fd(); @@ -426,15 +426,35 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, return err; } +static int check_fcntl_cmd(unsigned cmd) +{ + switch (cmd) { + case F_DUPFD: + case F_DUPFD_CLOEXEC: + case F_GETFD: + case F_SETFD: + case F_GETFL: + return 1; + } + return 0; +} + SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { struct file *filp; long err = -EBADF; - filp = fget(fd); + filp = fget_raw(fd); if (!filp) goto out; + if (unlikely(filp->f_mode & FMODE_PATH)) { + if (!check_fcntl_cmd(cmd)) { + fput(filp); + goto out; + } + } + err = security_file_fcntl(filp, cmd, arg); if (err) { fput(filp); @@ -456,10 +476,17 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, long err; err = -EBADF; - filp = fget(fd); + filp = fget_raw(fd); if (!filp) goto out; + if (unlikely(filp->f_mode & FMODE_PATH)) { + if (!check_fcntl_cmd(cmd)) { + fput(filp); + goto out; + } + } + err = security_file_fcntl(filp, cmd, arg); if (err) { fput(filp); @@ -808,14 +835,14 @@ static int __init fcntl_init(void) * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ - BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( + BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | O_APPEND | /* O_NONBLOCK | */ __O_SYNC | O_DSYNC | FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | O_NOATIME | O_CLOEXEC | - __FMODE_EXEC + __FMODE_EXEC | O_PATH )); fasync_cache = kmem_cache_create("fasync_cache", diff --git a/fs/fhandle.c b/fs/fhandle.c new file mode 100644 index 00000000000..bf93ad2bee0 --- /dev/null +++ b/fs/fhandle.c @@ -0,0 +1,265 @@ +#include <linux/syscalls.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/exportfs.h> +#include <linux/fs_struct.h> +#include <linux/fsnotify.h> +#include <asm/uaccess.h> +#include "internal.h" + +static long do_sys_name_to_handle(struct path *path, + struct file_handle __user *ufh, + int __user *mnt_id) +{ + long retval; + struct file_handle f_handle; + int handle_dwords, handle_bytes; + struct file_handle *handle = NULL; + + /* + * We need t make sure wether the file system + * support decoding of the file handle + */ + if (!path->mnt->mnt_sb->s_export_op || + !path->mnt->mnt_sb->s_export_op->fh_to_dentry) + return -EOPNOTSUPP; + + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) + return -EFAULT; + + if (f_handle.handle_bytes > MAX_HANDLE_SZ) + return -EINVAL; + + handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes, + GFP_KERNEL); + if (!handle) + return -ENOMEM; + + /* convert handle size to multiple of sizeof(u32) */ + handle_dwords = f_handle.handle_bytes >> 2; + + /* we ask for a non connected handle */ + retval = exportfs_encode_fh(path->dentry, + (struct fid *)handle->f_handle, + &handle_dwords, 0); + handle->handle_type = retval; + /* convert handle size to bytes */ + handle_bytes = handle_dwords * sizeof(u32); + handle->handle_bytes = handle_bytes; + if ((handle->handle_bytes > f_handle.handle_bytes) || + (retval == 255) || (retval == -ENOSPC)) { + /* As per old exportfs_encode_fh documentation + * we could return ENOSPC to indicate overflow + * But file system returned 255 always. So handle + * both the values + */ + /* + * set the handle size to zero so we copy only + * non variable part of the file_handle + */ + handle_bytes = 0; + retval = -EOVERFLOW; + } else + retval = 0; + /* copy the mount id */ + if (copy_to_user(mnt_id, &path->mnt->mnt_id, sizeof(*mnt_id)) || + copy_to_user(ufh, handle, + sizeof(struct file_handle) + handle_bytes)) + retval = -EFAULT; + kfree(handle); + return retval; +} + +/** + * sys_name_to_handle_at: convert name to handle + * @dfd: directory relative to which name is interpreted if not absolute + * @name: name that should be converted to handle. + * @handle: resulting file handle + * @mnt_id: mount id of the file system containing the file + * @flag: flag value to indicate whether to follow symlink or not + * + * @handle->handle_size indicate the space available to store the + * variable part of the file handle in bytes. If there is not + * enough space, the field is updated to return the minimum + * value required. + */ +SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, + struct file_handle __user *, handle, int __user *, mnt_id, + int, flag) +{ + struct path path; + int lookup_flags; + int err; + + if ((flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; + + lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0; + if (flag & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + err = user_path_at(dfd, name, lookup_flags, &path); + if (!err) { + err = do_sys_name_to_handle(&path, handle, mnt_id); + path_put(&path); + } + return err; +} + +static struct vfsmount *get_vfsmount_from_fd(int fd) +{ + struct path path; + + if (fd == AT_FDCWD) { + struct fs_struct *fs = current->fs; + spin_lock(&fs->lock); + path = fs->pwd; + mntget(path.mnt); + spin_unlock(&fs->lock); + } else { + int fput_needed; + struct file *file = fget_light(fd, &fput_needed); + if (!file) + return ERR_PTR(-EBADF); + path = file->f_path; + mntget(path.mnt); + fput_light(file, fput_needed); + } + return path.mnt; +} + +static int vfs_dentry_acceptable(void *context, struct dentry *dentry) +{ + return 1; +} + +static int do_handle_to_path(int mountdirfd, struct file_handle *handle, + struct path *path) +{ + int retval = 0; + int handle_dwords; + + path->mnt = get_vfsmount_from_fd(mountdirfd); + if (IS_ERR(path->mnt)) { + retval = PTR_ERR(path->mnt); + goto out_err; + } + /* change the handle size to multiple of sizeof(u32) */ + handle_dwords = handle->handle_bytes >> 2; + path->dentry = exportfs_decode_fh(path->mnt, + (struct fid *)handle->f_handle, + handle_dwords, handle->handle_type, + vfs_dentry_acceptable, NULL); + if (IS_ERR(path->dentry)) { + retval = PTR_ERR(path->dentry); + goto out_mnt; + } + return 0; +out_mnt: + mntput(path->mnt); +out_err: + return retval; +} + +static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, + struct path *path) +{ + int retval = 0; + struct file_handle f_handle; + struct file_handle *handle = NULL; + + /* + * With handle we don't look at the execute bit on the + * the directory. Ideally we would like CAP_DAC_SEARCH. + * But we don't have that + */ + if (!capable(CAP_DAC_READ_SEARCH)) { + retval = -EPERM; + goto out_err; + } + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) { + retval = -EFAULT; + goto out_err; + } + if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || + (f_handle.handle_bytes == 0)) { + retval = -EINVAL; + goto out_err; + } + handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes, + GFP_KERNEL); + if (!handle) { + retval = -ENOMEM; + goto out_err; + } + /* copy the full handle */ + if (copy_from_user(handle, ufh, + sizeof(struct file_handle) + + f_handle.handle_bytes)) { + retval = -EFAULT; + goto out_handle; + } + + retval = do_handle_to_path(mountdirfd, handle, path); + +out_handle: + kfree(handle); +out_err: + return retval; +} + +long do_handle_open(int mountdirfd, + struct file_handle __user *ufh, int open_flag) +{ + long retval = 0; + struct path path; + struct file *file; + int fd; + + retval = handle_to_path(mountdirfd, ufh, &path); + if (retval) + return retval; + + fd = get_unused_fd_flags(open_flag); + if (fd < 0) { + path_put(&path); + return fd; + } + file = file_open_root(path.dentry, path.mnt, "", open_flag); + if (IS_ERR(file)) { + put_unused_fd(fd); + retval = PTR_ERR(file); + } else { + retval = fd; + fsnotify_open(file); + fd_install(fd, file); + } + path_put(&path); + return retval; +} + +/** + * sys_open_by_handle_at: Open the file handle + * @mountdirfd: directory file descriptor + * @handle: file handle to be opened + * @flag: open flags. + * + * @mountdirfd indicate the directory file descriptor + * of the mount point. file handle is decoded relative + * to the vfsmount pointed by the @mountdirfd. @flags + * value is same as the open(2) flags. + */ +SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, + struct file_handle __user *, handle, + int, flags) +{ + long ret; + + if (force_o_largefile()) + flags |= O_LARGEFILE; + + ret = do_handle_open(mountdirfd, handle, flags); + return ret; +} diff --git a/fs/file_table.c b/fs/file_table.c index eb36b6b17e2..01e4c1e8e6b 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -190,7 +190,8 @@ struct file *alloc_file(struct path *path, fmode_t mode, file_take_write(file); WARN_ON(mnt_clone_write(path->mnt)); } - ima_counts_get(file); + if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) + i_readcount_inc(path->dentry->d_inode); return file; } EXPORT_SYMBOL(alloc_file); @@ -246,11 +247,15 @@ static void __fput(struct file *file) file->f_op->release(inode, file); security_file_free(file); ima_file_free(file); - if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL)) + if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && + !(file->f_mode & FMODE_PATH))) { cdev_put(inode->i_cdev); + } fops_put(file->f_op); put_pid(file->f_owner.pid); file_sb_list_del(file); + if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) + i_readcount_dec(inode); if (file->f_mode & FMODE_WRITE) drop_file_write_access(file); file->f_path.dentry = NULL; @@ -276,11 +281,10 @@ struct file *fget(unsigned int fd) rcu_read_lock(); file = fcheck_files(files, fd); if (file) { - if (!atomic_long_inc_not_zero(&file->f_count)) { - /* File object ref couldn't be taken */ - rcu_read_unlock(); - return NULL; - } + /* File object ref couldn't be taken */ + if (file->f_mode & FMODE_PATH || + !atomic_long_inc_not_zero(&file->f_count)) + file = NULL; } rcu_read_unlock(); @@ -289,6 +293,25 @@ struct file *fget(unsigned int fd) EXPORT_SYMBOL(fget); +struct file *fget_raw(unsigned int fd) +{ + struct file *file; + struct files_struct *files = current->files; + + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + /* File object ref couldn't be taken */ + if (!atomic_long_inc_not_zero(&file->f_count)) + file = NULL; + } + rcu_read_unlock(); + + return file; +} + +EXPORT_SYMBOL(fget_raw); + /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. * @@ -313,6 +336,33 @@ struct file *fget_light(unsigned int fd, int *fput_needed) *fput_needed = 0; if (atomic_read(&files->count) == 1) { file = fcheck_files(files, fd); + if (file && (file->f_mode & FMODE_PATH)) + file = NULL; + } else { + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + if (!(file->f_mode & FMODE_PATH) && + atomic_long_inc_not_zero(&file->f_count)) + *fput_needed = 1; + else + /* Didn't get the reference, someone's freed */ + file = NULL; + } + rcu_read_unlock(); + } + + return file; +} + +struct file *fget_raw_light(unsigned int fd, int *fput_needed) +{ + struct file *file; + struct files_struct *files = current->files; + + *fput_needed = 0; + if (atomic_read(&files->count) == 1) { + file = fcheck_files(files, fd); } else { rcu_read_lock(); file = fcheck_files(files, fd); diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 3e87cce5837..7c39b885f96 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -458,7 +458,7 @@ static void cuse_fc_release(struct fuse_conn *fc) * @file: file struct being opened * * Userland CUSE server can create a CUSE device by opening /dev/cuse - * and replying to the initilaization request kernel sends. This + * and replying to the initialization request kernel sends. This * function is responsible for handling CUSE device initialization. * Because the fd opened by this function is used during * initialization, this function only creates cuse_conn and sends diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 83543b5ff94..8bd0ef9286c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -158,7 +158,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) { struct inode *inode; - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; inode = entry->d_inode; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 9e3f68cc1bd..051b1a08452 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -637,8 +637,10 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, u64 nodeid; u32 generation; - if (*max_len < len) + if (*max_len < len) { + *max_len = len; return 255; + } nodeid = get_fuse_inode(inode)->nodeid; generation = inode->i_generation; diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 7118f1a780a..cbc07155b1a 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -80,8 +80,11 @@ int gfs2_check_acl(struct inode *inode, int mask, unsigned int flags) struct posix_acl *acl; int error; - if (flags & IPERM_FLAG_RCU) - return -ECHILD; + if (flags & IPERM_FLAG_RCU) { + if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) + return -ECHILD; + return -EAGAIN; + } acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS); if (IS_ERR(acl)) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 4f36f8832b9..aad77e4f61b 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -695,6 +695,7 @@ out: if (error == 0) return 0; + unlock_page(page); page_cache_release(page); gfs2_trans_end(sdp); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 3c4039d5eef..ef3dc4b9fae 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -21,6 +21,7 @@ #include "meta_io.h" #include "quota.h" #include "rgrp.h" +#include "super.h" #include "trans.h" #include "dir.h" #include "util.h" @@ -757,7 +758,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrp_list rlist; u64 bn, bstart; - u32 blen; + u32 blen, btotal; __be64 *p; unsigned int rg_blocks = 0; int metadata; @@ -839,6 +840,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, bstart = 0; blen = 0; + btotal = 0; for (p = top; p < bottom; p++) { if (!*p) @@ -851,9 +853,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, else { if (bstart) { if (metadata) - gfs2_free_meta(ip, bstart, blen); + __gfs2_free_meta(ip, bstart, blen); else - gfs2_free_data(ip, bstart, blen); + __gfs2_free_data(ip, bstart, blen); + + btotal += blen; } bstart = bn; @@ -865,11 +869,17 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, } if (bstart) { if (metadata) - gfs2_free_meta(ip, bstart, blen); + __gfs2_free_meta(ip, bstart, blen); else - gfs2_free_data(ip, bstart, blen); + __gfs2_free_data(ip, bstart, blen); + + btotal += blen; } + gfs2_statfs_change(sdp, 0, +btotal, 0); + gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid, + ip->i_inode.i_gid); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_dinode_out(ip, dibh->b_data); diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 4a456338b87..0da8da2c991 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -44,7 +44,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) int error; int had_lock = 0; - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; parent = dget_parent(dentry); diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 9023db8184f..b5a5e60df0d 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -36,9 +36,13 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len, struct super_block *sb = inode->i_sb; struct gfs2_inode *ip = GFS2_I(inode); - if (*len < GFS2_SMALL_FH_SIZE || - (connectable && *len < GFS2_LARGE_FH_SIZE)) + if (connectable && (*len < GFS2_LARGE_FH_SIZE)) { + *len = GFS2_LARGE_FH_SIZE; return 255; + } else if (*len < GFS2_SMALL_FH_SIZE) { + *len = GFS2_SMALL_FH_SIZE; + return 255; + } fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32); fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 7cfdcb91336..4074b952b05 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -448,15 +448,20 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) { struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); - if (!(file->f_flags & O_NOATIME)) { + if (!(file->f_flags & O_NOATIME) && + !IS_NOATIME(&ip->i_inode)) { struct gfs2_holder i_gh; int error; - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); error = gfs2_glock_nq(&i_gh); - file_accessed(file); - if (error == 0) - gfs2_glock_dq_uninit(&i_gh); + if (error == 0) { + file_accessed(file); + gfs2_glock_dq(&i_gh); + } + gfs2_holder_uninit(&i_gh); + if (error) + return error; } vma->vm_ops = &gfs2_vm_ops; vma->vm_flags |= VM_CAN_NONLINEAR; @@ -617,8 +622,7 @@ static void empty_write_end(struct page *page, unsigned from, { struct gfs2_inode *ip = GFS2_I(page->mapping->host); - page_zero_new_buffers(page, from, to); - flush_dcache_page(page); + zero_user(page, from, to-from); mark_page_accessed(page); if (!gfs2_is_writeback(ip)) @@ -627,36 +631,43 @@ static void empty_write_end(struct page *page, unsigned from, block_commit_write(page, from, to); } -static int write_empty_blocks(struct page *page, unsigned from, unsigned to) +static int needs_empty_write(sector_t block, struct inode *inode) { - unsigned start, end, next; - struct buffer_head *bh, *head; int error; + struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; - if (!page_has_buffers(page)) { - error = __block_write_begin(page, from, to - from, gfs2_block_map); - if (unlikely(error)) - return error; + bh_map.b_size = 1 << inode->i_blkbits; + error = gfs2_block_map(inode, block, &bh_map, 0); + if (unlikely(error)) + return error; + return !buffer_mapped(&bh_map); +} - empty_write_end(page, from, to); - return 0; - } +static int write_empty_blocks(struct page *page, unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + unsigned start, end, next, blksize; + sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + int ret; - bh = head = page_buffers(page); + blksize = 1 << inode->i_blkbits; next = end = 0; while (next < from) { - next += bh->b_size; - bh = bh->b_this_page; + next += blksize; + block++; } start = next; do { - next += bh->b_size; - if (buffer_mapped(bh)) { + next += blksize; + ret = needs_empty_write(block, inode); + if (unlikely(ret < 0)) + return ret; + if (ret == 0) { if (end) { - error = __block_write_begin(page, start, end - start, - gfs2_block_map); - if (unlikely(error)) - return error; + ret = __block_write_begin(page, start, end - start, + gfs2_block_map); + if (unlikely(ret)) + return ret; empty_write_end(page, start, end); end = 0; } @@ -664,13 +675,13 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to) } else end = next; - bh = bh->b_this_page; + block++; } while (next < to); if (end) { - error = __block_write_begin(page, start, end - start, gfs2_block_map); - if (unlikely(error)) - return error; + ret = __block_write_begin(page, start, end - start, gfs2_block_map); + if (unlikely(ret)) + return ret; empty_write_end(page, start, end); } @@ -976,8 +987,10 @@ static void do_unflock(struct file *file, struct file_lock *fl) mutex_lock(&fp->f_fl_mutex); flock_lock_file_wait(file, fl); - if (fl_gh->gh_gl) - gfs2_glock_dq_uninit(fl_gh); + if (fl_gh->gh_gl) { + gfs2_glock_dq_wait(fl_gh); + gfs2_holder_uninit(fl_gh); + } mutex_unlock(&fp->f_fl_mutex); } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 7cd9a5a68d5..e2431313491 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -26,6 +26,9 @@ #include <linux/freezer.h> #include <linux/workqueue.h> #include <linux/jiffies.h> +#include <linux/rcupdate.h> +#include <linux/rculist_bl.h> +#include <linux/bit_spinlock.h> #include "gfs2.h" #include "incore.h" @@ -41,10 +44,6 @@ #define CREATE_TRACE_POINTS #include "trace_gfs2.h" -struct gfs2_gl_hash_bucket { - struct hlist_head hb_list; -}; - struct gfs2_glock_iter { int hash; /* hash bucket index */ struct gfs2_sbd *sdp; /* incore superblock */ @@ -54,7 +53,6 @@ struct gfs2_glock_iter { typedef void (*glock_examiner) (struct gfs2_glock * gl); -static int gfs2_dump_lockstate(struct gfs2_sbd *sdp); static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0) static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); @@ -70,57 +68,9 @@ static DEFINE_SPINLOCK(lru_lock); #define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) #define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1) -static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE]; +static struct hlist_bl_head gl_hash_table[GFS2_GL_HASH_SIZE]; static struct dentry *gfs2_root; -/* - * Despite what you might think, the numbers below are not arbitrary :-) - * They are taken from the ipv4 routing hash code, which is well tested - * and thus should be nearly optimal. Later on we might tweek the numbers - * but for now this should be fine. - * - * The reason for putting the locks in a separate array from the list heads - * is that we can have fewer locks than list heads and save memory. We use - * the same hash function for both, but with a different hash mask. - */ -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ - defined(CONFIG_PROVE_LOCKING) - -#ifdef CONFIG_LOCKDEP -# define GL_HASH_LOCK_SZ 256 -#else -# if NR_CPUS >= 32 -# define GL_HASH_LOCK_SZ 4096 -# elif NR_CPUS >= 16 -# define GL_HASH_LOCK_SZ 2048 -# elif NR_CPUS >= 8 -# define GL_HASH_LOCK_SZ 1024 -# elif NR_CPUS >= 4 -# define GL_HASH_LOCK_SZ 512 -# else -# define GL_HASH_LOCK_SZ 256 -# endif -#endif - -/* We never want more locks than chains */ -#if GFS2_GL_HASH_SIZE < GL_HASH_LOCK_SZ -# undef GL_HASH_LOCK_SZ -# define GL_HASH_LOCK_SZ GFS2_GL_HASH_SIZE -#endif - -static rwlock_t gl_hash_locks[GL_HASH_LOCK_SZ]; - -static inline rwlock_t *gl_lock_addr(unsigned int x) -{ - return &gl_hash_locks[x & (GL_HASH_LOCK_SZ-1)]; -} -#else /* not SMP, so no spinlocks required */ -static inline rwlock_t *gl_lock_addr(unsigned int x) -{ - return NULL; -} -#endif - /** * gl_hash() - Turn glock number into hash bucket number * @lock: The glock number @@ -141,25 +91,35 @@ static unsigned int gl_hash(const struct gfs2_sbd *sdp, return h; } -/** - * glock_free() - Perform a few checks and then release struct gfs2_glock - * @gl: The glock to release - * - * Also calls lock module to release its internal structure for this glock. - * - */ +static inline void spin_lock_bucket(unsigned int hash) +{ + struct hlist_bl_head *bl = &gl_hash_table[hash]; + bit_spin_lock(0, (unsigned long *)bl); +} -static void glock_free(struct gfs2_glock *gl) +static inline void spin_unlock_bucket(unsigned int hash) +{ + struct hlist_bl_head *bl = &gl_hash_table[hash]; + __bit_spin_unlock(0, (unsigned long *)bl); +} + +static void gfs2_glock_dealloc(struct rcu_head *rcu) +{ + struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); + + if (gl->gl_ops->go_flags & GLOF_ASPACE) + kmem_cache_free(gfs2_glock_aspace_cachep, gl); + else + kmem_cache_free(gfs2_glock_cachep, gl); +} + +void gfs2_glock_free(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; - struct address_space *mapping = gfs2_glock2aspace(gl); - struct kmem_cache *cachep = gfs2_glock_cachep; - GLOCK_BUG_ON(gl, mapping && mapping->nrpages); - trace_gfs2_glock_put(gl); - if (mapping) - cachep = gfs2_glock_aspace_cachep; - sdp->sd_lockstruct.ls_ops->lm_put_lock(cachep, gl); + call_rcu(&gl->gl_rcu, gfs2_glock_dealloc); + if (atomic_dec_and_test(&sdp->sd_glock_disposal)) + wake_up(&sdp->sd_glock_wait); } /** @@ -185,34 +145,49 @@ static int demote_ok(const struct gfs2_glock *gl) { const struct gfs2_glock_operations *glops = gl->gl_ops; + /* assert_spin_locked(&gl->gl_spin); */ + if (gl->gl_state == LM_ST_UNLOCKED) return 0; - if (!list_empty(&gl->gl_holders)) + if (test_bit(GLF_LFLUSH, &gl->gl_flags)) + return 0; + if ((gl->gl_name.ln_type != LM_TYPE_INODE) && + !list_empty(&gl->gl_holders)) return 0; if (glops->go_demote_ok) return glops->go_demote_ok(gl); return 1; } + /** - * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list + * __gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list * @gl: the glock * + * If the glock is demotable, then we add it (or move it) to the end + * of the glock LRU list. */ -static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) +static void __gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) { - int may_reclaim; - may_reclaim = (demote_ok(gl) && - (atomic_read(&gl->gl_ref) == 1 || - (gl->gl_name.ln_type == LM_TYPE_INODE && - atomic_read(&gl->gl_ref) <= 2))); - spin_lock(&lru_lock); - if (list_empty(&gl->gl_lru) && may_reclaim) { + if (demote_ok(gl)) { + spin_lock(&lru_lock); + + if (!list_empty(&gl->gl_lru)) + list_del_init(&gl->gl_lru); + else + atomic_inc(&lru_count); + list_add_tail(&gl->gl_lru, &lru_list); - atomic_inc(&lru_count); + spin_unlock(&lru_lock); } - spin_unlock(&lru_lock); +} + +void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) +{ + spin_lock(&gl->gl_spin); + __gfs2_glock_schedule_for_reclaim(gl); + spin_unlock(&gl->gl_spin); } /** @@ -227,7 +202,6 @@ void gfs2_glock_put_nolock(struct gfs2_glock *gl) { if (atomic_dec_and_test(&gl->gl_ref)) GLOCK_BUG_ON(gl, 1); - gfs2_glock_schedule_for_reclaim(gl); } /** @@ -236,30 +210,26 @@ void gfs2_glock_put_nolock(struct gfs2_glock *gl) * */ -int gfs2_glock_put(struct gfs2_glock *gl) +void gfs2_glock_put(struct gfs2_glock *gl) { - int rv = 0; + struct gfs2_sbd *sdp = gl->gl_sbd; + struct address_space *mapping = gfs2_glock2aspace(gl); - write_lock(gl_lock_addr(gl->gl_hash)); - if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) { - hlist_del(&gl->gl_list); + if (atomic_dec_and_test(&gl->gl_ref)) { + spin_lock_bucket(gl->gl_hash); + hlist_bl_del_rcu(&gl->gl_list); + spin_unlock_bucket(gl->gl_hash); + spin_lock(&lru_lock); if (!list_empty(&gl->gl_lru)) { list_del_init(&gl->gl_lru); atomic_dec(&lru_count); } spin_unlock(&lru_lock); - write_unlock(gl_lock_addr(gl->gl_hash)); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); - glock_free(gl); - rv = 1; - goto out; + GLOCK_BUG_ON(gl, mapping && mapping->nrpages); + trace_gfs2_glock_put(gl); + sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); } - spin_lock(&gl->gl_spin); - gfs2_glock_schedule_for_reclaim(gl); - spin_unlock(&gl->gl_spin); - write_unlock(gl_lock_addr(gl->gl_hash)); -out: - return rv; } /** @@ -275,17 +245,15 @@ static struct gfs2_glock *search_bucket(unsigned int hash, const struct lm_lockname *name) { struct gfs2_glock *gl; - struct hlist_node *h; + struct hlist_bl_node *h; - hlist_for_each_entry(gl, h, &gl_hash_table[hash].hb_list, gl_list) { + hlist_bl_for_each_entry_rcu(gl, h, &gl_hash_table[hash], gl_list) { if (!lm_name_equal(&gl->gl_name, name)) continue; if (gl->gl_sbd != sdp) continue; - - atomic_inc(&gl->gl_ref); - - return gl; + if (atomic_inc_not_zero(&gl->gl_ref)) + return gl; } return NULL; @@ -743,10 +711,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, struct gfs2_glock *gl, *tmp; unsigned int hash = gl_hash(sdp, &name); struct address_space *mapping; + struct kmem_cache *cachep; - read_lock(gl_lock_addr(hash)); + rcu_read_lock(); gl = search_bucket(hash, sdp, &name); - read_unlock(gl_lock_addr(hash)); + rcu_read_unlock(); *glp = gl; if (gl) @@ -755,9 +724,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, return -ENOENT; if (glops->go_flags & GLOF_ASPACE) - gl = kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_KERNEL); + cachep = gfs2_glock_aspace_cachep; else - gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL); + cachep = gfs2_glock_cachep; + gl = kmem_cache_alloc(cachep, GFP_KERNEL); if (!gl) return -ENOMEM; @@ -790,15 +760,16 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, mapping->writeback_index = 0; } - write_lock(gl_lock_addr(hash)); + spin_lock_bucket(hash); tmp = search_bucket(hash, sdp, &name); if (tmp) { - write_unlock(gl_lock_addr(hash)); - glock_free(gl); + spin_unlock_bucket(hash); + kmem_cache_free(cachep, gl); + atomic_dec(&sdp->sd_glock_disposal); gl = tmp; } else { - hlist_add_head(&gl->gl_list, &gl_hash_table[hash].hb_list); - write_unlock(gl_lock_addr(hash)); + hlist_bl_add_head_rcu(&gl->gl_list, &gl_hash_table[hash]); + spin_unlock_bucket(hash); } *glp = gl; @@ -1007,13 +978,13 @@ fail: insert_pt = &gh2->gh_list; } set_bit(GLF_QUEUED, &gl->gl_flags); + trace_gfs2_glock_queue(gh, 1); if (likely(insert_pt == NULL)) { list_add_tail(&gh->gh_list, &gl->gl_holders); if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY)) goto do_cancel; return; } - trace_gfs2_glock_queue(gh, 1); list_add_tail(&gh->gh_list, insert_pt); do_cancel: gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); @@ -1113,6 +1084,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh) !test_bit(GLF_DEMOTE, &gl->gl_flags)) fast_path = 1; } + __gfs2_glock_schedule_for_reclaim(gl); trace_gfs2_glock_queue(gh, 0); spin_unlock(&gl->gl_spin); if (likely(fast_path)) @@ -1276,10 +1248,8 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs) void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs) { - unsigned int x; - - for (x = 0; x < num_gh; x++) - gfs2_glock_dq(&ghs[x]); + while (num_gh--) + gfs2_glock_dq(&ghs[num_gh]); } /** @@ -1291,10 +1261,8 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs) void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs) { - unsigned int x; - - for (x = 0; x < num_gh; x++) - gfs2_glock_dq_uninit(&ghs[x]); + while (num_gh--) + gfs2_glock_dq_uninit(&ghs[num_gh]); } void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) @@ -1440,42 +1408,30 @@ static struct shrinker glock_shrinker = { * @sdp: the filesystem * @bucket: the bucket * - * Returns: 1 if the bucket has entries */ -static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp, +static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp, unsigned int hash) { - struct gfs2_glock *gl, *prev = NULL; - int has_entries = 0; - struct hlist_head *head = &gl_hash_table[hash].hb_list; + struct gfs2_glock *gl; + struct hlist_bl_head *head = &gl_hash_table[hash]; + struct hlist_bl_node *pos; - read_lock(gl_lock_addr(hash)); - /* Can't use hlist_for_each_entry - don't want prefetch here */ - if (hlist_empty(head)) - goto out; - gl = list_entry(head->first, struct gfs2_glock, gl_list); - while(1) { - if (!sdp || gl->gl_sbd == sdp) { - gfs2_glock_hold(gl); - read_unlock(gl_lock_addr(hash)); - if (prev) - gfs2_glock_put(prev); - prev = gl; + rcu_read_lock(); + hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) { + if ((gl->gl_sbd == sdp) && atomic_read(&gl->gl_ref)) examiner(gl); - has_entries = 1; - read_lock(gl_lock_addr(hash)); - } - if (gl->gl_list.next == NULL) - break; - gl = list_entry(gl->gl_list.next, struct gfs2_glock, gl_list); } -out: - read_unlock(gl_lock_addr(hash)); - if (prev) - gfs2_glock_put(prev); + rcu_read_unlock(); cond_resched(); - return has_entries; +} + +static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) +{ + unsigned x; + + for (x = 0; x < GFS2_GL_HASH_SIZE; x++) + examine_bucket(examiner, sdp, x); } @@ -1529,10 +1485,21 @@ static void clear_glock(struct gfs2_glock *gl) void gfs2_glock_thaw(struct gfs2_sbd *sdp) { - unsigned x; + glock_hash_walk(thaw_glock, sdp); +} - for (x = 0; x < GFS2_GL_HASH_SIZE; x++) - examine_bucket(thaw_glock, sdp, x); +static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl) +{ + int ret; + spin_lock(&gl->gl_spin); + ret = __dump_glock(seq, gl); + spin_unlock(&gl->gl_spin); + return ret; +} + +static void dump_glock_func(struct gfs2_glock *gl) +{ + dump_glock(NULL, gl); } /** @@ -1545,13 +1512,10 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp) void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) { - unsigned int x; - - for (x = 0; x < GFS2_GL_HASH_SIZE; x++) - examine_bucket(clear_glock, sdp, x); + glock_hash_walk(clear_glock, sdp); flush_workqueue(glock_workqueue); wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0); - gfs2_dump_lockstate(sdp); + glock_hash_walk(dump_glock_func, sdp); } void gfs2_glock_finish_truncate(struct gfs2_inode *ip) @@ -1717,66 +1681,15 @@ out: return error; } -static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl) -{ - int ret; - spin_lock(&gl->gl_spin); - ret = __dump_glock(seq, gl); - spin_unlock(&gl->gl_spin); - return ret; -} -/** - * gfs2_dump_lockstate - print out the current lockstate - * @sdp: the filesystem - * @ub: the buffer to copy the information into - * - * If @ub is NULL, dump the lockstate to the console. - * - */ - -static int gfs2_dump_lockstate(struct gfs2_sbd *sdp) -{ - struct gfs2_glock *gl; - struct hlist_node *h; - unsigned int x; - int error = 0; - - for (x = 0; x < GFS2_GL_HASH_SIZE; x++) { - - read_lock(gl_lock_addr(x)); - - hlist_for_each_entry(gl, h, &gl_hash_table[x].hb_list, gl_list) { - if (gl->gl_sbd != sdp) - continue; - - error = dump_glock(NULL, gl); - if (error) - break; - } - - read_unlock(gl_lock_addr(x)); - - if (error) - break; - } - - - return error; -} int __init gfs2_glock_init(void) { unsigned i; for(i = 0; i < GFS2_GL_HASH_SIZE; i++) { - INIT_HLIST_HEAD(&gl_hash_table[i].hb_list); - } -#ifdef GL_HASH_LOCK_SZ - for(i = 0; i < GL_HASH_LOCK_SZ; i++) { - rwlock_init(&gl_hash_locks[i]); + INIT_HLIST_BL_HEAD(&gl_hash_table[i]); } -#endif glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 0); @@ -1802,62 +1715,54 @@ void gfs2_glock_exit(void) destroy_workqueue(gfs2_delete_workqueue); } +static inline struct gfs2_glock *glock_hash_chain(unsigned hash) +{ + return hlist_bl_entry(hlist_bl_first_rcu(&gl_hash_table[hash]), + struct gfs2_glock, gl_list); +} + +static inline struct gfs2_glock *glock_hash_next(struct gfs2_glock *gl) +{ + return hlist_bl_entry(rcu_dereference(gl->gl_list.next), + struct gfs2_glock, gl_list); +} + static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) { struct gfs2_glock *gl; -restart: - read_lock(gl_lock_addr(gi->hash)); - gl = gi->gl; - if (gl) { - gi->gl = hlist_entry(gl->gl_list.next, - struct gfs2_glock, gl_list); - } else { - gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first, - struct gfs2_glock, gl_list); - } - if (gi->gl) - gfs2_glock_hold(gi->gl); - read_unlock(gl_lock_addr(gi->hash)); - if (gl) - gfs2_glock_put(gl); - while (gi->gl == NULL) { - gi->hash++; - if (gi->hash >= GFS2_GL_HASH_SIZE) - return 1; - read_lock(gl_lock_addr(gi->hash)); - gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first, - struct gfs2_glock, gl_list); - if (gi->gl) - gfs2_glock_hold(gi->gl); - read_unlock(gl_lock_addr(gi->hash)); - } - - if (gi->sdp != gi->gl->gl_sbd) - goto restart; + do { + gl = gi->gl; + if (gl) { + gi->gl = glock_hash_next(gl); + } else { + gi->gl = glock_hash_chain(gi->hash); + } + while (gi->gl == NULL) { + gi->hash++; + if (gi->hash >= GFS2_GL_HASH_SIZE) { + rcu_read_unlock(); + return 1; + } + gi->gl = glock_hash_chain(gi->hash); + } + /* Skip entries for other sb and dead entries */ + } while (gi->sdp != gi->gl->gl_sbd || atomic_read(&gi->gl->gl_ref) == 0); return 0; } -static void gfs2_glock_iter_free(struct gfs2_glock_iter *gi) -{ - if (gi->gl) - gfs2_glock_put(gi->gl); - gi->gl = NULL; -} - static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) { struct gfs2_glock_iter *gi = seq->private; loff_t n = *pos; gi->hash = 0; + rcu_read_lock(); do { - if (gfs2_glock_iter_next(gi)) { - gfs2_glock_iter_free(gi); + if (gfs2_glock_iter_next(gi)) return NULL; - } } while (n--); return gi->gl; @@ -1870,10 +1775,8 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, (*pos)++; - if (gfs2_glock_iter_next(gi)) { - gfs2_glock_iter_free(gi); + if (gfs2_glock_iter_next(gi)) return NULL; - } return gi->gl; } @@ -1881,7 +1784,10 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) { struct gfs2_glock_iter *gi = seq->private; - gfs2_glock_iter_free(gi); + + if (gi->gl) + rcu_read_unlock(); + gi->gl = NULL; } static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 691851ceb61..aea160690e9 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -118,7 +118,7 @@ struct lm_lockops { int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname); void (*lm_unmount) (struct gfs2_sbd *sdp); void (*lm_withdraw) (struct gfs2_sbd *sdp); - void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl); + void (*lm_put_lock) (struct gfs2_glock *gl); int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state, unsigned int flags); void (*lm_cancel) (struct gfs2_glock *gl); @@ -174,7 +174,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, int create, struct gfs2_glock **glp); void gfs2_glock_hold(struct gfs2_glock *gl); void gfs2_glock_put_nolock(struct gfs2_glock *gl); -int gfs2_glock_put(struct gfs2_glock *gl); +void gfs2_glock_put(struct gfs2_glock *gl); void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, struct gfs2_holder *gh); void gfs2_holder_reinit(unsigned int state, unsigned flags, @@ -223,25 +223,22 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl, return error; } -/* Lock Value Block functions */ - -int gfs2_lvb_hold(struct gfs2_glock *gl); -void gfs2_lvb_unhold(struct gfs2_glock *gl); - -void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state); -void gfs2_glock_complete(struct gfs2_glock *gl, int ret); -void gfs2_reclaim_glock(struct gfs2_sbd *sdp); -void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); -void gfs2_glock_finish_truncate(struct gfs2_inode *ip); -void gfs2_glock_thaw(struct gfs2_sbd *sdp); - -int __init gfs2_glock_init(void); -void gfs2_glock_exit(void); - -int gfs2_create_debugfs_file(struct gfs2_sbd *sdp); -void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp); -int gfs2_register_debugfs(void); -void gfs2_unregister_debugfs(void); +extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state); +extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret); +extern void gfs2_reclaim_glock(struct gfs2_sbd *sdp); +extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); +extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip); +extern void gfs2_glock_thaw(struct gfs2_sbd *sdp); +extern void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl); +extern void gfs2_glock_free(struct gfs2_glock *gl); + +extern int __init gfs2_glock_init(void); +extern void gfs2_glock_exit(void); + +extern int gfs2_create_debugfs_file(struct gfs2_sbd *sdp); +extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp); +extern int gfs2_register_debugfs(void); +extern void gfs2_unregister_debugfs(void); extern const struct lm_lockops gfs2_dlm_ops; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 263561bf1a5..3754e3cbf02 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -56,20 +56,26 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) BUG_ON(current->journal_info); current->journal_info = &tr; - gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); while (!list_empty(head)) { bd = list_entry(head->next, struct gfs2_bufdata, bd_ail_gl_list); bh = bd->bd_bh; gfs2_remove_from_ail(bd); + spin_unlock(&sdp->sd_ail_lock); + bd->bd_bh = NULL; bh->b_private = NULL; bd->bd_blkno = bh->b_blocknr; + gfs2_log_lock(sdp); gfs2_assert_withdraw(sdp, !buffer_busy(bh)); gfs2_trans_add_revoke(sdp, bd); + gfs2_log_unlock(sdp); + + spin_lock(&sdp->sd_ail_lock); } gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ail_lock); gfs2_trans_end(sdp); gfs2_log_flush(sdp, NULL); @@ -206,8 +212,17 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) static int inode_go_demote_ok(const struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_holder *gh; + if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object) return 0; + + if (!list_empty(&gl->gl_holders)) { + gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); + if (gh->gh_list.next != &gl->gl_holders) + return 0; + } + return 1; } @@ -272,19 +287,6 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) } /** - * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock - * @gl: the glock - * - * Returns: 1 if it's ok - */ - -static int rgrp_go_demote_ok(const struct gfs2_glock *gl) -{ - const struct address_space *mapping = (const struct address_space *)(gl + 1); - return !mapping->nrpages; -} - -/** * rgrp_go_lock - operation done after an rgrp lock is locked by * a first holder on this node. * @gl: the glock @@ -410,7 +412,6 @@ const struct gfs2_glock_operations gfs2_inode_glops = { const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_xmote_th = rgrp_go_sync, .go_inval = rgrp_go_inval, - .go_demote_ok = rgrp_go_demote_ok, .go_lock = rgrp_go_lock, .go_unlock = rgrp_go_unlock, .go_dump = gfs2_rgrp_dump, diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index a79790c0627..870a89d6d4d 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -15,6 +15,8 @@ #include <linux/workqueue.h> #include <linux/dlm.h> #include <linux/buffer_head.h> +#include <linux/rcupdate.h> +#include <linux/rculist_bl.h> #define DIO_WAIT 0x00000010 #define DIO_METADATA 0x00000020 @@ -201,7 +203,7 @@ enum { }; struct gfs2_glock { - struct hlist_node gl_list; + struct hlist_bl_node gl_list; unsigned long gl_flags; /* GLF_... */ struct lm_lockname gl_name; atomic_t gl_ref; @@ -234,6 +236,7 @@ struct gfs2_glock { atomic_t gl_ail_count; struct delayed_work gl_work; struct work_struct gl_delete; + struct rcu_head gl_rcu; }; #define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */ @@ -314,6 +317,7 @@ enum { QDF_USER = 0, QDF_CHANGE = 1, QDF_LOCKED = 2, + QDF_REFRESH = 3, }; struct gfs2_quota_data { @@ -647,6 +651,7 @@ struct gfs2_sbd { unsigned int sd_log_flush_head; u64 sd_log_flush_wrapped; + spinlock_t sd_ail_lock; struct list_head sd_ail1_list; struct list_head sd_ail2_list; u64 sd_ail_sync_gen; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 7aa7d4f8984..97d54a28776 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -763,14 +763,15 @@ fail: return error; } -static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip) +static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip, + const struct qstr *qstr) { int err; size_t len; void *value; char *name; - err = security_inode_init_security(&ip->i_inode, &dip->i_inode, + err = security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr, &name, &value, &len); if (err) { @@ -854,7 +855,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, if (error) goto fail_gunlock2; - error = gfs2_security_init(dip, GFS2_I(inode)); + error = gfs2_security_init(dip, GFS2_I(inode), name); if (error) goto fail_gunlock2; diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 6e493aee28f..98c80d8c2a6 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -22,7 +22,6 @@ static void gdlm_ast(void *arg) { struct gfs2_glock *gl = arg; unsigned ret = gl->gl_state; - struct gfs2_sbd *sdp = gl->gl_sbd; BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); @@ -31,12 +30,7 @@ static void gdlm_ast(void *arg) switch (gl->gl_lksb.sb_status) { case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */ - if (gl->gl_ops->go_flags & GLOF_ASPACE) - kmem_cache_free(gfs2_glock_aspace_cachep, gl); - else - kmem_cache_free(gfs2_glock_cachep, gl); - if (atomic_dec_and_test(&sdp->sd_glock_disposal)) - wake_up(&sdp->sd_glock_wait); + gfs2_glock_free(gl); return; case -DLM_ECANCEL: /* Cancel while getting lock */ ret |= LM_OUT_CANCELED; @@ -164,16 +158,14 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); } -static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl) +static void gdlm_put_lock(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; struct lm_lockstruct *ls = &sdp->sd_lockstruct; int error; if (gl->gl_lksb.sb_lkid == 0) { - kmem_cache_free(cachep, gl); - if (atomic_dec_and_test(&sdp->sd_glock_disposal)) - wake_up(&sdp->sd_glock_wait); + gfs2_glock_free(gl); return; } diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index eb01f3575e1..e7ed31f858d 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -67,7 +67,7 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, * @mapping: The associated mapping (maybe NULL) * @bd: The gfs2_bufdata to remove * - * The log lock _must_ be held when calling this function + * The ail lock _must_ be held when calling this function * */ @@ -88,8 +88,8 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd) */ static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) -__releases(&sdp->sd_log_lock) -__acquires(&sdp->sd_log_lock) +__releases(&sdp->sd_ail_lock) +__acquires(&sdp->sd_ail_lock) { struct gfs2_bufdata *bd, *s; struct buffer_head *bh; @@ -117,7 +117,7 @@ __acquires(&sdp->sd_log_lock) list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list); get_bh(bh); - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ail_lock); lock_buffer(bh); if (test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; @@ -126,7 +126,7 @@ __acquires(&sdp->sd_log_lock) unlock_buffer(bh); brelse(bh); } - gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); retry = 1; break; @@ -175,10 +175,10 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp) struct gfs2_ail *ai; int done = 0; - gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); head = &sdp->sd_ail1_list; if (list_empty(head)) { - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ail_lock); return; } sync_gen = sdp->sd_ail_sync_gen++; @@ -189,13 +189,13 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp) if (ai->ai_sync_gen >= sync_gen) continue; ai->ai_sync_gen = sync_gen; - gfs2_ail1_start_one(sdp, ai); /* This may drop log lock */ + gfs2_ail1_start_one(sdp, ai); /* This may drop ail lock */ done = 0; break; } } - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ail_lock); } static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags) @@ -203,7 +203,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags) struct gfs2_ail *ai, *s; int ret; - gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) { if (gfs2_ail1_empty_one(sdp, ai, flags)) @@ -214,7 +214,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags) ret = list_empty(&sdp->sd_ail1_list); - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ail_lock); return ret; } @@ -247,7 +247,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) int wrap = (new_tail < old_tail); int a, b, rm; - gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) { a = (old_tail <= ai->ai_first); @@ -263,7 +263,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) kfree(ai); } - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ail_lock); } /** @@ -421,7 +421,7 @@ static unsigned int current_tail(struct gfs2_sbd *sdp) struct gfs2_ail *ai; unsigned int tail; - gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); if (list_empty(&sdp->sd_ail1_list)) { tail = sdp->sd_log_head; @@ -430,7 +430,7 @@ static unsigned int current_tail(struct gfs2_sbd *sdp) tail = ai->ai_first; } - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ail_lock); return tail; } @@ -743,10 +743,12 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) sdp->sd_log_commited_databuf = 0; sdp->sd_log_commited_revoke = 0; + spin_lock(&sdp->sd_ail_lock); if (!list_empty(&ai->ai_ail1_list)) { list_add(&ai->ai_list, &sdp->sd_ail1_list); ai = NULL; } + spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); trace_gfs2_log_flush(sdp, 0); up_write(&sdp->sd_log_flush_lock); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index bf33f822058..e919abf25ec 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -51,8 +51,10 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) /* If this buffer is in the AIL and it has already been written * to in-place disk block, remove it from the AIL. */ + spin_lock(&sdp->sd_ail_lock); if (bd->bd_ail) list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list); + spin_unlock(&sdp->sd_ail_lock); get_bh(bh); atomic_inc(&sdp->sd_log_pinned); trace_gfs2_pin(bd, 1); @@ -80,7 +82,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, mark_buffer_dirty(bh); clear_buffer_pinned(bh); - gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); if (bd->bd_ail) { list_del(&bd->bd_ail_st_list); brelse(bh); @@ -91,9 +93,11 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, } bd->bd_ail = ai; list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list); - clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); + spin_unlock(&sdp->sd_ail_lock); + + if (test_and_clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags)) + gfs2_glock_schedule_for_reclaim(bd->bd_gl); trace_gfs2_pin(bd, 0); - gfs2_log_unlock(sdp); unlock_buffer(bh); atomic_dec(&sdp->sd_log_pinned); } diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 72c31a315d9..888a5f5a1a5 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -14,6 +14,8 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/gfs2_ondisk.h> +#include <linux/rcupdate.h> +#include <linux/rculist_bl.h> #include <asm/atomic.h> #include "gfs2.h" @@ -45,7 +47,7 @@ static void gfs2_init_glock_once(void *foo) { struct gfs2_glock *gl = foo; - INIT_HLIST_NODE(&gl->gl_list); + INIT_HLIST_BL_NODE(&gl->gl_list); spin_lock_init(&gl->gl_spin); INIT_LIST_HEAD(&gl->gl_holders); INIT_LIST_HEAD(&gl->gl_lru); @@ -191,6 +193,8 @@ static void __exit exit_gfs2_fs(void) unregister_filesystem(&gfs2meta_fs_type); destroy_workqueue(gfs_recovery_wq); + rcu_barrier(); + kmem_cache_destroy(gfs2_quotad_cachep); kmem_cache_destroy(gfs2_rgrpd_cachep); kmem_cache_destroy(gfs2_bufdata_cachep); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 939739c7b3f..01d97f48655 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -326,6 +326,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int brelse(bh); } if (bd) { + spin_lock(&sdp->sd_ail_lock); if (bd->bd_ail) { gfs2_remove_from_ail(bd); bh->b_private = NULL; @@ -333,6 +334,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int bd->bd_blkno = bh->b_blocknr; gfs2_trans_add_revoke(sdp, bd); } + spin_unlock(&sdp->sd_ail_lock); } clear_buffer_dirty(bh); clear_buffer_uptodate(bh); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 777927ce6f7..42ef24355af 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -99,6 +99,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) init_waitqueue_head(&sdp->sd_log_waitq); init_waitqueue_head(&sdp->sd_logd_waitq); + spin_lock_init(&sdp->sd_ail_lock); INIT_LIST_HEAD(&sdp->sd_ail1_list); INIT_LIST_HEAD(&sdp->sd_ail2_list); @@ -928,17 +929,9 @@ static const match_table_t nolock_tokens = { { Opt_err, NULL }, }; -static void nolock_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl) -{ - struct gfs2_sbd *sdp = gl->gl_sbd; - kmem_cache_free(cachep, gl); - if (atomic_dec_and_test(&sdp->sd_glock_disposal)) - wake_up(&sdp->sd_glock_wait); -} - static const struct lm_lockops nolock_ops = { .lm_proto_name = "lock_nolock", - .lm_put_lock = nolock_put_lock, + .lm_put_lock = gfs2_glock_free, .lm_tokens = &nolock_tokens, }; diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index d8b26ac2e20..09e436a5072 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -1026,9 +1026,9 @@ static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p) /** * gfs2_permission - - * @inode: - * @mask: - * @nd: passed from Linux VFS, ignored by us + * @inode: The inode + * @mask: The mask to be tested + * @flags: Indicates whether this is an RCU path walk or not * * This may be called from the VFS directly, or from within GFS2 with the * inode locked, so we look to see if the glock is already locked and only @@ -1044,11 +1044,11 @@ int gfs2_permission(struct inode *inode, int mask, unsigned int flags) int error; int unlock = 0; - if (flags & IPERM_FLAG_RCU) - return -ECHILD; ip = GFS2_I(inode); if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { + if (flags & IPERM_FLAG_RCU) + return -ECHILD; error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) return error; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index a689901963d..e23d9864c41 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -834,6 +834,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) goto out_end_trans; do_qc(qd, -qd->qd_change_sync); + set_bit(QDF_REFRESH, &qd->qd_flags); } error = 0; @@ -929,6 +930,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_quota_data *qd; unsigned int x; int error = 0; @@ -942,7 +944,11 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) sort_qd, NULL); for (x = 0; x < al->al_qd_num; x++) { - error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]); + int force = NO_FORCE; + qd = al->al_qd[x]; + if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags)) + force = FORCE; + error = do_glock(qd, force, &al->al_qd_ghs[x]); if (error) break; } @@ -1587,6 +1593,8 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, offset = qd2offset(qd); alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota)); + if (gfs2_is_stuffed(ip)) + alloc_required = 1; if (alloc_required) { al = gfs2_alloc_get(ip); if (al == NULL) @@ -1600,7 +1608,9 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, blocks += gfs2_rg_blocks(al); } - error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0); + /* Some quotas span block boundaries and can update two blocks, + adding an extra block to the transaction to handle such quotas */ + error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 2, 0); if (error) goto out_release; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 7293ea27020..cf930cd9664 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1602,7 +1602,7 @@ rgrp_error: * */ -void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) +void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd; @@ -1617,7 +1617,21 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_trans_add_rg(rgd); +} +/** + * gfs2_free_data - free a contiguous run of data block(s) + * @ip: the inode these blocks are being freed from + * @bstart: first block of a run of contiguous blocks + * @blen: the length of the block run + * + */ + +void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + __gfs2_free_data(ip, bstart, blen); gfs2_statfs_change(sdp, 0, +blen, 0); gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid); } @@ -1630,7 +1644,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) * */ -void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) +void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd; @@ -1645,10 +1659,24 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_trans_add_rg(rgd); + gfs2_meta_wipe(ip, bstart, blen); +} +/** + * gfs2_free_meta - free a contiguous run of data block(s) + * @ip: the inode these blocks are being freed from + * @bstart: first block of a run of contiguous blocks + * @blen: the length of the block run + * + */ + +void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + __gfs2_free_meta(ip, bstart, blen); gfs2_statfs_change(sdp, 0, +blen, 0); gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid); - gfs2_meta_wipe(ip, bstart, blen); } void gfs2_unlink_di(struct inode *inode) diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 50c2bb04369..a80e3034ac4 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -52,7 +52,9 @@ extern int gfs2_ri_update(struct gfs2_inode *ip); extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation); +extern void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); +extern void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); extern void gfs2_unlink_di(struct inode *inode); diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig index 63b6f563231..0c39dc3ef7d 100644 --- a/fs/hpfs/Kconfig +++ b/fs/hpfs/Kconfig @@ -1,7 +1,7 @@ config HPFS_FS tristate "OS/2 HPFS file system support" depends on BLOCK - depends on BKL # nontrivial to fix + depends on BROKEN || !PREEMPT help OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS is the file system used for organizing files on OS/2 hard disk diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index d32f63a569f..b3d7c0ddb60 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -6,16 +6,15 @@ * directory VFS functions */ -#include <linux/smp_lock.h> #include <linux/slab.h> #include "hpfs_fn.h" static int hpfs_dir_release(struct inode *inode, struct file *filp) { - lock_kernel(); + hpfs_lock(inode->i_sb); hpfs_del_pos(inode, &filp->f_pos); /*hpfs_write_if_changed(inode);*/ - unlock_kernel(); + hpfs_unlock(inode->i_sb); return 0; } @@ -30,7 +29,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) struct hpfs_inode_info *hpfs_inode = hpfs_i(i); struct super_block *s = i->i_sb; - lock_kernel(); + hpfs_lock(s); /*printk("dir lseek\n");*/ if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok; @@ -43,12 +42,12 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) } mutex_unlock(&i->i_mutex); ok: - unlock_kernel(); + hpfs_unlock(s); return filp->f_pos = new_off; fail: mutex_unlock(&i->i_mutex); /*printk("illegal lseek: %016llx\n", new_off);*/ - unlock_kernel(); + hpfs_unlock(s); return -ESPIPE; } @@ -64,7 +63,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir) int c1, c2 = 0; int ret = 0; - lock_kernel(); + hpfs_lock(inode->i_sb); if (hpfs_sb(inode->i_sb)->sb_chk) { if (hpfs_chk_sectors(inode->i_sb, inode->i_ino, 1, "dir_fnode")) { @@ -167,7 +166,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir) hpfs_brelse4(&qbh); } out: - unlock_kernel(); + hpfs_unlock(inode->i_sb); return ret; } @@ -197,10 +196,10 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name struct inode *result = NULL; struct hpfs_inode_info *hpfs_result; - lock_kernel(); + hpfs_lock(dir->i_sb); if ((err = hpfs_chk_name(name, &len))) { if (err == -ENAMETOOLONG) { - unlock_kernel(); + hpfs_unlock(dir->i_sb); return ERR_PTR(-ENAMETOOLONG); } goto end_add; @@ -298,7 +297,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name end: end_add: - unlock_kernel(); + hpfs_unlock(dir->i_sb); d_add(dentry, result); return NULL; @@ -311,7 +310,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name /*bail:*/ - unlock_kernel(); + hpfs_unlock(dir->i_sb); return ERR_PTR(-ENOENT); } diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index c0340887c7e..2dbae20450f 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -6,16 +6,15 @@ * file VFS functions */ -#include <linux/smp_lock.h> #include "hpfs_fn.h" #define BLOCKS(size) (((size) + 511) >> 9) static int hpfs_file_release(struct inode *inode, struct file *file) { - lock_kernel(); + hpfs_lock(inode->i_sb); hpfs_write_if_changed(inode); - unlock_kernel(); + hpfs_unlock(inode->i_sb); return 0; } @@ -49,14 +48,14 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno) static void hpfs_truncate(struct inode *i) { if (IS_IMMUTABLE(i)) return /*-EPERM*/; - lock_kernel(); + hpfs_lock(i->i_sb); hpfs_i(i)->i_n_secs = 0; i->i_blocks = 1 + ((i->i_size + 511) >> 9); hpfs_i(i)->mmu_private = i->i_size; hpfs_truncate_btree(i->i_sb, i->i_ino, 1, ((i->i_size + 511) >> 9)); hpfs_write_inode(i); hpfs_i(i)->i_n_secs = 0; - unlock_kernel(); + hpfs_unlock(i->i_sb); } static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 1c43dbea55e..c15adbca07f 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -342,3 +342,25 @@ static inline time32_t gmt_to_local(struct super_block *s, time_t t) extern struct timezone sys_tz; return t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift; } + +/* + * Locking: + * + * hpfs_lock() is a leftover from the big kernel lock. + * Right now, these functions are empty and only left + * for documentation purposes. The file system no longer + * works on SMP systems, so the lock is not needed + * any more. + * + * If someone is interested in making it work again, this + * would be the place to start by adding a per-superblock + * mutex and fixing all the bugs and performance issues + * caused by that. + */ +static inline void hpfs_lock(struct super_block *s) +{ +} + +static inline void hpfs_unlock(struct super_block *s) +{ +} diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 1ae35baa539..87f1f787e76 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -6,7 +6,6 @@ * inode VFS functions */ -#include <linux/smp_lock.h> #include <linux/slab.h> #include "hpfs_fn.h" @@ -267,7 +266,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr) struct inode *inode = dentry->d_inode; int error = -EINVAL; - lock_kernel(); + hpfs_lock(inode->i_sb); if (inode->i_ino == hpfs_sb(inode->i_sb)->sb_root) goto out_unlock; if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size) @@ -290,7 +289,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr) hpfs_write_inode(inode); out_unlock: - unlock_kernel(); + hpfs_unlock(inode->i_sb); return error; } @@ -307,8 +306,8 @@ void hpfs_evict_inode(struct inode *inode) truncate_inode_pages(&inode->i_data, 0); end_writeback(inode); if (!inode->i_nlink) { - lock_kernel(); + hpfs_lock(inode->i_sb); hpfs_remove_fnode(inode->i_sb, inode->i_ino); - unlock_kernel(); + hpfs_unlock(inode->i_sb); } } diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index f4ad9e31ddc..d5f8c8a1902 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -6,7 +6,6 @@ * adding & removing files & directories */ #include <linux/sched.h> -#include <linux/smp_lock.h> #include "hpfs_fn.h" static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) @@ -25,7 +24,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct hpfs_dirent dee; int err; if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err; - lock_kernel(); + hpfs_lock(dir->i_sb); err = -ENOSPC; fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); if (!fnode) @@ -103,7 +102,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) } d_instantiate(dentry, result); mutex_unlock(&hpfs_i(dir)->i_mutex); - unlock_kernel(); + hpfs_unlock(dir->i_sb); return 0; bail3: mutex_unlock(&hpfs_i(dir)->i_mutex); @@ -115,7 +114,7 @@ bail1: brelse(bh); hpfs_free_sectors(dir->i_sb, fno, 1); bail: - unlock_kernel(); + hpfs_unlock(dir->i_sb); return err; } @@ -132,7 +131,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc int err; if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err; - lock_kernel(); + hpfs_lock(dir->i_sb); err = -ENOSPC; fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); if (!fnode) @@ -195,7 +194,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc } d_instantiate(dentry, result); mutex_unlock(&hpfs_i(dir)->i_mutex); - unlock_kernel(); + hpfs_unlock(dir->i_sb); return 0; bail2: @@ -205,7 +204,7 @@ bail1: brelse(bh); hpfs_free_sectors(dir->i_sb, fno, 1); bail: - unlock_kernel(); + hpfs_unlock(dir->i_sb); return err; } @@ -224,7 +223,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM; if (!new_valid_dev(rdev)) return -EINVAL; - lock_kernel(); + hpfs_lock(dir->i_sb); err = -ENOSPC; fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); if (!fnode) @@ -274,7 +273,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t d_instantiate(dentry, result); mutex_unlock(&hpfs_i(dir)->i_mutex); brelse(bh); - unlock_kernel(); + hpfs_unlock(dir->i_sb); return 0; bail2: mutex_unlock(&hpfs_i(dir)->i_mutex); @@ -283,7 +282,7 @@ bail1: brelse(bh); hpfs_free_sectors(dir->i_sb, fno, 1); bail: - unlock_kernel(); + hpfs_unlock(dir->i_sb); return err; } @@ -299,9 +298,9 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy struct inode *result; int err; if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err; - lock_kernel(); + hpfs_lock(dir->i_sb); if (hpfs_sb(dir->i_sb)->sb_eas < 2) { - unlock_kernel(); + hpfs_unlock(dir->i_sb); return -EPERM; } err = -ENOSPC; @@ -354,7 +353,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy hpfs_write_inode_nolock(result); d_instantiate(dentry, result); mutex_unlock(&hpfs_i(dir)->i_mutex); - unlock_kernel(); + hpfs_unlock(dir->i_sb); return 0; bail2: mutex_unlock(&hpfs_i(dir)->i_mutex); @@ -363,7 +362,7 @@ bail1: brelse(bh); hpfs_free_sectors(dir->i_sb, fno, 1); bail: - unlock_kernel(); + hpfs_unlock(dir->i_sb); return err; } @@ -380,7 +379,7 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry) int rep = 0; int err; - lock_kernel(); + hpfs_lock(dir->i_sb); hpfs_adjust_length(name, &len); again: mutex_lock(&hpfs_i(inode)->i_parent_mutex); @@ -416,7 +415,7 @@ again: dentry_unhash(dentry); if (!d_unhashed(dentry)) { dput(dentry); - unlock_kernel(); + hpfs_unlock(dir->i_sb); return -ENOSPC; } if (generic_permission(inode, MAY_WRITE, 0, NULL) || @@ -435,7 +434,7 @@ again: if (!err) goto again; } - unlock_kernel(); + hpfs_unlock(dir->i_sb); return -ENOSPC; default: drop_nlink(inode); @@ -448,7 +447,7 @@ out1: out: mutex_unlock(&hpfs_i(dir)->i_mutex); mutex_unlock(&hpfs_i(inode)->i_parent_mutex); - unlock_kernel(); + hpfs_unlock(dir->i_sb); return err; } @@ -466,7 +465,7 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) int r; hpfs_adjust_length(name, &len); - lock_kernel(); + hpfs_lock(dir->i_sb); mutex_lock(&hpfs_i(inode)->i_parent_mutex); mutex_lock(&hpfs_i(dir)->i_mutex); err = -ENOENT; @@ -508,7 +507,7 @@ out1: out: mutex_unlock(&hpfs_i(dir)->i_mutex); mutex_unlock(&hpfs_i(inode)->i_parent_mutex); - unlock_kernel(); + hpfs_unlock(dir->i_sb); return err; } @@ -521,21 +520,21 @@ static int hpfs_symlink_readpage(struct file *file, struct page *page) int err; err = -EIO; - lock_kernel(); + hpfs_lock(i->i_sb); if (!(fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh))) goto fail; err = hpfs_read_ea(i->i_sb, fnode, "SYMLINK", link, PAGE_SIZE); brelse(bh); if (err) goto fail; - unlock_kernel(); + hpfs_unlock(i->i_sb); SetPageUptodate(page); kunmap(page); unlock_page(page); return 0; fail: - unlock_kernel(); + hpfs_unlock(i->i_sb); SetPageError(page); kunmap(page); unlock_page(page); @@ -567,7 +566,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, err = 0; hpfs_adjust_length(old_name, &old_len); - lock_kernel(); + hpfs_lock(i->i_sb); /* order doesn't matter, due to VFS exclusion */ mutex_lock(&hpfs_i(i)->i_parent_mutex); if (new_inode) @@ -659,7 +658,7 @@ end1: mutex_unlock(&hpfs_i(i)->i_parent_mutex); if (new_inode) mutex_unlock(&hpfs_i(new_inode)->i_parent_mutex); - unlock_kernel(); + hpfs_unlock(i->i_sb); return err; } diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index b30426b1fc9..c89b4080858 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -13,7 +13,6 @@ #include <linux/statfs.h> #include <linux/magic.h> #include <linux/sched.h> -#include <linux/smp_lock.h> #include <linux/bitmap.h> #include <linux/slab.h> @@ -103,15 +102,11 @@ static void hpfs_put_super(struct super_block *s) { struct hpfs_sb_info *sbi = hpfs_sb(s); - lock_kernel(); - kfree(sbi->sb_cp_table); kfree(sbi->sb_bmp_dir); unmark_dirty(s); s->s_fs_info = NULL; kfree(sbi); - - unlock_kernel(); } unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno) @@ -143,7 +138,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct super_block *s = dentry->d_sb; struct hpfs_sb_info *sbi = hpfs_sb(s); u64 id = huge_encode_dev(s->s_bdev->bd_dev); - lock_kernel(); + hpfs_lock(s); /*if (sbi->sb_n_free == -1) {*/ sbi->sb_n_free = count_bitmaps(s); @@ -160,7 +155,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = 254; - unlock_kernel(); + hpfs_unlock(s); return 0; } @@ -406,7 +401,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) *flags |= MS_NOATIME; - lock_kernel(); + hpfs_lock(s); lock_super(s); uid = sbi->sb_uid; gid = sbi->sb_gid; umask = 0777 & ~sbi->sb_mode; @@ -441,12 +436,12 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) replace_mount_options(s, new_opts); unlock_super(s); - unlock_kernel(); + hpfs_unlock(s); return 0; out_err: unlock_super(s); - unlock_kernel(); + hpfs_unlock(s); kfree(new_opts); return -EINVAL; } @@ -484,13 +479,15 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) int o; - lock_kernel(); + if (num_possible_cpus() > 1) { + printk(KERN_ERR "HPFS is not SMP safe\n"); + return -EINVAL; + } save_mount_options(s, options); sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) { - unlock_kernel(); return -ENOMEM; } s->s_fs_info = sbi; @@ -677,7 +674,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) root->i_blocks = 5; hpfs_brelse4(&qbh); } - unlock_kernel(); return 0; bail4: brelse(bh2); @@ -689,7 +685,6 @@ bail0: kfree(sbi->sb_cp_table); s->s_fs_info = NULL; kfree(sbi); - unlock_kernel(); return -EINVAL; } diff --git a/fs/inode.c b/fs/inode.c index 0647d80accf..9910c039f02 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -84,16 +84,13 @@ static struct hlist_head *inode_hashtable __read_mostly; DEFINE_SPINLOCK(inode_lock); /* - * iprune_sem provides exclusion between the kswapd or try_to_free_pages - * icache shrinking path, and the umount path. Without this exclusion, - * by the time prune_icache calls iput for the inode whose pages it has - * been invalidating, or by the time it calls clear_inode & destroy_inode - * from its final dispose_list, the struct super_block they refer to - * (for inode->i_sb->s_op) may already have been freed and reused. + * iprune_sem provides exclusion between the icache shrinking and the + * umount path. * - * We make this an rwsem because the fastpath is icache shrinking. In - * some cases a filesystem may be doing a significant amount of work in - * its inode reclaim code, so this should improve parallelism. + * We don't actually need it to protect anything in the umount path, + * but only need to cycle through it to make sure any inode that + * prune_icache took off the LRU list has been fully torn down by the + * time we are past evict_inodes. */ static DECLARE_RWSEM(iprune_sem); @@ -516,17 +513,12 @@ void evict_inodes(struct super_block *sb) struct inode *inode, *next; LIST_HEAD(dispose); - down_write(&iprune_sem); - spin_lock(&inode_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (atomic_read(&inode->i_count)) continue; - - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { - WARN_ON(1); + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) continue; - } inode->i_state |= I_FREEING; @@ -542,6 +534,13 @@ void evict_inodes(struct super_block *sb) spin_unlock(&inode_lock); dispose_list(&dispose); + + /* + * Cycle through iprune_sem to make sure any inode that prune_icache + * moved off the list before we took the lock has been fully torn + * down. + */ + down_write(&iprune_sem); up_write(&iprune_sem); } @@ -561,8 +560,6 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) struct inode *inode, *next; LIST_HEAD(dispose); - down_write(&iprune_sem); - spin_lock(&inode_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) @@ -590,7 +587,6 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) spin_unlock(&inode_lock); dispose_list(&dispose); - up_write(&iprune_sem); return busy; } diff --git a/fs/internal.h b/fs/internal.h index 9b976b57d7f..17191546d52 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -12,6 +12,7 @@ #include <linux/lglock.h> struct super_block; +struct file_system_type; struct linux_binprm; struct path; @@ -61,8 +62,6 @@ extern int check_unsafe_exec(struct linux_binprm *); extern int copy_mount_options(const void __user *, unsigned long *); extern int copy_mount_string(const void __user *, char **); -extern void free_vfsmnt(struct vfsmount *); -extern struct vfsmount *alloc_vfsmnt(const char *); extern unsigned int mnt_get_count(struct vfsmount *mnt); extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int); extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *, @@ -99,6 +98,8 @@ extern struct file *get_empty_filp(void); extern int do_remount_sb(struct super_block *, int, void *, int); extern void __put_super(struct super_block *sb); extern void put_super(struct super_block *sb); +extern struct dentry *mount_fs(struct file_system_type *, + int, const char *, void *); /* * open.c @@ -106,6 +107,19 @@ extern void put_super(struct super_block *sb); struct nameidata; extern struct file *nameidata_to_filp(struct nameidata *); extern void release_open_intent(struct nameidata *); +struct open_flags { + int open_flag; + int mode; + int acc_mode; + int intent; +}; +extern struct file *do_filp_open(int dfd, const char *pathname, + const struct open_flags *op, int lookup_flags); +extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, + const char *, const struct open_flags *, int lookup_flags); + +extern long do_handle_open(int mountdirfd, + struct file_handle __user *ufh, int open_flag); /* * inode.c diff --git a/fs/isofs/export.c b/fs/isofs/export.c index ed752cb3847..dd4687ff30d 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -124,9 +124,13 @@ isofs_export_encode_fh(struct dentry *dentry, * offset of the inode and the upper 16 bits of fh32[1] to * hold the offset of the parent. */ - - if (len < 3 || (connectable && len < 5)) + if (connectable && (len < 5)) { + *max_len = 5; + return 255; + } else if (len < 3) { + *max_len = 3; return 255; + } len = 3; fh32[0] = ei->i_iget5_block; diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index da1b5e4ffce..eb11601f2e0 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -839,7 +839,7 @@ journal_t * journal_init_inode (struct inode *inode) err = journal_bmap(journal, 0, &blocknr); /* If that failed, give up */ if (err) { - printk(KERN_ERR "%s: Cannnot locate journal superblock\n", + printk(KERN_ERR "%s: Cannot locate journal superblock\n", __func__); goto out_err; } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 97e73469b2c..90407b8fece 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -991,7 +991,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) err = jbd2_journal_bmap(journal, 0, &blocknr); /* If that failed, give up */ if (err) { - printk(KERN_ERR "%s: Cannnot locate journal superblock\n", + printk(KERN_ERR "%s: Cannot locate journal superblock\n", __func__); goto out_err; } diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 92978658ed1..82faddd1f32 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -215,8 +215,7 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode, no chance of AB-BA deadlock involving its f->sem). */ mutex_unlock(&f->sem); - ret = jffs2_do_create(c, dir_f, f, ri, - dentry->d_name.name, dentry->d_name.len); + ret = jffs2_do_create(c, dir_f, f, ri, &dentry->d_name); if (ret) goto fail; @@ -386,7 +385,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char jffs2_complete_reservation(c); - ret = jffs2_init_security(inode, dir_i); + ret = jffs2_init_security(inode, dir_i, &dentry->d_name); if (ret) goto fail; @@ -530,7 +529,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) jffs2_complete_reservation(c); - ret = jffs2_init_security(inode, dir_i); + ret = jffs2_init_security(inode, dir_i, &dentry->d_name); if (ret) goto fail; @@ -703,7 +702,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de jffs2_complete_reservation(c); - ret = jffs2_init_security(inode, dir_i); + ret = jffs2_init_security(inode, dir_i, &dentry->d_name); if (ret) goto fail; diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h index 5a53d9bdb2b..e4619b00f7c 100644 --- a/fs/jffs2/nodelist.h +++ b/fs/jffs2/nodelist.h @@ -401,7 +401,7 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_raw_inode *ri, unsigned char *buf, uint32_t offset, uint32_t writelen, uint32_t *retlen); int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f, - struct jffs2_raw_inode *ri, const char *name, int namelen); + struct jffs2_raw_inode *ri, const struct qstr *qstr); int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, const char *name, int namelen, struct jffs2_inode_info *dead_f, uint32_t time); int jffs2_do_link(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino, diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index 239f51216a6..cfeb7164b08 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -23,14 +23,15 @@ #include "nodelist.h" /* ---- Initial Security Label Attachment -------------- */ -int jffs2_init_security(struct inode *inode, struct inode *dir) +int jffs2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) { int rc; size_t len; void *value; char *name; - rc = security_inode_init_security(inode, dir, &name, &value, &len); + rc = security_inode_init_security(inode, dir, qstr, &name, &value, &len); if (rc) { if (rc == -EOPNOTSUPP) return 0; diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c index c819eb0e982..30d175b6d29 100644 --- a/fs/jffs2/write.c +++ b/fs/jffs2/write.c @@ -424,7 +424,9 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, return ret; } -int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f, struct jffs2_raw_inode *ri, const char *name, int namelen) +int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, + struct jffs2_inode_info *f, struct jffs2_raw_inode *ri, + const struct qstr *qstr) { struct jffs2_raw_dirent *rd; struct jffs2_full_dnode *fn; @@ -466,15 +468,15 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str mutex_unlock(&f->sem); jffs2_complete_reservation(c); - ret = jffs2_init_security(&f->vfs_inode, &dir_f->vfs_inode); + ret = jffs2_init_security(&f->vfs_inode, &dir_f->vfs_inode, qstr); if (ret) return ret; ret = jffs2_init_acl_post(&f->vfs_inode); if (ret) return ret; - ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, - ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); + ret = jffs2_reserve_space(c, sizeof(*rd)+qstr->len, &alloclen, + ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(qstr->len)); if (ret) { /* Eep. */ @@ -493,19 +495,19 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT); - rd->totlen = cpu_to_je32(sizeof(*rd) + namelen); + rd->totlen = cpu_to_je32(sizeof(*rd) + qstr->len); rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)); rd->pino = cpu_to_je32(dir_f->inocache->ino); rd->version = cpu_to_je32(++dir_f->highest_version); rd->ino = ri->ino; rd->mctime = ri->ctime; - rd->nsize = namelen; + rd->nsize = qstr->len; rd->type = DT_REG; rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8)); - rd->name_crc = cpu_to_je32(crc32(0, name, namelen)); + rd->name_crc = cpu_to_je32(crc32(0, qstr->name, qstr->len)); - fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_NORMAL); + fd = jffs2_write_dirent(c, dir_f, rd, qstr->name, qstr->len, ALLOC_NORMAL); jffs2_free_raw_dirent(rd); diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h index cf4f5759b42..7be4beb306f 100644 --- a/fs/jffs2/xattr.h +++ b/fs/jffs2/xattr.h @@ -121,10 +121,11 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t); #endif /* CONFIG_JFFS2_FS_XATTR */ #ifdef CONFIG_JFFS2_FS_SECURITY -extern int jffs2_init_security(struct inode *inode, struct inode *dir); +extern int jffs2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr); extern const struct xattr_handler jffs2_security_xattr_handler; #else -#define jffs2_init_security(inode,dir) (0) +#define jffs2_init_security(inode,dir,qstr) (0) #endif /* CONFIG_JFFS2_FS_SECURITY */ #endif /* _JFFS2_FS_XATTR_H_ */ diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h index 88b6cc535bf..e9e100fd7c0 100644 --- a/fs/jfs/jfs_xattr.h +++ b/fs/jfs/jfs_xattr.h @@ -62,10 +62,11 @@ extern ssize_t jfs_listxattr(struct dentry *, char *, size_t); extern int jfs_removexattr(struct dentry *, const char *); #ifdef CONFIG_JFS_SECURITY -extern int jfs_init_security(tid_t, struct inode *, struct inode *); +extern int jfs_init_security(tid_t, struct inode *, struct inode *, + const struct qstr *); #else static inline int jfs_init_security(tid_t tid, struct inode *inode, - struct inode *dir) + struct inode *dir, const struct qstr *qstr) { return 0; } diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 81ead850ddb..eaaf2b511e8 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -115,7 +115,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, if (rc) goto out3; - rc = jfs_init_security(tid, ip, dip); + rc = jfs_init_security(tid, ip, dip, &dentry->d_name); if (rc) { txAbort(tid, 0); goto out3; @@ -253,7 +253,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) if (rc) goto out3; - rc = jfs_init_security(tid, ip, dip); + rc = jfs_init_security(tid, ip, dip, &dentry->d_name); if (rc) { txAbort(tid, 0); goto out3; @@ -809,9 +809,6 @@ static int jfs_link(struct dentry *old_dentry, if (ip->i_nlink == JFS_LINK_MAX) return -EMLINK; - if (ip->i_nlink == 0) - return -ENOENT; - dquot_initialize(dir); tid = txBegin(ip->i_sb, 0); @@ -932,7 +929,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); - rc = jfs_init_security(tid, ip, dip); + rc = jfs_init_security(tid, ip, dip, &dentry->d_name); if (rc) goto out3; @@ -1395,7 +1392,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, if (rc) goto out3; - rc = jfs_init_security(tid, ip, dir); + rc = jfs_init_security(tid, ip, dir, &dentry->d_name); if (rc) { txAbort(tid, 0); goto out3; @@ -1600,7 +1597,7 @@ out: static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd) { - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; /* * This is not negative dentry. Always valid. diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 2d7f165d0f1..3fa4c32272d 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -1091,7 +1091,8 @@ int jfs_removexattr(struct dentry *dentry, const char *name) } #ifdef CONFIG_JFS_SECURITY -int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir) +int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir, + const struct qstr *qstr) { int rc; size_t len; @@ -1099,7 +1100,8 @@ int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir) char *suffix; char *name; - rc = security_inode_init_security(inode, dir, &suffix, &value, &len); + rc = security_inode_init_security(inode, dir, qstr, &suffix, &value, + &len); if (rc) { if (rc == -EOPNOTSUPP) return 0; diff --git a/fs/locks.c b/fs/locks.c index 0f3998291f7..822c3d1843a 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -145,7 +145,6 @@ static DEFINE_SPINLOCK(file_lock_lock); /* * Protects the two list heads above, plus the inode->i_flock list - * FIXME: should use a spinlock, once lockd and ceph are ready. */ void lock_flocks(void) { diff --git a/fs/namei.c b/fs/namei.c index 0087cf9c2c6..5a9a6c3094d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -136,7 +136,7 @@ static int do_getname(const char __user *filename, char *page) return retval; } -char * getname(const char __user * filename) +static char *getname_flags(const char __user * filename, int flags) { char *tmp, *result; @@ -147,14 +147,21 @@ char * getname(const char __user * filename) result = tmp; if (retval < 0) { - __putname(tmp); - result = ERR_PTR(retval); + if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { + __putname(tmp); + result = ERR_PTR(retval); + } } } audit_getname(result); return result; } +char *getname(const char __user * filename) +{ + return getname_flags(filename, 0); +} + #ifdef CONFIG_AUDITSYSCALL void putname(const char *name) { @@ -401,9 +408,11 @@ static int nameidata_drop_rcu(struct nameidata *nd) { struct fs_struct *fs = current->fs; struct dentry *dentry = nd->path.dentry; + int want_root = 0; BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (nd->root.mnt) { + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + want_root = 1; spin_lock(&fs->lock); if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry) @@ -414,7 +423,7 @@ static int nameidata_drop_rcu(struct nameidata *nd) goto err; BUG_ON(nd->inode != dentry->d_inode); spin_unlock(&dentry->d_lock); - if (nd->root.mnt) { + if (want_root) { path_get(&nd->root); spin_unlock(&fs->lock); } @@ -427,7 +436,7 @@ static int nameidata_drop_rcu(struct nameidata *nd) err: spin_unlock(&dentry->d_lock); err_root: - if (nd->root.mnt) + if (want_root) spin_unlock(&fs->lock); return -ECHILD; } @@ -454,9 +463,11 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry { struct fs_struct *fs = current->fs; struct dentry *parent = nd->path.dentry; + int want_root = 0; BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (nd->root.mnt) { + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + want_root = 1; spin_lock(&fs->lock); if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry) @@ -476,7 +487,7 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry parent->d_count++; spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); - if (nd->root.mnt) { + if (want_root) { path_get(&nd->root); spin_unlock(&fs->lock); } @@ -490,7 +501,7 @@ err: spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); err_root: - if (nd->root.mnt) + if (want_root) spin_unlock(&fs->lock); return -ECHILD; } @@ -498,8 +509,16 @@ err_root: /* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry) { - if (nd->flags & LOOKUP_RCU) - return nameidata_dentry_drop_rcu(nd, dentry); + if (nd->flags & LOOKUP_RCU) { + if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) { + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + return -ECHILD; + } + } return 0; } @@ -518,7 +537,8 @@ static int nameidata_drop_rcu_last(struct nameidata *nd) BUG_ON(!(nd->flags & LOOKUP_RCU)); nd->flags &= ~LOOKUP_RCU; - nd->root.mnt = NULL; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; spin_lock(&dentry->d_lock); if (!__d_rcu_to_refcount(dentry, nd->seq)) goto err_unlock; @@ -539,14 +559,6 @@ err_unlock: return -ECHILD; } -/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ -static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd) -{ - if (likely(nd->flags & LOOKUP_RCU)) - return nameidata_drop_rcu_last(nd); - return 0; -} - /** * release_open_intent - free up open intent resources * @nd: pointer to nameidata @@ -590,42 +602,8 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd) return dentry; } -static inline struct dentry * -do_revalidate_rcu(struct dentry *dentry, struct nameidata *nd) -{ - int status = d_revalidate(dentry, nd); - if (likely(status > 0)) - return dentry; - if (status == -ECHILD) { - if (nameidata_dentry_drop_rcu(nd, dentry)) - return ERR_PTR(-ECHILD); - return do_revalidate(dentry, nd); - } - if (status < 0) - return ERR_PTR(status); - /* Don't d_invalidate in rcu-walk mode */ - if (nameidata_dentry_drop_rcu(nd, dentry)) - return ERR_PTR(-ECHILD); - if (!d_invalidate(dentry)) { - dput(dentry); - dentry = NULL; - } - return dentry; -} - -static inline int need_reval_dot(struct dentry *dentry) -{ - if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE))) - return 0; - - if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))) - return 0; - - return 1; -} - /* - * force_reval_path - force revalidation of a dentry + * handle_reval_path - force revalidation of a dentry * * In some situations the path walking code will trust dentries without * revalidating them. This causes problems for filesystems that depend on @@ -639,27 +617,28 @@ static inline int need_reval_dot(struct dentry *dentry) * invalidate the dentry. It's up to the caller to handle putting references * to the path if necessary. */ -static int -force_reval_path(struct path *path, struct nameidata *nd) +static inline int handle_reval_path(struct nameidata *nd) { + struct dentry *dentry = nd->path.dentry; int status; - struct dentry *dentry = path->dentry; - /* - * only check on filesystems where it's possible for the dentry to - * become stale. - */ - if (!need_reval_dot(dentry)) + if (likely(!(nd->flags & LOOKUP_JUMPED))) + return 0; + + if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE))) + return 0; + + if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))) return 0; + /* Note: we do not d_invalidate() */ status = d_revalidate(dentry, nd); if (status > 0) return 0; - if (!status) { - d_invalidate(dentry); + if (!status) status = -ESTALE; - } + return status; } @@ -728,6 +707,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l path_put(&nd->path); nd->path = nd->root; path_get(&nd->root); + nd->flags |= LOOKUP_JUMPED; } nd->inode = nd->path.dentry->d_inode; @@ -757,19 +737,42 @@ static inline void path_to_nameidata(const struct path *path, nd->path.dentry = path->dentry; } +static inline void put_link(struct nameidata *nd, struct path *link, void *cookie) +{ + struct inode *inode = link->dentry->d_inode; + if (!IS_ERR(cookie) && inode->i_op->put_link) + inode->i_op->put_link(link->dentry, nd, cookie); + path_put(link); +} + static __always_inline int -__do_follow_link(const struct path *link, struct nameidata *nd, void **p) +follow_link(struct path *link, struct nameidata *nd, void **p) { int error; struct dentry *dentry = link->dentry; BUG_ON(nd->flags & LOOKUP_RCU); + if (link->mnt == nd->path.mnt) + mntget(link->mnt); + + if (unlikely(current->total_link_count >= 40)) { + *p = ERR_PTR(-ELOOP); /* no ->put_link(), please */ + path_put(&nd->path); + return -ELOOP; + } + cond_resched(); + current->total_link_count++; + touch_atime(link->mnt, dentry); nd_set_link(nd, NULL); - if (link->mnt == nd->path.mnt) - mntget(link->mnt); + error = security_inode_follow_link(link->dentry, nd); + if (error) { + *p = ERR_PTR(error); /* no ->put_link(), please */ + path_put(&nd->path); + return error; + } nd->last_type = LAST_BIND; *p = dentry->d_inode->i_op->follow_link(dentry, nd); @@ -780,56 +783,18 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p) if (s) error = __vfs_follow_link(nd, s); else if (nd->last_type == LAST_BIND) { - error = force_reval_path(&nd->path, nd); - if (error) + nd->flags |= LOOKUP_JUMPED; + nd->inode = nd->path.dentry->d_inode; + if (nd->inode->i_op->follow_link) { + /* stepped on a _really_ weird one */ path_put(&nd->path); + error = -ELOOP; + } } } return error; } -/* - * This limits recursive symlink follows to 8, while - * limiting consecutive symlinks to 40. - * - * Without that kind of total limit, nasty chains of consecutive - * symlinks can cause almost arbitrarily long lookups. - */ -static inline int do_follow_link(struct inode *inode, struct path *path, struct nameidata *nd) -{ - void *cookie; - int err = -ELOOP; - - /* We drop rcu-walk here */ - if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry)) - return -ECHILD; - BUG_ON(inode != path->dentry->d_inode); - - if (current->link_count >= MAX_NESTED_LINKS) - goto loop; - if (current->total_link_count >= 40) - goto loop; - BUG_ON(nd->depth >= MAX_NESTED_LINKS); - cond_resched(); - err = security_inode_follow_link(path->dentry, nd); - if (err) - goto loop; - current->link_count++; - current->total_link_count++; - nd->depth++; - err = __do_follow_link(path, nd, &cookie); - if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link) - path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie); - path_put(path); - current->link_count--; - nd->depth--; - return err; -loop: - path_put_conditional(path, nd); - path_put(&nd->path); - return err; -} - static int follow_up_rcu(struct path *path) { struct vfsmount *parent; @@ -968,8 +933,7 @@ static int follow_managed(struct path *path, unsigned flags) if (managed & DCACHE_MANAGE_TRANSIT) { BUG_ON(!path->dentry->d_op); BUG_ON(!path->dentry->d_op->d_manage); - ret = path->dentry->d_op->d_manage(path->dentry, - false, false); + ret = path->dentry->d_op->d_manage(path->dentry, false); if (ret < 0) return ret == -EISDIR ? 0 : ret; } @@ -1034,7 +998,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, struct vfsmount *mounted; if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) && !reverse_transit && - path->dentry->d_op->d_manage(path->dentry, false, true) < 0) + path->dentry->d_op->d_manage(path->dentry, true) < 0) return false; mounted = __lookup_mnt(path->mnt, path->dentry, 1); if (!mounted) @@ -1068,7 +1032,7 @@ static int follow_dotdot_rcu(struct nameidata *nd) seq = read_seqcount_begin(&parent->d_seq); if (read_seqcount_retry(&old->d_seq, nd->seq)) - return -ECHILD; + goto failed; inode = parent->d_inode; nd->path.dentry = parent; nd->seq = seq; @@ -1081,8 +1045,15 @@ static int follow_dotdot_rcu(struct nameidata *nd) } __follow_mount_rcu(nd, &nd->path, &inode, true); nd->inode = inode; - return 0; + +failed: + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + return -ECHILD; } /* @@ -1093,7 +1064,7 @@ static int follow_dotdot_rcu(struct nameidata *nd) * Care must be taken as namespace_sem may be held (indicated by mounting_here * being true). */ -int follow_down(struct path *path, bool mounting_here) +int follow_down(struct path *path) { unsigned managed; int ret; @@ -1114,7 +1085,7 @@ int follow_down(struct path *path, bool mounting_here) BUG_ON(!path->dentry->d_op); BUG_ON(!path->dentry->d_op->d_manage); ret = path->dentry->d_op->d_manage( - path->dentry, mounting_here, false); + path->dentry, false); if (ret < 0) return ret == -EISDIR ? 0 : ret; } @@ -1216,68 +1187,85 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, { struct vfsmount *mnt = nd->path.mnt; struct dentry *dentry, *parent = nd->path.dentry; - struct inode *dir; + int need_reval = 1; + int status = 1; int err; /* - * See if the low-level filesystem might want - * to use its own hash.. - */ - if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { - err = parent->d_op->d_hash(parent, nd->inode, name); - if (err < 0) - return err; - } - - /* * Rename seqlock is not required here because in the off chance * of a false negative due to a concurrent rename, we're going to * do the non-racy lookup, below. */ if (nd->flags & LOOKUP_RCU) { unsigned seq; - *inode = nd->inode; dentry = __d_lookup_rcu(parent, name, &seq, inode); - if (!dentry) { - if (nameidata_drop_rcu(nd)) - return -ECHILD; - goto need_lookup; - } + if (!dentry) + goto unlazy; + /* Memory barrier in read_seqcount_begin of child is enough */ if (__read_seqcount_retry(&parent->d_seq, nd->seq)) return -ECHILD; - nd->seq = seq; + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { - dentry = do_revalidate_rcu(dentry, nd); - if (!dentry) - goto need_lookup; - if (IS_ERR(dentry)) - goto fail; - if (!(nd->flags & LOOKUP_RCU)) - goto done; + status = d_revalidate(dentry, nd); + if (unlikely(status <= 0)) { + if (status != -ECHILD) + need_reval = 0; + goto unlazy; + } } path->mnt = mnt; path->dentry = dentry; if (likely(__follow_mount_rcu(nd, path, inode, false))) return 0; - if (nameidata_drop_rcu(nd)) - return -ECHILD; - /* fallthru */ +unlazy: + if (dentry) { + if (nameidata_dentry_drop_rcu(nd, dentry)) + return -ECHILD; + } else { + if (nameidata_drop_rcu(nd)) + return -ECHILD; + } + } else { + dentry = __d_lookup(parent, name); } - dentry = __d_lookup(parent, name); - if (!dentry) - goto need_lookup; -found: - if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { - dentry = do_revalidate(dentry, nd); - if (!dentry) - goto need_lookup; - if (IS_ERR(dentry)) - goto fail; + +retry: + if (unlikely(!dentry)) { + struct inode *dir = parent->d_inode; + BUG_ON(nd->inode != dir); + + mutex_lock(&dir->i_mutex); + dentry = d_lookup(parent, name); + if (likely(!dentry)) { + dentry = d_alloc_and_lookup(parent, name, nd); + if (IS_ERR(dentry)) { + mutex_unlock(&dir->i_mutex); + return PTR_ERR(dentry); + } + /* known good */ + need_reval = 0; + status = 1; + } + mutex_unlock(&dir->i_mutex); } -done: + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) + status = d_revalidate(dentry, nd); + if (unlikely(status <= 0)) { + if (status < 0) { + dput(dentry); + return status; + } + if (!d_invalidate(dentry)) { + dput(dentry); + dentry = NULL; + need_reval = 1; + goto retry; + } + } + path->mnt = mnt; path->dentry = dentry; err = follow_managed(path, nd->flags); @@ -1287,39 +1275,113 @@ done: } *inode = path->dentry->d_inode; return 0; +} + +static inline int may_lookup(struct nameidata *nd) +{ + if (nd->flags & LOOKUP_RCU) { + int err = exec_permission(nd->inode, IPERM_FLAG_RCU); + if (err != -ECHILD) + return err; + if (nameidata_drop_rcu(nd)) + return -ECHILD; + } + return exec_permission(nd->inode, 0); +} -need_lookup: - dir = parent->d_inode; - BUG_ON(nd->inode != dir); +static inline int handle_dots(struct nameidata *nd, int type) +{ + if (type == LAST_DOTDOT) { + if (nd->flags & LOOKUP_RCU) { + if (follow_dotdot_rcu(nd)) + return -ECHILD; + } else + follow_dotdot(nd); + } + return 0; +} - mutex_lock(&dir->i_mutex); - /* - * First re-do the cached lookup just in case it was created - * while we waited for the directory semaphore, or the first - * lookup failed due to an unrelated rename. - * - * This could use version numbering or similar to avoid unnecessary - * cache lookups, but then we'd have to do the first lookup in the - * non-racy way. However in the common case here, everything should - * be hot in cache, so would it be a big win? - */ - dentry = d_lookup(parent, name); - if (likely(!dentry)) { - dentry = d_alloc_and_lookup(parent, name, nd); - mutex_unlock(&dir->i_mutex); - if (IS_ERR(dentry)) - goto fail; - goto done; +static void terminate_walk(struct nameidata *nd) +{ + if (!(nd->flags & LOOKUP_RCU)) { + path_put(&nd->path); + } else { + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); } +} + +static inline int walk_component(struct nameidata *nd, struct path *path, + struct qstr *name, int type, int follow) +{ + struct inode *inode; + int err; /* - * Uhhuh! Nasty case: the cache was re-populated while - * we waited on the semaphore. Need to revalidate. + * "." and ".." are special - ".." especially so because it has + * to be able to know about the current root directory and + * parent relationships. */ - mutex_unlock(&dir->i_mutex); - goto found; + if (unlikely(type != LAST_NORM)) + return handle_dots(nd, type); + err = do_lookup(nd, name, path, &inode); + if (unlikely(err)) { + terminate_walk(nd); + return err; + } + if (!inode) { + path_to_nameidata(path, nd); + terminate_walk(nd); + return -ENOENT; + } + if (unlikely(inode->i_op->follow_link) && follow) { + if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry)) + return -ECHILD; + BUG_ON(inode != path->dentry->d_inode); + return 1; + } + path_to_nameidata(path, nd); + nd->inode = inode; + return 0; +} -fail: - return PTR_ERR(dentry); +/* + * This limits recursive symlink follows to 8, while + * limiting consecutive symlinks to 40. + * + * Without that kind of total limit, nasty chains of consecutive + * symlinks can cause almost arbitrarily long lookups. + */ +static inline int nested_symlink(struct path *path, struct nameidata *nd) +{ + int res; + + BUG_ON(nd->depth >= MAX_NESTED_LINKS); + if (unlikely(current->link_count >= MAX_NESTED_LINKS)) { + path_put_conditional(path, nd); + path_put(&nd->path); + return -ELOOP; + } + + nd->depth++; + current->link_count++; + + do { + struct path link = *path; + void *cookie; + + res = follow_link(&link, nd, &cookie); + if (!res) + res = walk_component(nd, path, &nd->last, + nd->last_type, LOOKUP_FOLLOW); + put_link(nd, &link, cookie); + } while (res > 0); + + current->link_count--; + nd->depth--; + return res; } /* @@ -1339,30 +1401,18 @@ static int link_path_walk(const char *name, struct nameidata *nd) while (*name=='/') name++; if (!*name) - goto return_reval; - - if (nd->depth) - lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE); + return 0; /* At this point we know we have a real path component. */ for(;;) { - struct inode *inode; unsigned long hash; struct qstr this; unsigned int c; + int type; nd->flags |= LOOKUP_CONTINUE; - if (nd->flags & LOOKUP_RCU) { - err = exec_permission(nd->inode, IPERM_FLAG_RCU); - if (err == -ECHILD) { - if (nameidata_drop_rcu(nd)) - return -ECHILD; - goto exec_again; - } - } else { -exec_again: - err = exec_permission(nd->inode, 0); - } + + err = may_lookup(nd); if (err) break; @@ -1378,52 +1428,43 @@ exec_again: this.len = name - (const char *) this.name; this.hash = end_name_hash(hash); + type = LAST_NORM; + if (this.name[0] == '.') switch (this.len) { + case 2: + if (this.name[1] == '.') { + type = LAST_DOTDOT; + nd->flags |= LOOKUP_JUMPED; + } + break; + case 1: + type = LAST_DOT; + } + if (likely(type == LAST_NORM)) { + struct dentry *parent = nd->path.dentry; + nd->flags &= ~LOOKUP_JUMPED; + if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { + err = parent->d_op->d_hash(parent, nd->inode, + &this); + if (err < 0) + break; + } + } + /* remove trailing slashes? */ if (!c) goto last_component; while (*++name == '/'); if (!*name) - goto last_with_slashes; + goto last_component; - /* - * "." and ".." are special - ".." especially so because it has - * to be able to know about the current root directory and - * parent relationships. - */ - if (this.name[0] == '.') switch (this.len) { - default: - break; - case 2: - if (this.name[1] != '.') - break; - if (nd->flags & LOOKUP_RCU) { - if (follow_dotdot_rcu(nd)) - return -ECHILD; - } else - follow_dotdot(nd); - /* fallthrough */ - case 1: - continue; - } - /* This does the actual lookups.. */ - err = do_lookup(nd, &this, &next, &inode); - if (err) - break; - err = -ENOENT; - if (!inode) - goto out_dput; + err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW); + if (err < 0) + return err; - if (inode->i_op->follow_link) { - err = do_follow_link(inode, &next, nd); + if (err) { + err = nested_symlink(&next, nd); if (err) - goto return_err; - nd->inode = nd->path.dentry->d_inode; - err = -ENOENT; - if (!nd->inode) - break; - } else { - path_to_nameidata(&next, nd); - nd->inode = inode; + return err; } err = -ENOTDIR; if (!nd->inode->i_op->lookup) @@ -1431,209 +1472,109 @@ exec_again: continue; /* here ends the main loop */ -last_with_slashes: - lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; last_component: /* Clear LOOKUP_CONTINUE iff it was previously unset */ nd->flags &= lookup_flags | ~LOOKUP_CONTINUE; - if (lookup_flags & LOOKUP_PARENT) - goto lookup_parent; - if (this.name[0] == '.') switch (this.len) { - default: - break; - case 2: - if (this.name[1] != '.') - break; - if (nd->flags & LOOKUP_RCU) { - if (follow_dotdot_rcu(nd)) - return -ECHILD; - } else - follow_dotdot(nd); - /* fallthrough */ - case 1: - goto return_reval; - } - err = do_lookup(nd, &this, &next, &inode); - if (err) - break; - if (inode && unlikely(inode->i_op->follow_link) && - (lookup_flags & LOOKUP_FOLLOW)) { - err = do_follow_link(inode, &next, nd); - if (err) - goto return_err; - nd->inode = nd->path.dentry->d_inode; - } else { - path_to_nameidata(&next, nd); - nd->inode = inode; - } - err = -ENOENT; - if (!nd->inode) - break; - if (lookup_flags & LOOKUP_DIRECTORY) { - err = -ENOTDIR; - if (!nd->inode->i_op->lookup) - break; - } - goto return_base; -lookup_parent: nd->last = this; - nd->last_type = LAST_NORM; - if (this.name[0] != '.') - goto return_base; - if (this.len == 1) - nd->last_type = LAST_DOT; - else if (this.len == 2 && this.name[1] == '.') - nd->last_type = LAST_DOTDOT; - else - goto return_base; -return_reval: - /* - * We bypassed the ordinary revalidation routines. - * We may need to check the cached dentry for staleness. - */ - if (need_reval_dot(nd->path.dentry)) { - if (nameidata_drop_rcu_last_maybe(nd)) - return -ECHILD; - /* Note: we do not d_invalidate() */ - err = d_revalidate(nd->path.dentry, nd); - if (!err) - err = -ESTALE; - if (err < 0) - break; - return 0; - } -return_base: - if (nameidata_drop_rcu_last_maybe(nd)) - return -ECHILD; + nd->last_type = type; return 0; -out_dput: - if (!(nd->flags & LOOKUP_RCU)) - path_put_conditional(&next, nd); - break; } - if (!(nd->flags & LOOKUP_RCU)) - path_put(&nd->path); -return_err: + terminate_walk(nd); return err; } -static inline int path_walk_rcu(const char *name, struct nameidata *nd) -{ - current->total_link_count = 0; - - return link_path_walk(name, nd); -} - -static inline int path_walk_simple(const char *name, struct nameidata *nd) -{ - current->total_link_count = 0; - - return link_path_walk(name, nd); -} - -static int path_walk(const char *name, struct nameidata *nd) -{ - struct path save = nd->path; - int result; - - current->total_link_count = 0; - - /* make sure the stuff we saved doesn't go away */ - path_get(&save); - - result = link_path_walk(name, nd); - if (result == -ESTALE) { - /* nd->path had been dropped */ - current->total_link_count = 0; - nd->path = save; - path_get(&nd->path); - nd->flags |= LOOKUP_REVAL; - result = link_path_walk(name, nd); - } - - path_put(&save); - - return result; -} - -static void path_finish_rcu(struct nameidata *nd) -{ - if (nd->flags & LOOKUP_RCU) { - /* RCU dangling. Cancel it. */ - nd->flags &= ~LOOKUP_RCU; - nd->root.mnt = NULL; - rcu_read_unlock(); - br_read_unlock(vfsmount_lock); - } - if (nd->file) - fput(nd->file); -} - -static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd) +static int path_init(int dfd, const char *name, unsigned int flags, + struct nameidata *nd, struct file **fp) { int retval = 0; int fput_needed; struct file *file; nd->last_type = LAST_ROOT; /* if there are only slashes... */ - nd->flags = flags | LOOKUP_RCU; + nd->flags = flags | LOOKUP_JUMPED; nd->depth = 0; + if (flags & LOOKUP_ROOT) { + struct inode *inode = nd->root.dentry->d_inode; + if (*name) { + if (!inode->i_op->lookup) + return -ENOTDIR; + retval = inode_permission(inode, MAY_EXEC); + if (retval) + return retval; + } + nd->path = nd->root; + nd->inode = inode; + if (flags & LOOKUP_RCU) { + br_read_lock(vfsmount_lock); + rcu_read_lock(); + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + } else { + path_get(&nd->path); + } + return 0; + } + nd->root.mnt = NULL; - nd->file = NULL; if (*name=='/') { - struct fs_struct *fs = current->fs; - unsigned seq; - - br_read_lock(vfsmount_lock); - rcu_read_lock(); - - do { - seq = read_seqcount_begin(&fs->seq); - nd->root = fs->root; - nd->path = nd->root; - nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); - + if (flags & LOOKUP_RCU) { + br_read_lock(vfsmount_lock); + rcu_read_lock(); + set_root_rcu(nd); + } else { + set_root(nd); + path_get(&nd->root); + } + nd->path = nd->root; } else if (dfd == AT_FDCWD) { - struct fs_struct *fs = current->fs; - unsigned seq; - - br_read_lock(vfsmount_lock); - rcu_read_lock(); + if (flags & LOOKUP_RCU) { + struct fs_struct *fs = current->fs; + unsigned seq; - do { - seq = read_seqcount_begin(&fs->seq); - nd->path = fs->pwd; - nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); + br_read_lock(vfsmount_lock); + rcu_read_lock(); + do { + seq = read_seqcount_begin(&fs->seq); + nd->path = fs->pwd; + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + } while (read_seqcount_retry(&fs->seq, seq)); + } else { + get_fs_pwd(current->fs, &nd->path); + } } else { struct dentry *dentry; - file = fget_light(dfd, &fput_needed); + file = fget_raw_light(dfd, &fput_needed); retval = -EBADF; if (!file) goto out_fail; dentry = file->f_path.dentry; - retval = -ENOTDIR; - if (!S_ISDIR(dentry->d_inode->i_mode)) - goto fput_fail; + if (*name) { + retval = -ENOTDIR; + if (!S_ISDIR(dentry->d_inode->i_mode)) + goto fput_fail; - retval = file_permission(file, MAY_EXEC); - if (retval) - goto fput_fail; + retval = file_permission(file, MAY_EXEC); + if (retval) + goto fput_fail; + } nd->path = file->f_path; - if (fput_needed) - nd->file = file; - - nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - br_read_lock(vfsmount_lock); - rcu_read_lock(); + if (flags & LOOKUP_RCU) { + if (fput_needed) + *fp = file; + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + br_read_lock(vfsmount_lock); + rcu_read_lock(); + } else { + path_get(&file->f_path); + fput_light(file, fput_needed); + } } + nd->inode = nd->path.dentry->d_inode; return 0; @@ -1643,60 +1584,23 @@ out_fail: return retval; } -static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) +static inline int lookup_last(struct nameidata *nd, struct path *path) { - int retval = 0; - int fput_needed; - struct file *file; - - nd->last_type = LAST_ROOT; /* if there are only slashes... */ - nd->flags = flags; - nd->depth = 0; - nd->root.mnt = NULL; - - if (*name=='/') { - set_root(nd); - nd->path = nd->root; - path_get(&nd->root); - } else if (dfd == AT_FDCWD) { - get_fs_pwd(current->fs, &nd->path); - } else { - struct dentry *dentry; - - file = fget_light(dfd, &fput_needed); - retval = -EBADF; - if (!file) - goto out_fail; - - dentry = file->f_path.dentry; - - retval = -ENOTDIR; - if (!S_ISDIR(dentry->d_inode->i_mode)) - goto fput_fail; + if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) + nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; - retval = file_permission(file, MAY_EXEC); - if (retval) - goto fput_fail; - - nd->path = file->f_path; - path_get(&file->f_path); - - fput_light(file, fput_needed); - } - nd->inode = nd->path.dentry->d_inode; - return 0; - -fput_fail: - fput_light(file, fput_needed); -out_fail: - return retval; + nd->flags &= ~LOOKUP_PARENT; + return walk_component(nd, path, &nd->last, nd->last_type, + nd->flags & LOOKUP_FOLLOW); } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ -static int do_path_lookup(int dfd, const char *name, +static int path_lookupat(int dfd, const char *name, unsigned int flags, struct nameidata *nd) { - int retval; + struct file *base = NULL; + struct path path; + int err; /* * Path walking is largely split up into 2 different synchronisation @@ -1712,44 +1616,75 @@ static int do_path_lookup(int dfd, const char *name, * be handled by restarting a traditional ref-walk (which will always * be able to complete). */ - retval = path_init_rcu(dfd, name, flags, nd); - if (unlikely(retval)) - return retval; - retval = path_walk_rcu(name, nd); - path_finish_rcu(nd); - if (nd->root.mnt) { - path_put(&nd->root); - nd->root.mnt = NULL; + err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base); + + if (unlikely(err)) + return err; + + current->total_link_count = 0; + err = link_path_walk(name, nd); + + if (!err && !(flags & LOOKUP_PARENT)) { + err = lookup_last(nd, &path); + while (err > 0) { + void *cookie; + struct path link = path; + nd->flags |= LOOKUP_PARENT; + err = follow_link(&link, nd, &cookie); + if (!err) + err = lookup_last(nd, &path); + put_link(nd, &link, cookie); + } } - if (unlikely(retval == -ECHILD || retval == -ESTALE)) { - /* slower, locked walk */ - if (retval == -ESTALE) - flags |= LOOKUP_REVAL; - retval = path_init(dfd, name, flags, nd); - if (unlikely(retval)) - return retval; - retval = path_walk(name, nd); - if (nd->root.mnt) { - path_put(&nd->root); - nd->root.mnt = NULL; + if (nd->flags & LOOKUP_RCU) { + /* went all way through without dropping RCU */ + BUG_ON(err); + if (nameidata_drop_rcu_last(nd)) + err = -ECHILD; + } + + if (!err) + err = handle_reval_path(nd); + + if (!err && nd->flags & LOOKUP_DIRECTORY) { + if (!nd->inode->i_op->lookup) { + path_put(&nd->path); + return -ENOTDIR; } } + if (base) + fput(base); + + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + path_put(&nd->root); + nd->root.mnt = NULL; + } + return err; +} + +static int do_path_lookup(int dfd, const char *name, + unsigned int flags, struct nameidata *nd) +{ + int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd); + if (unlikely(retval == -ECHILD)) + retval = path_lookupat(dfd, name, flags, nd); + if (unlikely(retval == -ESTALE)) + retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd); + if (likely(!retval)) { if (unlikely(!audit_dummy_context())) { if (nd->path.dentry && nd->inode) audit_inode(name, nd->path.dentry); } } - return retval; } -int path_lookup(const char *name, unsigned int flags, - struct nameidata *nd) +int kern_path_parent(const char *name, struct nameidata *nd) { - return do_path_lookup(AT_FDCWD, name, flags, nd); + return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd); } int kern_path(const char *name, unsigned int flags, struct path *path) @@ -1773,29 +1708,10 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, const char *name, unsigned int flags, struct nameidata *nd) { - int retval; - - /* same as do_path_lookup */ - nd->last_type = LAST_ROOT; - nd->flags = flags; - nd->depth = 0; - - nd->path.dentry = dentry; - nd->path.mnt = mnt; - path_get(&nd->path); - nd->root = nd->path; - path_get(&nd->root); - nd->inode = nd->path.dentry->d_inode; - - retval = path_walk(name, nd); - if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && - nd->inode)) - audit_inode(name, nd->path.dentry); - - path_put(&nd->root); - nd->root.mnt = NULL; - - return retval; + nd->root.dentry = dentry; + nd->root.mnt = mnt; + /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */ + return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd); } static struct dentry *__lookup_hash(struct qstr *name, @@ -1810,17 +1726,6 @@ static struct dentry *__lookup_hash(struct qstr *name, return ERR_PTR(err); /* - * See if the low-level filesystem might want - * to use its own hash.. - */ - if (base->d_flags & DCACHE_OP_HASH) { - err = base->d_op->d_hash(base, inode, name); - dentry = ERR_PTR(err); - if (err < 0) - goto out; - } - - /* * Don't bother with __d_lookup: callers are for creat as * well as unlink, so a lot of the time it would cost * a double lookup. @@ -1832,7 +1737,7 @@ static struct dentry *__lookup_hash(struct qstr *name, if (!dentry) dentry = d_alloc_and_lookup(base, name, nd); -out: + return dentry; } @@ -1846,28 +1751,6 @@ static struct dentry *lookup_hash(struct nameidata *nd) return __lookup_hash(&nd->last, nd->path.dentry, nd); } -static int __lookup_one_len(const char *name, struct qstr *this, - struct dentry *base, int len) -{ - unsigned long hash; - unsigned int c; - - this->name = name; - this->len = len; - if (!len) - return -EACCES; - - hash = init_name_hash(); - while (len--) { - c = *(const unsigned char *)name++; - if (c == '/' || c == '\0') - return -EACCES; - hash = partial_name_hash(c, hash); - } - this->hash = end_name_hash(hash); - return 0; -} - /** * lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup @@ -1881,14 +1764,34 @@ static int __lookup_one_len(const char *name, struct qstr *this, */ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) { - int err; struct qstr this; + unsigned long hash; + unsigned int c; WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); - err = __lookup_one_len(name, &this, base, len); - if (err) - return ERR_PTR(err); + this.name = name; + this.len = len; + if (!len) + return ERR_PTR(-EACCES); + + hash = init_name_hash(); + while (len--) { + c = *(const unsigned char *)name++; + if (c == '/' || c == '\0') + return ERR_PTR(-EACCES); + hash = partial_name_hash(c, hash); + } + this.hash = end_name_hash(hash); + /* + * See if the low-level filesystem might want + * to use its own hash.. + */ + if (base->d_flags & DCACHE_OP_HASH) { + int err = base->d_op->d_hash(base, base->d_inode, &this); + if (err < 0) + return ERR_PTR(err); + } return __lookup_hash(&this, base, NULL); } @@ -1897,7 +1800,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags, struct path *path) { struct nameidata nd; - char *tmp = getname(name); + char *tmp = getname_flags(name, flags); int err = PTR_ERR(tmp); if (!IS_ERR(tmp)) { @@ -2077,12 +1980,16 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode, return error; } -int may_open(struct path *path, int acc_mode, int flag) +static int may_open(struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; struct inode *inode = dentry->d_inode; int error; + /* O_PATH? */ + if (!acc_mode) + return 0; + if (!inode) return -ENOENT; @@ -2151,34 +2058,6 @@ static int handle_truncate(struct file *filp) } /* - * Be careful about ever adding any more callers of this - * function. Its flags must be in the namei format, not - * what get passed to sys_open(). - */ -static int __open_namei_create(struct nameidata *nd, struct path *path, - int open_flag, int mode) -{ - int error; - struct dentry *dir = nd->path.dentry; - - if (!IS_POSIXACL(dir->d_inode)) - mode &= ~current_umask(); - error = security_path_mknod(&nd->path, path->dentry, mode, 0); - if (error) - goto out_unlock; - error = vfs_create(dir->d_inode, path->dentry, mode, nd); -out_unlock: - mutex_unlock(&dir->d_inode->i_mutex); - dput(nd->path.dentry); - nd->path.dentry = path->dentry; - - if (error) - return error; - /* Don't check for write permission, don't truncate */ - return may_open(&nd->path, 0, open_flag & ~O_TRUNC); -} - -/* * Note that while the flag value (low two bits) for sys_open means: * 00 - read-only * 01 - write-only @@ -2202,126 +2081,115 @@ static inline int open_to_namei_flags(int flag) return flag; } -static int open_will_truncate(int flag, struct inode *inode) -{ - /* - * We'll never write to the fs underlying - * a device file. - */ - if (special_file(inode->i_mode)) - return 0; - return (flag & O_TRUNC); -} - -static struct file *finish_open(struct nameidata *nd, - int open_flag, int acc_mode) -{ - struct file *filp; - int will_truncate; - int error; - - will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode); - if (will_truncate) { - error = mnt_want_write(nd->path.mnt); - if (error) - goto exit; - } - error = may_open(&nd->path, acc_mode, open_flag); - if (error) { - if (will_truncate) - mnt_drop_write(nd->path.mnt); - goto exit; - } - filp = nameidata_to_filp(nd); - if (!IS_ERR(filp)) { - error = ima_file_check(filp, acc_mode); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } - } - if (!IS_ERR(filp)) { - if (will_truncate) { - error = handle_truncate(filp); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } - } - } - /* - * It is now safe to drop the mnt write - * because the filp has had a write taken - * on its behalf. - */ - if (will_truncate) - mnt_drop_write(nd->path.mnt); - path_put(&nd->path); - return filp; - -exit: - path_put(&nd->path); - return ERR_PTR(error); -} - /* - * Handle O_CREAT case for do_filp_open + * Handle the last step of open() */ static struct file *do_last(struct nameidata *nd, struct path *path, - int open_flag, int acc_mode, - int mode, const char *pathname) + const struct open_flags *op, const char *pathname) { struct dentry *dir = nd->path.dentry; + struct dentry *dentry; + int open_flag = op->open_flag; + int will_truncate = open_flag & O_TRUNC; + int want_write = 0; + int acc_mode = op->acc_mode; struct file *filp; - int error = -EISDIR; + int error; + + nd->flags &= ~LOOKUP_PARENT; + nd->flags |= op->intent; switch (nd->last_type) { case LAST_DOTDOT: - follow_dotdot(nd); - dir = nd->path.dentry; case LAST_DOT: - if (need_reval_dot(dir)) { - int status = d_revalidate(nd->path.dentry, nd); - if (!status) - status = -ESTALE; - if (status < 0) { - error = status; - goto exit; - } - } + error = handle_dots(nd, nd->last_type); + if (error) + return ERR_PTR(error); /* fallthrough */ case LAST_ROOT: - goto exit; + if (nd->flags & LOOKUP_RCU) { + if (nameidata_drop_rcu_last(nd)) + return ERR_PTR(-ECHILD); + } + error = handle_reval_path(nd); + if (error) + goto exit; + audit_inode(pathname, nd->path.dentry); + if (open_flag & O_CREAT) { + error = -EISDIR; + goto exit; + } + goto ok; case LAST_BIND: + /* can't be RCU mode here */ + error = handle_reval_path(nd); + if (error) + goto exit; audit_inode(pathname, dir); goto ok; } + if (!(open_flag & O_CREAT)) { + int symlink_ok = 0; + if (nd->last.name[nd->last.len]) + nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; + if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW)) + symlink_ok = 1; + /* we _can_ be in RCU mode here */ + error = walk_component(nd, path, &nd->last, LAST_NORM, + !symlink_ok); + if (error < 0) + return ERR_PTR(error); + if (error) /* symlink */ + return NULL; + /* sayonara */ + if (nd->flags & LOOKUP_RCU) { + if (nameidata_drop_rcu_last(nd)) + return ERR_PTR(-ECHILD); + } + + error = -ENOTDIR; + if (nd->flags & LOOKUP_DIRECTORY) { + if (!nd->inode->i_op->lookup) + goto exit; + } + audit_inode(pathname, nd->path.dentry); + goto ok; + } + + /* create side of things */ + + if (nd->flags & LOOKUP_RCU) { + if (nameidata_drop_rcu_last(nd)) + return ERR_PTR(-ECHILD); + } + + audit_inode(pathname, dir); + error = -EISDIR; /* trailing slashes? */ if (nd->last.name[nd->last.len]) goto exit; mutex_lock(&dir->d_inode->i_mutex); - path->dentry = lookup_hash(nd); - path->mnt = nd->path.mnt; - - error = PTR_ERR(path->dentry); - if (IS_ERR(path->dentry)) { + dentry = lookup_hash(nd); + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) { mutex_unlock(&dir->d_inode->i_mutex); goto exit; } - if (IS_ERR(nd->intent.open.file)) { - error = PTR_ERR(nd->intent.open.file); - goto exit_mutex_unlock; - } + path->dentry = dentry; + path->mnt = nd->path.mnt; /* Negative dentry, just create the file */ - if (!path->dentry->d_inode) { + if (!dentry->d_inode) { + int mode = op->mode; + if (!IS_POSIXACL(dir->d_inode)) + mode &= ~current_umask(); /* * This write is needed to ensure that a - * ro->rw transition does not occur between + * rw->ro transition does not occur between * the time when the file is created and when * a permanent write count is taken through * the 'struct file' in nameidata_to_filp(). @@ -2329,22 +2197,21 @@ static struct file *do_last(struct nameidata *nd, struct path *path, error = mnt_want_write(nd->path.mnt); if (error) goto exit_mutex_unlock; - error = __open_namei_create(nd, path, open_flag, mode); - if (error) { - mnt_drop_write(nd->path.mnt); - goto exit; - } - filp = nameidata_to_filp(nd); - mnt_drop_write(nd->path.mnt); - path_put(&nd->path); - if (!IS_ERR(filp)) { - error = ima_file_check(filp, acc_mode); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } - } - return filp; + want_write = 1; + /* Don't check for write permission, don't truncate */ + open_flag &= ~O_TRUNC; + will_truncate = 0; + acc_mode = MAY_OPEN; + error = security_path_mknod(&nd->path, dentry, mode, 0); + if (error) + goto exit_mutex_unlock; + error = vfs_create(dir->d_inode, dentry, mode, nd); + if (error) + goto exit_mutex_unlock; + mutex_unlock(&dir->d_inode->i_mutex); + dput(nd->path.dentry); + nd->path.dentry = dentry; + goto common; } /* @@ -2374,7 +2241,40 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (S_ISDIR(nd->inode->i_mode)) goto exit; ok: - filp = finish_open(nd, open_flag, acc_mode); + if (!S_ISREG(nd->inode->i_mode)) + will_truncate = 0; + + if (will_truncate) { + error = mnt_want_write(nd->path.mnt); + if (error) + goto exit; + want_write = 1; + } +common: + error = may_open(&nd->path, acc_mode, open_flag); + if (error) + goto exit; + filp = nameidata_to_filp(nd); + if (!IS_ERR(filp)) { + error = ima_file_check(filp, op->acc_mode); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } + if (!IS_ERR(filp)) { + if (will_truncate) { + error = handle_truncate(filp); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } + } +out: + if (want_write) + mnt_drop_write(nd->path.mnt); + path_put(&nd->path); return filp; exit_mutex_unlock: @@ -2382,197 +2282,103 @@ exit_mutex_unlock: exit_dput: path_put_conditional(path, nd); exit: - path_put(&nd->path); - return ERR_PTR(error); + filp = ERR_PTR(error); + goto out; } -/* - * Note that the low bits of the passed in "open_flag" - * are not the same as in the local variable "flag". See - * open_to_namei_flags() for more details. - */ -struct file *do_filp_open(int dfd, const char *pathname, - int open_flag, int mode, int acc_mode) +static struct file *path_openat(int dfd, const char *pathname, + struct nameidata *nd, const struct open_flags *op, int flags) { + struct file *base = NULL; struct file *filp; - struct nameidata nd; - int error; struct path path; - int count = 0; - int flag = open_to_namei_flags(open_flag); - int flags; - - if (!(open_flag & O_CREAT)) - mode = 0; - - /* Must never be set by userspace */ - open_flag &= ~FMODE_NONOTIFY; - - /* - * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only - * check for O_DSYNC if the need any syncing at all we enforce it's - * always set instead of having to deal with possibly weird behaviour - * for malicious applications setting only __O_SYNC. - */ - if (open_flag & __O_SYNC) - open_flag |= O_DSYNC; - - if (!acc_mode) - acc_mode = MAY_OPEN | ACC_MODE(open_flag); - - /* O_TRUNC implies we need access checks for write permissions */ - if (open_flag & O_TRUNC) - acc_mode |= MAY_WRITE; - - /* Allow the LSM permission hook to distinguish append - access from general write access. */ - if (open_flag & O_APPEND) - acc_mode |= MAY_APPEND; - - flags = LOOKUP_OPEN; - if (open_flag & O_CREAT) { - flags |= LOOKUP_CREATE; - if (open_flag & O_EXCL) - flags |= LOOKUP_EXCL; - } - if (open_flag & O_DIRECTORY) - flags |= LOOKUP_DIRECTORY; - if (!(open_flag & O_NOFOLLOW)) - flags |= LOOKUP_FOLLOW; + int error; filp = get_empty_filp(); if (!filp) return ERR_PTR(-ENFILE); - filp->f_flags = open_flag; - nd.intent.open.file = filp; - nd.intent.open.flags = flag; - nd.intent.open.create_mode = mode; - - if (open_flag & O_CREAT) - goto creat; + filp->f_flags = op->open_flag; + nd->intent.open.file = filp; + nd->intent.open.flags = open_to_namei_flags(op->open_flag); + nd->intent.open.create_mode = op->mode; - /* !O_CREAT, simple open */ - error = do_path_lookup(dfd, pathname, flags, &nd); + error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base); if (unlikely(error)) goto out_filp; - error = -ELOOP; - if (!(nd.flags & LOOKUP_FOLLOW)) { - if (nd.inode->i_op->follow_link) - goto out_path; - } - error = -ENOTDIR; - if (nd.flags & LOOKUP_DIRECTORY) { - if (!nd.inode->i_op->lookup) - goto out_path; - } - audit_inode(pathname, nd.path.dentry); - filp = finish_open(&nd, open_flag, acc_mode); - release_open_intent(&nd); - return filp; -creat: - /* OK, have to create the file. Find the parent. */ - error = path_init_rcu(dfd, pathname, - LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd); - if (error) - goto out_filp; - error = path_walk_rcu(pathname, &nd); - path_finish_rcu(&nd); - if (unlikely(error == -ECHILD || error == -ESTALE)) { - /* slower, locked walk */ - if (error == -ESTALE) { -reval: - flags |= LOOKUP_REVAL; - } - error = path_init(dfd, pathname, - LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd); - if (error) - goto out_filp; - - error = path_walk_simple(pathname, &nd); - } + current->total_link_count = 0; + error = link_path_walk(pathname, nd); if (unlikely(error)) goto out_filp; - if (unlikely(!audit_dummy_context())) - audit_inode(pathname, nd.path.dentry); - /* - * We have the parent and last component. - */ - nd.flags = flags; - filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); + filp = do_last(nd, &path, op, pathname); while (unlikely(!filp)) { /* trailing symlink */ struct path link = path; - struct inode *linki = link.dentry->d_inode; void *cookie; - error = -ELOOP; - if (!(nd.flags & LOOKUP_FOLLOW)) - goto exit_dput; - if (count++ == 32) - goto exit_dput; - /* - * This is subtle. Instead of calling do_follow_link() we do - * the thing by hands. The reason is that this way we have zero - * link_count and path_walk() (called from ->follow_link) - * honoring LOOKUP_PARENT. After that we have the parent and - * last component, i.e. we are in the same situation as after - * the first path_walk(). Well, almost - if the last component - * is normal we get its copy stored in nd->last.name and we will - * have to putname() it when we are done. Procfs-like symlinks - * just set LAST_BIND. - */ - nd.flags |= LOOKUP_PARENT; - error = security_inode_follow_link(link.dentry, &nd); - if (error) - goto exit_dput; - error = __do_follow_link(&link, &nd, &cookie); - if (unlikely(error)) { - if (!IS_ERR(cookie) && linki->i_op->put_link) - linki->i_op->put_link(link.dentry, &nd, cookie); - /* nd.path had been dropped */ - nd.path = link; - goto out_path; + if (!(nd->flags & LOOKUP_FOLLOW)) { + path_put_conditional(&path, nd); + path_put(&nd->path); + filp = ERR_PTR(-ELOOP); + break; } - nd.flags &= ~LOOKUP_PARENT; - filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); - if (linki->i_op->put_link) - linki->i_op->put_link(link.dentry, &nd, cookie); - path_put(&link); + nd->flags |= LOOKUP_PARENT; + nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); + error = follow_link(&link, nd, &cookie); + if (unlikely(error)) + filp = ERR_PTR(error); + else + filp = do_last(nd, &path, op, pathname); + put_link(nd, &link, cookie); } out: - if (nd.root.mnt) - path_put(&nd.root); - if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL)) - goto reval; - release_open_intent(&nd); + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) + path_put(&nd->root); + if (base) + fput(base); + release_open_intent(nd); return filp; -exit_dput: - path_put_conditional(&path, &nd); -out_path: - path_put(&nd.path); out_filp: filp = ERR_PTR(error); goto out; } -/** - * filp_open - open file and return file pointer - * - * @filename: path to open - * @flags: open flags as per the open(2) second argument - * @mode: mode for the new file if O_CREAT is set, else ignored - * - * This is the helper to open a file from kernelspace if you really - * have to. But in generally you should not do this, so please move - * along, nothing to see here.. - */ -struct file *filp_open(const char *filename, int flags, int mode) +struct file *do_filp_open(int dfd, const char *pathname, + const struct open_flags *op, int flags) +{ + struct nameidata nd; + struct file *filp; + + filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); + if (unlikely(filp == ERR_PTR(-ECHILD))) + filp = path_openat(dfd, pathname, &nd, op, flags); + if (unlikely(filp == ERR_PTR(-ESTALE))) + filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL); + return filp; +} + +struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, + const char *name, const struct open_flags *op, int flags) { - return do_filp_open(AT_FDCWD, filename, flags, mode, 0); + struct nameidata nd; + struct file *file; + + nd.root.mnt = mnt; + nd.root.dentry = dentry; + + flags |= LOOKUP_ROOT; + + if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN) + return ERR_PTR(-ELOOP); + + file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU); + if (unlikely(file == ERR_PTR(-ECHILD))) + file = path_openat(-1, name, &nd, op, flags); + if (unlikely(file == ERR_PTR(-ESTALE))) + file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL); + return file; } -EXPORT_SYMBOL(filp_open); /** * lookup_create - lookup a dentry, creating it if it doesn't exist @@ -3111,7 +2917,11 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de return error; mutex_lock(&inode->i_mutex); - error = dir->i_op->link(old_dentry, dir, new_dentry); + /* Make sure we don't allow creating hardlink to an unlinked file */ + if (inode->i_nlink == 0) + error = -ENOENT; + else + error = dir->i_op->link(old_dentry, dir, new_dentry); mutex_unlock(&inode->i_mutex); if (!error) fsnotify_link(dir, inode, new_dentry); @@ -3133,15 +2943,27 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, struct dentry *new_dentry; struct nameidata nd; struct path old_path; + int how = 0; int error; char *to; - if ((flags & ~AT_SYMLINK_FOLLOW) != 0) + if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; + /* + * To use null names we require CAP_DAC_READ_SEARCH + * This ensures that not everyone will be able to create + * handlink using the passed filedescriptor. + */ + if (flags & AT_EMPTY_PATH) { + if (!capable(CAP_DAC_READ_SEARCH)) + return -ENOENT; + how = LOOKUP_EMPTY; + } + + if (flags & AT_SYMLINK_FOLLOW) + how |= LOOKUP_FOLLOW; - error = user_path_at(olddfd, oldname, - flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0, - &old_path); + error = user_path_at(olddfd, oldname, how, &old_path); if (error) return error; @@ -3578,7 +3400,7 @@ EXPORT_SYMBOL(page_readlink); EXPORT_SYMBOL(__page_symlink); EXPORT_SYMBOL(page_symlink); EXPORT_SYMBOL(page_symlink_inode_operations); -EXPORT_SYMBOL(path_lookup); +EXPORT_SYMBOL(kern_path_parent); EXPORT_SYMBOL(kern_path); EXPORT_SYMBOL(vfs_path_lookup); EXPORT_SYMBOL(inode_permission); diff --git a/fs/namespace.c b/fs/namespace.c index d1edf26025d..9263995bf6a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -196,7 +196,7 @@ unsigned int mnt_get_count(struct vfsmount *mnt) #endif } -struct vfsmount *alloc_vfsmnt(const char *name) +static struct vfsmount *alloc_vfsmnt(const char *name) { struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); if (mnt) { @@ -466,15 +466,7 @@ static void __mnt_unmake_readonly(struct vfsmount *mnt) br_write_unlock(vfsmount_lock); } -void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) -{ - mnt->mnt_sb = sb; - mnt->mnt_root = dget(sb->s_root); -} - -EXPORT_SYMBOL(simple_set_mnt); - -void free_vfsmnt(struct vfsmount *mnt) +static void free_vfsmnt(struct vfsmount *mnt) { kfree(mnt->mnt_devname); mnt_free_id(mnt); @@ -678,6 +670,36 @@ static struct vfsmount *skip_mnt_tree(struct vfsmount *p) return p; } +struct vfsmount * +vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) +{ + struct vfsmount *mnt; + struct dentry *root; + + if (!type) + return ERR_PTR(-ENODEV); + + mnt = alloc_vfsmnt(name); + if (!mnt) + return ERR_PTR(-ENOMEM); + + if (flags & MS_KERNMOUNT) + mnt->mnt_flags = MNT_INTERNAL; + + root = mount_fs(type, flags, name, data); + if (IS_ERR(root)) { + free_vfsmnt(mnt); + return ERR_CAST(root); + } + + mnt->mnt_root = root; + mnt->mnt_sb = root->d_sb; + mnt->mnt_mountpoint = mnt->mnt_root; + mnt->mnt_parent = mnt; + return mnt; +} +EXPORT_SYMBOL_GPL(vfs_kern_mount); + static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, int flag) { @@ -978,7 +1000,13 @@ static int show_vfsmnt(struct seq_file *m, void *v) int err = 0; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; - mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); + if (mnt->mnt_sb->s_op->show_devname) { + err = mnt->mnt_sb->s_op->show_devname(m, mnt); + if (err) + goto out; + } else { + mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); + } seq_putc(m, ' '); seq_path(m, &mnt_path, " \t\n\\"); seq_putc(m, ' '); @@ -1002,6 +1030,18 @@ const struct seq_operations mounts_op = { .show = show_vfsmnt }; +static int uuid_is_nil(u8 *uuid) +{ + int i; + u8 *cp = (u8 *)uuid; + + for (i = 0; i < 16; i++) { + if (*cp++) + return 0; + } + return 1; +} + static int show_mountinfo(struct seq_file *m, void *v) { struct proc_mounts *p = m->private; @@ -1013,7 +1053,12 @@ static int show_mountinfo(struct seq_file *m, void *v) seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id, MAJOR(sb->s_dev), MINOR(sb->s_dev)); - seq_dentry(m, mnt->mnt_root, " \t\n\\"); + if (sb->s_op->show_path) + err = sb->s_op->show_path(m, mnt); + else + seq_dentry(m, mnt->mnt_root, " \t\n\\"); + if (err) + goto out; seq_putc(m, ' '); seq_path_root(m, &mnt_path, &root, " \t\n\\"); if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) { @@ -1040,11 +1085,20 @@ static int show_mountinfo(struct seq_file *m, void *v) if (IS_MNT_UNBINDABLE(mnt)) seq_puts(m, " unbindable"); + if (!uuid_is_nil(mnt->mnt_sb->s_uuid)) + /* print the uuid */ + seq_printf(m, " uuid:%pU", mnt->mnt_sb->s_uuid); + /* Filesystem specific data */ seq_puts(m, " - "); show_type(m, sb); seq_putc(m, ' '); - mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); + if (sb->s_op->show_devname) + err = sb->s_op->show_devname(m, mnt); + else + mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); + if (err) + goto out; seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); err = show_sb_opts(m, sb); if (err) @@ -1070,11 +1124,15 @@ static int show_vfsstat(struct seq_file *m, void *v) int err = 0; /* device */ - if (mnt->mnt_devname) { - seq_puts(m, "device "); - mangle(m, mnt->mnt_devname); - } else - seq_puts(m, "no device"); + if (mnt->mnt_sb->s_op->show_devname) { + err = mnt->mnt_sb->s_op->show_devname(m, mnt); + } else { + if (mnt->mnt_devname) { + seq_puts(m, "device "); + mangle(m, mnt->mnt_devname); + } else + seq_puts(m, "no device"); + } /* mount point */ seq_puts(m, " mounted on "); @@ -1088,7 +1146,8 @@ static int show_vfsstat(struct seq_file *m, void *v) /* optional statistics */ if (mnt->mnt_sb->s_op->show_stats) { seq_putc(m, ' '); - err = mnt->mnt_sb->s_op->show_stats(m, mnt); + if (!err) + err = mnt->mnt_sb->s_op->show_stats(m, mnt); } seq_putc(m, '\n'); @@ -1604,9 +1663,35 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt, return err; } +static int lock_mount(struct path *path) +{ + struct vfsmount *mnt; +retry: + mutex_lock(&path->dentry->d_inode->i_mutex); + if (unlikely(cant_mount(path->dentry))) { + mutex_unlock(&path->dentry->d_inode->i_mutex); + return -ENOENT; + } + down_write(&namespace_sem); + mnt = lookup_mnt(path); + if (likely(!mnt)) + return 0; + up_write(&namespace_sem); + mutex_unlock(&path->dentry->d_inode->i_mutex); + path_put(path); + path->mnt = mnt; + path->dentry = dget(mnt->mnt_root); + goto retry; +} + +static void unlock_mount(struct path *path) +{ + up_write(&namespace_sem); + mutex_unlock(&path->dentry->d_inode->i_mutex); +} + static int graft_tree(struct vfsmount *mnt, struct path *path) { - int err; if (mnt->mnt_sb->s_flags & MS_NOUSER) return -EINVAL; @@ -1614,16 +1699,10 @@ static int graft_tree(struct vfsmount *mnt, struct path *path) S_ISDIR(mnt->mnt_root->d_inode->i_mode)) return -ENOTDIR; - err = -ENOENT; - mutex_lock(&path->dentry->d_inode->i_mutex); - if (cant_mount(path->dentry)) - goto out_unlock; + if (d_unlinked(path->dentry)) + return -ENOENT; - if (!d_unlinked(path->dentry)) - err = attach_recursive_mnt(mnt, path, NULL); -out_unlock: - mutex_unlock(&path->dentry->d_inode->i_mutex); - return err; + return attach_recursive_mnt(mnt, path, NULL); } /* @@ -1686,6 +1765,7 @@ static int do_change_type(struct path *path, int flag) static int do_loopback(struct path *path, char *old_name, int recurse) { + LIST_HEAD(umount_list); struct path old_path; struct vfsmount *mnt = NULL; int err = mount_is_safe(path); @@ -1697,13 +1777,16 @@ static int do_loopback(struct path *path, char *old_name, if (err) return err; - down_write(&namespace_sem); + err = lock_mount(path); + if (err) + goto out; + err = -EINVAL; if (IS_MNT_UNBINDABLE(old_path.mnt)) - goto out; + goto out2; if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) - goto out; + goto out2; err = -ENOMEM; if (recurse) @@ -1712,20 +1795,18 @@ static int do_loopback(struct path *path, char *old_name, mnt = clone_mnt(old_path.mnt, old_path.dentry, 0); if (!mnt) - goto out; + goto out2; err = graft_tree(mnt, path); if (err) { - LIST_HEAD(umount_list); - br_write_lock(vfsmount_lock); umount_tree(mnt, 0, &umount_list); br_write_unlock(vfsmount_lock); - release_mounts(&umount_list); } - +out2: + unlock_mount(path); + release_mounts(&umount_list); out: - up_write(&namespace_sem); path_put(&old_path); return err; } @@ -1767,6 +1848,10 @@ static int do_remount(struct path *path, int flags, int mnt_flags, if (path->dentry != path->mnt->mnt_root) return -EINVAL; + err = security_sb_remount(sb, data); + if (err) + return err; + down_write(&sb->s_umount); if (flags & MS_BIND) err = change_mount_flags(path->mnt, flags); @@ -1810,18 +1895,12 @@ static int do_move_mount(struct path *path, char *old_name) if (err) return err; - down_write(&namespace_sem); - err = follow_down(path, true); + err = lock_mount(path); if (err < 0) goto out; err = -EINVAL; if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) - goto out; - - err = -ENOENT; - mutex_lock(&path->dentry->d_inode->i_mutex); - if (cant_mount(path->dentry)) goto out1; if (d_unlinked(path->dentry)) @@ -1863,16 +1942,87 @@ static int do_move_mount(struct path *path, char *old_name) * automatically */ list_del_init(&old_path.mnt->mnt_expire); out1: - mutex_unlock(&path->dentry->d_inode->i_mutex); + unlock_mount(path); out: - up_write(&namespace_sem); if (!err) path_put(&parent_path); path_put(&old_path); return err; } -static int do_add_mount(struct vfsmount *, struct path *, int); +static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) +{ + int err; + const char *subtype = strchr(fstype, '.'); + if (subtype) { + subtype++; + err = -EINVAL; + if (!subtype[0]) + goto err; + } else + subtype = ""; + + mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL); + err = -ENOMEM; + if (!mnt->mnt_sb->s_subtype) + goto err; + return mnt; + + err: + mntput(mnt); + return ERR_PTR(err); +} + +struct vfsmount * +do_kern_mount(const char *fstype, int flags, const char *name, void *data) +{ + struct file_system_type *type = get_fs_type(fstype); + struct vfsmount *mnt; + if (!type) + return ERR_PTR(-ENODEV); + mnt = vfs_kern_mount(type, flags, name, data); + if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && + !mnt->mnt_sb->s_subtype) + mnt = fs_set_subtype(mnt, fstype); + put_filesystem(type); + return mnt; +} +EXPORT_SYMBOL_GPL(do_kern_mount); + +/* + * add a mount into a namespace's mount tree + */ +static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags) +{ + int err; + + mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL); + + err = lock_mount(path); + if (err) + return err; + + err = -EINVAL; + if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt)) + goto unlock; + + /* Refuse the same filesystem on the same mount point */ + err = -EBUSY; + if (path->mnt->mnt_sb == newmnt->mnt_sb && + path->mnt->mnt_root == path->dentry) + goto unlock; + + err = -EINVAL; + if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode)) + goto unlock; + + newmnt->mnt_flags = mnt_flags; + err = graft_tree(newmnt, path); + +unlock: + unlock_mount(path); + return err; +} /* * create a new mount for userspace and request it to be added into the @@ -1932,43 +2082,6 @@ fail: return err; } -/* - * add a mount into a namespace's mount tree - */ -static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags) -{ - int err; - - mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL); - - down_write(&namespace_sem); - /* Something was mounted here while we slept */ - err = follow_down(path, true); - if (err < 0) - goto unlock; - - err = -EINVAL; - if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt)) - goto unlock; - - /* Refuse the same filesystem on the same mount point */ - err = -EBUSY; - if (path->mnt->mnt_sb == newmnt->mnt_sb && - path->mnt->mnt_root == path->dentry) - goto unlock; - - err = -EINVAL; - if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode)) - goto unlock; - - newmnt->mnt_flags = mnt_flags; - err = graft_tree(newmnt, path); - -unlock: - up_write(&namespace_sem); - return err; -} - /** * mnt_set_expiry - Put a mount on an expiration list * @mnt: The mount to list. @@ -2469,65 +2582,60 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, error = user_path_dir(new_root, &new); if (error) goto out0; - error = -EINVAL; - if (!check_mnt(new.mnt)) - goto out1; error = user_path_dir(put_old, &old); if (error) goto out1; error = security_sb_pivotroot(&old, &new); - if (error) { - path_put(&old); - goto out1; - } + if (error) + goto out2; get_fs_root(current->fs, &root); - down_write(&namespace_sem); - mutex_lock(&old.dentry->d_inode->i_mutex); + error = lock_mount(&old); + if (error) + goto out3; + error = -EINVAL; if (IS_MNT_SHARED(old.mnt) || IS_MNT_SHARED(new.mnt->mnt_parent) || IS_MNT_SHARED(root.mnt->mnt_parent)) - goto out2; - if (!check_mnt(root.mnt)) - goto out2; + goto out4; + if (!check_mnt(root.mnt) || !check_mnt(new.mnt)) + goto out4; error = -ENOENT; - if (cant_mount(old.dentry)) - goto out2; if (d_unlinked(new.dentry)) - goto out2; + goto out4; if (d_unlinked(old.dentry)) - goto out2; + goto out4; error = -EBUSY; if (new.mnt == root.mnt || old.mnt == root.mnt) - goto out2; /* loop, on the same file system */ + goto out4; /* loop, on the same file system */ error = -EINVAL; if (root.mnt->mnt_root != root.dentry) - goto out2; /* not a mountpoint */ + goto out4; /* not a mountpoint */ if (root.mnt->mnt_parent == root.mnt) - goto out2; /* not attached */ + goto out4; /* not attached */ if (new.mnt->mnt_root != new.dentry) - goto out2; /* not a mountpoint */ + goto out4; /* not a mountpoint */ if (new.mnt->mnt_parent == new.mnt) - goto out2; /* not attached */ + goto out4; /* not attached */ /* make sure we can reach put_old from new_root */ tmp = old.mnt; - br_write_lock(vfsmount_lock); if (tmp != new.mnt) { for (;;) { if (tmp->mnt_parent == tmp) - goto out3; /* already mounted on put_old */ + goto out4; /* already mounted on put_old */ if (tmp->mnt_parent == new.mnt) break; tmp = tmp->mnt_parent; } if (!is_subdir(tmp->mnt_mountpoint, new.dentry)) - goto out3; + goto out4; } else if (!is_subdir(old.dentry, new.dentry)) - goto out3; + goto out4; + br_write_lock(vfsmount_lock); detach_mnt(new.mnt, &parent_path); detach_mnt(root.mnt, &root_parent); /* mount old root on put_old */ @@ -2537,22 +2645,21 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, touch_mnt_namespace(current->nsproxy->mnt_ns); br_write_unlock(vfsmount_lock); chroot_fs_refs(&root, &new); - error = 0; - path_put(&root_parent); - path_put(&parent_path); -out2: - mutex_unlock(&old.dentry->d_inode->i_mutex); - up_write(&namespace_sem); +out4: + unlock_mount(&old); + if (!error) { + path_put(&root_parent); + path_put(&parent_path); + } +out3: path_put(&root); +out2: path_put(&old); out1: path_put(&new); out0: return error; -out3: - br_write_unlock(vfsmount_lock); - goto out2; } static void __init init_mount_tree(void) @@ -2627,3 +2734,9 @@ void put_mnt_ns(struct mnt_namespace *ns) kfree(ns); } EXPORT_SYMBOL(put_mnt_ns); + +struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) +{ + return vfs_kern_mount(type, MS_KERNMOUNT, type->name, data); +} +EXPORT_SYMBOL_GPL(kern_mount_data); diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 89587573fe5..2f41dccea18 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -188,10 +188,10 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, rv = NFS4ERR_DELAY; list_del_init(&lo->plh_bulk_recall); spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&free_me_list); put_layout_hdr(lo); iput(ino); } - pnfs_free_lseg_list(&free_me_list); return rv; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index bd3ca32879e..139be9647d8 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -82,6 +82,11 @@ retry: #endif /* CONFIG_NFS_V4 */ /* + * Turn off NFSv4 uid/gid mapping when using AUTH_SYS + */ +static int nfs4_disable_idmapping = 0; + +/* * RPC cruft for NFS */ static struct rpc_version *nfs_version[5] = { @@ -481,7 +486,12 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat * Look up a client by IP address and protocol version * - creates a new record if one doesn't yet exist */ -static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) +static struct nfs_client * +nfs_get_client(const struct nfs_client_initdata *cl_init, + const struct rpc_timeout *timeparms, + const char *ip_addr, + rpc_authflavor_t authflavour, + int noresvport) { struct nfs_client *clp, *new = NULL; int error; @@ -512,6 +522,13 @@ install_client: clp = new; list_add(&clp->cl_share_link, &nfs_client_list); spin_unlock(&nfs_client_lock); + + error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr, + authflavour, noresvport); + if (error < 0) { + nfs_put_client(clp); + return ERR_PTR(error); + } dprintk("--> nfs_get_client() = %p [new]\n", clp); return clp; @@ -767,9 +784,9 @@ static int nfs_init_server_rpcclient(struct nfs_server *server, /* * Initialise an NFS2 or NFS3 client */ -static int nfs_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const struct nfs_parsed_mount_data *data) +int nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, + const char *ip_addr, rpc_authflavor_t authflavour, + int noresvport) { int error; @@ -784,7 +801,7 @@ static int nfs_init_client(struct nfs_client *clp, * - RFC 2623, sec 2.3.2 */ error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX, - 0, data->flags & NFS_MOUNT_NORESVPORT); + 0, noresvport); if (error < 0) goto error; nfs_mark_client_ready(clp, NFS_CS_READY); @@ -820,19 +837,17 @@ static int nfs_init_server(struct nfs_server *server, cl_init.rpc_ops = &nfs_v3_clientops; #endif + nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, + data->timeo, data->retrans); + /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init); + clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX, + data->flags & NFS_MOUNT_NORESVPORT); if (IS_ERR(clp)) { dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); return PTR_ERR(clp); } - nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, - data->timeo, data->retrans); - error = nfs_init_client(clp, &timeparms, data); - if (error < 0) - goto error; - server->nfs_client = clp; /* Initialise the client representation from the mount data */ @@ -1009,14 +1024,19 @@ static void nfs_server_insert_lists(struct nfs_server *server) spin_lock(&nfs_client_lock); list_add_tail_rcu(&server->client_link, &clp->cl_superblocks); list_add_tail(&server->master_link, &nfs_volume_list); + clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); spin_unlock(&nfs_client_lock); } static void nfs_server_remove_lists(struct nfs_server *server) { + struct nfs_client *clp = server->nfs_client; + spin_lock(&nfs_client_lock); list_del_rcu(&server->client_link); + if (clp && list_empty(&clp->cl_superblocks)) + set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); list_del(&server->master_link); spin_unlock(&nfs_client_lock); @@ -1307,11 +1327,11 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp) /* * Initialise an NFS4 client record */ -static int nfs4_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr, - rpc_authflavor_t authflavour, - int flags) +int nfs4_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr, + rpc_authflavor_t authflavour, + int noresvport) { int error; @@ -1325,7 +1345,7 @@ static int nfs4_init_client(struct nfs_client *clp, clp->rpc_ops = &nfs_v4_clientops; error = nfs_create_rpc_client(clp, timeparms, authflavour, - 1, flags & NFS_MOUNT_NORESVPORT); + 1, noresvport); if (error < 0) goto error; strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); @@ -1378,27 +1398,71 @@ static int nfs4_set_client(struct nfs_server *server, dprintk("--> nfs4_set_client()\n"); /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init); + clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour, + server->flags & NFS_MOUNT_NORESVPORT); if (IS_ERR(clp)) { error = PTR_ERR(clp); goto error; } - error = nfs4_init_client(clp, timeparms, ip_addr, authflavour, - server->flags); - if (error < 0) - goto error_put; + + /* + * Query for the lease time on clientid setup or renewal + * + * Note that this will be set on nfs_clients that were created + * only for the DS role and did not set this bit, but now will + * serve a dual role. + */ + set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state); server->nfs_client = clp; dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp); return 0; - -error_put: - nfs_put_client(clp); error: dprintk("<-- nfs4_set_client() = xerror %d\n", error); return error; } +/* + * Set up a pNFS Data Server client. + * + * Return any existing nfs_client that matches server address,port,version + * and minorversion. + * + * For a new nfs_client, use a soft mount (default), a low retrans and a + * low timeout interval so that if a connection is lost, we retry through + * the MDS. + */ +struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, + const struct sockaddr *ds_addr, + int ds_addrlen, int ds_proto) +{ + struct nfs_client_initdata cl_init = { + .addr = ds_addr, + .addrlen = ds_addrlen, + .rpc_ops = &nfs_v4_clientops, + .proto = ds_proto, + .minorversion = mds_clp->cl_minorversion, + }; + struct rpc_timeout ds_timeout = { + .to_initval = 15 * HZ, + .to_maxval = 15 * HZ, + .to_retries = 1, + .to_exponential = 1, + }; + struct nfs_client *clp; + + /* + * Set an authflavor equual to the MDS value. Use the MDS nfs_client + * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS + * (section 13.1 RFC 5661). + */ + clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, + mds_clp->cl_rpcclient->cl_auth->au_flavor, 0); + + dprintk("<-- %s %p\n", __func__, clp); + return clp; +} +EXPORT_SYMBOL(nfs4_set_ds_client); /* * Session has been established, and the client marked ready. @@ -1435,6 +1499,10 @@ static int nfs4_server_common_setup(struct nfs_server *server, BUG_ON(!server->nfs_client->rpc_ops); BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); + /* data servers support only a subset of NFSv4.1 */ + if (is_ds_only_client(server->nfs_client)) + return -EPROTONOSUPPORT; + fattr = nfs_alloc_fattr(); if (fattr == NULL) return -ENOMEM; @@ -1504,6 +1572,13 @@ static int nfs4_init_server(struct nfs_server *server, if (error < 0) goto error; + /* + * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower + * authentication. + */ + if (nfs4_disable_idmapping && data->auth_flavors[0] == RPC_AUTH_UNIX) + server->caps |= NFS_CAP_UIDGID_NOMAP; + if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); if (data->wsize) @@ -1921,3 +1996,7 @@ void nfs_fs_proc_exit(void) } #endif /* CONFIG_PROC_FS */ + +module_param(nfs4_disable_idmapping, bool, 0644); +MODULE_PARM_DESC(nfs4_disable_idmapping, + "Turn off NFSv4 idmapping when using 'sec=sys'"); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2c3eb33b904..abdf38d5971 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1169,11 +1169,23 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode) iput(inode); } +static void nfs_d_release(struct dentry *dentry) +{ + /* free cached devname value, if it survived that far */ + if (unlikely(dentry->d_fsdata)) { + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) + WARN_ON(1); + else + kfree(dentry->d_fsdata); + } +} + const struct dentry_operations nfs_dentry_operations = { .d_revalidate = nfs_lookup_revalidate, .d_delete = nfs_dentry_delete, .d_iput = nfs_dentry_iput, .d_automount = nfs_d_automount, + .d_release = nfs_d_release, }; static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) @@ -1248,6 +1260,7 @@ const struct dentry_operations nfs4_dentry_operations = { .d_delete = nfs_dentry_delete, .d_iput = nfs_dentry_iput, .d_automount = nfs_d_automount, + .d_release = nfs_d_release, }; /* diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 9943a75bb6d..8eea2536671 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -45,6 +45,7 @@ #include <linux/pagemap.h> #include <linux/kref.h> #include <linux/slab.h> +#include <linux/task_io_accounting_ops.h> #include <linux/nfs_fs.h> #include <linux/nfs_page.h> @@ -649,8 +650,7 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; - if (nfs_writeback_done(task, data) != 0) - return; + nfs_writeback_done(task, data); } /* @@ -938,6 +938,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, if (retval) goto out; + task_io_account_read(count); + retval = nfs_direct_read(iocb, iov, nr_segs, pos); if (retval > 0) iocb->ki_pos = pos + retval; @@ -999,6 +1001,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, if (retval) goto out; + task_io_account_write(count); + retval = nfs_direct_write(iocb, iov, nr_segs, pos, count); if (retval > 0) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 7bf029ef408..d85a534b15c 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -387,10 +387,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, file->f_path.dentry->d_name.name, mapping->host->i_ino, len, (long long) pos); - pnfs_update_layout(mapping->host, - nfs_file_open_context(file), - IOMODE_RW); - start: /* * Prevent starvation issues if someone is doing a consistency diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index b5ffe8fa291..1084792bc0f 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -75,18 +75,25 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i /* * get an NFS2/NFS3 root dentry from the root filehandle */ -struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh) +struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh, + const char *devname) { struct nfs_server *server = NFS_SB(sb); struct nfs_fsinfo fsinfo; struct dentry *ret; struct inode *inode; + void *name = kstrdup(devname, GFP_KERNEL); int error; + if (!name) + return ERR_PTR(-ENOMEM); + /* get the actual root for this mount */ fsinfo.fattr = nfs_alloc_fattr(); - if (fsinfo.fattr == NULL) + if (fsinfo.fattr == NULL) { + kfree(name); return ERR_PTR(-ENOMEM); + } error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); if (error < 0) { @@ -119,7 +126,15 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh) } security_d_instantiate(ret, inode); + spin_lock(&ret->d_lock); + if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) { + ret->d_fsdata = name; + name = NULL; + } + spin_unlock(&ret->d_lock); out: + if (name) + kfree(name); nfs_free_fattr(fsinfo.fattr); return ret; } @@ -169,27 +184,35 @@ out: /* * get an NFS4 root dentry from the root filehandle */ -struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh) +struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh, + const char *devname) { struct nfs_server *server = NFS_SB(sb); struct nfs_fattr *fattr = NULL; struct dentry *ret; struct inode *inode; + void *name = kstrdup(devname, GFP_KERNEL); int error; dprintk("--> nfs4_get_root()\n"); + if (!name) + return ERR_PTR(-ENOMEM); + /* get the info about the server and filesystem */ error = nfs4_server_capabilities(server, mntfh); if (error < 0) { dprintk("nfs_get_root: getcaps error = %d\n", -error); + kfree(name); return ERR_PTR(error); } fattr = nfs_alloc_fattr(); - if (fattr == NULL) - return ERR_PTR(-ENOMEM);; + if (fattr == NULL) { + kfree(name); + return ERR_PTR(-ENOMEM); + } /* get the actual root for this mount */ error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr); @@ -223,8 +246,15 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh) } security_d_instantiate(ret, inode); - + spin_lock(&ret->d_lock); + if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) { + ret->d_fsdata = name; + name = NULL; + } + spin_unlock(&ret->d_lock); out: + if (name) + kfree(name); nfs_free_fattr(fattr); dprintk("<-- nfs4_get_root()\n"); return ret; diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 18696882f1c..79664a1025a 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -33,16 +33,41 @@ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kernel.h> + +static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res) +{ + unsigned long val; + char buf[16]; + + if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf)) + return 0; + memcpy(buf, name, namelen); + buf[namelen] = '\0'; + if (strict_strtoul(buf, 0, &val) != 0) + return 0; + *res = val; + return 1; +} + +static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen) +{ + return snprintf(buf, buflen, "%u", id); +} #ifdef CONFIG_NFS_USE_NEW_IDMAPPER #include <linux/slab.h> #include <linux/cred.h> +#include <linux/sunrpc/sched.h> +#include <linux/nfs4.h> +#include <linux/nfs_fs_sb.h> #include <linux/nfs_idmap.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <linux/rcupdate.h> -#include <linux/kernel.h> #include <linux/err.h> #include <keys/user-type.h> @@ -219,23 +244,39 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen, return ret; } -int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) +int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) { + if (nfs_map_string_to_numeric(name, namelen, uid)) + return 0; return nfs_idmap_lookup_id(name, namelen, "uid", uid); } -int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid) +int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid) { + if (nfs_map_string_to_numeric(name, namelen, gid)) + return 0; return nfs_idmap_lookup_id(name, namelen, "gid", gid); } -int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) +int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) { - return nfs_idmap_lookup_name(uid, "user", buf, buflen); + int ret = -EINVAL; + + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) + ret = nfs_idmap_lookup_name(uid, "user", buf, buflen); + if (ret < 0) + ret = nfs_map_numeric_to_string(uid, buf, buflen); + return ret; } -int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen) +int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen) { - return nfs_idmap_lookup_name(gid, "group", buf, buflen); + int ret = -EINVAL; + + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) + ret = nfs_idmap_lookup_name(gid, "group", buf, buflen); + if (ret < 0) + ret = nfs_map_numeric_to_string(gid, buf, buflen); + return ret; } #else /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */ @@ -243,7 +284,6 @@ int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t bu #include <linux/module.h> #include <linux/mutex.h> #include <linux/init.h> -#include <linux/types.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/in.h> @@ -695,31 +735,45 @@ static unsigned int fnvhash32(const void *buf, size_t buflen) return hash; } -int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) +int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) { - struct idmap *idmap = clp->cl_idmap; + struct idmap *idmap = server->nfs_client->cl_idmap; + if (nfs_map_string_to_numeric(name, namelen, uid)) + return 0; return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid); } -int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) +int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) { - struct idmap *idmap = clp->cl_idmap; + struct idmap *idmap = server->nfs_client->cl_idmap; + if (nfs_map_string_to_numeric(name, namelen, uid)) + return 0; return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid); } -int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) +int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) { - struct idmap *idmap = clp->cl_idmap; + struct idmap *idmap = server->nfs_client->cl_idmap; + int ret = -EINVAL; - return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) + ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); + if (ret < 0) + ret = nfs_map_numeric_to_string(uid, buf, buflen); + return ret; } -int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) +int nfs_map_gid_to_group(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) { - struct idmap *idmap = clp->cl_idmap; + struct idmap *idmap = server->nfs_client->cl_idmap; + int ret = -EINVAL; - return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) + ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); + if (ret < 0) + ret = nfs_map_numeric_to_string(uid, buf, buflen); + return ret; } #endif /* CONFIG_NFS_USE_NEW_IDMAPPER */ diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 1cc600e77bb..01768e5e2c9 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -37,6 +37,7 @@ #include <linux/inet.h> #include <linux/nfs_xdr.h> #include <linux/slab.h> +#include <linux/compat.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -89,7 +90,11 @@ int nfs_wait_bit_killable(void *word) */ u64 nfs_compat_user_ino64(u64 fileid) { - int ino; +#ifdef CONFIG_COMPAT + compat_ulong_t ino; +#else + unsigned long ino; +#endif if (enable_ino64) return fileid; @@ -1513,7 +1518,7 @@ static int nfsiod_start(void) { struct workqueue_struct *wq; dprintk("RPC: creating workqueue nfsiod\n"); - wq = alloc_workqueue("nfsiod", WQ_RESCUER, 0); + wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM, 0); if (wq == NULL) return -ENOMEM; nfsiod_workqueue = wq; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index cf9fdbdabc6..72e0bddf7a2 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -148,6 +148,9 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *, struct nfs_fattr *); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); extern int nfs4_check_client_ready(struct nfs_client *clp); +extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, + const struct sockaddr *ds_addr, + int ds_addrlen, int ds_proto); #ifdef CONFIG_PROC_FS extern int __init nfs_fs_proc_init(void); extern void nfs_fs_proc_exit(void); @@ -163,10 +166,10 @@ static inline void nfs_fs_proc_exit(void) /* nfs4namespace.c */ #ifdef CONFIG_NFS_V4 -extern struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry); +extern struct vfsmount *nfs_do_refmount(struct dentry *dentry); #else static inline -struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry) +struct vfsmount *nfs_do_refmount(struct dentry *dentry) { return ERR_PTR(-ENOENT); } @@ -213,8 +216,14 @@ extern const u32 nfs41_maxwrite_overhead; extern struct rpc_procinfo nfs4_procedures[]; #endif +extern int nfs4_init_ds_session(struct nfs_client *clp); + /* proc.c */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync); +extern int nfs_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr, rpc_authflavor_t authflavour, + int noresvport); /* dir.c */ extern int nfs_access_cache_shrinker(struct shrinker *shrink, @@ -247,24 +256,30 @@ extern void nfs_sb_active(struct super_block *sb); extern void nfs_sb_deactive(struct super_block *sb); /* namespace.c */ -extern char *nfs_path(const char *base, - const struct dentry *droot, - const struct dentry *dentry, +extern char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen); extern struct vfsmount *nfs_d_automount(struct path *path); /* getroot.c */ -extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *); +extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *, + const char *); #ifdef CONFIG_NFS_V4 -extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *); +extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *, + const char *); extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh); #endif /* read.c */ +extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops); extern void nfs_read_prepare(struct rpc_task *task, void *calldata); /* write.c */ +extern int nfs_initiate_write(struct nfs_write_data *data, + struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how); extern void nfs_write_prepare(struct rpc_task *task, void *calldata); #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, @@ -274,6 +289,13 @@ extern int nfs_migrate_page(struct address_space *, #endif /* nfs4proc.c */ +extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data); +extern int nfs4_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr, + rpc_authflavor_t authflavour, + int noresvport); +extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data); extern int _nfs4_call_sync(struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, @@ -288,12 +310,11 @@ extern int _nfs4_call_sync_session(struct nfs_server *server, /* * Determine the device name as a string */ -static inline char *nfs_devname(const struct vfsmount *mnt_parent, - const struct dentry *dentry, +static inline char *nfs_devname(struct dentry *dentry, char *buffer, ssize_t buflen) { - return nfs_path(mnt_parent->mnt_devname, mnt_parent->mnt_root, - dentry, buffer, buflen); + char *dummy; + return nfs_path(&dummy, dentry, buffer, buflen); } /* diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index f32b8603dca..c0b8344db0c 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -25,33 +25,30 @@ static LIST_HEAD(nfs_automount_list); static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts); int nfs_mountpoint_expiry_timeout = 500 * HZ; -static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent, - const struct dentry *dentry, +static struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr); /* * nfs_path - reconstruct the path given an arbitrary dentry - * @base - arbitrary string to prepend to the path - * @droot - pointer to root dentry for mountpoint + * @base - used to return pointer to the end of devname part of path * @dentry - pointer to dentry * @buffer - result buffer * @buflen - length of buffer * - * Helper function for constructing the path from the - * root dentry to an arbitrary hashed dentry. + * Helper function for constructing the server pathname + * by arbitrary hashed dentry. * * This is mainly for use in figuring out the path on the - * server side when automounting on top of an existing partition. + * server side when automounting on top of an existing partition + * and in generating /proc/mounts and friends. */ -char *nfs_path(const char *base, - const struct dentry *droot, - const struct dentry *dentry, - char *buffer, ssize_t buflen) +char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen) { char *end; int namelen; unsigned seq; + const char *base; rename_retry: end = buffer+buflen; @@ -60,7 +57,10 @@ rename_retry: seq = read_seqbegin(&rename_lock); rcu_read_lock(); - while (!IS_ROOT(dentry) && dentry != droot) { + while (1) { + spin_lock(&dentry->d_lock); + if (IS_ROOT(dentry)) + break; namelen = dentry->d_name.len; buflen -= namelen + 1; if (buflen < 0) @@ -68,27 +68,47 @@ rename_retry: end -= namelen; memcpy(end, dentry->d_name.name, namelen); *--end = '/'; + spin_unlock(&dentry->d_lock); dentry = dentry->d_parent; } - rcu_read_unlock(); - if (read_seqretry(&rename_lock, seq)) + if (read_seqretry(&rename_lock, seq)) { + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); goto rename_retry; + } if (*end != '/') { - if (--buflen < 0) + if (--buflen < 0) { + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); goto Elong; + } *--end = '/'; } + *p = end; + base = dentry->d_fsdata; + if (!base) { + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + WARN_ON(1); + return end; + } namelen = strlen(base); /* Strip off excess slashes in base string */ while (namelen > 0 && base[namelen - 1] == '/') namelen--; buflen -= namelen; - if (buflen < 0) + if (buflen < 0) { + spin_lock(&dentry->d_lock); + rcu_read_unlock(); goto Elong; + } end -= namelen; memcpy(end, base, namelen); + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); return end; Elong_unlock: + spin_lock(&dentry->d_lock); rcu_read_unlock(); if (read_seqretry(&rename_lock, seq)) goto rename_retry; @@ -143,9 +163,9 @@ struct vfsmount *nfs_d_automount(struct path *path) } if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) - mnt = nfs_do_refmount(path->mnt, path->dentry); + mnt = nfs_do_refmount(path->dentry); else - mnt = nfs_do_submount(path->mnt, path->dentry, fh, fattr); + mnt = nfs_do_submount(path->dentry, fh, fattr); if (IS_ERR(mnt)) goto out; @@ -209,19 +229,17 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, /** * nfs_do_submount - set up mountpoint when crossing a filesystem boundary - * @mnt_parent - mountpoint of parent directory * @dentry - parent directory * @fh - filehandle for new root dentry * @fattr - attributes for new root inode * */ -static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent, - const struct dentry *dentry, +static struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr) { struct nfs_clone_mount mountdata = { - .sb = mnt_parent->mnt_sb, + .sb = dentry->d_sb, .dentry = dentry, .fh = fh, .fattr = fattr, @@ -237,11 +255,11 @@ static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent, dentry->d_name.name); if (page == NULL) goto out; - devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE); + devname = nfs_devname(dentry, page, PAGE_SIZE); mnt = (struct vfsmount *)devname; if (IS_ERR(devname)) goto free_page; - mnt = nfs_do_clone_mount(NFS_SB(mnt_parent->mnt_sb), devname, &mountdata); + mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata); free_page: free_page((unsigned long)page); out: diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index ce939c062a5..d0c80d8b3f9 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -885,4 +885,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .lock = nfs3_proc_lock, .clear_acl_cache = nfs3_forget_cached_acls, .close_context = nfs_close_context, + .init_client = nfs_init_client, }; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 7a747407314..c64be1cff08 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -252,6 +252,9 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser extern int nfs4_setup_sequence(const struct nfs_server *server, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply, struct rpc_task *task); +extern int nfs41_setup_sequence(struct nfs4_session *session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task); extern void nfs4_destroy_session(struct nfs4_session *session); extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); extern int nfs4_proc_create_session(struct nfs_client *); @@ -259,6 +262,19 @@ extern int nfs4_proc_destroy_session(struct nfs4_session *); extern int nfs4_init_session(struct nfs_server *server); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); + +static inline bool +is_ds_only_client(struct nfs_client *clp) +{ + return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) == + EXCHGID4_FLAG_USE_PNFS_DS; +} + +static inline bool +is_ds_client(struct nfs_client *clp) +{ + return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS; +} #else /* CONFIG_NFS_v4_1 */ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) { @@ -276,6 +292,18 @@ static inline int nfs4_init_session(struct nfs_server *server) { return 0; } + +static inline bool +is_ds_only_client(struct nfs_client *clp) +{ + return false; +} + +static inline bool +is_ds_client(struct nfs_client *clp) +{ + return false; +} #endif /* CONFIG_NFS_V4_1 */ extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; @@ -298,6 +326,11 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); #if defined(CONFIG_NFS_V4_1) struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp); +extern void nfs4_schedule_session_recovery(struct nfs4_session *); +#else +static inline void nfs4_schedule_session_recovery(struct nfs4_session *session) +{ +} #endif /* CONFIG_NFS_V4_1 */ extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); @@ -307,10 +340,9 @@ extern void nfs4_put_open_state(struct nfs4_state *); extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t); extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t); extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); -extern void nfs4_schedule_state_recovery(struct nfs_client *); +extern void nfs4_schedule_lease_recovery(struct nfs_client *); extern void nfs4_schedule_state_manager(struct nfs_client *); -extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state); -extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state); +extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); extern void nfs41_handle_recall_slot(struct nfs_client *clp); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 23f930caf1e..42855846481 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -40,32 +40,309 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>"); MODULE_DESCRIPTION("The NFSv4 file layout driver"); -static int -filelayout_set_layoutdriver(struct nfs_server *nfss) +#define FILELAYOUT_POLL_RETRY_MAX (15*HZ) + +static loff_t +filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg, + loff_t offset) { - int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client, - nfs4_fl_free_deviceid_callback); - if (status) { - printk(KERN_WARNING "%s: deviceid cache could not be " - "initialized\n", __func__); - return status; + u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count; + u64 tmp; + + offset -= flseg->pattern_offset; + tmp = offset; + do_div(tmp, stripe_width); + + return tmp * flseg->stripe_unit + do_div(offset, flseg->stripe_unit); +} + +/* This function is used by the layout driver to calculate the + * offset of the file on the dserver based on whether the + * layout type is STRIPE_DENSE or STRIPE_SPARSE + */ +static loff_t +filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + + switch (flseg->stripe_type) { + case STRIPE_SPARSE: + return offset; + + case STRIPE_DENSE: + return filelayout_get_dense_offset(flseg, offset); } - dprintk("%s: deviceid cache has been initialized successfully\n", - __func__); + + BUG(); +} + +/* For data server errors we don't recover from */ +static void +filelayout_set_lo_fail(struct pnfs_layout_segment *lseg) +{ + if (lseg->pls_range.iomode == IOMODE_RW) { + dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__); + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); + } else { + dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__); + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + } +} + +static int filelayout_async_handle_error(struct rpc_task *task, + struct nfs4_state *state, + struct nfs_client *clp, + int *reset) +{ + if (task->tk_status >= 0) + return 0; + + *reset = 0; + + switch (task->tk_status) { + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: + dprintk("%s ERROR %d, Reset session. Exchangeid " + "flags 0x%x\n", __func__, task->tk_status, + clp->cl_exchange_flags); + nfs4_schedule_session_recovery(clp->cl_session); + break; + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: + case -EKEYEXPIRED: + rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX); + break; + default: + dprintk("%s DS error. Retry through MDS %d\n", __func__, + task->tk_status); + *reset = 1; + break; + } + task->tk_status = 0; + return -EAGAIN; +} + +/* NFS_PROTO call done callback routines */ + +static int filelayout_read_done_cb(struct rpc_task *task, + struct nfs_read_data *data) +{ + struct nfs_client *clp = data->ds_clp; + int reset = 0; + + dprintk("%s DS read\n", __func__); + + if (filelayout_async_handle_error(task, data->args.context->state, + data->ds_clp, &reset) == -EAGAIN) { + dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", + __func__, data->ds_clp, data->ds_clp->cl_session); + if (reset) { + filelayout_set_lo_fail(data->lseg); + nfs4_reset_read(task, data); + clp = NFS_SERVER(data->inode)->nfs_client; + } + nfs_restart_rpc(task, clp); + return -EAGAIN; + } + return 0; } -/* Clear out the layout by destroying its device list */ -static int -filelayout_clear_layoutdriver(struct nfs_server *nfss) +/* + * Call ops for the async read/write cases + * In the case of dense layouts, the offset needs to be reset to its + * original value. + */ +static void filelayout_read_prepare(struct rpc_task *task, void *data) { - dprintk("--> %s\n", __func__); + struct nfs_read_data *rdata = (struct nfs_read_data *)data; + + rdata->read_done_cb = filelayout_read_done_cb; + + if (nfs41_setup_sequence(rdata->ds_clp->cl_session, + &rdata->args.seq_args, &rdata->res.seq_res, + 0, task)) + return; + + rpc_call_start(task); +} + +static void filelayout_read_call_done(struct rpc_task *task, void *data) +{ + struct nfs_read_data *rdata = (struct nfs_read_data *)data; + + dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); + + /* Note this may cause RPC to be resent */ + rdata->mds_ops->rpc_call_done(task, data); +} + +static void filelayout_read_release(void *data) +{ + struct nfs_read_data *rdata = (struct nfs_read_data *)data; + + rdata->mds_ops->rpc_release(data); +} + +static int filelayout_write_done_cb(struct rpc_task *task, + struct nfs_write_data *data) +{ + int reset = 0; + + if (filelayout_async_handle_error(task, data->args.context->state, + data->ds_clp, &reset) == -EAGAIN) { + struct nfs_client *clp; + + dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", + __func__, data->ds_clp, data->ds_clp->cl_session); + if (reset) { + filelayout_set_lo_fail(data->lseg); + nfs4_reset_write(task, data); + clp = NFS_SERVER(data->inode)->nfs_client; + } else + clp = data->ds_clp; + nfs_restart_rpc(task, clp); + return -EAGAIN; + } - if (nfss->nfs_client->cl_devid_cache) - pnfs_put_deviceid_cache(nfss->nfs_client); return 0; } +static void filelayout_write_prepare(struct rpc_task *task, void *data) +{ + struct nfs_write_data *wdata = (struct nfs_write_data *)data; + + if (nfs41_setup_sequence(wdata->ds_clp->cl_session, + &wdata->args.seq_args, &wdata->res.seq_res, + 0, task)) + return; + + rpc_call_start(task); +} + +static void filelayout_write_call_done(struct rpc_task *task, void *data) +{ + struct nfs_write_data *wdata = (struct nfs_write_data *)data; + + /* Note this may cause RPC to be resent */ + wdata->mds_ops->rpc_call_done(task, data); +} + +static void filelayout_write_release(void *data) +{ + struct nfs_write_data *wdata = (struct nfs_write_data *)data; + + wdata->mds_ops->rpc_release(data); +} + +struct rpc_call_ops filelayout_read_call_ops = { + .rpc_call_prepare = filelayout_read_prepare, + .rpc_call_done = filelayout_read_call_done, + .rpc_release = filelayout_read_release, +}; + +struct rpc_call_ops filelayout_write_call_ops = { + .rpc_call_prepare = filelayout_write_prepare, + .rpc_call_done = filelayout_write_call_done, + .rpc_release = filelayout_write_release, +}; + +static enum pnfs_try_status +filelayout_read_pagelist(struct nfs_read_data *data) +{ + struct pnfs_layout_segment *lseg = data->lseg; + struct nfs4_pnfs_ds *ds; + loff_t offset = data->args.offset; + u32 j, idx; + struct nfs_fh *fh; + int status; + + dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n", + __func__, data->inode->i_ino, + data->args.pgbase, (size_t)data->args.count, offset); + + /* Retrieve the correct rpc_client for the byte range */ + j = nfs4_fl_calc_j_index(lseg, offset); + idx = nfs4_fl_calc_ds_index(lseg, j); + ds = nfs4_fl_prepare_ds(lseg, idx); + if (!ds) { + /* Either layout fh index faulty, or ds connect failed */ + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + return PNFS_NOT_ATTEMPTED; + } + dprintk("%s USE DS:ip %x %hu\n", __func__, + ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); + + /* No multipath support. Use first DS */ + data->ds_clp = ds->ds_clp; + fh = nfs4_fl_select_ds_fh(lseg, j); + if (fh) + data->args.fh = fh; + + data->args.offset = filelayout_get_dserver_offset(lseg, offset); + data->mds_offset = offset; + + /* Perform an asynchronous read to ds */ + status = nfs_initiate_read(data, ds->ds_clp->cl_rpcclient, + &filelayout_read_call_ops); + BUG_ON(status != 0); + return PNFS_ATTEMPTED; +} + +/* Perform async writes. */ +static enum pnfs_try_status +filelayout_write_pagelist(struct nfs_write_data *data, int sync) +{ + struct pnfs_layout_segment *lseg = data->lseg; + struct nfs4_pnfs_ds *ds; + loff_t offset = data->args.offset; + u32 j, idx; + struct nfs_fh *fh; + int status; + + /* Retrieve the correct rpc_client for the byte range */ + j = nfs4_fl_calc_j_index(lseg, offset); + idx = nfs4_fl_calc_ds_index(lseg, j); + ds = nfs4_fl_prepare_ds(lseg, idx); + if (!ds) { + printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + return PNFS_NOT_ATTEMPTED; + } + dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__, + data->inode->i_ino, sync, (size_t) data->args.count, offset, + ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); + + /* We can't handle commit to ds yet */ + if (!FILELAYOUT_LSEG(lseg)->commit_through_mds) + data->args.stable = NFS_FILE_SYNC; + + data->write_done_cb = filelayout_write_done_cb; + data->ds_clp = ds->ds_clp; + fh = nfs4_fl_select_ds_fh(lseg, j); + if (fh) + data->args.fh = fh; + /* + * Get the file offset on the dserver. Set the write offset to + * this offset and save the original offset. + */ + data->args.offset = filelayout_get_dserver_offset(lseg, offset); + data->mds_offset = offset; + + /* Perform an asynchronous write */ + status = nfs_initiate_write(data, ds->ds_clp->cl_rpcclient, + &filelayout_write_call_ops, sync); + BUG_ON(status != 0); + return PNFS_ATTEMPTED; +} + /* * filelayout_check_layout() * @@ -92,14 +369,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, goto out; } - if (fl->stripe_unit % PAGE_SIZE) { - dprintk("%s Stripe unit (%u) not page aligned\n", + if (!fl->stripe_unit || fl->stripe_unit % PAGE_SIZE) { + dprintk("%s Invalid stripe unit (%u)\n", __func__, fl->stripe_unit); goto out; } /* find and reference the deviceid */ - dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id); + dsaddr = nfs4_fl_find_get_deviceid(id); if (dsaddr == NULL) { dsaddr = get_device_info(lo->plh_inode, id); if (dsaddr == NULL) @@ -134,7 +411,7 @@ out: dprintk("--> %s returns %d\n", __func__, status); return status; out_put: - pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid); + nfs4_fl_put_deviceid(dsaddr); goto out; } @@ -243,23 +520,47 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, static void filelayout_free_lseg(struct pnfs_layout_segment *lseg) { - struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode); struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); dprintk("--> %s\n", __func__); - pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, - &fl->dsaddr->deviceid); + nfs4_fl_put_deviceid(fl->dsaddr); _filelayout_free_lseg(fl); } +/* + * filelayout_pg_test(). Called by nfs_can_coalesce_requests() + * + * return 1 : coalesce page + * return 0 : don't coalesce page + */ +int +filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + u64 p_stripe, r_stripe; + u32 stripe_unit; + + if (!pgio->pg_lseg) + return 1; + p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; + r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT; + stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; + + do_div(p_stripe, stripe_unit); + do_div(r_stripe, stripe_unit); + + return (p_stripe == r_stripe); +} + static struct pnfs_layoutdriver_type filelayout_type = { - .id = LAYOUT_NFSV4_1_FILES, - .name = "LAYOUT_NFSV4_1_FILES", - .owner = THIS_MODULE, - .set_layoutdriver = filelayout_set_layoutdriver, - .clear_layoutdriver = filelayout_clear_layoutdriver, - .alloc_lseg = filelayout_alloc_lseg, - .free_lseg = filelayout_free_lseg, + .id = LAYOUT_NFSV4_1_FILES, + .name = "LAYOUT_NFSV4_1_FILES", + .owner = THIS_MODULE, + .alloc_lseg = filelayout_alloc_lseg, + .free_lseg = filelayout_free_lseg, + .pg_test = filelayout_pg_test, + .read_pagelist = filelayout_read_pagelist, + .write_pagelist = filelayout_write_pagelist, }; static int __init nfs4filelayout_init(void) diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index bbf60dd2ab9..ee0c907742b 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -55,8 +55,14 @@ struct nfs4_pnfs_ds { atomic_t ds_count; }; +/* nfs4_file_layout_dsaddr flags */ +#define NFS4_DEVICE_ID_NEG_ENTRY 0x00000001 + struct nfs4_file_layout_dsaddr { - struct pnfs_deviceid_node deviceid; + struct hlist_node node; + struct nfs4_deviceid deviceid; + atomic_t ref; + unsigned long flags; u32 stripe_count; u8 *stripe_indices; u32 ds_num; @@ -83,11 +89,18 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg) generic_hdr); } -extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *); +extern struct nfs_fh * +nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); + extern void print_ds(struct nfs4_pnfs_ds *ds); extern void print_deviceid(struct nfs4_deviceid *dev_id); +u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset); +u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); +struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, + u32 ds_idx); extern struct nfs4_file_layout_dsaddr * -nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id); +nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id); +extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); struct nfs4_file_layout_dsaddr * get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id); diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index f5c9b125e8c..68143c162e3 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -37,6 +37,30 @@ #define NFSDBG_FACILITY NFSDBG_PNFS_LD /* + * Device ID RCU cache. A device ID is unique per client ID and layout type. + */ +#define NFS4_FL_DEVICE_ID_HASH_BITS 5 +#define NFS4_FL_DEVICE_ID_HASH_SIZE (1 << NFS4_FL_DEVICE_ID_HASH_BITS) +#define NFS4_FL_DEVICE_ID_HASH_MASK (NFS4_FL_DEVICE_ID_HASH_SIZE - 1) + +static inline u32 +nfs4_fl_deviceid_hash(struct nfs4_deviceid *id) +{ + unsigned char *cptr = (unsigned char *)id->data; + unsigned int nbytes = NFS4_DEVICEID4_SIZE; + u32 x = 0; + + while (nbytes--) { + x *= 37; + x += *cptr++; + } + return x & NFS4_FL_DEVICE_ID_HASH_MASK; +} + +static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE]; +static DEFINE_SPINLOCK(filelayout_deviceid_lock); + +/* * Data server cache * * Data servers can be mapped to different device ids. @@ -104,6 +128,67 @@ _data_server_lookup_locked(u32 ip_addr, u32 port) return NULL; } +/* + * Create an rpc connection to the nfs4_pnfs_ds data server + * Currently only support IPv4 + */ +static int +nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) +{ + struct nfs_client *clp; + struct sockaddr_in sin; + int status = 0; + + dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__, + ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), + mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor); + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = ds->ds_ip_addr; + sin.sin_port = ds->ds_port; + + clp = nfs4_set_ds_client(mds_srv->nfs_client, (struct sockaddr *)&sin, + sizeof(sin), IPPROTO_TCP); + if (IS_ERR(clp)) { + status = PTR_ERR(clp); + goto out; + } + + if ((clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) != 0) { + if (!is_ds_client(clp)) { + status = -ENODEV; + goto out_put; + } + ds->ds_clp = clp; + dprintk("%s [existing] ip=%x, port=%hu\n", __func__, + ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); + goto out; + } + + /* + * Do not set NFS_CS_CHECK_LEASE_TIME instead set the DS lease to + * be equal to the MDS lease. Renewal is scheduled in create_session. + */ + spin_lock(&mds_srv->nfs_client->cl_lock); + clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time; + spin_unlock(&mds_srv->nfs_client->cl_lock); + clp->cl_last_renewal = jiffies; + + /* New nfs_client */ + status = nfs4_init_ds_session(clp); + if (status) + goto out_put; + + ds->ds_clp = clp; + dprintk("%s [new] ip=%x, port=%hu\n", __func__, ntohl(ds->ds_ip_addr), + ntohs(ds->ds_port)); +out: + return status; +out_put: + nfs_put_client(clp); + goto out; +} + static void destroy_ds(struct nfs4_pnfs_ds *ds) { @@ -122,7 +207,7 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) struct nfs4_pnfs_ds *ds; int i; - print_deviceid(&dsaddr->deviceid.de_id); + print_deviceid(&dsaddr->deviceid); for (i = 0; i < dsaddr->ds_num; i++) { ds = dsaddr->ds_list[i]; @@ -139,15 +224,6 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) kfree(dsaddr); } -void -nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device) -{ - struct nfs4_file_layout_dsaddr *dsaddr = - container_of(device, struct nfs4_file_layout_dsaddr, deviceid); - - nfs4_fl_free_deviceid(dsaddr); -} - static struct nfs4_pnfs_ds * nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port) { @@ -219,6 +295,10 @@ decode_and_add_ds(__be32 **pp, struct inode *inode) goto out_err; } buf = kmalloc(rlen + 1, GFP_KERNEL); + if (!buf) { + dprintk("%s: Not enough memory\n", __func__); + goto out_err; + } buf[rlen] = '\0'; memcpy(buf, r_addr, rlen); @@ -296,7 +376,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev) dsaddr->stripe_count = cnt; dsaddr->ds_num = num; - memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id)); + memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id)); /* Go back an read stripe indices */ p = indicesp; @@ -346,28 +426,37 @@ out_err: } /* - * Decode the opaque device specified in 'dev' - * and add it to the list of available devices. - * If the deviceid is already cached, nfs4_add_deviceid will return - * a pointer to the cached struct and throw away the new. + * Decode the opaque device specified in 'dev' and add it to the cache of + * available devices. */ -static struct nfs4_file_layout_dsaddr* +static struct nfs4_file_layout_dsaddr * decode_and_add_device(struct inode *inode, struct pnfs_device *dev) { - struct nfs4_file_layout_dsaddr *dsaddr; - struct pnfs_deviceid_node *d; + struct nfs4_file_layout_dsaddr *d, *new; + long hash; - dsaddr = decode_device(inode, dev); - if (!dsaddr) { + new = decode_device(inode, dev); + if (!new) { printk(KERN_WARNING "%s: Could not decode or add device\n", __func__); return NULL; } - d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache, - &dsaddr->deviceid); + spin_lock(&filelayout_deviceid_lock); + d = nfs4_fl_find_get_deviceid(&new->deviceid); + if (d) { + spin_unlock(&filelayout_deviceid_lock); + nfs4_fl_free_deviceid(new); + return d; + } + + INIT_HLIST_NODE(&new->node); + atomic_set(&new->ref, 1); + hash = nfs4_fl_deviceid_hash(&new->deviceid); + hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]); + spin_unlock(&filelayout_deviceid_lock); - return container_of(d, struct nfs4_file_layout_dsaddr, deviceid); + return new; } /* @@ -442,12 +531,123 @@ out_free: return dsaddr; } +void +nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) +{ + if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) { + hlist_del_rcu(&dsaddr->node); + spin_unlock(&filelayout_deviceid_lock); + + synchronize_rcu(); + nfs4_fl_free_deviceid(dsaddr); + } +} + struct nfs4_file_layout_dsaddr * -nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id) +nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id) +{ + struct nfs4_file_layout_dsaddr *d; + struct hlist_node *n; + long hash = nfs4_fl_deviceid_hash(id); + + + rcu_read_lock(); + hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) { + if (!memcmp(&d->deviceid, id, sizeof(*id))) { + if (!atomic_inc_not_zero(&d->ref)) + goto fail; + rcu_read_unlock(); + return d; + } + } +fail: + rcu_read_unlock(); + return NULL; +} + +/* + * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit + * Then: ((res + fsi) % dsaddr->stripe_count) + */ +u32 +nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + u64 tmp; + + tmp = offset - flseg->pattern_offset; + do_div(tmp, flseg->stripe_unit); + tmp += flseg->first_stripe_index; + return do_div(tmp, flseg->dsaddr->stripe_count); +} + +u32 +nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j) { - struct pnfs_deviceid_node *d; + return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j]; +} - d = pnfs_find_get_deviceid(clp->cl_devid_cache, id); - return (d == NULL) ? NULL : - container_of(d, struct nfs4_file_layout_dsaddr, deviceid); +struct nfs_fh * +nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + u32 i; + + if (flseg->stripe_type == STRIPE_SPARSE) { + if (flseg->num_fh == 1) + i = 0; + else if (flseg->num_fh == 0) + /* Use the MDS OPEN fh set in nfs_read_rpcsetup */ + return NULL; + else + i = nfs4_fl_calc_ds_index(lseg, j); + } else + i = j; + return flseg->fh_array[i]; +} + +static void +filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr, + int err, u32 ds_addr) +{ + u32 *p = (u32 *)&dsaddr->deviceid; + + printk(KERN_ERR "NFS: data server %x connection error %d." + " Deviceid [%x%x%x%x] marked out of use.\n", + ds_addr, err, p[0], p[1], p[2], p[3]); + + spin_lock(&filelayout_deviceid_lock); + dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY; + spin_unlock(&filelayout_deviceid_lock); +} + +struct nfs4_pnfs_ds * +nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) +{ + struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr; + struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx]; + + if (ds == NULL) { + printk(KERN_ERR "%s: No data server for offset index %d\n", + __func__, ds_idx); + return NULL; + } + + if (!ds->ds_clp) { + struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode); + int err; + + if (dsaddr->flags & NFS4_DEVICE_ID_NEG_ENTRY) { + /* Already tried to connect, don't try again */ + dprintk("%s Deviceid marked out of use\n", __func__); + return NULL; + } + err = nfs4_ds_connect(s, ds); + if (err) { + filelayout_mark_devid_negative(dsaddr, err, + ntohl(ds->ds_ip_addr)); + return NULL; + } + } + return ds; } diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 3c2a1724fbd..bb80c49b653 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -54,33 +54,29 @@ Elong: /* * Determine the mount path as a string */ -static char *nfs4_path(const struct vfsmount *mnt_parent, - const struct dentry *dentry, - char *buffer, ssize_t buflen) +static char *nfs4_path(struct dentry *dentry, char *buffer, ssize_t buflen) { - const char *srvpath; - - srvpath = strchr(mnt_parent->mnt_devname, ':'); - if (srvpath) - srvpath++; - else - srvpath = mnt_parent->mnt_devname; - - return nfs_path(srvpath, mnt_parent->mnt_root, dentry, buffer, buflen); + char *limit; + char *path = nfs_path(&limit, dentry, buffer, buflen); + if (!IS_ERR(path)) { + char *colon = strchr(path, ':'); + if (colon && colon < limit) + path = colon + 1; + } + return path; } /* * Check that fs_locations::fs_root [RFC3530 6.3] is a prefix for what we * believe to be the server path to this dentry */ -static int nfs4_validate_fspath(const struct vfsmount *mnt_parent, - const struct dentry *dentry, +static int nfs4_validate_fspath(struct dentry *dentry, const struct nfs4_fs_locations *locations, char *page, char *page2) { const char *path, *fs_path; - path = nfs4_path(mnt_parent, dentry, page, PAGE_SIZE); + path = nfs4_path(dentry, page, PAGE_SIZE); if (IS_ERR(path)) return PTR_ERR(path); @@ -165,20 +161,18 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, /** * nfs_follow_referral - set up mountpoint when hitting a referral on moved error - * @mnt_parent - mountpoint of parent directory * @dentry - parent directory * @locations - array of NFSv4 server location information * */ -static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, - const struct dentry *dentry, +static struct vfsmount *nfs_follow_referral(struct dentry *dentry, const struct nfs4_fs_locations *locations) { struct vfsmount *mnt = ERR_PTR(-ENOENT); struct nfs_clone_mount mountdata = { - .sb = mnt_parent->mnt_sb, + .sb = dentry->d_sb, .dentry = dentry, - .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor, + .authflavor = NFS_SB(dentry->d_sb)->client->cl_auth->au_flavor, }; char *page = NULL, *page2 = NULL; int loc, error; @@ -198,7 +192,7 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, goto out; /* Ensure fs path is a prefix of current dentry path */ - error = nfs4_validate_fspath(mnt_parent, dentry, locations, page, page2); + error = nfs4_validate_fspath(dentry, locations, page, page2); if (error < 0) { mnt = ERR_PTR(error); goto out; @@ -225,11 +219,10 @@ out: /* * nfs_do_refmount - handle crossing a referral on server - * @mnt_parent - mountpoint of referral * @dentry - dentry of referral * */ -struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry) +struct vfsmount *nfs_do_refmount(struct dentry *dentry) { struct vfsmount *mnt = ERR_PTR(-ENOMEM); struct dentry *parent; @@ -262,7 +255,7 @@ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentr fs_locations->fs_path.ncomponents <= 0) goto out_free; - mnt = nfs_follow_referral(mnt_parent, dentry, fs_locations); + mnt = nfs_follow_referral(dentry, fs_locations); out_free: __free_page(page); kfree(fs_locations); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1ff76acc7e9..1d84e7088af 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -51,7 +51,6 @@ #include <linux/sunrpc/bc_xprt.h> #include <linux/xattr.h> #include <linux/utsname.h> -#include <linux/mm.h> #include "nfs4_fs.h" #include "delegation.h" @@ -86,6 +85,9 @@ static int nfs4_map_errors(int err) switch (err) { case -NFS4ERR_RESOURCE: return -EREMOTEIO; + case -NFS4ERR_BADOWNER: + case -NFS4ERR_BADNAME: + return -EINVAL; default: dprintk("%s could not handle NFSv4 error %d\n", __func__, -err); @@ -242,7 +244,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) /* This is the error handling routine for processes that are allowed * to sleep. */ -static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception) +static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception) { struct nfs_client *clp = server->nfs_client; struct nfs4_state *state = exception->state; @@ -257,12 +259,13 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, case -NFS4ERR_OPENMODE: if (state == NULL) break; - nfs4_state_mark_reclaim_nograce(clp, state); - goto do_state_recovery; + nfs4_schedule_stateid_recovery(server, state); + goto wait_on_recovery; case -NFS4ERR_STALE_STATEID: case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_EXPIRED: - goto do_state_recovery; + nfs4_schedule_lease_recovery(clp); + goto wait_on_recovery; #if defined(CONFIG_NFS_V4_1) case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: @@ -273,7 +276,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, case -NFS4ERR_SEQ_MISORDERED: dprintk("%s ERROR: %d Reset session\n", __func__, errorcode); - nfs4_schedule_state_recovery(clp); + nfs4_schedule_session_recovery(clp->cl_session); exception->retry = 1; break; #endif /* defined(CONFIG_NFS_V4_1) */ @@ -293,11 +296,23 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, break; case -NFS4ERR_OLD_STATEID: exception->retry = 1; + break; + case -NFS4ERR_BADOWNER: + /* The following works around a Linux server bug! */ + case -NFS4ERR_BADNAME: + if (server->caps & NFS_CAP_UIDGID_NOMAP) { + server->caps &= ~NFS_CAP_UIDGID_NOMAP; + exception->retry = 1; + printk(KERN_WARNING "NFS: v4 server %s " + "does not accept raw " + "uid/gids. " + "Reenabling the idmapper.\n", + server->nfs_client->cl_hostname); + } } /* We failed to handle the error */ return nfs4_map_errors(ret); -do_state_recovery: - nfs4_schedule_state_recovery(clp); +wait_on_recovery: ret = nfs4_wait_clnt_recover(clp); if (ret == 0) exception->retry = 1; @@ -436,8 +451,8 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * clp = res->sr_session->clp; do_renew_lease(clp, timestamp); /* Check sequence flags */ - if (atomic_read(&clp->cl_count) > 1) - nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); + if (res->sr_status_flags != 0) + nfs4_schedule_lease_recovery(clp); break; case -NFS4ERR_DELAY: /* The server detected a resend of the RPC call and @@ -506,7 +521,7 @@ out: return ret_id; } -static int nfs41_setup_sequence(struct nfs4_session *session, +int nfs41_setup_sequence(struct nfs4_session *session, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply, @@ -572,6 +587,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session, res->sr_status = 1; return 0; } +EXPORT_SYMBOL_GPL(nfs41_setup_sequence); int nfs4_setup_sequence(const struct nfs_server *server, struct nfs4_sequence_args *args, @@ -1256,14 +1272,13 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_DEADSESSION: - nfs4_schedule_state_recovery( - server->nfs_client); + nfs4_schedule_session_recovery(server->nfs_client->cl_session); goto out; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: /* Don't recall a delegation if it was lost */ - nfs4_schedule_state_recovery(server->nfs_client); + nfs4_schedule_lease_recovery(server->nfs_client); goto out; case -ERESTARTSYS: /* @@ -1272,7 +1287,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state */ case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: - nfs4_state_mark_reclaim_nograce(server->nfs_client, state); + nfs4_schedule_stateid_recovery(server, state); case -EKEYEXPIRED: /* * User RPCSEC_GSS context has expired. @@ -1575,9 +1590,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) return 0; } -static int nfs4_recover_expired_lease(struct nfs_server *server) +static int nfs4_client_recover_expired_lease(struct nfs_client *clp) { - struct nfs_client *clp = server->nfs_client; unsigned int loop; int ret; @@ -1588,12 +1602,17 @@ static int nfs4_recover_expired_lease(struct nfs_server *server) if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state)) break; - nfs4_schedule_state_recovery(clp); + nfs4_schedule_state_manager(clp); ret = -EIO; } return ret; } +static int nfs4_recover_expired_lease(struct nfs_server *server) +{ + return nfs4_client_recover_expired_lease(server->nfs_client); +} + /* * OPEN_EXPIRED: * reclaim state on the server after a network partition. @@ -3071,15 +3090,10 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return err; } -static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) +static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) { struct nfs_server *server = NFS_SERVER(data->inode); - dprintk("--> %s\n", __func__); - - if (!nfs4_sequence_done(task, &data->res.seq_res)) - return -EAGAIN; - if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { nfs_restart_rpc(task, server->nfs_client); return -EAGAIN; @@ -3091,19 +3105,44 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) return 0; } +static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) +{ + + dprintk("--> %s\n", __func__); + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + + return data->read_done_cb(task, data); +} + static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) { data->timestamp = jiffies; + data->read_done_cb = nfs4_read_done_cb; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; } -static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) +/* Reset the the nfs_read_data to send the read to the MDS. */ +void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data) +{ + dprintk("%s Reset task for i/o through\n", __func__); + put_lseg(data->lseg); + data->lseg = NULL; + /* offsets will differ in the dense stripe case */ + data->args.offset = data->mds_offset; + data->ds_clp = NULL; + data->args.fh = NFS_FH(data->inode); + data->read_done_cb = nfs4_read_done_cb; + task->tk_ops = data->mds_ops; + rpc_task_reset_client(task, NFS_CLIENT(data->inode)); +} +EXPORT_SYMBOL_GPL(nfs4_reset_read); + +static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data) { struct inode *inode = data->inode; - if (!nfs4_sequence_done(task, &data->res.seq_res)) - return -EAGAIN; - if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); return -EAGAIN; @@ -3115,11 +3154,41 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) return 0; } +static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) +{ + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + return data->write_done_cb(task, data); +} + +/* Reset the the nfs_write_data to send the write to the MDS. */ +void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data) +{ + dprintk("%s Reset task for i/o through\n", __func__); + put_lseg(data->lseg); + data->lseg = NULL; + data->ds_clp = NULL; + data->write_done_cb = nfs4_write_done_cb; + data->args.fh = NFS_FH(data->inode); + data->args.bitmask = data->res.server->cache_consistency_bitmask; + data->args.offset = data->mds_offset; + data->res.fattr = &data->fattr; + task->tk_ops = data->mds_ops; + rpc_task_reset_client(task, NFS_CLIENT(data->inode)); +} +EXPORT_SYMBOL_GPL(nfs4_reset_write); + static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) { struct nfs_server *server = NFS_SERVER(data->inode); - data->args.bitmask = server->cache_consistency_bitmask; + if (data->lseg) { + data->args.bitmask = NULL; + data->res.fattr = NULL; + } else + data->args.bitmask = server->cache_consistency_bitmask; + if (!data->write_done_cb) + data->write_done_cb = nfs4_write_done_cb; data->res.server = server; data->timestamp = jiffies; @@ -3179,7 +3248,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata) if (task->tk_status < 0) { /* Unless we're shutting down, schedule state recovery! */ if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0) - nfs4_schedule_state_recovery(clp); + nfs4_schedule_lease_recovery(clp); return; } do_renew_lease(clp, timestamp); @@ -3262,7 +3331,7 @@ static int buf_to_pages_noslab(const void *buf, size_t buflen, spages = pages; do { - len = min(PAGE_CACHE_SIZE, buflen); + len = min_t(size_t, PAGE_CACHE_SIZE, buflen); newpage = alloc_page(GFP_KERNEL); if (newpage == NULL) @@ -3504,12 +3573,13 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, case -NFS4ERR_OPENMODE: if (state == NULL) break; - nfs4_state_mark_reclaim_nograce(clp, state); - goto do_state_recovery; + nfs4_schedule_stateid_recovery(server, state); + goto wait_on_recovery; case -NFS4ERR_STALE_STATEID: case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_EXPIRED: - goto do_state_recovery; + nfs4_schedule_lease_recovery(clp); + goto wait_on_recovery; #if defined(CONFIG_NFS_V4_1) case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: @@ -3520,7 +3590,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, case -NFS4ERR_SEQ_MISORDERED: dprintk("%s ERROR %d, Reset session\n", __func__, task->tk_status); - nfs4_schedule_state_recovery(clp); + nfs4_schedule_session_recovery(clp->cl_session); task->tk_status = 0; return -EAGAIN; #endif /* CONFIG_NFS_V4_1 */ @@ -3537,9 +3607,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, } task->tk_status = nfs4_map_errors(task->tk_status); return 0; -do_state_recovery: +wait_on_recovery: rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); - nfs4_schedule_state_recovery(clp); if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); task->tk_status = 0; @@ -4150,7 +4219,7 @@ static void nfs4_lock_release(void *calldata) task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp, data->arg.lock_seqid); if (!IS_ERR(task)) - rpc_put_task(task); + rpc_put_task_async(task); dprintk("%s: cancelling lock!\n", __func__); } else nfs_free_seqid(data->arg.lock_seqid); @@ -4174,23 +4243,18 @@ static const struct rpc_call_ops nfs4_recover_lock_ops = { static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error) { - struct nfs_client *clp = server->nfs_client; - struct nfs4_state *state = lsp->ls_state; - switch (error) { case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_EXPIRED: + lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; if (new_lock_owner != 0 || (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) - nfs4_state_mark_reclaim_nograce(clp, state); - lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; + nfs4_schedule_stateid_recovery(server, lsp->ls_state); break; case -NFS4ERR_STALE_STATEID: - if (new_lock_owner != 0 || - (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) - nfs4_state_mark_reclaim_reboot(clp, state); lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; + case -NFS4ERR_EXPIRED: + nfs4_schedule_lease_recovery(server->nfs_client); }; } @@ -4406,12 +4470,14 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) case -NFS4ERR_EXPIRED: case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: + nfs4_schedule_lease_recovery(server->nfs_client); + goto out; case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_DEADSESSION: - nfs4_schedule_state_recovery(server->nfs_client); + nfs4_schedule_session_recovery(server->nfs_client->cl_session); goto out; case -ERESTARTSYS: /* @@ -4421,7 +4487,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OPENMODE: - nfs4_state_mark_reclaim_nograce(server->nfs_client, state); + nfs4_schedule_stateid_recovery(server, state); err = 0; goto out; case -EKEYEXPIRED: @@ -5028,10 +5094,20 @@ int nfs4_proc_create_session(struct nfs_client *clp) int status; unsigned *ptr; struct nfs4_session *session = clp->cl_session; + long timeout = 0; + int err; dprintk("--> %s clp=%p session=%p\n", __func__, clp, session); - status = _nfs4_proc_create_session(clp); + do { + status = _nfs4_proc_create_session(clp); + if (status == -NFS4ERR_DELAY) { + err = nfs4_delay(clp->cl_rpcclient, &timeout); + if (err) + status = err; + } + } while (status == -NFS4ERR_DELAY); + if (status) goto out; @@ -5113,6 +5189,27 @@ int nfs4_init_session(struct nfs_server *server) return ret; } +int nfs4_init_ds_session(struct nfs_client *clp) +{ + struct nfs4_session *session = clp->cl_session; + int ret; + + if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) + return 0; + + ret = nfs4_client_recover_expired_lease(clp); + if (!ret) + /* Test for the DS role */ + if (!is_ds_client(clp)) + ret = -ENODEV; + if (!ret) + ret = nfs4_check_client_ready(clp); + return ret; + +} +EXPORT_SYMBOL_GPL(nfs4_init_ds_session); + + /* * Renew the cl_session lease. */ @@ -5140,7 +5237,7 @@ static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client rpc_delay(task, NFS4_POLL_RETRY_MAX); return -EAGAIN; default: - nfs4_schedule_state_recovery(clp); + nfs4_schedule_lease_recovery(clp); } return 0; } @@ -5227,7 +5324,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr if (IS_ERR(task)) ret = PTR_ERR(task); else - rpc_put_task(task); + rpc_put_task_async(task); dprintk("<-- %s status=%d\n", __func__, ret); return ret; } @@ -5243,8 +5340,13 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) goto out; } ret = rpc_wait_for_completion_task(task); - if (!ret) + if (!ret) { + struct nfs4_sequence_res *res = task->tk_msg.rpc_resp; + + if (task->tk_status == 0) + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); ret = task->tk_status; + } rpc_put_task(task); out: dprintk("<-- %s status=%d\n", __func__, ret); @@ -5281,7 +5383,7 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf rpc_delay(task, NFS4_POLL_RETRY_MAX); return -EAGAIN; default: - nfs4_schedule_state_recovery(clp); + nfs4_schedule_lease_recovery(clp); } return 0; } @@ -5349,6 +5451,9 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp) status = PTR_ERR(task); goto out; } + status = nfs4_wait_for_completion_rpc_task(task); + if (status == 0) + status = task->tk_status; rpc_put_task(task); return 0; out: @@ -5635,6 +5740,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .clear_acl_cache = nfs4_zap_acl_attr, .close_context = nfs4_close_context, .open_context = nfs4_atomic_open, + .init_client = nfs4_init_client, }; static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index 402143d75fc..df8e7f3ca56 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -64,12 +64,8 @@ nfs4_renew_state(struct work_struct *work) ops = clp->cl_mvops->state_renewal_ops; dprintk("%s: start\n", __func__); - rcu_read_lock(); - if (list_empty(&clp->cl_superblocks)) { - rcu_read_unlock(); + if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state)) goto out; - } - rcu_read_unlock(); spin_lock(&clp->cl_lock); lease = clp->cl_lease_time; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e6742b57a04..ab1bf5bb021 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -153,6 +153,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp) int status; struct nfs_fsinfo fsinfo; + if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) { + nfs4_schedule_state_renewal(clp); + return 0; + } + status = nfs4_proc_get_lease_time(clp, &fsinfo); if (status == 0) { /* Update lease time and schedule renewal */ @@ -1007,9 +1012,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) } /* - * Schedule a state recovery attempt + * Schedule a lease recovery attempt */ -void nfs4_schedule_state_recovery(struct nfs_client *clp) +void nfs4_schedule_lease_recovery(struct nfs_client *clp) { if (!clp) return; @@ -1018,7 +1023,7 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp) nfs4_schedule_state_manager(clp); } -int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) +static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) { set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); @@ -1032,7 +1037,7 @@ int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *st return 1; } -int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) +static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) { set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); @@ -1041,6 +1046,14 @@ int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *s return 1; } +void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state) +{ + struct nfs_client *clp = server->nfs_client; + + nfs4_state_mark_reclaim_nograce(clp, state); + nfs4_schedule_state_manager(clp); +} + static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops) { struct inode *inode = state->inode; @@ -1436,10 +1449,16 @@ static int nfs4_reclaim_lease(struct nfs_client *clp) } #ifdef CONFIG_NFS_V4_1 +void nfs4_schedule_session_recovery(struct nfs4_session *session) +{ + nfs4_schedule_lease_recovery(session->clp); +} +EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); + void nfs41_handle_recall_slot(struct nfs_client *clp) { set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); - nfs4_schedule_state_recovery(clp); + nfs4_schedule_state_manager(clp); } static void nfs4_reset_all_state(struct nfs_client *clp) @@ -1447,7 +1466,7 @@ static void nfs4_reset_all_state(struct nfs_client *clp) if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { clp->cl_boot_time = CURRENT_TIME; nfs4_state_start_reclaim_nograce(clp); - nfs4_schedule_state_recovery(clp); + nfs4_schedule_state_manager(clp); } } @@ -1455,7 +1474,7 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp) { if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { nfs4_state_start_reclaim_reboot(clp); - nfs4_schedule_state_recovery(clp); + nfs4_schedule_state_manager(clp); } } @@ -1475,7 +1494,7 @@ static void nfs41_handle_cb_path_down(struct nfs_client *clp) { nfs_expire_all_delegations(clp); if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0) - nfs4_schedule_state_recovery(clp); + nfs4_schedule_state_manager(clp); } void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 4e2c168b6ee..0cf560f7788 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -844,7 +844,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const if (iap->ia_valid & ATTR_MODE) len += 4; if (iap->ia_valid & ATTR_UID) { - owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ); + owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ); if (owner_namelen < 0) { dprintk("nfs: couldn't resolve uid %d to string\n", iap->ia_uid); @@ -856,7 +856,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const len += 4 + (XDR_QUADLEN(owner_namelen) << 2); } if (iap->ia_valid & ATTR_GID) { - owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ); + owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ); if (owner_grouplen < 0) { dprintk("nfs: couldn't resolve gid %d to string\n", iap->ia_gid); @@ -1384,7 +1384,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) hdr->replen += decode_putrootfh_maxsz; } -static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx) +static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, int zero_seqid) { nfs4_stateid stateid; __be32 *p; @@ -1392,6 +1392,8 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context p = reserve_space(xdr, NFS4_STATEID_SIZE); if (ctx->state != NULL) { nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid); + if (zero_seqid) + stateid.stateid.seqid = 0; xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); } else xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); @@ -1404,7 +1406,8 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, p = reserve_space(xdr, 4); *p = cpu_to_be32(OP_READ); - encode_stateid(xdr, args->context, args->lock_context); + encode_stateid(xdr, args->context, args->lock_context, + hdr->minorversion); p = reserve_space(xdr, 12); p = xdr_encode_hyper(p, args->offset); @@ -1592,7 +1595,8 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg p = reserve_space(xdr, 4); *p = cpu_to_be32(OP_WRITE); - encode_stateid(xdr, args->context, args->lock_context); + encode_stateid(xdr, args->context, args->lock_context, + hdr->minorversion); p = reserve_space(xdr, 16); p = xdr_encode_hyper(p, args->offset); @@ -1660,7 +1664,7 @@ static void encode_create_session(struct xdr_stream *xdr, p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12); *p++ = cpu_to_be32(OP_CREATE_SESSION); - p = xdr_encode_hyper(p, clp->cl_ex_clid); + p = xdr_encode_hyper(p, clp->cl_clientid); *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */ *p++ = cpu_to_be32(args->flags); /*flags */ @@ -2271,7 +2275,8 @@ static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr, encode_putfh(xdr, args->fh, &hdr); encode_write(xdr, args, &hdr); req->rq_snd_buf.flags |= XDRBUF_WRITE; - encode_getfattr(xdr, args->bitmask, &hdr); + if (args->bitmask) + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); } @@ -3382,7 +3387,7 @@ out_overflow: } static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, - struct nfs_client *clp, uint32_t *uid, int may_sleep) + const struct nfs_server *server, uint32_t *uid, int may_sleep) { uint32_t len; __be32 *p; @@ -3402,7 +3407,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, if (!may_sleep) { /* do nothing */ } else if (len < XDR_MAX_NETOBJ) { - if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0) + if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0) ret = NFS_ATTR_FATTR_OWNER; else dprintk("%s: nfs_map_name_to_uid failed!\n", @@ -3420,7 +3425,7 @@ out_overflow: } static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, - struct nfs_client *clp, uint32_t *gid, int may_sleep) + const struct nfs_server *server, uint32_t *gid, int may_sleep) { uint32_t len; __be32 *p; @@ -3440,7 +3445,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, if (!may_sleep) { /* do nothing */ } else if (len < XDR_MAX_NETOBJ) { - if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0) + if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0) ret = NFS_ATTR_FATTR_GROUP; else dprintk("%s: nfs_map_group_to_gid failed!\n", @@ -3939,14 +3944,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, goto xdr_error; fattr->valid |= status; - status = decode_attr_owner(xdr, bitmap, server->nfs_client, - &fattr->uid, may_sleep); + status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, may_sleep); if (status < 0) goto xdr_error; fattr->valid |= status; - status = decode_attr_group(xdr, bitmap, server->nfs_client, - &fattr->gid, may_sleep); + status = decode_attr_group(xdr, bitmap, server, &fattr->gid, may_sleep); if (status < 0) goto xdr_error; fattr->valid |= status; @@ -4694,7 +4697,7 @@ static int decode_exchange_id(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - xdr_decode_hyper(p, &clp->cl_ex_clid); + xdr_decode_hyper(p, &clp->cl_clientid); p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) goto out_overflow; @@ -5690,8 +5693,9 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_write(xdr, res); if (status) goto out; - decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + if (res->fattr) + decode_getfattr(xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); if (!status) status = res->count; out: @@ -6167,8 +6171,6 @@ static struct { { NFS4ERR_DQUOT, -EDQUOT }, { NFS4ERR_STALE, -ESTALE }, { NFS4ERR_BADHANDLE, -EBADHANDLE }, - { NFS4ERR_BADOWNER, -EINVAL }, - { NFS4ERR_BADNAME, -EINVAL }, { NFS4ERR_BAD_COOKIE, -EBADCOOKIE }, { NFS4ERR_NOTSUPP, -ENOTSUPP }, { NFS4ERR_TOOSMALL, -ETOOSMALL }, diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 903908a2002..c541093a5bf 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -86,11 +86,14 @@ /* Default path we try to mount. "%s" gets replaced by our IP address */ #define NFS_ROOT "/tftpboot/%s" +/* Default NFSROOT mount options. */ +#define NFS_DEF_OPTIONS "udp" + /* Parameters passed from the kernel command line */ static char nfs_root_parms[256] __initdata = ""; /* Text-based mount options passed to super.c */ -static char nfs_root_options[256] __initdata = ""; +static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS; /* Address of NFS server */ static __be32 servaddr __initdata = htonl(INADDR_NONE); @@ -160,8 +163,14 @@ static int __init root_nfs_copy(char *dest, const char *src, } static int __init root_nfs_cat(char *dest, const char *src, - const size_t destlen) + const size_t destlen) { + size_t len = strlen(dest); + + if (len && dest[len - 1] != ',') + if (strlcat(dest, ",", destlen) > destlen) + return -1; + if (strlcat(dest, src, destlen) > destlen) return -1; return 0; @@ -194,16 +203,6 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath, if (root_nfs_cat(nfs_root_options, incoming, sizeof(nfs_root_options))) return -1; - - /* - * Possibly prepare for more options to be appended - */ - if (nfs_root_options[0] != '\0' && - nfs_root_options[strlen(nfs_root_options)] != ',') - if (root_nfs_cat(nfs_root_options, ",", - sizeof(nfs_root_options))) - return -1; - return 0; } @@ -217,7 +216,7 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath, */ static int __init root_nfs_data(char *cmdline) { - char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1]; + char mand_options[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1]; int len, retval = -1; char *tmp = NULL; const size_t tmplen = sizeof(nfs_export_path); @@ -244,9 +243,9 @@ static int __init root_nfs_data(char *cmdline) * Append mandatory options for nfsroot so they override * what has come before */ - snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4", + snprintf(mand_options, sizeof(mand_options), "nolock,addr=%pI4", &servaddr); - if (root_nfs_cat(nfs_root_options, addr_option, + if (root_nfs_cat(nfs_root_options, mand_options, sizeof(nfs_root_options))) goto out_optionstoolong; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index e1164e3f9e6..23e79441066 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -20,6 +20,7 @@ #include <linux/nfs_mount.h> #include "internal.h" +#include "pnfs.h" static struct kmem_cache *nfs_page_cachep; @@ -213,7 +214,7 @@ nfs_wait_on_request(struct nfs_page *req) */ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, struct inode *inode, - int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), + int (*doio)(struct nfs_pageio_descriptor *), size_t bsize, int io_flags) { @@ -226,6 +227,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_doio = doio; desc->pg_ioflags = io_flags; desc->pg_error = 0; + desc->pg_lseg = NULL; } /** @@ -240,7 +242,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, * Return 'true' if this is the case, else return 'false'. */ static int nfs_can_coalesce_requests(struct nfs_page *prev, - struct nfs_page *req) + struct nfs_page *req, + struct nfs_pageio_descriptor *pgio) { if (req->wb_context->cred != prev->wb_context->cred) return 0; @@ -254,6 +257,12 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev, return 0; if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) return 0; + /* + * Non-whole file layouts need to check that req is inside of + * pgio->pg_lseg. + */ + if (pgio->pg_test && !pgio->pg_test(pgio, prev, req)) + return 0; return 1; } @@ -286,7 +295,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, if (newlen > desc->pg_bsize) return 0; prev = nfs_list_entry(desc->pg_list.prev); - if (!nfs_can_coalesce_requests(prev, req)) + if (!nfs_can_coalesce_requests(prev, req, desc)) return 0; } else desc->pg_base = req->wb_pgbase; @@ -302,12 +311,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) { if (!list_empty(&desc->pg_list)) { - int error = desc->pg_doio(desc->pg_inode, - &desc->pg_list, - nfs_page_array_len(desc->pg_base, - desc->pg_count), - desc->pg_count, - desc->pg_ioflags); + int error = desc->pg_doio(desc); if (error < 0) desc->pg_error = error; else diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 1b1bc1a0fb0..f38813a0a29 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -30,6 +30,7 @@ #include <linux/nfs_fs.h> #include "internal.h" #include "pnfs.h" +#include "iostat.h" #define NFSDBG_FACILITY NFSDBG_PNFS @@ -74,10 +75,8 @@ find_pnfs_driver(u32 id) void unset_pnfs_layoutdriver(struct nfs_server *nfss) { - if (nfss->pnfs_curr_ld) { - nfss->pnfs_curr_ld->clear_layoutdriver(nfss); + if (nfss->pnfs_curr_ld) module_put(nfss->pnfs_curr_ld->owner); - } nfss->pnfs_curr_ld = NULL; } @@ -115,13 +114,7 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id) goto out_no_driver; } server->pnfs_curr_ld = ld_type; - if (ld_type->set_layoutdriver(server)) { - printk(KERN_ERR - "%s: Error initializing mount point for layout driver %u.\n", - __func__, id); - module_put(ld_type->owner); - goto out_no_driver; - } + dprintk("%s: pNFS module for %u set\n", __func__, id); return; @@ -230,37 +223,41 @@ static void free_lseg(struct pnfs_layout_segment *lseg) put_layout_hdr(NFS_I(ino)->layout); } -/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg - * could sleep, so must be called outside of the lock. - * Returns 1 if object was removed, otherwise return 0. - */ -static int -put_lseg_locked(struct pnfs_layout_segment *lseg, - struct list_head *tmp_list) +static void +put_lseg_common(struct pnfs_layout_segment *lseg) +{ + struct inode *inode = lseg->pls_layout->plh_inode; + + BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); + list_del_init(&lseg->pls_list); + if (list_empty(&lseg->pls_layout->plh_segs)) { + set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags); + /* Matched by initial refcount set in alloc_init_layout_hdr */ + put_layout_hdr_locked(lseg->pls_layout); + } + rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); +} + +void +put_lseg(struct pnfs_layout_segment *lseg) { + struct inode *inode; + + if (!lseg) + return; + dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, atomic_read(&lseg->pls_refcount), test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); - if (atomic_dec_and_test(&lseg->pls_refcount)) { - struct inode *ino = lseg->pls_layout->plh_inode; + inode = lseg->pls_layout->plh_inode; + if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { + LIST_HEAD(free_me); - BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); - list_del(&lseg->pls_list); - if (list_empty(&lseg->pls_layout->plh_segs)) { - struct nfs_client *clp; - - clp = NFS_SERVER(ino)->nfs_client; - spin_lock(&clp->cl_lock); - /* List does not take a reference, so no need for put here */ - list_del_init(&lseg->pls_layout->plh_layouts); - spin_unlock(&clp->cl_lock); - clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags); - } - rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq); - list_add(&lseg->pls_list, tmp_list); - return 1; + put_lseg_common(lseg); + list_add(&lseg->pls_list, &free_me); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&free_me); } - return 0; } static bool @@ -281,7 +278,13 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, * list. It will now be removed when all * outstanding io is finished. */ - rv = put_lseg_locked(lseg, tmp_list); + dprintk("%s: lseg %p ref %d\n", __func__, lseg, + atomic_read(&lseg->pls_refcount)); + if (atomic_dec_and_test(&lseg->pls_refcount)) { + put_lseg_common(lseg); + list_add(&lseg->pls_list, tmp_list); + rv = 1; + } } return rv; } @@ -299,6 +302,11 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, dprintk("%s:Begin lo %p\n", __func__, lo); + if (list_empty(&lo->plh_segs)) { + if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) + put_layout_hdr_locked(lo); + return 0; + } list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) if (should_free_lseg(lseg->pls_range.iomode, iomode)) { dprintk("%s: freeing lseg %p iomode %d " @@ -312,11 +320,27 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, return invalid - removed; } +/* note free_me must contain lsegs from a single layout_hdr */ void pnfs_free_lseg_list(struct list_head *free_me) { struct pnfs_layout_segment *lseg, *tmp; + struct pnfs_layout_hdr *lo; + + if (list_empty(free_me)) + return; + lo = list_first_entry(free_me, struct pnfs_layout_segment, + pls_list)->pls_layout; + + if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) { + struct nfs_client *clp; + + clp = NFS_SERVER(lo->plh_inode)->nfs_client; + spin_lock(&clp->cl_lock); + list_del_init(&lo->plh_layouts); + spin_unlock(&clp->cl_lock); + } list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { list_del(&lseg->pls_list); free_lseg(lseg); @@ -332,10 +356,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) spin_lock(&nfsi->vfs_inode.i_lock); lo = nfsi->layout; if (lo) { - set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags); + lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY); - /* Matched by refcount set to 1 in alloc_init_layout_hdr */ - put_layout_hdr_locked(lo); } spin_unlock(&nfsi->vfs_inode.i_lock); pnfs_free_lseg_list(&tmp_list); @@ -403,6 +425,7 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0) return true; return lo->plh_block_lgets || + test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || (list_empty(&lo->plh_segs) && (atomic_read(&lo->plh_outstanding) > lget)); @@ -674,7 +697,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode) list_for_each_entry(lseg, &lo->plh_segs, pls_list) { if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && is_matching_lseg(lseg, iomode)) { - ret = lseg; + ret = get_lseg(lseg); break; } if (cmp_layout(iomode, lseg->pls_range.iomode) > 0) @@ -699,6 +722,7 @@ pnfs_update_layout(struct inode *ino, struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; struct pnfs_layout_hdr *lo; struct pnfs_layout_segment *lseg = NULL; + bool first = false; if (!pnfs_enabled_sb(NFS_SERVER(ino))) return NULL; @@ -715,21 +739,25 @@ pnfs_update_layout(struct inode *ino, dprintk("%s matches recall, use MDS\n", __func__); goto out_unlock; } - /* Check to see if the layout for the given range already exists */ - lseg = pnfs_find_lseg(lo, iomode); - if (lseg) - goto out_unlock; /* if LAYOUTGET already failed once we don't try again */ if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags)) goto out_unlock; + /* Check to see if the layout for the given range already exists */ + lseg = pnfs_find_lseg(lo, iomode); + if (lseg) + goto out_unlock; + if (pnfs_layoutgets_blocked(lo, NULL, 0)) goto out_unlock; atomic_inc(&lo->plh_outstanding); get_layout_hdr(lo); - if (list_empty(&lo->plh_segs)) { + if (list_empty(&lo->plh_segs)) + first = true; + spin_unlock(&ino->i_lock); + if (first) { /* The lo must be on the clp list if there is any * chance of a CB_LAYOUTRECALL(FILE) coming in. */ @@ -738,24 +766,18 @@ pnfs_update_layout(struct inode *ino, list_add_tail(&lo->plh_layouts, &clp->cl_layouts); spin_unlock(&clp->cl_lock); } - spin_unlock(&ino->i_lock); lseg = send_layoutget(lo, ctx, iomode); - if (!lseg) { - spin_lock(&ino->i_lock); - if (list_empty(&lo->plh_segs)) { - spin_lock(&clp->cl_lock); - list_del_init(&lo->plh_layouts); - spin_unlock(&clp->cl_lock); - clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); - } - spin_unlock(&ino->i_lock); + if (!lseg && first) { + spin_lock(&clp->cl_lock); + list_del_init(&lo->plh_layouts); + spin_unlock(&clp->cl_lock); } atomic_dec(&lo->plh_outstanding); put_layout_hdr(lo); out: dprintk("%s end, state 0x%lx lseg %p\n", __func__, - nfsi->layout->plh_flags, lseg); + nfsi->layout ? nfsi->layout->plh_flags : -1, lseg); return lseg; out_unlock: spin_unlock(&ino->i_lock); @@ -808,7 +830,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) } init_lseg(lo, lseg); lseg->pls_range = res->range; - *lgp->lsegpp = lseg; + *lgp->lsegpp = get_lseg(lseg); pnfs_insert_layout(lo, lseg); if (res->return_on_close) { @@ -829,137 +851,97 @@ out_forget_reply: goto out; } -/* - * Device ID cache. Currently supports one layout type per struct nfs_client. - * Add layout type to the lookup key to expand to support multiple types. - */ -int -pnfs_alloc_init_deviceid_cache(struct nfs_client *clp, - void (*free_callback)(struct pnfs_deviceid_node *)) +static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio, + struct nfs_page *prev, + struct nfs_page *req) { - struct pnfs_deviceid_cache *c; - - c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL); - if (!c) - return -ENOMEM; - spin_lock(&clp->cl_lock); - if (clp->cl_devid_cache != NULL) { - atomic_inc(&clp->cl_devid_cache->dc_ref); - dprintk("%s [kref [%d]]\n", __func__, - atomic_read(&clp->cl_devid_cache->dc_ref)); - kfree(c); - } else { - /* kzalloc initializes hlists */ - spin_lock_init(&c->dc_lock); - atomic_set(&c->dc_ref, 1); - c->dc_free_callback = free_callback; - clp->cl_devid_cache = c; - dprintk("%s [new]\n", __func__); + if (pgio->pg_count == prev->wb_bytes) { + /* This is first coelesce call for a series of nfs_pages */ + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + prev->wb_context, + IOMODE_READ); } - spin_unlock(&clp->cl_lock); - return 0; + return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req); } -EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache); -/* - * Called from pnfs_layoutdriver_type->free_lseg - * last layout segment reference frees deviceid - */ void -pnfs_put_deviceid(struct pnfs_deviceid_cache *c, - struct pnfs_deviceid_node *devid) +pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) { - struct nfs4_deviceid *id = &devid->de_id; - struct pnfs_deviceid_node *d; - struct hlist_node *n; - long h = nfs4_deviceid_hash(id); + struct pnfs_layoutdriver_type *ld; - dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref)); - if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock)) - return; + ld = NFS_SERVER(inode)->pnfs_curr_ld; + pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL; +} - hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node) - if (!memcmp(&d->de_id, id, sizeof(*id))) { - hlist_del_rcu(&d->de_node); - spin_unlock(&c->dc_lock); - synchronize_rcu(); - c->dc_free_callback(devid); - return; - } - spin_unlock(&c->dc_lock); - /* Why wasn't it found in the list? */ - BUG(); -} -EXPORT_SYMBOL_GPL(pnfs_put_deviceid); - -/* Find and reference a deviceid */ -struct pnfs_deviceid_node * -pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id) -{ - struct pnfs_deviceid_node *d; - struct hlist_node *n; - long hash = nfs4_deviceid_hash(id); - - dprintk("--> %s hash %ld\n", __func__, hash); - rcu_read_lock(); - hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { - if (!memcmp(&d->de_id, id, sizeof(*id))) { - if (!atomic_inc_not_zero(&d->de_ref)) { - goto fail; - } else { - rcu_read_unlock(); - return d; - } - } +static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio, + struct nfs_page *prev, + struct nfs_page *req) +{ + if (pgio->pg_count == prev->wb_bytes) { + /* This is first coelesce call for a series of nfs_pages */ + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + prev->wb_context, + IOMODE_RW); } -fail: - rcu_read_unlock(); - return NULL; + return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req); +} + +void +pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode) +{ + struct pnfs_layoutdriver_type *ld; + + ld = NFS_SERVER(inode)->pnfs_curr_ld; + pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL; +} + +enum pnfs_try_status +pnfs_try_to_write_data(struct nfs_write_data *wdata, + const struct rpc_call_ops *call_ops, int how) +{ + struct inode *inode = wdata->inode; + enum pnfs_try_status trypnfs; + struct nfs_server *nfss = NFS_SERVER(inode); + + wdata->mds_ops = call_ops; + + dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, + inode->i_ino, wdata->args.count, wdata->args.offset, how); + + trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how); + if (trypnfs == PNFS_NOT_ATTEMPTED) { + put_lseg(wdata->lseg); + wdata->lseg = NULL; + } else + nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); + + dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); + return trypnfs; } -EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid); /* - * Add a deviceid to the cache. - * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new + * Call the appropriate parallel I/O subsystem read function. */ -struct pnfs_deviceid_node * -pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new) -{ - struct pnfs_deviceid_node *d; - long hash = nfs4_deviceid_hash(&new->de_id); - - dprintk("--> %s hash %ld\n", __func__, hash); - spin_lock(&c->dc_lock); - d = pnfs_find_get_deviceid(c, &new->de_id); - if (d) { - spin_unlock(&c->dc_lock); - dprintk("%s [discard]\n", __func__); - c->dc_free_callback(new); - return d; - } - INIT_HLIST_NODE(&new->de_node); - atomic_set(&new->de_ref, 1); - hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]); - spin_unlock(&c->dc_lock); - dprintk("%s [new]\n", __func__); - return new; -} -EXPORT_SYMBOL_GPL(pnfs_add_deviceid); - -void -pnfs_put_deviceid_cache(struct nfs_client *clp) +enum pnfs_try_status +pnfs_try_to_read_data(struct nfs_read_data *rdata, + const struct rpc_call_ops *call_ops) { - struct pnfs_deviceid_cache *local = clp->cl_devid_cache; + struct inode *inode = rdata->inode; + struct nfs_server *nfss = NFS_SERVER(inode); + enum pnfs_try_status trypnfs; - dprintk("--> %s ({%d})\n", __func__, atomic_read(&local->dc_ref)); - if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) { - int i; - /* Verify cache is empty */ - for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) - BUG_ON(!hlist_empty(&local->dc_deviceids[i])); - clp->cl_devid_cache = NULL; - spin_unlock(&clp->cl_lock); - kfree(local); + rdata->mds_ops = call_ops; + + dprintk("%s: Reading ino:%lu %u@%llu\n", + __func__, inode->i_ino, rdata->args.count, rdata->args.offset); + + trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata); + if (trypnfs == PNFS_NOT_ATTEMPTED) { + put_lseg(rdata->lseg); + rdata->lseg = NULL; + } else { + nfs_inc_stats(inode, NFSIOS_PNFS_READ); } + dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); + return trypnfs; } -EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index e2612ea0cbe..6380b9405bc 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -30,6 +30,8 @@ #ifndef FS_NFS_PNFS_H #define FS_NFS_PNFS_H +#include <linux/nfs_page.h> + enum { NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ NFS_LSEG_ROC, /* roc bit received from server */ @@ -43,6 +45,11 @@ struct pnfs_layout_segment { struct pnfs_layout_hdr *pls_layout; }; +enum pnfs_try_status { + PNFS_ATTEMPTED = 0, + PNFS_NOT_ATTEMPTED = 1, +}; + #ifdef CONFIG_NFS_V4_1 #define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" @@ -61,10 +68,18 @@ struct pnfs_layoutdriver_type { const u32 id; const char *name; struct module *owner; - int (*set_layoutdriver) (struct nfs_server *); - int (*clear_layoutdriver) (struct nfs_server *); struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr); void (*free_lseg) (struct pnfs_layout_segment *lseg); + + /* test for nfs page cache coalescing */ + int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); + + /* + * Return PNFS_ATTEMPTED to indicate the layout code has attempted + * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS + */ + enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data); + enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how); }; struct pnfs_layout_hdr { @@ -90,52 +105,6 @@ struct pnfs_device { unsigned int pglen; }; -/* - * Device ID RCU cache. A device ID is unique per client ID and layout type. - */ -#define NFS4_DEVICE_ID_HASH_BITS 5 -#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) -#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) - -static inline u32 -nfs4_deviceid_hash(struct nfs4_deviceid *id) -{ - unsigned char *cptr = (unsigned char *)id->data; - unsigned int nbytes = NFS4_DEVICEID4_SIZE; - u32 x = 0; - - while (nbytes--) { - x *= 37; - x += *cptr++; - } - return x & NFS4_DEVICE_ID_HASH_MASK; -} - -struct pnfs_deviceid_node { - struct hlist_node de_node; - struct nfs4_deviceid de_id; - atomic_t de_ref; -}; - -struct pnfs_deviceid_cache { - spinlock_t dc_lock; - atomic_t dc_ref; - void (*dc_free_callback)(struct pnfs_deviceid_node *); - struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE]; -}; - -extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *, - void (*free_callback)(struct pnfs_deviceid_node *)); -extern void pnfs_put_deviceid_cache(struct nfs_client *); -extern struct pnfs_deviceid_node *pnfs_find_get_deviceid( - struct pnfs_deviceid_cache *, - struct nfs4_deviceid *); -extern struct pnfs_deviceid_node *pnfs_add_deviceid( - struct pnfs_deviceid_cache *, - struct pnfs_deviceid_node *); -extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c, - struct pnfs_deviceid_node *devid); - extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); @@ -146,11 +115,18 @@ extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); /* pnfs.c */ void get_layout_hdr(struct pnfs_layout_hdr *lo); +void put_lseg(struct pnfs_layout_segment *lseg); struct pnfs_layout_segment * pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, enum pnfs_iomode access_type); void set_pnfs_layoutdriver(struct nfs_server *, u32 id); void unset_pnfs_layoutdriver(struct nfs_server *); +enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, + const struct rpc_call_ops *, int); +enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, + const struct rpc_call_ops *); +void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *); +void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *); int pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); @@ -177,6 +153,16 @@ static inline int lo_fail_bit(u32 iomode) NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; } +static inline struct pnfs_layout_segment * +get_lseg(struct pnfs_layout_segment *lseg) +{ + if (lseg) { + atomic_inc(&lseg->pls_refcount); + smp_mb__after_atomic_inc(); + } + return lseg; +} + /* Return true if a layout driver is being used for this mountpoint */ static inline int pnfs_enabled_sb(struct nfs_server *nfss) { @@ -194,12 +180,36 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) } static inline struct pnfs_layout_segment * +get_lseg(struct pnfs_layout_segment *lseg) +{ + return NULL; +} + +static inline void put_lseg(struct pnfs_layout_segment *lseg) +{ +} + +static inline struct pnfs_layout_segment * pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, enum pnfs_iomode access_type) { return NULL; } +static inline enum pnfs_try_status +pnfs_try_to_read_data(struct nfs_read_data *data, + const struct rpc_call_ops *call_ops) +{ + return PNFS_NOT_ATTEMPTED; +} + +static inline enum pnfs_try_status +pnfs_try_to_write_data(struct nfs_write_data *data, + const struct rpc_call_ops *call_ops, int how) +{ + return PNFS_NOT_ATTEMPTED; +} + static inline bool pnfs_roc(struct inode *ino) { @@ -230,6 +240,18 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s) { } +static inline void +pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino) +{ + pgio->pg_test = NULL; +} + +static inline void +pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino) +{ + pgio->pg_test = NULL; +} + #endif /* CONFIG_NFS_V4_1 */ #endif /* FS_NFS_PNFS_H */ diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 77d5e21c4ad..b8ec170f2a0 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -741,4 +741,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .lock = nfs_proc_lock, .lock_check_bounds = nfs_lock_check_bounds, .close_context = nfs_close_context, + .init_client = nfs_init_client, }; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index aedcaa7f291..7cded2b12a0 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -18,19 +18,20 @@ #include <linux/sunrpc/clnt.h> #include <linux/nfs_fs.h> #include <linux/nfs_page.h> +#include <linux/module.h> #include <asm/system.h> +#include "pnfs.h" #include "nfs4_fs.h" #include "internal.h" #include "iostat.h" #include "fscache.h" -#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE -static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int); -static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int); +static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc); +static int nfs_pagein_one(struct nfs_pageio_descriptor *desc); static const struct rpc_call_ops nfs_read_partial_ops; static const struct rpc_call_ops nfs_read_full_ops; @@ -69,6 +70,7 @@ void nfs_readdata_free(struct nfs_read_data *p) static void nfs_readdata_release(struct nfs_read_data *rdata) { + put_lseg(rdata->lseg); put_nfs_open_context(rdata->args.context); nfs_readdata_free(rdata); } @@ -114,14 +116,13 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, struct page *page) { - LIST_HEAD(one_request); struct nfs_page *new; unsigned int len; + struct nfs_pageio_descriptor pgio; len = nfs_page_length(page); if (len == 0) return nfs_return_empty_page(page); - pnfs_update_layout(inode, ctx, IOMODE_READ); new = nfs_create_request(ctx, inode, page, 0, len); if (IS_ERR(new)) { unlock_page(page); @@ -130,11 +131,14 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, if (len < PAGE_CACHE_SIZE) zero_user_segment(page, len, PAGE_CACHE_SIZE); - nfs_list_add_request(new, &one_request); + nfs_pageio_init(&pgio, inode, NULL, 0, 0); + nfs_list_add_request(new, &pgio.pg_list); + pgio.pg_count = len; + if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) - nfs_pagein_multi(inode, &one_request, 1, len, 0); + nfs_pagein_multi(&pgio); else - nfs_pagein_one(inode, &one_request, 1, len, 0); + nfs_pagein_one(&pgio); return 0; } @@ -155,24 +159,20 @@ static void nfs_readpage_release(struct nfs_page *req) nfs_release_request(req); } -/* - * Set up the NFS read request struct - */ -static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, - const struct rpc_call_ops *call_ops, - unsigned int count, unsigned int offset) +int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops) { - struct inode *inode = req->wb_context->path.dentry->d_inode; + struct inode *inode = data->inode; int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; struct rpc_task *task; struct rpc_message msg = { .rpc_argp = &data->args, .rpc_resp = &data->res, - .rpc_cred = req->wb_context->cred, + .rpc_cred = data->cred, }; struct rpc_task_setup task_setup_data = { .task = &data->task, - .rpc_client = NFS_CLIENT(inode), + .rpc_client = clnt, .rpc_message = &msg, .callback_ops = call_ops, .callback_data = data, @@ -180,9 +180,39 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, .flags = RPC_TASK_ASYNC | swap_flags, }; + /* Set up the initial task struct. */ + NFS_PROTO(inode)->read_setup(data, &msg); + + dprintk("NFS: %5u initiated read call (req %s/%lld, %u bytes @ " + "offset %llu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + data->args.count, + (unsigned long long)data->args.offset); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; +} +EXPORT_SYMBOL_GPL(nfs_initiate_read); + +/* + * Set up the NFS read request struct + */ +static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, + const struct rpc_call_ops *call_ops, + unsigned int count, unsigned int offset, + struct pnfs_layout_segment *lseg) +{ + struct inode *inode = req->wb_context->path.dentry->d_inode; + data->req = req; data->inode = inode; - data->cred = msg.rpc_cred; + data->cred = req->wb_context->cred; + data->lseg = get_lseg(lseg); data->args.fh = NFS_FH(inode); data->args.offset = req_offset(req) + offset; @@ -197,21 +227,11 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, data->res.eof = 0; nfs_fattr_init(&data->fattr); - /* Set up the initial task struct. */ - NFS_PROTO(inode)->read_setup(data, &msg); - - dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - count, - (unsigned long long)data->args.offset); + if (data->lseg && + (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED)) + return 0; - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - return PTR_ERR(task); - rpc_put_task(task); - return 0; + return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops); } static void @@ -240,20 +260,21 @@ nfs_async_read_error(struct list_head *head) * won't see the new data until our attribute cache is updated. This is more * or less conventional NFS client behavior. */ -static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags) +static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc) { - struct nfs_page *req = nfs_list_entry(head->next); + struct nfs_page *req = nfs_list_entry(desc->pg_list.next); struct page *page = req->wb_page; struct nfs_read_data *data; - size_t rsize = NFS_SERVER(inode)->rsize, nbytes; + size_t rsize = NFS_SERVER(desc->pg_inode)->rsize, nbytes; unsigned int offset; int requests = 0; int ret = 0; + struct pnfs_layout_segment *lseg; LIST_HEAD(list); nfs_list_remove_request(req); - nbytes = count; + nbytes = desc->pg_count; do { size_t len = min(nbytes,rsize); @@ -266,9 +287,11 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne } while(nbytes != 0); atomic_set(&req->wb_complete, requests); + BUG_ON(desc->pg_lseg != NULL); + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ); ClearPageError(page); offset = 0; - nbytes = count; + nbytes = desc->pg_count; do { int ret2; @@ -280,12 +303,14 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne if (nbytes < rsize) rsize = nbytes; ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, - rsize, offset); + rsize, offset, lseg); if (ret == 0) ret = ret2; offset += rsize; nbytes -= rsize; } while (nbytes != 0); + put_lseg(lseg); + desc->pg_lseg = NULL; return ret; @@ -300,16 +325,21 @@ out_bad: return -ENOMEM; } -static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags) +static int nfs_pagein_one(struct nfs_pageio_descriptor *desc) { struct nfs_page *req; struct page **pages; struct nfs_read_data *data; + struct list_head *head = &desc->pg_list; + struct pnfs_layout_segment *lseg = desc->pg_lseg; int ret = -ENOMEM; - data = nfs_readdata_alloc(npages); - if (!data) - goto out_bad; + data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base, + desc->pg_count)); + if (!data) { + nfs_async_read_error(head); + goto out; + } pages = data->pagevec; while (!list_empty(head)) { @@ -320,10 +350,14 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned *pages++ = req->wb_page; } req = nfs_list_entry(data->pages.next); + if ((!lseg) && list_is_singular(&data->pages)) + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ); - return nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0); -out_bad: - nfs_async_read_error(head); + ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count, + 0, lseg); +out: + put_lseg(lseg); + desc->pg_lseg = NULL; return ret; } @@ -366,6 +400,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data return; /* Yes, so retry the read at the end of the data */ + data->mds_offset += resp->count; argp->offset += resp->count; argp->pgbase += resp->count; argp->count -= resp->count; @@ -625,7 +660,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, if (ret == 0) goto read_complete; /* all pages were read */ - pnfs_update_layout(inode, desc.ctx, IOMODE_READ); + pnfs_pageio_init_read(&pgio, inode); if (rsize < PAGE_CACHE_SIZE) nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); else diff --git a/fs/nfs/super.c b/fs/nfs/super.c index b68c8607770..2b8e9a5e366 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -263,8 +263,11 @@ static match_table_t nfs_local_lock_tokens = { static void nfs_umount_begin(struct super_block *); static int nfs_statfs(struct dentry *, struct kstatfs *); static int nfs_show_options(struct seq_file *, struct vfsmount *); +static int nfs_show_devname(struct seq_file *, struct vfsmount *); +static int nfs_show_path(struct seq_file *, struct vfsmount *); static int nfs_show_stats(struct seq_file *, struct vfsmount *); -static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *); +static struct dentry *nfs_fs_mount(struct file_system_type *, + int, const char *, void *); static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data); static void nfs_put_super(struct super_block *); @@ -274,7 +277,7 @@ static int nfs_remount(struct super_block *sb, int *flags, char *raw_data); static struct file_system_type nfs_fs_type = { .owner = THIS_MODULE, .name = "nfs", - .get_sb = nfs_get_sb, + .mount = nfs_fs_mount, .kill_sb = nfs_kill_super, .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; @@ -296,6 +299,8 @@ static const struct super_operations nfs_sops = { .evict_inode = nfs_evict_inode, .umount_begin = nfs_umount_begin, .show_options = nfs_show_options, + .show_devname = nfs_show_devname, + .show_path = nfs_show_path, .show_stats = nfs_show_stats, .remount_fs = nfs_remount, }; @@ -303,16 +308,16 @@ static const struct super_operations nfs_sops = { #ifdef CONFIG_NFS_V4 static int nfs4_validate_text_mount_data(void *options, struct nfs_parsed_mount_data *args, const char *dev_name); -static int nfs4_try_mount(int flags, const char *dev_name, - struct nfs_parsed_mount_data *data, struct vfsmount *mnt); -static int nfs4_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); +static struct dentry *nfs4_try_mount(int flags, const char *dev_name, + struct nfs_parsed_mount_data *data); +static struct dentry *nfs4_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data); static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data); -static int nfs4_referral_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); +static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data); static void nfs4_kill_super(struct super_block *sb); @@ -320,7 +325,7 @@ static void nfs4_kill_super(struct super_block *sb); static struct file_system_type nfs4_fs_type = { .owner = THIS_MODULE, .name = "nfs4", - .get_sb = nfs4_get_sb, + .mount = nfs4_mount, .kill_sb = nfs4_kill_super, .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; @@ -352,7 +357,7 @@ static struct file_system_type nfs4_remote_referral_fs_type = { struct file_system_type nfs4_referral_fs_type = { .owner = THIS_MODULE, .name = "nfs4", - .get_sb = nfs4_referral_get_sb, + .mount = nfs4_referral_mount, .kill_sb = nfs4_kill_super, .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; @@ -366,6 +371,8 @@ static const struct super_operations nfs4_sops = { .evict_inode = nfs4_evict_inode, .umount_begin = nfs_umount_begin, .show_options = nfs_show_options, + .show_devname = nfs_show_devname, + .show_path = nfs_show_path, .show_stats = nfs_show_stats, .remount_fs = nfs_remount, }; @@ -726,6 +733,28 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) return 0; } +static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt) +{ + char *page = (char *) __get_free_page(GFP_KERNEL); + char *devname, *dummy; + int err = 0; + if (!page) + return -ENOMEM; + devname = nfs_path(&dummy, mnt->mnt_root, page, PAGE_SIZE); + if (IS_ERR(devname)) + err = PTR_ERR(devname); + else + seq_escape(m, devname, " \t\n\\"); + free_page((unsigned long)page); + return err; +} + +static int nfs_show_path(struct seq_file *m, struct vfsmount *mnt) +{ + seq_puts(m, "/"); + return 0; +} + /* * Present statistical information for this VFS mountpoint */ @@ -979,6 +1008,27 @@ static int nfs_parse_security_flavors(char *value, return 1; } +static int nfs_get_option_str(substring_t args[], char **option) +{ + kfree(*option); + *option = match_strdup(args); + return !option; +} + +static int nfs_get_option_ul(substring_t args[], unsigned long *option) +{ + int rc; + char *string; + + string = match_strdup(args); + if (string == NULL) + return -ENOMEM; + rc = strict_strtoul(string, 10, option); + kfree(string); + + return rc; +} + /* * Error-check and convert a string of mount options from user space into * a data structure. The whole mount string is processed; bad options are @@ -1127,155 +1177,82 @@ static int nfs_parse_mount_options(char *raw, * options that take numeric values */ case Opt_port: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0 || option > USHRT_MAX) + if (nfs_get_option_ul(args, &option) || + option > USHRT_MAX) goto out_invalid_value; mnt->nfs_server.port = option; break; case Opt_rsize: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->rsize = option; break; case Opt_wsize: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->wsize = option; break; case Opt_bsize: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->bsize = option; break; case Opt_timeo: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0 || option == 0) + if (nfs_get_option_ul(args, &option) || option == 0) goto out_invalid_value; mnt->timeo = option; break; case Opt_retrans: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0 || option == 0) + if (nfs_get_option_ul(args, &option) || option == 0) goto out_invalid_value; mnt->retrans = option; break; case Opt_acregmin: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->acregmin = option; break; case Opt_acregmax: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->acregmax = option; break; case Opt_acdirmin: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->acdirmin = option; break; case Opt_acdirmax: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->acdirmax = option; break; case Opt_actimeo: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->acregmin = mnt->acregmax = mnt->acdirmin = mnt->acdirmax = option; break; case Opt_namelen: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->namlen = option; break; case Opt_mountport: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0 || option > USHRT_MAX) + if (nfs_get_option_ul(args, &option) || + option > USHRT_MAX) goto out_invalid_value; mnt->mount_server.port = option; break; case Opt_mountvers: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0 || + if (nfs_get_option_ul(args, &option) || option < NFS_MNT_VERSION || option > NFS_MNT3_VERSION) goto out_invalid_value; mnt->mount_server.version = option; break; case Opt_nfsvers: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; switch (option) { case NFS2_VERSION: @@ -1295,12 +1272,7 @@ static int nfs_parse_mount_options(char *raw, } break; case Opt_minorversion: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; if (option > NFS4_MAX_MINOR_VERSION) goto out_invalid_value; @@ -1336,21 +1308,18 @@ static int nfs_parse_mount_options(char *raw, case Opt_xprt_udp: mnt->flags &= ~NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; - kfree(string); break; case Opt_xprt_tcp6: protofamily = AF_INET6; case Opt_xprt_tcp: mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; - kfree(string); break; case Opt_xprt_rdma: /* vector side protocols to TCP */ mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; xprt_load_transport(string); - kfree(string); break; default: dfprintk(MOUNT, "NFS: unrecognized " @@ -1358,6 +1327,7 @@ static int nfs_parse_mount_options(char *raw, kfree(string); return 0; } + kfree(string); break; case Opt_mountproto: string = match_strdup(args); @@ -1400,18 +1370,13 @@ static int nfs_parse_mount_options(char *raw, goto out_invalid_address; break; case Opt_clientaddr: - string = match_strdup(args); - if (string == NULL) + if (nfs_get_option_str(args, &mnt->client_address)) goto out_nomem; - kfree(mnt->client_address); - mnt->client_address = string; break; case Opt_mounthost: - string = match_strdup(args); - if (string == NULL) + if (nfs_get_option_str(args, + &mnt->mount_server.hostname)) goto out_nomem; - kfree(mnt->mount_server.hostname); - mnt->mount_server.hostname = string; break; case Opt_mountaddr: string = match_strdup(args); @@ -1451,11 +1416,8 @@ static int nfs_parse_mount_options(char *raw, }; break; case Opt_fscache_uniq: - string = match_strdup(args); - if (string == NULL) + if (nfs_get_option_str(args, &mnt->fscache_uniq)) goto out_nomem; - kfree(mnt->fscache_uniq); - mnt->fscache_uniq = string; mnt->options |= NFS_OPTION_FSCACHE; break; case Opt_local_lock: @@ -1665,99 +1627,59 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, return nfs_walk_authlist(args, &request); } -static int nfs_parse_simple_hostname(const char *dev_name, - char **hostname, size_t maxnamlen, - char **export_path, size_t maxpathlen) +/* + * Split "dev_name" into "hostname:export_path". + * + * The leftmost colon demarks the split between the server's hostname + * and the export path. If the hostname starts with a left square + * bracket, then it may contain colons. + * + * Note: caller frees hostname and export path, even on error. + */ +static int nfs_parse_devname(const char *dev_name, + char **hostname, size_t maxnamlen, + char **export_path, size_t maxpathlen) { size_t len; - char *colon, *comma; - - colon = strchr(dev_name, ':'); - if (colon == NULL) - goto out_bad_devname; - - len = colon - dev_name; - if (len > maxnamlen) - goto out_hostname; + char *end; - /* N.B. caller will free nfs_server.hostname in all cases */ - *hostname = kstrndup(dev_name, len, GFP_KERNEL); - if (!*hostname) - goto out_nomem; - - /* kill possible hostname list: not supported */ - comma = strchr(*hostname, ','); - if (comma != NULL) { - if (comma == *hostname) + /* Is the host name protected with square brakcets? */ + if (*dev_name == '[') { + end = strchr(++dev_name, ']'); + if (end == NULL || end[1] != ':') goto out_bad_devname; - *comma = '\0'; - } - colon++; - len = strlen(colon); - if (len > maxpathlen) - goto out_path; - *export_path = kstrndup(colon, len, GFP_KERNEL); - if (!*export_path) - goto out_nomem; - - dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path); - return 0; - -out_bad_devname: - dfprintk(MOUNT, "NFS: device name not in host:path format\n"); - return -EINVAL; - -out_nomem: - dfprintk(MOUNT, "NFS: not enough memory to parse device name\n"); - return -ENOMEM; - -out_hostname: - dfprintk(MOUNT, "NFS: server hostname too long\n"); - return -ENAMETOOLONG; - -out_path: - dfprintk(MOUNT, "NFS: export pathname too long\n"); - return -ENAMETOOLONG; -} - -/* - * Hostname has square brackets around it because it contains one or - * more colons. We look for the first closing square bracket, and a - * colon must follow it. - */ -static int nfs_parse_protected_hostname(const char *dev_name, - char **hostname, size_t maxnamlen, - char **export_path, size_t maxpathlen) -{ - size_t len; - char *start, *end; + len = end - dev_name; + end++; + } else { + char *comma; - start = (char *)(dev_name + 1); + end = strchr(dev_name, ':'); + if (end == NULL) + goto out_bad_devname; + len = end - dev_name; - end = strchr(start, ']'); - if (end == NULL) - goto out_bad_devname; - if (*(end + 1) != ':') - goto out_bad_devname; + /* kill possible hostname list: not supported */ + comma = strchr(dev_name, ','); + if (comma != NULL && comma < end) + *comma = 0; + } - len = end - start; if (len > maxnamlen) goto out_hostname; /* N.B. caller will free nfs_server.hostname in all cases */ - *hostname = kstrndup(start, len, GFP_KERNEL); + *hostname = kstrndup(dev_name, len, GFP_KERNEL); if (*hostname == NULL) goto out_nomem; - - end += 2; - len = strlen(end); + len = strlen(++end); if (len > maxpathlen) goto out_path; *export_path = kstrndup(end, len, GFP_KERNEL); if (!*export_path) goto out_nomem; + dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path); return 0; out_bad_devname: @@ -1778,29 +1700,6 @@ out_path: } /* - * Split "dev_name" into "hostname:export_path". - * - * The leftmost colon demarks the split between the server's hostname - * and the export path. If the hostname starts with a left square - * bracket, then it may contain colons. - * - * Note: caller frees hostname and export path, even on error. - */ -static int nfs_parse_devname(const char *dev_name, - char **hostname, size_t maxnamlen, - char **export_path, size_t maxpathlen) -{ - if (*dev_name == '[') - return nfs_parse_protected_hostname(dev_name, - hostname, maxnamlen, - export_path, maxpathlen); - - return nfs_parse_simple_hostname(dev_name, - hostname, maxnamlen, - export_path, maxpathlen); -} - -/* * Validate the NFS2/NFS3 mount data * - fills in the mount root filehandle * @@ -2267,19 +2166,19 @@ static int nfs_bdi_register(struct nfs_server *server) return bdi_register_dev(&server->backing_dev_info, server->s_dev); } -static int nfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) +static struct dentry *nfs_fs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data) { struct nfs_server *server = NULL; struct super_block *s; struct nfs_parsed_mount_data *data; struct nfs_fh *mntfh; - struct dentry *mntroot; + struct dentry *mntroot = ERR_PTR(-ENOMEM); int (*compare_super)(struct super_block *, void *) = nfs_compare_super; struct nfs_sb_mountdata sb_mntdata = { .mntflags = flags, }; - int error = -ENOMEM; + int error; data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION); mntfh = nfs_alloc_fhandle(); @@ -2290,12 +2189,14 @@ static int nfs_get_sb(struct file_system_type *fs_type, /* Validate the mount data */ error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name); - if (error < 0) + if (error < 0) { + mntroot = ERR_PTR(error); goto out; + } #ifdef CONFIG_NFS_V4 if (data->version == 4) { - error = nfs4_try_mount(flags, dev_name, data, mnt); + mntroot = nfs4_try_mount(flags, dev_name, data); kfree(data->client_address); kfree(data->nfs_server.export_path); goto out; @@ -2305,7 +2206,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, /* Get a volume representation */ server = nfs_create_server(data, mntfh); if (IS_ERR(server)) { - error = PTR_ERR(server); + mntroot = ERR_CAST(server); goto out; } sb_mntdata.server = server; @@ -2316,7 +2217,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, /* Get a superblock - note that we may end up sharing one that already exists */ s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata); if (IS_ERR(s)) { - error = PTR_ERR(s); + mntroot = ERR_CAST(s); goto out_err_nosb; } @@ -2325,8 +2226,10 @@ static int nfs_get_sb(struct file_system_type *fs_type, server = NULL; } else { error = nfs_bdi_register(server); - if (error) + if (error) { + mntroot = ERR_PTR(error); goto error_splat_bdi; + } } if (!s->s_root) { @@ -2336,20 +2239,15 @@ static int nfs_get_sb(struct file_system_type *fs_type, s, data ? data->fscache_uniq : NULL, NULL); } - mntroot = nfs_get_root(s, mntfh); - if (IS_ERR(mntroot)) { - error = PTR_ERR(mntroot); + mntroot = nfs_get_root(s, mntfh, dev_name); + if (IS_ERR(mntroot)) goto error_splat_super; - } error = security_sb_set_mnt_opts(s, &data->lsm_opts); if (error) goto error_splat_root; s->s_flags |= MS_ACTIVE; - mnt->mnt_sb = s; - mnt->mnt_root = mntroot; - error = 0; out: kfree(data->nfs_server.hostname); @@ -2359,7 +2257,7 @@ out: out_free_fh: nfs_free_fhandle(mntfh); kfree(data); - return error; + return mntroot; out_err_nosb: nfs_free_server(server); @@ -2367,6 +2265,7 @@ out_err_nosb: error_splat_root: dput(mntroot); + mntroot = ERR_PTR(error); error_splat_super: if (server && !s->s_root) bdi_unregister(&server->backing_dev_info); @@ -2450,7 +2349,7 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags, nfs_fscache_get_super_cookie(s, NULL, data); } - mntroot = nfs_get_root(s, data->fh); + mntroot = nfs_get_root(s, data->fh, dev_name); if (IS_ERR(mntroot)) { error = PTR_ERR(mntroot); goto error_splat_super; @@ -2718,7 +2617,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags, s, data ? data->fscache_uniq : NULL, NULL); } - mntroot = nfs4_get_root(s, mntfh); + mntroot = nfs4_get_root(s, mntfh, dev_name); if (IS_ERR(mntroot)) { error = PTR_ERR(mntroot); goto error_splat_super; @@ -2771,27 +2670,6 @@ static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type, return root_mnt; } -static void nfs_fix_devname(const struct path *path, struct vfsmount *mnt) -{ - char *page = (char *) __get_free_page(GFP_KERNEL); - char *devname, *tmp; - - if (page == NULL) - return; - devname = nfs_path(path->mnt->mnt_devname, - path->mnt->mnt_root, path->dentry, - page, PAGE_SIZE); - if (IS_ERR(devname)) - goto out_freepage; - tmp = kstrdup(devname, GFP_KERNEL); - if (tmp == NULL) - goto out_freepage; - kfree(mnt->mnt_devname); - mnt->mnt_devname = tmp; -out_freepage: - free_page((unsigned long)page); -} - struct nfs_referral_count { struct list_head list; const struct task_struct *task; @@ -2858,17 +2736,18 @@ static void nfs_referral_loop_unprotect(void) kfree(p); } -static int nfs_follow_remote_path(struct vfsmount *root_mnt, - const char *export_path, struct vfsmount *mnt_target) +static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, + const char *export_path) { struct nameidata *nd = NULL; struct mnt_namespace *ns_private; struct super_block *s; + struct dentry *dentry; int ret; nd = kmalloc(sizeof(*nd), GFP_KERNEL); if (nd == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); ns_private = create_mnt_ns(root_mnt); ret = PTR_ERR(ns_private); @@ -2890,32 +2769,27 @@ static int nfs_follow_remote_path(struct vfsmount *root_mnt, s = nd->path.mnt->mnt_sb; atomic_inc(&s->s_active); - mnt_target->mnt_sb = s; - mnt_target->mnt_root = dget(nd->path.dentry); - - /* Correct the device pathname */ - nfs_fix_devname(&nd->path, mnt_target); + dentry = dget(nd->path.dentry); path_put(&nd->path); kfree(nd); down_write(&s->s_umount); - return 0; + return dentry; out_put_mnt_ns: put_mnt_ns(ns_private); out_mntput: mntput(root_mnt); out_err: kfree(nd); - return ret; + return ERR_PTR(ret); } -static int nfs4_try_mount(int flags, const char *dev_name, - struct nfs_parsed_mount_data *data, - struct vfsmount *mnt) +static struct dentry *nfs4_try_mount(int flags, const char *dev_name, + struct nfs_parsed_mount_data *data) { char *export_path; struct vfsmount *root_mnt; - int error; + struct dentry *res; dfprintk(MOUNT, "--> nfs4_try_mount()\n"); @@ -2925,26 +2799,25 @@ static int nfs4_try_mount(int flags, const char *dev_name, data->nfs_server.hostname); data->nfs_server.export_path = export_path; - error = PTR_ERR(root_mnt); - if (IS_ERR(root_mnt)) - goto out; - - error = nfs_follow_remote_path(root_mnt, export_path, mnt); + res = ERR_CAST(root_mnt); + if (!IS_ERR(root_mnt)) + res = nfs_follow_remote_path(root_mnt, export_path); -out: - dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", error, - error != 0 ? " [error]" : ""); - return error; + dfprintk(MOUNT, "<-- nfs4_try_mount() = %ld%s\n", + IS_ERR(res) ? PTR_ERR(res) : 0, + IS_ERR(res) ? " [error]" : ""); + return res; } /* * Get the superblock for an NFS4 mountpoint */ -static int nfs4_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) +static struct dentry *nfs4_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data) { struct nfs_parsed_mount_data *data; int error = -ENOMEM; + struct dentry *res = ERR_PTR(-ENOMEM); data = nfs_alloc_parsed_mount_data(4); if (data == NULL) @@ -2952,10 +2825,14 @@ static int nfs4_get_sb(struct file_system_type *fs_type, /* Validate the mount data */ error = nfs4_validate_mount_data(raw_data, data, dev_name); - if (error < 0) + if (error < 0) { + res = ERR_PTR(error); goto out; + } - error = nfs4_try_mount(flags, dev_name, data, mnt); + res = nfs4_try_mount(flags, dev_name, data); + if (IS_ERR(res)) + error = PTR_ERR(res); out: kfree(data->client_address); @@ -2964,9 +2841,9 @@ out: kfree(data->fscache_uniq); out_free_data: kfree(data); - dprintk("<-- nfs4_get_sb() = %d%s\n", error, + dprintk("<-- nfs4_mount() = %d%s\n", error, error != 0 ? " [error]" : ""); - return error; + return res; } static void nfs4_kill_super(struct super_block *sb) @@ -3033,7 +2910,7 @@ nfs4_xdev_mount(struct file_system_type *fs_type, int flags, nfs_fscache_get_super_cookie(s, NULL, data); } - mntroot = nfs4_get_root(s, data->fh); + mntroot = nfs4_get_root(s, data->fh, dev_name); if (IS_ERR(mntroot)) { error = PTR_ERR(mntroot); goto error_splat_super; @@ -3120,7 +2997,7 @@ nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags, nfs_fscache_get_super_cookie(s, NULL, data); } - mntroot = nfs4_get_root(s, mntfh); + mntroot = nfs4_get_root(s, mntfh, dev_name); if (IS_ERR(mntroot)) { error = PTR_ERR(mntroot); goto error_splat_super; @@ -3160,16 +3037,15 @@ error_splat_bdi: /* * Create an NFS4 server record on referral traversal */ -static int nfs4_referral_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, - struct vfsmount *mnt) +static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data) { struct nfs_clone_mount *data = raw_data; char *export_path; struct vfsmount *root_mnt; - int error; + struct dentry *res; - dprintk("--> nfs4_referral_get_sb()\n"); + dprintk("--> nfs4_referral_mount()\n"); export_path = data->mnt_path; data->mnt_path = "/"; @@ -3178,15 +3054,13 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type, flags, data, data->hostname); data->mnt_path = export_path; - error = PTR_ERR(root_mnt); - if (IS_ERR(root_mnt)) - goto out; - - error = nfs_follow_remote_path(root_mnt, export_path, mnt); -out: - dprintk("<-- nfs4_referral_get_sb() = %d%s\n", error, - error != 0 ? " [error]" : ""); - return error; + res = ERR_CAST(root_mnt); + if (!IS_ERR(root_mnt)) + res = nfs_follow_remote_path(root_mnt, export_path); + dprintk("<-- nfs4_referral_mount() = %ld%s\n", + IS_ERR(res) ? PTR_ERR(res) : 0, + IS_ERR(res) ? " [error]" : ""); + return res; } #endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index e313a51acdd..8d6864c2a5f 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -148,6 +148,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n alias = d_lookup(parent, &data->args.name); if (alias != NULL) { int ret = 0; + void *devname_garbage = NULL; /* * Hey, we raced with lookup... See if we need to transfer @@ -157,6 +158,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n spin_lock(&alias->d_lock); if (alias->d_inode != NULL && !(alias->d_flags & DCACHE_NFSFS_RENAMED)) { + devname_garbage = alias->d_fsdata; alias->d_fsdata = data; alias->d_flags |= DCACHE_NFSFS_RENAMED; ret = 1; @@ -164,6 +166,13 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n spin_unlock(&alias->d_lock); nfs_dec_sillycount(dir); dput(alias); + /* + * If we'd displaced old cached devname, free it. At that + * point dentry is definitely not a root, so we won't need + * that anymore. + */ + if (devname_garbage) + kfree(devname_garbage); return ret; } data->dir = igrab(dir); @@ -180,7 +189,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n task_setup_data.rpc_client = NFS_CLIENT(dir); task = rpc_run_task(&task_setup_data); if (!IS_ERR(task)) - rpc_put_task(task); + rpc_put_task_async(task); return 1; } @@ -252,6 +261,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry) { struct nfs_unlinkdata *data; int status = -ENOMEM; + void *devname_garbage = NULL; data = kzalloc(sizeof(*data), GFP_KERNEL); if (data == NULL) @@ -269,8 +279,16 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry) if (dentry->d_flags & DCACHE_NFSFS_RENAMED) goto out_unlock; dentry->d_flags |= DCACHE_NFSFS_RENAMED; + devname_garbage = dentry->d_fsdata; dentry->d_fsdata = data; spin_unlock(&dentry->d_lock); + /* + * If we'd displaced old cached devname, free it. At that + * point dentry is definitely not a root, so we won't need + * that anymore. + */ + if (devname_garbage) + kfree(devname_garbage); return 0; out_unlock: spin_unlock(&dentry->d_lock); @@ -299,6 +317,7 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode) if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; data = dentry->d_fsdata; + dentry->d_fsdata = NULL; } spin_unlock(&dentry->d_lock); @@ -315,6 +334,7 @@ nfs_cancel_async_unlink(struct dentry *dentry) struct nfs_unlinkdata *data = dentry->d_fsdata; dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; + dentry->d_fsdata = NULL; spin_unlock(&dentry->d_lock); nfs_free_unlinkdata(data); return; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c8278f4046c..47a3ad63e0d 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -28,6 +28,7 @@ #include "iostat.h" #include "nfs4_fs.h" #include "fscache.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE @@ -96,6 +97,7 @@ void nfs_writedata_free(struct nfs_write_data *p) static void nfs_writedata_release(struct nfs_write_data *wdata) { + put_lseg(wdata->lseg); put_nfs_open_context(wdata->args.context); nfs_writedata_free(wdata); } @@ -781,25 +783,21 @@ static int flush_task_priority(int how) return RPC_PRIORITY_NORMAL; } -/* - * Set up the argument/result storage required for the RPC call. - */ -static int nfs_write_rpcsetup(struct nfs_page *req, - struct nfs_write_data *data, - const struct rpc_call_ops *call_ops, - unsigned int count, unsigned int offset, - int how) +int nfs_initiate_write(struct nfs_write_data *data, + struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how) { - struct inode *inode = req->wb_context->path.dentry->d_inode; + struct inode *inode = data->inode; int priority = flush_task_priority(how); struct rpc_task *task; struct rpc_message msg = { .rpc_argp = &data->args, .rpc_resp = &data->res, - .rpc_cred = req->wb_context->cred, + .rpc_cred = data->cred, }; struct rpc_task_setup task_setup_data = { - .rpc_client = NFS_CLIENT(inode), + .rpc_client = clnt, .task = &data->task, .rpc_message = &msg, .callback_ops = call_ops, @@ -810,12 +808,52 @@ static int nfs_write_rpcsetup(struct nfs_page *req, }; int ret = 0; + /* Set up the initial task struct. */ + NFS_PROTO(inode)->write_setup(data, &msg); + + dprintk("NFS: %5u initiated write call " + "(req %s/%lld, %u bytes @ offset %llu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + data->args.count, + (unsigned long long)data->args.offset); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) { + ret = PTR_ERR(task); + goto out; + } + if (how & FLUSH_SYNC) { + ret = rpc_wait_for_completion_task(task); + if (ret == 0) + ret = task->tk_status; + } + rpc_put_task(task); +out: + return ret; +} +EXPORT_SYMBOL_GPL(nfs_initiate_write); + +/* + * Set up the argument/result storage required for the RPC call. + */ +static int nfs_write_rpcsetup(struct nfs_page *req, + struct nfs_write_data *data, + const struct rpc_call_ops *call_ops, + unsigned int count, unsigned int offset, + struct pnfs_layout_segment *lseg, + int how) +{ + struct inode *inode = req->wb_context->path.dentry->d_inode; + /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ data->req = req; data->inode = inode = req->wb_context->path.dentry->d_inode; - data->cred = msg.rpc_cred; + data->cred = req->wb_context->cred; + data->lseg = get_lseg(lseg); data->args.fh = NFS_FH(inode); data->args.offset = req_offset(req) + offset; @@ -836,30 +874,11 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); - /* Set up the initial task struct. */ - NFS_PROTO(inode)->write_setup(data, &msg); - - dprintk("NFS: %5u initiated write call " - "(req %s/%lld, %u bytes @ offset %llu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - count, - (unsigned long long)data->args.offset); + if (data->lseg && + (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) + return 0; - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - ret = PTR_ERR(task); - goto out; - } - if (how & FLUSH_SYNC) { - ret = rpc_wait_for_completion_task(task); - if (ret == 0) - ret = task->tk_status; - } - rpc_put_task(task); -out: - return ret; + return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); } /* If a nfs_flush_* function fails, it should remove reqs from @head and @@ -879,20 +898,21 @@ static void nfs_redirty_request(struct nfs_page *req) * Generate multiple small requests to write out a single * contiguous dirty area on one page. */ -static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how) +static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) { - struct nfs_page *req = nfs_list_entry(head->next); + struct nfs_page *req = nfs_list_entry(desc->pg_list.next); struct page *page = req->wb_page; struct nfs_write_data *data; - size_t wsize = NFS_SERVER(inode)->wsize, nbytes; + size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes; unsigned int offset; int requests = 0; int ret = 0; + struct pnfs_layout_segment *lseg; LIST_HEAD(list); nfs_list_remove_request(req); - nbytes = count; + nbytes = desc->pg_count; do { size_t len = min(nbytes, wsize); @@ -905,9 +925,11 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned } while (nbytes != 0); atomic_set(&req->wb_complete, requests); + BUG_ON(desc->pg_lseg); + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW); ClearPageError(page); offset = 0; - nbytes = count; + nbytes = desc->pg_count; do { int ret2; @@ -919,13 +941,15 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned if (nbytes < wsize) wsize = nbytes; ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, - wsize, offset, how); + wsize, offset, lseg, desc->pg_ioflags); if (ret == 0) ret = ret2; offset += wsize; nbytes -= wsize; } while (nbytes != 0); + put_lseg(lseg); + desc->pg_lseg = NULL; return ret; out_bad: @@ -946,16 +970,26 @@ out_bad: * This is the case if nfs_updatepage detects a conflicting request * that has been written but not committed. */ -static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how) +static int nfs_flush_one(struct nfs_pageio_descriptor *desc) { struct nfs_page *req; struct page **pages; struct nfs_write_data *data; + struct list_head *head = &desc->pg_list; + struct pnfs_layout_segment *lseg = desc->pg_lseg; + int ret; - data = nfs_writedata_alloc(npages); - if (!data) - goto out_bad; - + data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base, + desc->pg_count)); + if (!data) { + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_redirty_request(req); + } + ret = -ENOMEM; + goto out; + } pages = data->pagevec; while (!list_empty(head)) { req = nfs_list_entry(head->next); @@ -965,16 +999,15 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i *pages++ = req->wb_page; } req = nfs_list_entry(data->pages.next); + if ((!lseg) && list_is_singular(&data->pages)) + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW); /* Set up the argument struct */ - return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how); - out_bad: - while (!list_empty(head)) { - req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_redirty_request(req); - } - return -ENOMEM; + ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags); +out: + put_lseg(lseg); /* Cleans any gotten in ->pg_test */ + desc->pg_lseg = NULL; + return ret; } static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, @@ -982,6 +1015,8 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, { size_t wsize = NFS_SERVER(inode)->wsize; + pnfs_pageio_init_write(pgio, inode); + if (wsize < PAGE_CACHE_SIZE) nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); else @@ -1132,7 +1167,7 @@ static const struct rpc_call_ops nfs_write_full_ops = { /* * This function is called when the WRITE call is complete. */ -int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) +void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) { struct nfs_writeargs *argp = &data->args; struct nfs_writeres *resp = &data->res; @@ -1151,7 +1186,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) */ status = NFS_PROTO(data->inode)->write_done(task, data); if (status != 0) - return status; + return; nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) @@ -1166,6 +1201,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) */ static unsigned long complain; + /* Note this will print the MDS for a DS write */ if (time_before(complain, jiffies)) { dprintk("NFS: faulty NFS server %s:" " (committed = %d) != (stable = %d)\n", @@ -1186,6 +1222,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) /* Was this an NFSv2 write or an NFSv3 stable write? */ if (resp->verf->committed != NFS_UNSTABLE) { /* Resend from where the server left off */ + data->mds_offset += resp->count; argp->offset += resp->count; argp->pgbase += resp->count; argp->count -= resp->count; @@ -1196,7 +1233,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) argp->stable = NFS_FILE_SYNC; } nfs_restart_rpc(task, server->nfs_client); - return -EAGAIN; + return; } if (time_before(complain, jiffies)) { printk(KERN_WARNING @@ -1207,7 +1244,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) /* Can't do anything about it except throw an error. */ task->tk_status = -EIO; } - return 0; + return; } @@ -1292,6 +1329,8 @@ static int nfs_commit_rpcsetup(struct list_head *head, task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); + if (how & FLUSH_SYNC) + rpc_wait_for_completion_task(task); rpc_put_task(task); return 0; } diff --git a/fs/nfsctl.c b/fs/nfsctl.c index bf9cbd242dd..124e8fcb0dd 100644 --- a/fs/nfsctl.c +++ b/fs/nfsctl.c @@ -22,30 +22,17 @@ static struct file *do_open(char *name, int flags) { - struct nameidata nd; struct vfsmount *mnt; - int error; + struct file *file; mnt = do_kern_mount("nfsd", 0, "nfsd", NULL); if (IS_ERR(mnt)) return (struct file *)mnt; - error = vfs_path_lookup(mnt->mnt_root, mnt, name, 0, &nd); - mntput(mnt); /* drop do_kern_mount reference */ - if (error) - return ERR_PTR(error); - - if (flags == O_RDWR) - error = may_open(&nd.path, MAY_READ|MAY_WRITE, flags); - else - error = may_open(&nd.path, MAY_WRITE, flags); + file = file_open_root(mnt->mnt_root, mnt, name, flags); - if (!error) - return dentry_open(nd.path.dentry, nd.path.mnt, flags, - current_cred()); - - path_put(&nd.path); - return ERR_PTR(error); + mntput(mnt); /* drop do_kern_mount reference */ + return file; } static struct { diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index cde36cb0f34..02eb4edf0ec 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -432,7 +432,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr, * If the server returns different values for sessionID, slotID or * sequence number, the server is looney tunes. */ - p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4); + p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4); if (unlikely(p == NULL)) goto out_overflow; memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 54b60bfceb8..7b566ec14e1 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2445,15 +2445,16 @@ nfs4_check_delegmode(struct nfs4_delegation *dp, int flags) static struct nfs4_delegation * find_delegation_file(struct nfs4_file *fp, stateid_t *stid) { - struct nfs4_delegation *dp = NULL; + struct nfs4_delegation *dp; spin_lock(&recall_lock); - list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) { - if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) - break; - } + list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) + if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) { + spin_unlock(&recall_lock); + return dp; + } spin_unlock(&recall_lock); - return dp; + return NULL; } int share_access_to_flags(u32 share_access) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 1275b865507..615f0a9f060 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1142,7 +1142,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, u32 dummy; char *machine_name; - int i; + int i, j; int nr_secflavs; READ_BUF(16); @@ -1215,7 +1215,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, READ_BUF(4); READ32(dummy); READ_BUF(dummy * 4); - for (i = 0; i < dummy; ++i) + for (j = 0; j < dummy; ++j) READ32(dummy); break; case RPC_AUTH_GSS: diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index da1d9701f8e..ff93025ae2f 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -87,7 +87,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, .dentry = dget(dentry)}; int err = 0; - err = follow_down(&path, false); + err = follow_down(&path); if (err < 0) goto out; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 8b61220cffc..6b1305dc26c 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -876,7 +876,7 @@ SYSCALL_ALIAS(sys_fanotify_mark, SyS_fanotify_mark); #endif /* - * fanotify_user_setup - Our initialization function. Note that we cannnot return + * fanotify_user_setup - Our initialization function. Note that we cannot return * error because we have compiled-in VFS hooks. So an (unlikely) failure here * must result in panic(). */ diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 4cd5d5d78f9..bd46e7c8a0e 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -841,7 +841,7 @@ out: } /* - * inotify_user_setup - Our initialization function. Note that we cannnot return + * inotify_user_setup - Our initialization function. Note that we cannot return * error because we have compiled-in VFS hooks. So an (unlikely) failure here * must result in panic(). */ diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 6d80ecc7834..7eb90403fc8 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -56,7 +56,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, int ret = 0; /* if all else fails, just return false */ struct ocfs2_super *osb; - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; inode = dentry->d_inode; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index d417b3f9b0c..f97b6f1c61d 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -354,7 +354,7 @@ static inline int ocfs2_match(int len, /* * Returns 0 if not found, -1 on failure, and 1 on success */ -static int inline ocfs2_search_dirblock(struct buffer_head *bh, +static inline int ocfs2_search_dirblock(struct buffer_head *bh, struct inode *dir, const char *name, int namelen, unsigned long offset, diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 5dbc3062b4f..254652a9b54 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -197,8 +197,12 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len, dentry->d_name.len, dentry->d_name.name, fh, len, connectable); - if (len < 3 || (connectable && len < 6)) { - mlog(ML_ERROR, "fh buffer is too small for encoding\n"); + if (connectable && (len < 6)) { + *max_len = 6; + type = 255; + goto bail; + } else if (len < 3) { + *max_len = 3; type = 255; goto bail; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 849fb4a2e81..d6c25d76b53 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -293,7 +293,7 @@ static int ocfs2_mknod(struct inode *dir, } /* get security xattr */ - status = ocfs2_init_security_get(inode, dir, &si); + status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si); if (status) { if (status == -EOPNOTSUPP) si.enable = 0; @@ -1665,7 +1665,7 @@ static int ocfs2_symlink(struct inode *dir, } /* get security xattr */ - status = ocfs2_init_security_get(inode, dir, &si); + status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si); if (status) { if (status == -EOPNOTSUPP) si.enable = 0; diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index 196fcb52d95..d5ab56cbe5c 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -114,7 +114,4 @@ int ocfs2_local_write_dquot(struct dquot *dquot); extern const struct dquot_operations ocfs2_quota_operations; extern struct quota_format_type ocfs2_quota_format; -int ocfs2_quota_setup(void); -void ocfs2_quota_shutdown(void); - #endif /* _OCFS2_QUOTA_H */ diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 4607923eb24..a73f6416648 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -63,8 +63,6 @@ * write to gf */ -static struct workqueue_struct *ocfs2_quota_wq = NULL; - static void qsync_work_fn(struct work_struct *work); static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp) @@ -400,8 +398,8 @@ int ocfs2_global_read_info(struct super_block *sb, int type) OCFS2_QBLK_RESERVED_SPACE; oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi); INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn); - queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, - msecs_to_jiffies(oinfo->dqi_syncms)); + schedule_delayed_work(&oinfo->dqi_sync_work, + msecs_to_jiffies(oinfo->dqi_syncms)); out_err: mlog_exit(status); @@ -635,8 +633,8 @@ static void qsync_work_fn(struct work_struct *work) struct super_block *sb = oinfo->dqi_gqinode->i_sb; dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type); - queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, - msecs_to_jiffies(oinfo->dqi_syncms)); + schedule_delayed_work(&oinfo->dqi_sync_work, + msecs_to_jiffies(oinfo->dqi_syncms)); } /* @@ -923,20 +921,3 @@ const struct dquot_operations ocfs2_quota_operations = { .alloc_dquot = ocfs2_alloc_dquot, .destroy_dquot = ocfs2_destroy_dquot, }; - -int ocfs2_quota_setup(void) -{ - ocfs2_quota_wq = create_workqueue("o2quot"); - if (!ocfs2_quota_wq) - return -ENOMEM; - return 0; -} - -void ocfs2_quota_shutdown(void) -{ - if (ocfs2_quota_wq) { - flush_workqueue(ocfs2_quota_wq); - destroy_workqueue(ocfs2_quota_wq); - ocfs2_quota_wq = NULL; - } -} diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 19ebc5aad39..c384d634872 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4328,7 +4328,8 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { - error = ocfs2_init_security_and_acl(dir, new_orphan_inode); + error = ocfs2_init_security_and_acl(dir, new_orphan_inode, + &new_dentry->d_name); if (error) mlog_errno(error); } @@ -4379,7 +4380,7 @@ static int ocfs2_user_path_parent(const char __user *path, if (IS_ERR(s)) return PTR_ERR(s); - error = path_lookup(s, LOOKUP_PARENT, nd); + error = kern_path_parent(s, nd); if (error) putname(s); else diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 36c423fb063..236ed1bdca2 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1657,16 +1657,11 @@ static int __init ocfs2_init(void) mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); } - status = ocfs2_quota_setup(); - if (status) - goto leave; - ocfs2_set_locking_protocol(); status = register_quota_format(&ocfs2_quota_format); leave: if (status < 0) { - ocfs2_quota_shutdown(); ocfs2_free_mem_caches(); exit_ocfs2_uptodate_cache(); } @@ -1683,8 +1678,6 @@ static void __exit ocfs2_exit(void) { mlog_entry_void(); - ocfs2_quota_shutdown(); - if (ocfs2_wq) { flush_workqueue(ocfs2_wq); destroy_workqueue(ocfs2_wq); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 67cd4391464..6bb602486c6 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7185,7 +7185,8 @@ out: * must not hold any lock expect i_mutex. */ int ocfs2_init_security_and_acl(struct inode *dir, - struct inode *inode) + struct inode *inode, + const struct qstr *qstr) { int ret = 0; struct buffer_head *dir_bh = NULL; @@ -7193,7 +7194,7 @@ int ocfs2_init_security_and_acl(struct inode *dir, .enable = 1, }; - ret = ocfs2_init_security_get(inode, dir, &si); + ret = ocfs2_init_security_get(inode, dir, qstr, &si); if (!ret) { ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, si.name, si.value, si.value_len, @@ -7261,13 +7262,14 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name, int ocfs2_init_security_get(struct inode *inode, struct inode *dir, + const struct qstr *qstr, struct ocfs2_security_xattr_info *si) { /* check whether ocfs2 support feature xattr */ if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb))) return -EOPNOTSUPP; - return security_inode_init_security(inode, dir, &si->name, &si->value, - &si->value_len); + return security_inode_init_security(inode, dir, qstr, &si->name, + &si->value, &si->value_len); } int ocfs2_init_security_set(handle_t *handle, diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index aa64bb37a65..d63cfb72316 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -57,6 +57,7 @@ int ocfs2_has_inline_xattr_value_outside(struct inode *inode, struct ocfs2_dinode *di); int ocfs2_xattr_remove(struct inode *, struct buffer_head *); int ocfs2_init_security_get(struct inode *, struct inode *, + const struct qstr *, struct ocfs2_security_xattr_info *); int ocfs2_init_security_set(handle_t *, struct inode *, struct buffer_head *, @@ -94,5 +95,6 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, struct buffer_head *new_bh, bool preserve_security); int ocfs2_init_security_and_acl(struct inode *dir, - struct inode *inode); + struct inode *inode, + const struct qstr *qstr); #endif /* OCFS2_XATTR_H */ diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index 393f3f659da..de4ff29f1e0 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -235,33 +235,22 @@ static int omfs_dir_is_empty(struct inode *inode) return *ptr != ~0; } -static int omfs_unlink(struct inode *dir, struct dentry *dentry) +static int omfs_remove(struct inode *dir, struct dentry *dentry) { - int ret; struct inode *inode = dentry->d_inode; + int ret; + + if (S_ISDIR(inode->i_mode) && !omfs_dir_is_empty(inode)) + return -ENOTEMPTY; ret = omfs_delete_entry(dentry); if (ret) - goto end_unlink; - - inode_dec_link_count(inode); + return ret; + + clear_nlink(inode); + mark_inode_dirty(inode); mark_inode_dirty(dir); - -end_unlink: - return ret; -} - -static int omfs_rmdir(struct inode *dir, struct dentry *dentry) -{ - int err = -ENOTEMPTY; - struct inode *inode = dentry->d_inode; - - if (omfs_dir_is_empty(inode)) { - err = omfs_unlink(dir, dentry); - if (!err) - inode_dec_link_count(inode); - } - return err; + return 0; } static int omfs_add_node(struct inode *dir, struct dentry *dentry, int mode) @@ -372,9 +361,10 @@ static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir, res = filldir(dirent, oi->i_name, strnlen(oi->i_name, OMFS_NAMELEN), filp->f_pos, self, d_type); - if (res == 0) - filp->f_pos++; brelse(bh); + if (res < 0) + break; + filp->f_pos++; } out: return res; @@ -385,44 +375,28 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry, { struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; - struct buffer_head *bh; - int is_dir; int err; - is_dir = S_ISDIR(old_inode->i_mode); - if (new_inode) { /* overwriting existing file/dir */ - err = -ENOTEMPTY; - if (is_dir && !omfs_dir_is_empty(new_inode)) - goto out; - - err = -ENOENT; - bh = omfs_find_entry(new_dir, new_dentry->d_name.name, - new_dentry->d_name.len); - if (IS_ERR(bh)) - goto out; - brelse(bh); - - err = omfs_unlink(new_dir, new_dentry); + err = omfs_remove(new_dir, new_dentry); if (err) goto out; } /* since omfs locates files by name, we need to unlink _before_ * adding the new link or we won't find the old one */ - inode_inc_link_count(old_inode); - err = omfs_unlink(old_dir, old_dentry); - if (err) { - inode_dec_link_count(old_inode); + err = omfs_delete_entry(old_dentry); + if (err) goto out; - } + mark_inode_dirty(old_dir); err = omfs_add_link(new_dentry, old_inode); if (err) goto out; old_inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(old_inode); out: return err; } @@ -488,8 +462,8 @@ const struct inode_operations omfs_dir_inops = { .mkdir = omfs_mkdir, .rename = omfs_rename, .create = omfs_create, - .unlink = omfs_unlink, - .rmdir = omfs_rmdir, + .unlink = omfs_remove, + .rmdir = omfs_remove, }; const struct file_operations omfs_dir_operations = { diff --git a/fs/open.c b/fs/open.c index 5a2c6ebc22b..f83ca80cc59 100644 --- a/fs/open.c +++ b/fs/open.c @@ -233,6 +233,14 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!(file->f_mode & FMODE_WRITE)) return -EBADF; + + /* It's not possible punch hole on append only file */ + if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode)) + return -EPERM; + + if (IS_IMMUTABLE(inode)) + return -EPERM; + /* * Revalidate the write permissions, in case security policy has * changed since the files were opened. @@ -565,13 +573,15 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, { struct path path; int error = -EINVAL; - int follow; + int lookup_flags; - if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) + if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) goto out; - follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; - error = user_path_at(dfd, filename, follow, &path); + lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + if (flag & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + error = user_path_at(dfd, filename, lookup_flags, &path); if (error) goto out; error = mnt_want_write(path.mnt); @@ -661,11 +671,16 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, int (*open)(struct inode *, struct file *), const struct cred *cred) { + static const struct file_operations empty_fops = {}; struct inode *inode; int error; f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; + + if (unlikely(f->f_flags & O_PATH)) + f->f_mode = FMODE_PATH; + inode = dentry->d_inode; if (f->f_mode & FMODE_WRITE) { error = __get_file_write_access(inode, mnt); @@ -679,9 +694,15 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, f->f_path.dentry = dentry; f->f_path.mnt = mnt; f->f_pos = 0; - f->f_op = fops_get(inode->i_fop); file_sb_list_add(f, inode->i_sb); + if (unlikely(f->f_mode & FMODE_PATH)) { + f->f_op = &empty_fops; + return f; + } + + f->f_op = fops_get(inode->i_fop); + error = security_dentry_open(f, cred); if (error) goto cleanup_all; @@ -693,7 +714,8 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, if (error) goto cleanup_all; } - ima_counts_get(f); + if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) + i_readcount_inc(inode); f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); @@ -882,15 +904,110 @@ void fd_install(unsigned int fd, struct file *file) EXPORT_SYMBOL(fd_install); +static inline int build_open_flags(int flags, int mode, struct open_flags *op) +{ + int lookup_flags = 0; + int acc_mode; + + if (!(flags & O_CREAT)) + mode = 0; + op->mode = mode; + + /* Must never be set by userspace */ + flags &= ~FMODE_NONOTIFY; + + /* + * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only + * check for O_DSYNC if the need any syncing at all we enforce it's + * always set instead of having to deal with possibly weird behaviour + * for malicious applications setting only __O_SYNC. + */ + if (flags & __O_SYNC) + flags |= O_DSYNC; + + /* + * If we have O_PATH in the open flag. Then we + * cannot have anything other than the below set of flags + */ + if (flags & O_PATH) { + flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; + acc_mode = 0; + } else { + acc_mode = MAY_OPEN | ACC_MODE(flags); + } + + op->open_flag = flags; + + /* O_TRUNC implies we need access checks for write permissions */ + if (flags & O_TRUNC) + acc_mode |= MAY_WRITE; + + /* Allow the LSM permission hook to distinguish append + access from general write access. */ + if (flags & O_APPEND) + acc_mode |= MAY_APPEND; + + op->acc_mode = acc_mode; + + op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN; + + if (flags & O_CREAT) { + op->intent |= LOOKUP_CREATE; + if (flags & O_EXCL) + op->intent |= LOOKUP_EXCL; + } + + if (flags & O_DIRECTORY) + lookup_flags |= LOOKUP_DIRECTORY; + if (!(flags & O_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + return lookup_flags; +} + +/** + * filp_open - open file and return file pointer + * + * @filename: path to open + * @flags: open flags as per the open(2) second argument + * @mode: mode for the new file if O_CREAT is set, else ignored + * + * This is the helper to open a file from kernelspace if you really + * have to. But in generally you should not do this, so please move + * along, nothing to see here.. + */ +struct file *filp_open(const char *filename, int flags, int mode) +{ + struct open_flags op; + int lookup = build_open_flags(flags, mode, &op); + return do_filp_open(AT_FDCWD, filename, &op, lookup); +} +EXPORT_SYMBOL(filp_open); + +struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, + const char *filename, int flags) +{ + struct open_flags op; + int lookup = build_open_flags(flags, 0, &op); + if (flags & O_CREAT) + return ERR_PTR(-EINVAL); + if (!filename && (flags & O_DIRECTORY)) + if (!dentry->d_inode->i_op->lookup) + return ERR_PTR(-ENOTDIR); + return do_file_open_root(dentry, mnt, filename, &op, lookup); +} +EXPORT_SYMBOL(file_open_root); + long do_sys_open(int dfd, const char __user *filename, int flags, int mode) { + struct open_flags op; + int lookup = build_open_flags(flags, mode, &op); char *tmp = getname(filename); int fd = PTR_ERR(tmp); if (!IS_ERR(tmp)) { fd = get_unused_fd_flags(flags); if (fd >= 0) { - struct file *f = do_filp_open(dfd, tmp, flags, mode, 0); + struct file *f = do_filp_open(dfd, tmp, &op, lookup); if (IS_ERR(f)) { put_unused_fd(fd); fd = PTR_ERR(f); @@ -960,8 +1077,10 @@ int filp_close(struct file *filp, fl_owner_t id) if (filp->f_op && filp->f_op->flush) retval = filp->f_op->flush(filp, id); - dnotify_flush(filp, id); - locks_remove_posix(filp, id); + if (likely(!(filp->f_mode & FMODE_PATH))) { + dnotify_flush(filp, id); + locks_remove_posix(filp, id); + } fput(filp); return retval; } diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c index 48cec7cbca1..764b86a0196 100644 --- a/fs/partitions/osf.c +++ b/fs/partitions/osf.c @@ -10,10 +10,13 @@ #include "check.h" #include "osf.h" +#define MAX_OSF_PARTITIONS 18 + int osf_partition(struct parsed_partitions *state) { int i; int slot = 1; + unsigned int npartitions; Sector sect; unsigned char *data; struct disklabel { @@ -45,7 +48,7 @@ int osf_partition(struct parsed_partitions *state) u8 p_fstype; u8 p_frag; __le16 p_cpg; - } d_partitions[8]; + } d_partitions[MAX_OSF_PARTITIONS]; } * label; struct d_partition * partition; @@ -63,7 +66,12 @@ int osf_partition(struct parsed_partitions *state) put_dev_sector(sect); return 0; } - for (i = 0 ; i < le16_to_cpu(label->d_npartitions); i++, partition++) { + npartitions = le16_to_cpu(label->d_npartitions); + if (npartitions > MAX_OSF_PARTITIONS) { + put_dev_sector(sect); + return 0; + } + for (i = 0 ; i < npartitions; i++, partition++) { if (slot == state->limit) break; if (le32_to_cpu(partition->p_size)) diff --git a/fs/proc/base.c b/fs/proc/base.c index 9d096e82b20..d49c4b5d2c3 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2620,35 +2620,6 @@ static const struct pid_entry proc_base_stuff[] = { &proc_self_inode_operations, NULL, {}), }; -/* - * Exceptional case: normally we are not allowed to unhash a busy - * directory. In this case, however, we can do it - no aliasing problems - * due to the way we treat inodes. - */ -static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd) -{ - struct inode *inode; - struct task_struct *task; - - if (nd->flags & LOOKUP_RCU) - return -ECHILD; - - inode = dentry->d_inode; - task = get_proc_task(inode); - if (task) { - put_task_struct(task); - return 1; - } - d_drop(dentry); - return 0; -} - -static const struct dentry_operations proc_base_dentry_operations = -{ - .d_revalidate = proc_base_revalidate, - .d_delete = pid_delete_dentry, -}; - static struct dentry *proc_base_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { @@ -2685,7 +2656,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir, if (p->fop) inode->i_fop = p->fop; ei->op = p->op; - d_set_d_op(dentry, &proc_base_dentry_operations); d_add(dentry, inode); error = NULL; out: diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 176ce4cda68..d6a7ca1fdac 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -27,6 +27,7 @@ static void proc_evict_inode(struct inode *inode) { struct proc_dir_entry *de; + struct ctl_table_header *head; truncate_inode_pages(&inode->i_data, 0); end_writeback(inode); @@ -38,8 +39,11 @@ static void proc_evict_inode(struct inode *inode) de = PROC_I(inode)->pde; if (de) pde_put(de); - if (PROC_I(inode)->sysctl) - sysctl_head_put(PROC_I(inode)->sysctl); + head = PROC_I(inode)->sysctl; + if (head) { + rcu_assign_pointer(PROC_I(inode)->sysctl, NULL); + sysctl_head_put(head); + } } struct vfsmount *proc_mnt; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 09a1f92a34e..f50133c11c2 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -32,7 +32,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, ei->sysctl_entry = table; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */ inode->i_mode = table->mode; if (!table->child) { inode->i_mode |= S_IFREG; @@ -408,15 +407,18 @@ static int proc_sys_compare(const struct dentry *parent, const struct dentry *dentry, const struct inode *inode, unsigned int len, const char *str, const struct qstr *name) { + struct ctl_table_header *head; /* Although proc doesn't have negative dentries, rcu-walk means * that inode here can be NULL */ + /* AV: can it, indeed? */ if (!inode) - return 0; + return 1; if (name->len != len) return 1; if (memcmp(name->name, str, len)) return 1; - return !sysctl_is_seen(PROC_I(inode)->sysctl); + head = rcu_dereference(PROC_I(inode)->sysctl); + return !head || !sysctl_is_seen(head); } static const struct dentry_operations proc_sys_dentry_operations = { diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig new file mode 100644 index 00000000000..867d0ac026c --- /dev/null +++ b/fs/pstore/Kconfig @@ -0,0 +1,13 @@ +config PSTORE + bool "Persistant store support" + default n + help + This option enables generic access to platform level + persistent storage via "pstore" filesystem that can + be mounted as /dev/pstore. Only useful if you have + a platform level driver that registers with pstore to + provide the data, so you probably should just go say "Y" + (or "M") to a platform specific persistent store driver + (e.g. ACPI_APEI on X86) which will select this for you. + If you don't have a platform persistent store driver, + say N. diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile new file mode 100644 index 00000000000..760f4bce7d1 --- /dev/null +++ b/fs/pstore/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux pstorefs routines. +# + +obj-y += pstore.o + +pstore-objs += inode.o platform.o diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c new file mode 100644 index 00000000000..08342232cb1 --- /dev/null +++ b/fs/pstore/inode.c @@ -0,0 +1,281 @@ +/* + * Persistent Storage - ramfs parts. + * + * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/fsnotify.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/mount.h> +#include <linux/ramfs.h> +#include <linux/sched.h> +#include <linux/magic.h> +#include <linux/pstore.h> +#include <linux/slab.h> +#include <linux/uaccess.h> + +#include "internal.h" + +#define PSTORE_NAMELEN 64 + +struct pstore_private { + u64 id; + int (*erase)(u64); + ssize_t size; + char data[]; +}; + +static int pstore_file_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return 0; +} + +static ssize_t pstore_file_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct pstore_private *ps = file->private_data; + + return simple_read_from_buffer(userbuf, count, ppos, ps->data, ps->size); +} + +static const struct file_operations pstore_file_operations = { + .open = pstore_file_open, + .read = pstore_file_read, + .llseek = default_llseek, +}; + +/* + * When a file is unlinked from our file system we call the + * platform driver to erase the record from persistent store. + */ +static int pstore_unlink(struct inode *dir, struct dentry *dentry) +{ + struct pstore_private *p = dentry->d_inode->i_private; + + p->erase(p->id); + kfree(p); + + return simple_unlink(dir, dentry); +} + +static const struct inode_operations pstore_dir_inode_operations = { + .lookup = simple_lookup, + .unlink = pstore_unlink, +}; + +static struct inode *pstore_get_inode(struct super_block *sb, + const struct inode *dir, int mode, dev_t dev) +{ + struct inode *inode = new_inode(sb); + + if (inode) { + inode->i_ino = get_next_ino(); + inode->i_uid = inode->i_gid = 0; + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + switch (mode & S_IFMT) { + case S_IFREG: + inode->i_fop = &pstore_file_operations; + break; + case S_IFDIR: + inode->i_op = &pstore_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + inc_nlink(inode); + break; + } + } + return inode; +} + +static const struct super_operations pstore_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .show_options = generic_show_options, +}; + +static struct super_block *pstore_sb; + +int pstore_is_mounted(void) +{ + return pstore_sb != NULL; +} + +/* + * Make a regular file in the root directory of our file system. + * Load it up with "size" bytes of data from "buf". + * Set the mtime & ctime to the date that this record was originally stored. + */ +int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, + char *data, size_t size, + struct timespec time, int (*erase)(u64)) +{ + struct dentry *root = pstore_sb->s_root; + struct dentry *dentry; + struct inode *inode; + int rc; + char name[PSTORE_NAMELEN]; + struct pstore_private *private; + + rc = -ENOMEM; + inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0); + if (!inode) + goto fail; + private = kmalloc(sizeof *private + size, GFP_KERNEL); + if (!private) + goto fail_alloc; + private->id = id; + private->erase = erase; + + switch (type) { + case PSTORE_TYPE_DMESG: + sprintf(name, "dmesg-%s-%lld", psname, id); + break; + case PSTORE_TYPE_MCE: + sprintf(name, "mce-%s-%lld", psname, id); + break; + case PSTORE_TYPE_UNKNOWN: + sprintf(name, "unknown-%s-%lld", psname, id); + break; + default: + sprintf(name, "type%d-%s-%lld", type, psname, id); + break; + } + + mutex_lock(&root->d_inode->i_mutex); + + rc = -ENOSPC; + dentry = d_alloc_name(root, name); + if (IS_ERR(dentry)) + goto fail_lockedalloc; + + memcpy(private->data, data, size); + inode->i_size = private->size = size; + + inode->i_private = private; + + if (time.tv_sec) + inode->i_mtime = inode->i_ctime = time; + + d_add(dentry, inode); + + mutex_unlock(&root->d_inode->i_mutex); + + return 0; + +fail_lockedalloc: + mutex_unlock(&root->d_inode->i_mutex); + kfree(private); +fail_alloc: + iput(inode); + +fail: + return rc; +} + +int pstore_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *inode = NULL; + struct dentry *root; + int err; + + save_mount_options(sb, data); + + pstore_sb = sb; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = PSTOREFS_MAGIC; + sb->s_op = &pstore_ops; + sb->s_time_gran = 1; + + inode = pstore_get_inode(sb, NULL, S_IFDIR | 0755, 0); + if (!inode) { + err = -ENOMEM; + goto fail; + } + /* override ramfs "dir" options so we catch unlink(2) */ + inode->i_op = &pstore_dir_inode_operations; + + root = d_alloc_root(inode); + sb->s_root = root; + if (!root) { + err = -ENOMEM; + goto fail; + } + + pstore_get_records(); + + return 0; +fail: + iput(inode); + return err; +} + +static struct dentry *pstore_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_single(fs_type, flags, data, pstore_fill_super); +} + +static void pstore_kill_sb(struct super_block *sb) +{ + kill_litter_super(sb); + pstore_sb = NULL; +} + +static struct file_system_type pstore_fs_type = { + .name = "pstore", + .mount = pstore_mount, + .kill_sb = pstore_kill_sb, +}; + +static int __init init_pstore_fs(void) +{ + int rc = 0; + struct kobject *pstorefs_kobj; + + pstorefs_kobj = kobject_create_and_add("pstore", fs_kobj); + if (!pstorefs_kobj) { + rc = -ENOMEM; + goto done; + } + + rc = sysfs_create_file(pstorefs_kobj, &pstore_kmsg_bytes_attr.attr); + if (rc) + goto done1; + + rc = register_filesystem(&pstore_fs_type); + if (rc == 0) + goto done; + + sysfs_remove_file(pstorefs_kobj, &pstore_kmsg_bytes_attr.attr); +done1: + kobject_put(pstorefs_kobj); +done: + return rc; +} +module_init(init_pstore_fs) + +MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>"); +MODULE_LICENSE("GPL"); diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h new file mode 100644 index 00000000000..76c26d2fab2 --- /dev/null +++ b/fs/pstore/internal.h @@ -0,0 +1,7 @@ +extern void pstore_get_records(void); +extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, + char *data, size_t size, + struct timespec time, int (*erase)(u64)); +extern int pstore_is_mounted(void); + +extern struct kobj_attribute pstore_kmsg_bytes_attr; diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c new file mode 100644 index 00000000000..705fdf8abf6 --- /dev/null +++ b/fs/pstore/platform.c @@ -0,0 +1,202 @@ +/* + * Persistent Storage - platform driver interface parts. + * + * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/atomic.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/kmsg_dump.h> +#include <linux/module.h> +#include <linux/pstore.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/uaccess.h> + +#include "internal.h" + +/* + * pstore_lock just protects "psinfo" during + * calls to pstore_register() + */ +static DEFINE_SPINLOCK(pstore_lock); +static struct pstore_info *psinfo; + +/* How much of the console log to snapshot. /sys/fs/pstore/kmsg_bytes */ +static unsigned long kmsg_bytes = 10240; + +static ssize_t b_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%lu\n", kmsg_bytes); +} + +static ssize_t b_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return (sscanf(buf, "%lu", &kmsg_bytes) > 0) ? count : 0; +} + +struct kobj_attribute pstore_kmsg_bytes_attr = + __ATTR(kmsg_bytes, S_IRUGO | S_IWUSR, b_show, b_store); + +/* Tag each group of saved records with a sequence number */ +static int oopscount; + +/* + * callback from kmsg_dump. (s2,l2) has the most recently + * written bytes, older bytes are in (s1,l1). Save as much + * as we can from the end of the buffer. + */ +static void pstore_dump(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason, + const char *s1, unsigned long l1, + const char *s2, unsigned long l2) +{ + unsigned long s1_start, s2_start; + unsigned long l1_cpy, l2_cpy; + unsigned long size, total = 0; + char *dst; + u64 id; + int hsize, part = 1; + + mutex_lock(&psinfo->buf_mutex); + oopscount++; + while (total < kmsg_bytes) { + dst = psinfo->buf; + hsize = sprintf(dst, "Oops#%d Part%d\n", oopscount, part++); + size = psinfo->bufsize - hsize; + dst += hsize; + + l2_cpy = min(l2, size); + l1_cpy = min(l1, size - l2_cpy); + + if (l1_cpy + l2_cpy == 0) + break; + + s2_start = l2 - l2_cpy; + s1_start = l1 - l1_cpy; + + memcpy(dst, s1 + s1_start, l1_cpy); + memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy); + + id = psinfo->write(PSTORE_TYPE_DMESG, hsize + l1_cpy + l2_cpy); + if (pstore_is_mounted()) + pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, + psinfo->buf, hsize + l1_cpy + l2_cpy, + CURRENT_TIME, psinfo->erase); + l1 -= l1_cpy; + l2 -= l2_cpy; + total += l1_cpy + l2_cpy; + } + mutex_unlock(&psinfo->buf_mutex); +} + +static struct kmsg_dumper pstore_dumper = { + .dump = pstore_dump, +}; + +/* + * platform specific persistent storage driver registers with + * us here. If pstore is already mounted, call the platform + * read function right away to populate the file system. If not + * then the pstore mount code will call us later to fill out + * the file system. + * + * Register with kmsg_dump to save last part of console log on panic. + */ +int pstore_register(struct pstore_info *psi) +{ + struct module *owner = psi->owner; + + spin_lock(&pstore_lock); + if (psinfo) { + spin_unlock(&pstore_lock); + return -EBUSY; + } + psinfo = psi; + spin_unlock(&pstore_lock); + + if (owner && !try_module_get(owner)) { + psinfo = NULL; + return -EINVAL; + } + + if (pstore_is_mounted()) + pstore_get_records(); + + kmsg_dump_register(&pstore_dumper); + + return 0; +} +EXPORT_SYMBOL_GPL(pstore_register); + +/* + * Read all the records from the persistent store. Create and + * file files in our filesystem. + */ +void pstore_get_records(void) +{ + struct pstore_info *psi = psinfo; + size_t size; + u64 id; + enum pstore_type_id type; + struct timespec time; + int failed = 0; + + if (!psi) + return; + + mutex_lock(&psinfo->buf_mutex); + while ((size = psi->read(&id, &type, &time)) > 0) { + if (pstore_mkfile(type, psi->name, id, psi->buf, size, + time, psi->erase)) + failed++; + } + mutex_unlock(&psinfo->buf_mutex); + + if (failed) + printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n", + failed, psi->name); +} + +/* + * Call platform driver to write a record to the + * persistent store. + */ +int pstore_write(enum pstore_type_id type, char *buf, size_t size) +{ + u64 id; + + if (!psinfo) + return -ENODEV; + + if (size > psinfo->bufsize) + return -EFBIG; + + mutex_lock(&psinfo->buf_mutex); + memcpy(psinfo->buf, buf, size); + id = psinfo->write(type, size); + if (pstore_is_mounted()) + pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf, + size, CURRENT_TIME, psinfo->erase); + mutex_unlock(&psinfo->buf_mutex); + + return 0; +} +EXPORT_SYMBOL_GPL(pstore_write); diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index 65444d29406..f1ab3604db5 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -112,7 +112,7 @@ static int v2_read_file_info(struct super_block *sb, int type) if (!info->dqi_priv) { printk(KERN_WARNING "Not enough memory for quota information structure.\n"); - return -1; + return -ENOMEM; } qinfo = info->dqi_priv; if (version == 0) { diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 0bae036831e..1bba24bad82 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1593,8 +1593,13 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp, struct inode *inode = dentry->d_inode; int maxlen = *lenp; - if (maxlen < 3) + if (need_parent && (maxlen < 5)) { + *lenp = 5; return 255; + } else if (maxlen < 3) { + *lenp = 3; + return 255; + } data[0] = inode->i_ino; data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 3eea859e699..c77514bd577 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2876,7 +2876,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name, reiserfs_mounted_fs_count++; if (reiserfs_mounted_fs_count <= 1) { reiserfs_write_unlock(sb); - commit_wq = create_workqueue("reiserfs"); + commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0); reiserfs_write_lock(sb); } diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 68fdf45cc6c..118662690cd 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -593,7 +593,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode, new_inode_init(inode, dir, mode); jbegin_count += reiserfs_cache_default_acl(dir); - retval = reiserfs_security_init(dir, inode, &security); + retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); if (retval < 0) { drop_new_inode(inode); return retval; @@ -667,7 +667,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode, new_inode_init(inode, dir, mode); jbegin_count += reiserfs_cache_default_acl(dir); - retval = reiserfs_security_init(dir, inode, &security); + retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); if (retval < 0) { drop_new_inode(inode); return retval; @@ -747,7 +747,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) new_inode_init(inode, dir, mode); jbegin_count += reiserfs_cache_default_acl(dir); - retval = reiserfs_security_init(dir, inode, &security); + retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); if (retval < 0) { drop_new_inode(inode); return retval; @@ -1032,7 +1032,8 @@ static int reiserfs_symlink(struct inode *parent_dir, } new_inode_init(inode, parent_dir, mode); - retval = reiserfs_security_init(parent_dir, inode, &security); + retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name, + &security); if (retval < 0) { drop_new_inode(inode); return retval; @@ -1122,10 +1123,6 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, reiserfs_write_unlock(dir->i_sb); return -EMLINK; } - if (inode->i_nlink == 0) { - reiserfs_write_unlock(dir->i_sb); - return -ENOENT; - } /* inc before scheduling so reiserfs_unlink knows we are here */ inc_nlink(inode); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 3cfb2e93364..5c11ca82b78 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -978,8 +978,6 @@ int reiserfs_permission(struct inode *inode, int mask, unsigned int flags) static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd) { - if (nd->flags & LOOKUP_RCU) - return -ECHILD; return -EPERM; } diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 237c6928d3c..ef66c18a933 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -54,6 +54,7 @@ static size_t security_list(struct dentry *dentry, char *list, size_t list_len, * of blocks needed for the transaction. If successful, reiserfs_security * must be released using reiserfs_security_free when the caller is done. */ int reiserfs_security_init(struct inode *dir, struct inode *inode, + const struct qstr *qstr, struct reiserfs_security_handle *sec) { int blocks = 0; @@ -65,7 +66,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode, if (IS_PRIVATE(dir)) return 0; - error = security_inode_init_security(inode, dir, &sec->name, + error = security_inode_init_security(inode, dir, qstr, &sec->name, &sec->value, &sec->length); if (error) { if (error == -EOPNOTSUPP) diff --git a/fs/stat.c b/fs/stat.c index d5c61cf2b70..961039121cb 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -75,13 +75,16 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int error = -EINVAL; int lookup_flags = 0; - if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0) + if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | + AT_EMPTY_PATH)) != 0) goto out; if (!(flag & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (flag & AT_NO_AUTOMOUNT) lookup_flags |= LOOKUP_NO_AUTOMOUNT; + if (flag & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; error = user_path_at(dfd, filename, lookup_flags, &path); if (error) @@ -297,7 +300,7 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname, if (bufsiz <= 0) return -EINVAL; - error = user_path_at(dfd, pathname, 0, &path); + error = user_path_at(dfd, pathname, LOOKUP_EMPTY, &path); if (!error) { struct inode *inode = path.dentry->d_inode; diff --git a/fs/statfs.c b/fs/statfs.c index 30ea8c8a996..8244924dec5 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -73,149 +73,135 @@ int vfs_statfs(struct path *path, struct kstatfs *buf) } EXPORT_SYMBOL(vfs_statfs); -static int do_statfs_native(struct path *path, struct statfs *buf) +int user_statfs(const char __user *pathname, struct kstatfs *st) { - struct kstatfs st; - int retval; + struct path path; + int error = user_path(pathname, &path); + if (!error) { + error = vfs_statfs(&path, st); + path_put(&path); + } + return error; +} - retval = vfs_statfs(path, &st); - if (retval) - return retval; +int fd_statfs(int fd, struct kstatfs *st) +{ + struct file *file = fget(fd); + int error = -EBADF; + if (file) { + error = vfs_statfs(&file->f_path, st); + fput(file); + } + return error; +} - if (sizeof(*buf) == sizeof(st)) - memcpy(buf, &st, sizeof(st)); +static int do_statfs_native(struct kstatfs *st, struct statfs __user *p) +{ + struct statfs buf; + + if (sizeof(buf) == sizeof(*st)) + memcpy(&buf, st, sizeof(*st)); else { - if (sizeof buf->f_blocks == 4) { - if ((st.f_blocks | st.f_bfree | st.f_bavail | - st.f_bsize | st.f_frsize) & + if (sizeof buf.f_blocks == 4) { + if ((st->f_blocks | st->f_bfree | st->f_bavail | + st->f_bsize | st->f_frsize) & 0xffffffff00000000ULL) return -EOVERFLOW; /* * f_files and f_ffree may be -1; it's okay to stuff * that into 32 bits */ - if (st.f_files != -1 && - (st.f_files & 0xffffffff00000000ULL)) + if (st->f_files != -1 && + (st->f_files & 0xffffffff00000000ULL)) return -EOVERFLOW; - if (st.f_ffree != -1 && - (st.f_ffree & 0xffffffff00000000ULL)) + if (st->f_ffree != -1 && + (st->f_ffree & 0xffffffff00000000ULL)) return -EOVERFLOW; } - buf->f_type = st.f_type; - buf->f_bsize = st.f_bsize; - buf->f_blocks = st.f_blocks; - buf->f_bfree = st.f_bfree; - buf->f_bavail = st.f_bavail; - buf->f_files = st.f_files; - buf->f_ffree = st.f_ffree; - buf->f_fsid = st.f_fsid; - buf->f_namelen = st.f_namelen; - buf->f_frsize = st.f_frsize; - buf->f_flags = st.f_flags; - memset(buf->f_spare, 0, sizeof(buf->f_spare)); + buf.f_type = st->f_type; + buf.f_bsize = st->f_bsize; + buf.f_blocks = st->f_blocks; + buf.f_bfree = st->f_bfree; + buf.f_bavail = st->f_bavail; + buf.f_files = st->f_files; + buf.f_ffree = st->f_ffree; + buf.f_fsid = st->f_fsid; + buf.f_namelen = st->f_namelen; + buf.f_frsize = st->f_frsize; + buf.f_flags = st->f_flags; + memset(buf.f_spare, 0, sizeof(buf.f_spare)); } + if (copy_to_user(p, &buf, sizeof(buf))) + return -EFAULT; return 0; } -static int do_statfs64(struct path *path, struct statfs64 *buf) +static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p) { - struct kstatfs st; - int retval; - - retval = vfs_statfs(path, &st); - if (retval) - return retval; - - if (sizeof(*buf) == sizeof(st)) - memcpy(buf, &st, sizeof(st)); + struct statfs64 buf; + if (sizeof(buf) == sizeof(*st)) + memcpy(&buf, st, sizeof(*st)); else { - buf->f_type = st.f_type; - buf->f_bsize = st.f_bsize; - buf->f_blocks = st.f_blocks; - buf->f_bfree = st.f_bfree; - buf->f_bavail = st.f_bavail; - buf->f_files = st.f_files; - buf->f_ffree = st.f_ffree; - buf->f_fsid = st.f_fsid; - buf->f_namelen = st.f_namelen; - buf->f_frsize = st.f_frsize; - buf->f_flags = st.f_flags; - memset(buf->f_spare, 0, sizeof(buf->f_spare)); + buf.f_type = st->f_type; + buf.f_bsize = st->f_bsize; + buf.f_blocks = st->f_blocks; + buf.f_bfree = st->f_bfree; + buf.f_bavail = st->f_bavail; + buf.f_files = st->f_files; + buf.f_ffree = st->f_ffree; + buf.f_fsid = st->f_fsid; + buf.f_namelen = st->f_namelen; + buf.f_frsize = st->f_frsize; + buf.f_flags = st->f_flags; + memset(buf.f_spare, 0, sizeof(buf.f_spare)); } + if (copy_to_user(p, &buf, sizeof(buf))) + return -EFAULT; return 0; } SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf) { - struct path path; - int error; - - error = user_path(pathname, &path); - if (!error) { - struct statfs tmp; - error = do_statfs_native(&path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - path_put(&path); - } + struct kstatfs st; + int error = user_statfs(pathname, &st); + if (!error) + error = do_statfs_native(&st, buf); return error; } SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf) { - struct path path; - long error; - + struct kstatfs st; + int error; if (sz != sizeof(*buf)) return -EINVAL; - error = user_path(pathname, &path); - if (!error) { - struct statfs64 tmp; - error = do_statfs64(&path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - path_put(&path); - } + error = user_statfs(pathname, &st); + if (!error) + error = do_statfs64(&st, buf); return error; } SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf) { - struct file *file; - struct statfs tmp; - int error; - - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = do_statfs_native(&file->f_path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - fput(file); -out: + struct kstatfs st; + int error = fd_statfs(fd, &st); + if (!error) + error = do_statfs_native(&st, buf); return error; } SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf) { - struct file *file; - struct statfs64 tmp; + struct kstatfs st; int error; if (sz != sizeof(*buf)) return -EINVAL; - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = do_statfs64(&file->f_path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - fput(file); -out: + error = fd_statfs(fd, &st); + if (!error) + error = do_statfs64(&st, buf); return error; } diff --git a/fs/super.c b/fs/super.c index 7e9dd4cc2c0..e8486490826 100644 --- a/fs/super.c +++ b/fs/super.c @@ -843,23 +843,6 @@ error: } EXPORT_SYMBOL(mount_bdev); -int get_sb_bdev(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - int (*fill_super)(struct super_block *, void *, int), - struct vfsmount *mnt) -{ - struct dentry *root; - - root = mount_bdev(fs_type, flags, dev_name, data, fill_super); - if (IS_ERR(root)) - return PTR_ERR(root); - mnt->mnt_root = root; - mnt->mnt_sb = root->d_sb; - return 0; -} - -EXPORT_SYMBOL(get_sb_bdev); - void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; @@ -897,22 +880,6 @@ struct dentry *mount_nodev(struct file_system_type *fs_type, } EXPORT_SYMBOL(mount_nodev); -int get_sb_nodev(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int), - struct vfsmount *mnt) -{ - struct dentry *root; - - root = mount_nodev(fs_type, flags, data, fill_super); - if (IS_ERR(root)) - return PTR_ERR(root); - mnt->mnt_root = root; - mnt->mnt_sb = root->d_sb; - return 0; -} -EXPORT_SYMBOL(get_sb_nodev); - static int compare_single(struct super_block *s, void *p) { return 1; @@ -943,69 +910,35 @@ struct dentry *mount_single(struct file_system_type *fs_type, } EXPORT_SYMBOL(mount_single); -int get_sb_single(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int), - struct vfsmount *mnt) -{ - struct dentry *root; - root = mount_single(fs_type, flags, data, fill_super); - if (IS_ERR(root)) - return PTR_ERR(root); - mnt->mnt_root = root; - mnt->mnt_sb = root->d_sb; - return 0; -} - -EXPORT_SYMBOL(get_sb_single); - -struct vfsmount * -vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) +struct dentry * +mount_fs(struct file_system_type *type, int flags, const char *name, void *data) { - struct vfsmount *mnt; struct dentry *root; + struct super_block *sb; char *secdata = NULL; - int error; - - if (!type) - return ERR_PTR(-ENODEV); - - error = -ENOMEM; - mnt = alloc_vfsmnt(name); - if (!mnt) - goto out; - - if (flags & MS_KERNMOUNT) - mnt->mnt_flags = MNT_INTERNAL; + int error = -ENOMEM; if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) { secdata = alloc_secdata(); if (!secdata) - goto out_mnt; + goto out; error = security_sb_copy_data(data, secdata); if (error) goto out_free_secdata; } - if (type->mount) { - root = type->mount(type, flags, name, data); - if (IS_ERR(root)) { - error = PTR_ERR(root); - goto out_free_secdata; - } - mnt->mnt_root = root; - mnt->mnt_sb = root->d_sb; - } else { - error = type->get_sb(type, flags, name, data, mnt); - if (error < 0) - goto out_free_secdata; + root = type->mount(type, flags, name, data); + if (IS_ERR(root)) { + error = PTR_ERR(root); + goto out_free_secdata; } - BUG_ON(!mnt->mnt_sb); - WARN_ON(!mnt->mnt_sb->s_bdi); - mnt->mnt_sb->s_flags |= MS_BORN; + sb = root->d_sb; + BUG_ON(!sb); + WARN_ON(!sb->s_bdi); + sb->s_flags |= MS_BORN; - error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata); + error = security_sb_kern_mount(sb, flags, secdata); if (error) goto out_sb; @@ -1016,27 +949,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void * violate this rule. This warning should be either removed or * converted to a BUG() in 2.6.34. */ - WARN((mnt->mnt_sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " - "negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes); + WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " + "negative value (%lld)\n", type->name, sb->s_maxbytes); - mnt->mnt_mountpoint = mnt->mnt_root; - mnt->mnt_parent = mnt; - up_write(&mnt->mnt_sb->s_umount); + up_write(&sb->s_umount); free_secdata(secdata); - return mnt; + return root; out_sb: - dput(mnt->mnt_root); - deactivate_locked_super(mnt->mnt_sb); + dput(root); + deactivate_locked_super(sb); out_free_secdata: free_secdata(secdata); -out_mnt: - free_vfsmnt(mnt); out: return ERR_PTR(error); } -EXPORT_SYMBOL_GPL(vfs_kern_mount); - /** * freeze_super - lock the filesystem and force it into a consistent state * @sb: the super to lock @@ -1126,49 +1053,3 @@ out: return 0; } EXPORT_SYMBOL(thaw_super); - -static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) -{ - int err; - const char *subtype = strchr(fstype, '.'); - if (subtype) { - subtype++; - err = -EINVAL; - if (!subtype[0]) - goto err; - } else - subtype = ""; - - mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL); - err = -ENOMEM; - if (!mnt->mnt_sb->s_subtype) - goto err; - return mnt; - - err: - mntput(mnt); - return ERR_PTR(err); -} - -struct vfsmount * -do_kern_mount(const char *fstype, int flags, const char *name, void *data) -{ - struct file_system_type *type = get_fs_type(fstype); - struct vfsmount *mnt; - if (!type) - return ERR_PTR(-ENODEV); - mnt = vfs_kern_mount(type, flags, name, data); - if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && - !mnt->mnt_sb->s_subtype) - mnt = fs_set_subtype(mnt, fstype); - put_filesystem(type); - return mnt; -} -EXPORT_SYMBOL_GPL(do_kern_mount); - -struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) -{ - return vfs_kern_mount(type, MS_KERNMOUNT, type->name, data); -} - -EXPORT_SYMBOL_GPL(kern_mount_data); diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index 830e3f76f44..1d1859dc3de 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -44,23 +44,20 @@ config UBIFS_FS_ZLIB # Debugging-related stuff config UBIFS_FS_DEBUG - bool "Enable debugging" + bool "Enable debugging support" depends on UBIFS_FS select DEBUG_FS select KALLSYMS_ALL help - This option enables UBIFS debugging. - -config UBIFS_FS_DEBUG_MSG_LVL - int "Default message level (0 = no extra messages, 3 = lots)" - depends on UBIFS_FS_DEBUG - default "0" - help - This controls the amount of debugging messages produced by UBIFS. - If reporting bugs, please try to have available a full dump of the - messages at level 1 while the misbehaviour was occurring. Level 2 - may become necessary if level 1 messages were not enough to find the - bug. Generally Level 3 should be avoided. + This option enables UBIFS debugging support. It makes sure various + assertions, self-checks, debugging messages and test modes are compiled + in (this all is compiled out otherwise). Assertions are light-weight + and this option also enables them. Self-checks, debugging messages and + test modes are switched off by default. Thus, it is safe and actually + recommended to have debugging support enabled, and it should not slow + down UBIFS. You can then further enable / disable individual debugging + features using UBIFS module parameters and the corresponding sysfs + interfaces. config UBIFS_FS_DEBUG_CHKS bool "Enable extra checks" diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index 02429d81ca3..b148fbc80f8 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -48,6 +48,56 @@ #include <linux/slab.h> #include "ubifs.h" +/* + * nothing_to_commit - check if there is nothing to commit. + * @c: UBIFS file-system description object + * + * This is a helper function which checks if there is anything to commit. It is + * used as an optimization to avoid starting the commit if it is not really + * necessary. Indeed, the commit operation always assumes flash I/O (e.g., + * writing the commit start node to the log), and it is better to avoid doing + * this unnecessarily. E.g., 'ubifs_sync_fs()' runs the commit, but if there is + * nothing to commit, it is more optimal to avoid any flash I/O. + * + * This function has to be called with @c->commit_sem locked for writing - + * this function does not take LPT/TNC locks because the @c->commit_sem + * guarantees that we have exclusive access to the TNC and LPT data structures. + * + * This function returns %1 if there is nothing to commit and %0 otherwise. + */ +static int nothing_to_commit(struct ubifs_info *c) +{ + /* + * During mounting or remounting from R/O mode to R/W mode we may + * commit for various recovery-related reasons. + */ + if (c->mounting || c->remounting_rw) + return 0; + + /* + * If the root TNC node is dirty, we definitely have something to + * commit. + */ + if (c->zroot.znode && test_bit(DIRTY_ZNODE, &c->zroot.znode->flags)) + return 0; + + /* + * Even though the TNC is clean, the LPT tree may have dirty nodes. For + * example, this may happen if the budgeting subsystem invoked GC to + * make some free space, and the GC found an LEB with only dirty and + * free space. In this case GC would just change the lprops of this + * LEB (by turning all space into free space) and unmap it. + */ + if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags)) + return 0; + + ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0); + ubifs_assert(c->dirty_pn_cnt == 0); + ubifs_assert(c->dirty_nn_cnt == 0); + + return 1; +} + /** * do_commit - commit the journal. * @c: UBIFS file-system description object @@ -70,6 +120,12 @@ static int do_commit(struct ubifs_info *c) goto out_up; } + if (nothing_to_commit(c)) { + up_write(&c->commit_sem); + err = 0; + goto out_cancel; + } + /* Sync all write buffers (necessary for recovery) */ for (i = 0; i < c->jhead_cnt; i++) { err = ubifs_wbuf_sync(&c->jheads[i].wbuf); @@ -162,12 +218,12 @@ static int do_commit(struct ubifs_info *c) if (err) goto out; +out_cancel: spin_lock(&c->cs_lock); c->cmt_state = COMMIT_RESTING; wake_up(&c->cmt_wq); dbg_cmt("commit end"); spin_unlock(&c->cs_lock); - return 0; out_up: diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 0bee4dbffc3..01c2b028e52 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -43,8 +43,8 @@ DEFINE_SPINLOCK(dbg_lock); static char dbg_key_buf0[128]; static char dbg_key_buf1[128]; -unsigned int ubifs_msg_flags = UBIFS_MSG_FLAGS_DEFAULT; -unsigned int ubifs_chk_flags = UBIFS_CHK_FLAGS_DEFAULT; +unsigned int ubifs_msg_flags; +unsigned int ubifs_chk_flags; unsigned int ubifs_tst_flags; module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR); @@ -810,16 +810,24 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum) { struct ubifs_scan_leb *sleb; struct ubifs_scan_node *snod; + void *buf; if (dbg_failure_mode) return; printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", current->pid, lnum); - sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0); + + buf = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err("cannot allocate memory for dumping LEB %d", lnum); + return; + } + + sleb = ubifs_scan(c, lnum, 0, buf, 0); if (IS_ERR(sleb)) { ubifs_err("scan error %d", (int)PTR_ERR(sleb)); - return; + goto out; } printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum, @@ -835,6 +843,9 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum) printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n", current->pid, lnum); ubifs_scan_destroy(sleb); + +out: + vfree(buf); return; } @@ -2690,16 +2701,8 @@ int ubifs_debugging_init(struct ubifs_info *c) if (!c->dbg) return -ENOMEM; - c->dbg->buf = vmalloc(c->leb_size); - if (!c->dbg->buf) - goto out; - failure_mode_init(c); return 0; - -out: - kfree(c->dbg); - return -ENOMEM; } /** @@ -2709,7 +2712,6 @@ out: void ubifs_debugging_exit(struct ubifs_info *c) { failure_mode_exit(c); - vfree(c->dbg->buf); kfree(c->dbg); } @@ -2813,19 +2815,19 @@ int dbg_debugfs_init_fs(struct ubifs_info *c) } fname = "dump_lprops"; - dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); if (IS_ERR(dent)) goto out_remove; d->dfs_dump_lprops = dent; fname = "dump_budg"; - dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); if (IS_ERR(dent)) goto out_remove; d->dfs_dump_budg = dent; fname = "dump_tnc"; - dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); if (IS_ERR(dent)) goto out_remove; d->dfs_dump_tnc = dent; diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 69ebe472915..919f0de29d8 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -27,7 +27,6 @@ /** * ubifs_debug_info - per-FS debugging information. - * @buf: a buffer of LEB size, used for various purposes * @old_zroot: old index root - used by 'dbg_check_old_index()' * @old_zroot_level: old index root level - used by 'dbg_check_old_index()' * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()' @@ -54,7 +53,6 @@ * dfs_dump_tnc: "dump TNC" debugfs knob */ struct ubifs_debug_info { - void *buf; struct ubifs_zbranch old_zroot; int old_zroot_level; unsigned long long old_zroot_sqnum; @@ -173,7 +171,7 @@ const char *dbg_key_str1(const struct ubifs_info *c, #define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__) /* - * Debugging message type flags (must match msg_type_names in debug.c). + * Debugging message type flags. * * UBIFS_MSG_GEN: general messages * UBIFS_MSG_JNL: journal messages @@ -205,14 +203,8 @@ enum { UBIFS_MSG_RCVRY = 0x1000, }; -/* Debugging message type flags for each default debug message level */ -#define UBIFS_MSG_LVL_0 0 -#define UBIFS_MSG_LVL_1 0x1 -#define UBIFS_MSG_LVL_2 0x7f -#define UBIFS_MSG_LVL_3 0xffff - /* - * Debugging check flags (must match chk_names in debug.c). + * Debugging check flags. * * UBIFS_CHK_GEN: general checks * UBIFS_CHK_TNC: check TNC @@ -233,7 +225,7 @@ enum { }; /* - * Special testing flags (must match tst_names in debug.c). + * Special testing flags. * * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method * UBIFS_TST_RCVRY: failure mode for recovery testing @@ -243,22 +235,6 @@ enum { UBIFS_TST_RCVRY = 0x4, }; -#if CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 1 -#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_1 -#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 2 -#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_2 -#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 3 -#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_3 -#else -#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_0 -#endif - -#ifdef CONFIG_UBIFS_FS_DEBUG_CHKS -#define UBIFS_CHK_FLAGS_DEFAULT 0xffffffff -#else -#define UBIFS_CHK_FLAGS_DEFAULT 0 -#endif - extern spinlock_t dbg_lock; extern unsigned int ubifs_msg_flags; diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 14f64b689d7..7217d67a80a 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -522,24 +522,6 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, ubifs_assert(mutex_is_locked(&dir->i_mutex)); ubifs_assert(mutex_is_locked(&inode->i_mutex)); - /* - * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing - * otherwise has the potential to corrupt the orphan inode list. - * - * Indeed, consider a scenario when 'vfs_link(dirA/fileA)' and - * 'vfs_unlink(dirA/fileA, dirB/fileB)' race. 'vfs_link()' does not - * lock 'dirA->i_mutex', so this is possible. Both of the functions - * lock 'fileA->i_mutex' though. Suppose 'vfs_unlink()' wins, and takes - * 'fileA->i_mutex' mutex first. Suppose 'fileA->i_nlink' is 1. In this - * case 'ubifs_unlink()' will drop the last reference, and put 'inodeA' - * to the list of orphans. After this, 'vfs_link()' will link - * 'dirB/fileB' to 'inodeA'. This is a problem because, for example, - * the subsequent 'vfs_unlink(dirB/fileB)' will add the same inode - * to the list of orphans. - */ - if (inode->i_nlink == 0) - return -ENOENT; - err = dbg_check_synced_i_size(inode); if (err) return err; diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index d82173182ee..dfd168b7807 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -31,6 +31,26 @@ * buffer is full or when it is not used for some time (by timer). This is * similar to the mechanism is used by JFFS2. * + * UBIFS distinguishes between minimum write size (@c->min_io_size) and maximum + * write size (@c->max_write_size). The latter is the maximum amount of bytes + * the underlying flash is able to program at a time, and writing in + * @c->max_write_size units should presumably be faster. Obviously, + * @c->min_io_size <= @c->max_write_size. Write-buffers are of + * @c->max_write_size bytes in size for maximum performance. However, when a + * write-buffer is flushed, only the portion of it (aligned to @c->min_io_size + * boundary) which contains data is written, not the whole write-buffer, + * because this is more space-efficient. + * + * This optimization adds few complications to the code. Indeed, on the one + * hand, we want to write in optimal @c->max_write_size bytes chunks, which + * also means aligning writes at the @c->max_write_size bytes offsets. On the + * other hand, we do not want to waste space when synchronizing the write + * buffer, so during synchronization we writes in smaller chunks. And this makes + * the next write offset to be not aligned to @c->max_write_size bytes. So the + * have to make sure that the write-buffer offset (@wbuf->offs) becomes aligned + * to @c->max_write_size bytes again. We do this by temporarily shrinking + * write-buffer size (@wbuf->size). + * * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by * mutexes defined inside these objects. Since sometimes upper-level code * has to lock the write-buffer (e.g. journal space reservation code), many @@ -46,8 +66,8 @@ * UBIFS uses padding when it pads to the next min. I/O unit. In this case it * uses padding nodes or padding bytes, if the padding node does not fit. * - * All UBIFS nodes are protected by CRC checksums and UBIFS checks all nodes - * every time they are read from the flash media. + * All UBIFS nodes are protected by CRC checksums and UBIFS checks CRC when + * they are read from the flash media. */ #include <linux/crc32.h> @@ -88,8 +108,12 @@ void ubifs_ro_mode(struct ubifs_info *c, int err) * This function may skip data nodes CRC checking if @c->no_chk_data_crc is * true, which is controlled by corresponding UBIFS mount option. However, if * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is - * checked. Similarly, if @c->always_chk_crc is true, @c->no_chk_data_crc is - * ignored and CRC is checked. + * checked. Similarly, if @c->mounting or @c->remounting_rw is true (we are + * mounting or re-mounting to R/W mode), @c->no_chk_data_crc is ignored and CRC + * is checked. This is because during mounting or re-mounting from R/O mode to + * R/W mode we may read journal nodes (when replying the journal or doing the + * recovery) and the journal nodes may potentially be corrupted, so checking is + * required. * * This function returns zero in case of success and %-EUCLEAN in case of bad * CRC or magic. @@ -131,8 +155,8 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, node_len > c->ranges[type].max_len) goto out_len; - if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc && - c->no_chk_data_crc) + if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->mounting && + !c->remounting_rw && c->no_chk_data_crc) return 0; crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); @@ -343,11 +367,17 @@ static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) * * This function synchronizes write-buffer @buf and returns zero in case of * success or a negative error code in case of failure. + * + * Note, although write-buffers are of @c->max_write_size, this function does + * not necessarily writes all @c->max_write_size bytes to the flash. Instead, + * if the write-buffer is only partially filled with data, only the used part + * of the write-buffer (aligned on @c->min_io_size boundary) is synchronized. + * This way we waste less space. */ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) { struct ubifs_info *c = wbuf->c; - int err, dirt; + int err, dirt, sync_len; cancel_wbuf_timer_nolock(wbuf); if (!wbuf->used || wbuf->lnum == -1) @@ -357,27 +387,53 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) dbg_io("LEB %d:%d, %d bytes, jhead %s", wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead)); ubifs_assert(!(wbuf->avail & 7)); - ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size); + ubifs_assert(wbuf->offs + wbuf->size <= c->leb_size); + ubifs_assert(wbuf->size >= c->min_io_size); + ubifs_assert(wbuf->size <= c->max_write_size); + ubifs_assert(wbuf->size % c->min_io_size == 0); ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->leb_size - wbuf->offs >= c->max_write_size) + ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size )); if (c->ro_error) return -EROFS; - ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail); + /* + * Do not write whole write buffer but write only the minimum necessary + * amount of min. I/O units. + */ + sync_len = ALIGN(wbuf->used, c->min_io_size); + dirt = sync_len - wbuf->used; + if (dirt) + ubifs_pad(c, wbuf->buf + wbuf->used, dirt); err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, - c->min_io_size, wbuf->dtype); + sync_len, wbuf->dtype); if (err) { ubifs_err("cannot write %d bytes to LEB %d:%d", - c->min_io_size, wbuf->lnum, wbuf->offs); + sync_len, wbuf->lnum, wbuf->offs); dbg_dump_stack(); return err; } - dirt = wbuf->avail; - spin_lock(&wbuf->lock); - wbuf->offs += c->min_io_size; - wbuf->avail = c->min_io_size; + wbuf->offs += sync_len; + /* + * Now @wbuf->offs is not necessarily aligned to @c->max_write_size. + * But our goal is to optimize writes and make sure we write in + * @c->max_write_size chunks and to @c->max_write_size-aligned offset. + * Thus, if @wbuf->offs is not aligned to @c->max_write_size now, make + * sure that @wbuf->offs + @wbuf->size is aligned to + * @c->max_write_size. This way we make sure that after next + * write-buffer flush we are again at the optimal offset (aligned to + * @c->max_write_size). + */ + if (c->leb_size - wbuf->offs < c->max_write_size) + wbuf->size = c->leb_size - wbuf->offs; + else if (wbuf->offs & (c->max_write_size - 1)) + wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs; + else + wbuf->size = c->max_write_size; + wbuf->avail = wbuf->size; wbuf->used = 0; wbuf->next_ino = 0; spin_unlock(&wbuf->lock); @@ -420,7 +476,13 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, spin_lock(&wbuf->lock); wbuf->lnum = lnum; wbuf->offs = offs; - wbuf->avail = c->min_io_size; + if (c->leb_size - wbuf->offs < c->max_write_size) + wbuf->size = c->leb_size - wbuf->offs; + else if (wbuf->offs & (c->max_write_size - 1)) + wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs; + else + wbuf->size = c->max_write_size; + wbuf->avail = wbuf->size; wbuf->used = 0; spin_unlock(&wbuf->lock); wbuf->dtype = dtype; @@ -500,8 +562,9 @@ out_timers: * * This function writes data to flash via write-buffer @wbuf. This means that * the last piece of the node won't reach the flash media immediately if it - * does not take whole minimal I/O unit. Instead, the node will sit in RAM - * until the write-buffer is synchronized (e.g., by timer). + * does not take whole max. write unit (@c->max_write_size). Instead, the node + * will sit in RAM until the write-buffer is synchronized (e.g., by timer, or + * because more data are appended to the write-buffer). * * This function returns zero in case of success and a negative error code in * case of failure. If the node cannot be written because there is no more @@ -518,9 +581,14 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt); ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0); ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size); - ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size); + ubifs_assert(wbuf->avail > 0 && wbuf->avail <= wbuf->size); + ubifs_assert(wbuf->size >= c->min_io_size); + ubifs_assert(wbuf->size <= c->max_write_size); + ubifs_assert(wbuf->size % c->min_io_size == 0); ubifs_assert(mutex_is_locked(&wbuf->io_mutex)); ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->leb_size - wbuf->offs >= c->max_write_size) + ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size )); if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) { err = -ENOSPC; @@ -543,14 +611,18 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) dbg_io("flush jhead %s wbuf to LEB %d:%d", dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, - wbuf->offs, c->min_io_size, + wbuf->offs, wbuf->size, wbuf->dtype); if (err) goto out; spin_lock(&wbuf->lock); - wbuf->offs += c->min_io_size; - wbuf->avail = c->min_io_size; + wbuf->offs += wbuf->size; + if (c->leb_size - wbuf->offs >= c->max_write_size) + wbuf->size = c->max_write_size; + else + wbuf->size = c->leb_size - wbuf->offs; + wbuf->avail = wbuf->size; wbuf->used = 0; wbuf->next_ino = 0; spin_unlock(&wbuf->lock); @@ -564,33 +636,57 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) goto exit; } - /* - * The node is large enough and does not fit entirely within current - * minimal I/O unit. We have to fill and flush write-buffer and switch - * to the next min. I/O unit. - */ - dbg_io("flush jhead %s wbuf to LEB %d:%d", - dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); - memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); - err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, - c->min_io_size, wbuf->dtype); - if (err) - goto out; + offs = wbuf->offs; + written = 0; - offs = wbuf->offs + c->min_io_size; - len -= wbuf->avail; - aligned_len -= wbuf->avail; - written = wbuf->avail; + if (wbuf->used) { + /* + * The node is large enough and does not fit entirely within + * current available space. We have to fill and flush + * write-buffer and switch to the next max. write unit. + */ + dbg_io("flush jhead %s wbuf to LEB %d:%d", + dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); + memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); + err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, + wbuf->size, wbuf->dtype); + if (err) + goto out; + + offs += wbuf->size; + len -= wbuf->avail; + aligned_len -= wbuf->avail; + written += wbuf->avail; + } else if (wbuf->offs & (c->max_write_size - 1)) { + /* + * The write-buffer offset is not aligned to + * @c->max_write_size and @wbuf->size is less than + * @c->max_write_size. Write @wbuf->size bytes to make sure the + * following writes are done in optimal @c->max_write_size + * chunks. + */ + dbg_io("write %d bytes to LEB %d:%d", + wbuf->size, wbuf->lnum, wbuf->offs); + err = ubi_leb_write(c->ubi, wbuf->lnum, buf, wbuf->offs, + wbuf->size, wbuf->dtype); + if (err) + goto out; + + offs += wbuf->size; + len -= wbuf->size; + aligned_len -= wbuf->size; + written += wbuf->size; + } /* - * The remaining data may take more whole min. I/O units, so write the - * remains multiple to min. I/O unit size directly to the flash media. + * The remaining data may take more whole max. write units, so write the + * remains multiple to max. write unit size directly to the flash media. * We align node length to 8-byte boundary because we anyway flash wbuf * if the remaining space is less than 8 bytes. */ - n = aligned_len >> c->min_io_shift; + n = aligned_len >> c->max_write_shift; if (n) { - n <<= c->min_io_shift; + n <<= c->max_write_shift; dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs); err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n, wbuf->dtype); @@ -606,14 +702,18 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) if (aligned_len) /* * And now we have what's left and what does not take whole - * min. I/O unit, so write it to the write-buffer and we are + * max. write unit, so write it to the write-buffer and we are * done. */ memcpy(wbuf->buf, buf + written, len); wbuf->offs = offs; + if (c->leb_size - wbuf->offs >= c->max_write_size) + wbuf->size = c->max_write_size; + else + wbuf->size = c->leb_size - wbuf->offs; + wbuf->avail = wbuf->size - aligned_len; wbuf->used = aligned_len; - wbuf->avail = c->min_io_size - aligned_len; wbuf->next_ino = 0; spin_unlock(&wbuf->lock); @@ -837,11 +937,11 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) { size_t size; - wbuf->buf = kmalloc(c->min_io_size, GFP_KERNEL); + wbuf->buf = kmalloc(c->max_write_size, GFP_KERNEL); if (!wbuf->buf) return -ENOMEM; - size = (c->min_io_size / UBIFS_CH_SZ + 1) * sizeof(ino_t); + size = (c->max_write_size / UBIFS_CH_SZ + 1) * sizeof(ino_t); wbuf->inodes = kmalloc(size, GFP_KERNEL); if (!wbuf->inodes) { kfree(wbuf->buf); @@ -851,7 +951,14 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) wbuf->used = 0; wbuf->lnum = wbuf->offs = -1; - wbuf->avail = c->min_io_size; + /* + * If the LEB starts at the max. write size aligned address, then + * write-buffer size has to be set to @c->max_write_size. Otherwise, + * set it to something smaller so that it ends at the closest max. + * write size boundary. + */ + size = c->max_write_size - (c->leb_start % c->max_write_size); + wbuf->avail = wbuf->size = size; wbuf->dtype = UBI_UNKNOWN; wbuf->sync_callback = NULL; mutex_init(&wbuf->io_mutex); diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 914f1bd89e5..aed25e86422 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -690,7 +690,7 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, { struct ubifs_data_node *data; int err, lnum, offs, compr_type, out_len; - int dlen = UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR; + int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1; struct ubifs_inode *ui = ubifs_inode(inode); dbg_jnl("ino %lu, blk %u, len %d, key %s", @@ -698,9 +698,19 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, DBGKEY(key)); ubifs_assert(len <= UBIFS_BLOCK_SIZE); - data = kmalloc(dlen, GFP_NOFS); - if (!data) - return -ENOMEM; + data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN); + if (!data) { + /* + * Fall-back to the write reserve buffer. Note, we might be + * currently on the memory reclaim path, when the kernel is + * trying to free some memory by writing out dirty pages. The + * write reserve buffer helps us to guarantee that we are + * always able to write the data. + */ + allocated = 0; + mutex_lock(&c->write_reserve_mutex); + data = c->write_reserve_buf; + } data->ch.node_type = UBIFS_DATA_NODE; key_write(c, key, &data->key); @@ -736,7 +746,10 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, goto out_ro; finish_reservation(c); - kfree(data); + if (!allocated) + mutex_unlock(&c->write_reserve_mutex); + else + kfree(data); return 0; out_release: @@ -745,7 +758,10 @@ out_ro: ubifs_ro_mode(c, err); finish_reservation(c); out_free: - kfree(data); + if (!allocated) + mutex_unlock(&c->write_reserve_mutex); + else + kfree(data); return err; } diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 4d4ca388889..c7b25e2f776 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -1035,7 +1035,8 @@ static int scan_check_cb(struct ubifs_info *c, struct ubifs_scan_leb *sleb; struct ubifs_scan_node *snod; struct ubifs_lp_stats *lst = &data->lst; - int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty; + int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret; + void *buf = NULL; cat = lp->flags & LPROPS_CAT_MASK; if (cat != LPROPS_UNCAT) { @@ -1093,7 +1094,13 @@ static int scan_check_cb(struct ubifs_info *c, } } - sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0); + buf = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err("cannot allocate memory to scan LEB %d", lnum); + goto out; + } + + sleb = ubifs_scan(c, lnum, 0, buf, 0); if (IS_ERR(sleb)) { /* * After an unclean unmount, empty and freeable LEBs @@ -1105,7 +1112,8 @@ static int scan_check_cb(struct ubifs_info *c, lst->empty_lebs += 1; lst->total_free += c->leb_size; lst->total_dark += ubifs_calc_dark(c, c->leb_size); - return LPT_SCAN_CONTINUE; + ret = LPT_SCAN_CONTINUE; + goto exit; } if (lp->free + lp->dirty == c->leb_size && @@ -1115,10 +1123,12 @@ static int scan_check_cb(struct ubifs_info *c, lst->total_free += lp->free; lst->total_dirty += lp->dirty; lst->total_dark += ubifs_calc_dark(c, c->leb_size); - return LPT_SCAN_CONTINUE; + ret = LPT_SCAN_CONTINUE; + goto exit; } data->err = PTR_ERR(sleb); - return LPT_SCAN_STOP; + ret = LPT_SCAN_STOP; + goto exit; } is_idx = -1; @@ -1236,7 +1246,10 @@ static int scan_check_cb(struct ubifs_info *c, } ubifs_scan_destroy(sleb); - return LPT_SCAN_CONTINUE; + ret = LPT_SCAN_CONTINUE; +exit: + vfree(buf); + return ret; out_print: ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, " @@ -1246,6 +1259,7 @@ out_print: out_destroy: ubifs_scan_destroy(sleb); out: + vfree(buf); data->err = -EINVAL; return LPT_SCAN_STOP; } diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 5c90dec5db0..0a3c2c3f5c4 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1628,29 +1628,35 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) { int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len; int ret; - void *buf = c->dbg->buf; + void *buf, *p; if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) return 0; + buf = p = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err("cannot allocate memory for ltab checking"); + return 0; + } + dbg_lp("LEB %d", lnum); err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); if (err) { dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err); - return err; + goto out; } while (1) { - if (!is_a_node(c, buf, len)) { + if (!is_a_node(c, p, len)) { int i, pad_len; - pad_len = get_pad_len(c, buf, len); + pad_len = get_pad_len(c, p, len); if (pad_len) { - buf += pad_len; + p += pad_len; len -= pad_len; dirty += pad_len; continue; } - if (!dbg_is_all_ff(buf, len)) { + if (!dbg_is_all_ff(p, len)) { dbg_msg("invalid empty space in LEB %d at %d", lnum, c->leb_size - len); err = -EINVAL; @@ -1668,16 +1674,21 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) lnum, dirty, c->ltab[i].dirty); err = -EINVAL; } - return err; + goto out; } - node_type = get_lpt_node_type(c, buf, &node_num); + node_type = get_lpt_node_type(c, p, &node_num); node_len = get_lpt_node_len(c, node_type); ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len); if (ret == 1) dirty += node_len; - buf += node_len; + p += node_len; len -= node_len; } + + err = 0; +out: + vfree(buf); + return err; } /** @@ -1870,25 +1881,31 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) static void dump_lpt_leb(const struct ubifs_info *c, int lnum) { int err, len = c->leb_size, node_type, node_num, node_len, offs; - void *buf = c->dbg->buf; + void *buf, *p; printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", current->pid, lnum); + buf = p = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err("cannot allocate memory to dump LPT"); + return; + } + err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); if (err) { ubifs_err("cannot read LEB %d, error %d", lnum, err); - return; + goto out; } while (1) { offs = c->leb_size - len; - if (!is_a_node(c, buf, len)) { + if (!is_a_node(c, p, len)) { int pad_len; - pad_len = get_pad_len(c, buf, len); + pad_len = get_pad_len(c, p, len); if (pad_len) { printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n", lnum, offs, pad_len); - buf += pad_len; + p += pad_len; len -= pad_len; continue; } @@ -1898,7 +1915,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) break; } - node_type = get_lpt_node_type(c, buf, &node_num); + node_type = get_lpt_node_type(c, p, &node_num); switch (node_type) { case UBIFS_LPT_PNODE: { @@ -1923,7 +1940,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) else printk(KERN_DEBUG "LEB %d:%d, nnode, ", lnum, offs); - err = ubifs_unpack_nnode(c, buf, &nnode); + err = ubifs_unpack_nnode(c, p, &nnode); for (i = 0; i < UBIFS_LPT_FANOUT; i++) { printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum, nnode.nbranch[i].offs); @@ -1944,15 +1961,18 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) break; default: ubifs_err("LPT node type %d not recognized", node_type); - return; + goto out; } - buf += node_len; + p += node_len; len -= node_len; } printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n", current->pid, lnum); +out: + vfree(buf); + return; } /** diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 82009c74b6a..2cdbd31641d 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -892,15 +892,22 @@ static int dbg_read_orphans(struct check_info *ci, struct ubifs_scan_leb *sleb) static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) { int lnum, err = 0; + void *buf; /* Check no-orphans flag and skip this if no orphans */ if (c->no_orphs) return 0; + buf = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err("cannot allocate memory to check orphans"); + return 0; + } + for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { struct ubifs_scan_leb *sleb; - sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0); + sleb = ubifs_scan(c, lnum, 0, buf, 0); if (IS_ERR(sleb)) { err = PTR_ERR(sleb); break; @@ -912,6 +919,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) break; } + vfree(buf); return err; } diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 77e9b874b6c..936f2cbfe6b 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -28,6 +28,23 @@ * UBIFS always cleans away all remnants of an unclean un-mount, so that * errors do not accumulate. However UBIFS defers recovery if it is mounted * read-only, and the flash is not modified in that case. + * + * The general UBIFS approach to the recovery is that it recovers from + * corruptions which could be caused by power cuts, but it refuses to recover + * from corruption caused by other reasons. And UBIFS tries to distinguish + * between these 2 reasons of corruptions and silently recover in the former + * case and loudly complain in the latter case. + * + * UBIFS writes only to erased LEBs, so it writes only to the flash space + * containing only 0xFFs. UBIFS also always writes strictly from the beginning + * of the LEB to the end. And UBIFS assumes that the underlying flash media + * writes in @c->max_write_size bytes at a time. + * + * Hence, if UBIFS finds a corrupted node at offset X, it expects only the min. + * I/O unit corresponding to offset X to contain corrupted data, all the + * following min. I/O units have to contain empty space (all 0xFFs). If this is + * not true, the corruption cannot be the result of a power cut, and UBIFS + * refuses to mount. */ #include <linux/crc32.h> @@ -362,8 +379,9 @@ int ubifs_write_rcvrd_mst_node(struct ubifs_info *c) * @offs: offset to check * * This function returns %1 if @offs was in the last write to the LEB whose data - * is in @buf, otherwise %0 is returned. The determination is made by checking - * for subsequent empty space starting from the next @c->min_io_size boundary. + * is in @buf, otherwise %0 is returned. The determination is made by checking + * for subsequent empty space starting from the next @c->max_write_size + * boundary. */ static int is_last_write(const struct ubifs_info *c, void *buf, int offs) { @@ -371,10 +389,10 @@ static int is_last_write(const struct ubifs_info *c, void *buf, int offs) uint8_t *p; /* - * Round up to the next @c->min_io_size boundary i.e. @offs is in the - * last wbuf written. After that should be empty space. + * Round up to the next @c->max_write_size boundary i.e. @offs is in + * the last wbuf written. After that should be empty space. */ - empty_offs = ALIGN(offs + 1, c->min_io_size); + empty_offs = ALIGN(offs + 1, c->max_write_size); check_len = c->leb_size - empty_offs; p = buf + empty_offs - offs; return is_empty(p, check_len); @@ -429,7 +447,7 @@ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, int skip, dlen = le32_to_cpu(ch->len); /* Check for empty space after the corrupt node's common header */ - skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs; + skip = ALIGN(offs + UBIFS_CH_SZ, c->max_write_size) - offs; if (is_empty(buf + skip, len - skip)) return 1; /* @@ -441,7 +459,7 @@ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, return 0; } /* Now we know the corrupt node's length we can skip over it */ - skip = ALIGN(offs + dlen, c->min_io_size) - offs; + skip = ALIGN(offs + dlen, c->max_write_size) - offs; /* After which there should be empty space */ if (is_empty(buf + skip, len - skip)) return 1; @@ -671,10 +689,14 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, } else { int corruption = first_non_ff(buf, len); + /* + * See header comment for this file for more + * explanations about the reasons we have this check. + */ ubifs_err("corrupt empty space LEB %d:%d, corruption " "starts at %d", lnum, offs, corruption); /* Make sure we dump interesting non-0xFF data */ - offs = corruption; + offs += corruption; buf += corruption; goto corrupted; } @@ -836,12 +858,8 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, static int recover_head(const struct ubifs_info *c, int lnum, int offs, void *sbuf) { - int len, err; + int len = c->max_write_size, err; - if (c->min_io_size > 1) - len = c->min_io_size; - else - len = 512; if (offs + len > c->leb_size) len = c->leb_size - offs; diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c index 3e1ee57dbea..36216b46f77 100644 --- a/fs/ubifs/scan.c +++ b/fs/ubifs/scan.c @@ -328,7 +328,7 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, if (!quiet) ubifs_err("empty space starts at non-aligned offset %d", offs); - goto corrupted;; + goto corrupted; } ubifs_end_scan(c, sleb, lnum, offs); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 6e11c2975dc..e5dc1e120e8 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -512,9 +512,12 @@ static int init_constants_early(struct ubifs_info *c) c->leb_cnt = c->vi.size; c->leb_size = c->vi.usable_leb_size; + c->leb_start = c->di.leb_start; c->half_leb_size = c->leb_size / 2; c->min_io_size = c->di.min_io_size; c->min_io_shift = fls(c->min_io_size) - 1; + c->max_write_size = c->di.max_write_size; + c->max_write_shift = fls(c->max_write_size) - 1; if (c->leb_size < UBIFS_MIN_LEB_SZ) { ubifs_err("too small LEBs (%d bytes), min. is %d bytes", @@ -534,6 +537,18 @@ static int init_constants_early(struct ubifs_info *c) } /* + * Maximum write size has to be greater or equivalent to min. I/O + * size, and be multiple of min. I/O size. + */ + if (c->max_write_size < c->min_io_size || + c->max_write_size % c->min_io_size || + !is_power_of_2(c->max_write_size)) { + ubifs_err("bad write buffer size %d for %d min. I/O unit", + c->max_write_size, c->min_io_size); + return -EINVAL; + } + + /* * UBIFS aligns all node to 8-byte boundary, so to make function in * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is * less than 8. @@ -541,6 +556,10 @@ static int init_constants_early(struct ubifs_info *c) if (c->min_io_size < 8) { c->min_io_size = 8; c->min_io_shift = 3; + if (c->max_write_size < c->min_io_size) { + c->max_write_size = c->min_io_size; + c->max_write_shift = c->min_io_shift; + } } c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size); @@ -1202,11 +1221,14 @@ static int mount_ubifs(struct ubifs_info *c) if (c->bulk_read == 1) bu_init(c); - /* - * We have to check all CRCs, even for data nodes, when we mount the FS - * (specifically, when we are replaying). - */ - c->always_chk_crc = 1; + if (!c->ro_mount) { + c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, + GFP_KERNEL); + if (!c->write_reserve_buf) + goto out_free; + } + + c->mounting = 1; err = ubifs_read_superblock(c); if (err) @@ -1382,7 +1404,7 @@ static int mount_ubifs(struct ubifs_info *c) if (err) goto out_infos; - c->always_chk_crc = 0; + c->mounting = 0; ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"", c->vi.ubi_num, c->vi.vol_id, c->vi.name); @@ -1403,6 +1425,7 @@ static int mount_ubifs(struct ubifs_info *c) dbg_msg("compiled on: " __DATE__ " at " __TIME__); dbg_msg("min. I/O unit size: %d bytes", c->min_io_size); + dbg_msg("max. write size: %d bytes", c->max_write_size); dbg_msg("LEB size: %d bytes (%d KiB)", c->leb_size, c->leb_size >> 10); dbg_msg("data journal heads: %d", @@ -1432,9 +1455,9 @@ static int mount_ubifs(struct ubifs_info *c) UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ); dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu", UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ); - dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu", + dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu, idx %d", UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ, - UBIFS_MAX_DENT_NODE_SZ); + UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout)); dbg_msg("dead watermark: %d", c->dead_wm); dbg_msg("dark watermark: %d", c->dark_wm); dbg_msg("LEB overhead: %d", c->leb_overhead); @@ -1474,6 +1497,7 @@ out_wbufs: out_cbuf: kfree(c->cbuf); out_free: + kfree(c->write_reserve_buf); kfree(c->bu.buf); vfree(c->ileb_buf); vfree(c->sbuf); @@ -1512,6 +1536,7 @@ static void ubifs_umount(struct ubifs_info *c) kfree(c->cbuf); kfree(c->rcvrd_mst_node); kfree(c->mst_node); + kfree(c->write_reserve_buf); kfree(c->bu.buf); vfree(c->ileb_buf); vfree(c->sbuf); @@ -1543,7 +1568,6 @@ static int ubifs_remount_rw(struct ubifs_info *c) mutex_lock(&c->umount_mutex); dbg_save_space_info(c); c->remounting_rw = 1; - c->always_chk_crc = 1; err = check_free_space(c); if (err) @@ -1598,6 +1622,10 @@ static int ubifs_remount_rw(struct ubifs_info *c) goto out; } + c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, GFP_KERNEL); + if (!c->write_reserve_buf) + goto out; + err = ubifs_lpt_init(c, 0, 1); if (err) goto out; @@ -1650,7 +1678,6 @@ static int ubifs_remount_rw(struct ubifs_info *c) dbg_gen("re-mounted read-write"); c->ro_mount = 0; c->remounting_rw = 0; - c->always_chk_crc = 0; err = dbg_check_space_info(c); mutex_unlock(&c->umount_mutex); return err; @@ -1663,11 +1690,12 @@ out: c->bgt = NULL; } free_wbufs(c); + kfree(c->write_reserve_buf); + c->write_reserve_buf = NULL; vfree(c->ileb_buf); c->ileb_buf = NULL; ubifs_lpt_free(c, 1); c->remounting_rw = 0; - c->always_chk_crc = 0; mutex_unlock(&c->umount_mutex); return err; } @@ -1707,6 +1735,8 @@ static void ubifs_remount_ro(struct ubifs_info *c) free_wbufs(c); vfree(c->orph_buf); c->orph_buf = NULL; + kfree(c->write_reserve_buf); + c->write_reserve_buf = NULL; vfree(c->ileb_buf); c->ileb_buf = NULL; ubifs_lpt_free(c, 1); @@ -1937,6 +1967,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) mutex_init(&c->mst_mutex); mutex_init(&c->umount_mutex); mutex_init(&c->bu_mutex); + mutex_init(&c->write_reserve_mutex); init_waitqueue_head(&c->cmt_wq); c->buds = RB_ROOT; c->old_idx = RB_ROOT; @@ -1954,6 +1985,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&c->old_buds); INIT_LIST_HEAD(&c->orph_list); INIT_LIST_HEAD(&c->orph_new); + c->no_chk_data_crc = 1; c->vfs_sb = sb; c->highest_inum = UBIFS_FIRST_INO; diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index ad9cf013362..de485979ca3 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -447,8 +447,11 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr, * * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc * is true (it is controlled by corresponding mount option). However, if - * @c->always_chk_crc is true, @c->no_chk_data_crc is ignored and CRC is always - * checked. + * @c->mounting or @c->remounting_rw is true (we are mounting or re-mounting to + * R/W mode), @c->no_chk_data_crc is ignored and CRC is checked. This is + * because during mounting or re-mounting from R/O mode to R/W mode we may read + * journal nodes (when replying the journal or doing the recovery) and the + * journal nodes may potentially be corrupted, so checking is required. */ static int try_read_node(const struct ubifs_info *c, void *buf, int type, int len, int lnum, int offs) @@ -476,7 +479,8 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type, if (node_len != len) return 0; - if (type == UBIFS_DATA_NODE && !c->always_chk_crc && c->no_chk_data_crc) + if (type == UBIFS_DATA_NODE && c->no_chk_data_crc && !c->mounting && + !c->remounting_rw) return 1; crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 381d6b207a5..8c40ad3c672 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -151,6 +151,12 @@ */ #define WORST_COMPR_FACTOR 2 +/* + * How much memory is needed for a buffer where we comress a data node. + */ +#define COMPRESSED_DATA_NODE_BUF_SZ \ + (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR) + /* Maximum expected tree height for use by bottom_up_buf */ #define BOTTOM_UP_HEIGHT 64 @@ -646,6 +652,7 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c, * @offs: write-buffer offset in this logical eraseblock * @avail: number of bytes available in the write-buffer * @used: number of used bytes in the write-buffer + * @size: write-buffer size (in [@c->min_io_size, @c->max_write_size] range) * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM, * %UBI_UNKNOWN) * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep @@ -680,6 +687,7 @@ struct ubifs_wbuf { int offs; int avail; int used; + int size; int dtype; int jhead; int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad); @@ -1003,6 +1011,11 @@ struct ubifs_debug_info; * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu * @bu: pre-allocated bulk-read information * + * @write_reserve_mutex: protects @write_reserve_buf + * @write_reserve_buf: on the write path we allocate memory, which might + * sometimes be unavailable, in which case we use this + * write reserve buffer + * * @log_lebs: number of logical eraseblocks in the log * @log_bytes: log size in bytes * @log_last: last LEB of the log @@ -1024,7 +1037,12 @@ struct ubifs_debug_info; * * @min_io_size: minimal input/output unit size * @min_io_shift: number of bits in @min_io_size minus one + * @max_write_size: maximum amount of bytes the underlying flash can write at a + * time (MTD write buffer size) + * @max_write_shift: number of bits in @max_write_size minus one * @leb_size: logical eraseblock size in bytes + * @leb_start: starting offset of logical eraseblocks within physical + * eraseblocks * @half_leb_size: half LEB size * @idx_leb_size: how many bytes of an LEB are effectively available when it is * used to store indexing nodes (@leb_size - @max_idx_node_sz) @@ -1166,22 +1184,21 @@ struct ubifs_debug_info; * @rp_uid: reserved pool user ID * @rp_gid: reserved pool group ID * - * @empty: if the UBI device is empty + * @empty: %1 if the UBI device is empty + * @need_recovery: %1 if the file-system needs recovery + * @replaying: %1 during journal replay + * @mounting: %1 while mounting + * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode * @replay_tree: temporary tree used during journal replay * @replay_list: temporary list used during journal replay * @replay_buds: list of buds to replay * @cs_sqnum: sequence number of first node in the log (commit start node) * @replay_sqnum: sequence number of node currently being replayed - * @need_recovery: file-system needs recovery - * @replaying: set to %1 during journal replay * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W * mode * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted * FS to R/W mode * @size_tree: inode size information for recovery - * @remounting_rw: set while re-mounting from R/O mode to R/W mode - * @always_chk_crc: always check CRCs (while mounting and remounting to R/W - * mode) * @mount_opts: UBIFS-specific mount options * * @dbg: debugging-related information @@ -1250,6 +1267,9 @@ struct ubifs_info { struct mutex bu_mutex; struct bu_info bu; + struct mutex write_reserve_mutex; + void *write_reserve_buf; + int log_lebs; long long log_bytes; int log_last; @@ -1271,7 +1291,10 @@ struct ubifs_info { int min_io_size; int min_io_shift; + int max_write_size; + int max_write_shift; int leb_size; + int leb_start; int half_leb_size; int idx_leb_size; int leb_cnt; @@ -1402,19 +1425,19 @@ struct ubifs_info { gid_t rp_gid; /* The below fields are used only during mounting and re-mounting */ - int empty; + unsigned int empty:1; + unsigned int need_recovery:1; + unsigned int replaying:1; + unsigned int mounting:1; + unsigned int remounting_rw:1; struct rb_root replay_tree; struct list_head replay_list; struct list_head replay_buds; unsigned long long cs_sqnum; unsigned long long replay_sqnum; - int need_recovery; - int replaying; struct list_head unclean_leb_list; struct ubifs_mst_node *rcvrd_mst_node; struct rb_root size_tree; - int remounting_rw; - int always_chk_crc; struct ubifs_mount_opts mount_opts; #ifdef CONFIG_UBIFS_FS_DEBUG diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 306ee39ef2c..8994dd04166 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -31,7 +31,7 @@ #define udf_set_bit(nr, addr) ext2_set_bit(nr, addr) #define udf_test_bit(nr, addr) ext2_test_bit(nr, addr) #define udf_find_next_one_bit(addr, size, offset) \ - ext2_find_next_bit(addr, size, offset) + ext2_find_next_bit((unsigned long *)(addr), size, offset) static int read_block_bitmap(struct super_block *sb, struct udf_bitmap *bitmap, unsigned int block, @@ -297,7 +297,7 @@ repeat: break; } } else { - bit = udf_find_next_one_bit((char *)bh->b_data, + bit = udf_find_next_one_bit(bh->b_data, sb->s_blocksize << 3, group_start << 3); if (bit < sb->s_blocksize << 3) diff --git a/fs/udf/file.c b/fs/udf/file.c index 89c78486cbb..f391a2adc69 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -123,8 +123,8 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (inode->i_sb->s_blocksize < (udf_file_entry_alloc_offset(inode) + pos + count)) { - udf_expand_file_adinicb(inode, pos + count, &err); - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + err = udf_expand_file_adinicb(inode); + if (err) { udf_debug("udf_expand_adinicb: err=%d\n", err); up_write(&iinfo->i_data_sem); return err; @@ -237,7 +237,7 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { - error = vmtruncate(inode, attr->ia_size); + error = udf_setsize(inode, attr->ia_size); if (error) return error; } @@ -249,5 +249,4 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr) const struct inode_operations udf_file_inode_operations = { .setattr = udf_setattr, - .truncate = udf_truncate, }; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index c6a2e782b97..ccc81432141 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -73,14 +73,12 @@ void udf_evict_inode(struct inode *inode) struct udf_inode_info *iinfo = UDF_I(inode); int want_delete = 0; - truncate_inode_pages(&inode->i_data, 0); - if (!inode->i_nlink && !is_bad_inode(inode)) { want_delete = 1; - inode->i_size = 0; - udf_truncate(inode); + udf_setsize(inode, 0); udf_update_inode(inode, IS_SYNC(inode)); - } + } else + truncate_inode_pages(&inode->i_data, 0); invalidate_inode_buffers(inode); end_writeback(inode); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && @@ -117,9 +115,18 @@ static int udf_write_begin(struct file *file, struct address_space *mapping, ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block); if (unlikely(ret)) { - loff_t isize = mapping->host->i_size; - if (pos + len > isize) - vmtruncate(mapping->host, isize); + struct inode *inode = mapping->host; + struct udf_inode_info *iinfo = UDF_I(inode); + loff_t isize = inode->i_size; + + if (pos + len > isize) { + truncate_pagecache(inode, pos + len, isize); + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + down_write(&iinfo->i_data_sem); + udf_truncate_extents(inode); + up_write(&iinfo->i_data_sem); + } + } } return ret; @@ -139,30 +146,31 @@ const struct address_space_operations udf_aops = { .bmap = udf_bmap, }; -void udf_expand_file_adinicb(struct inode *inode, int newsize, int *err) +int udf_expand_file_adinicb(struct inode *inode) { struct page *page; char *kaddr; struct udf_inode_info *iinfo = UDF_I(inode); + int err; struct writeback_control udf_wbc = { .sync_mode = WB_SYNC_NONE, .nr_to_write = 1, }; - /* from now on we have normal address_space methods */ - inode->i_data.a_ops = &udf_aops; - if (!iinfo->i_lenAlloc) { if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; + /* from now on we have normal address_space methods */ + inode->i_data.a_ops = &udf_aops; mark_inode_dirty(inode); - return; + return 0; } - page = grab_cache_page(inode->i_mapping, 0); - BUG_ON(!PageLocked(page)); + page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS); + if (!page) + return -ENOMEM; if (!PageUptodate(page)) { kaddr = kmap(page); @@ -181,11 +189,24 @@ void udf_expand_file_adinicb(struct inode *inode, int newsize, int *err) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; - - inode->i_data.a_ops->writepage(page, &udf_wbc); + /* from now on we have normal address_space methods */ + inode->i_data.a_ops = &udf_aops; + err = inode->i_data.a_ops->writepage(page, &udf_wbc); + if (err) { + /* Restore everything back so that we don't lose data... */ + lock_page(page); + kaddr = kmap(page); + memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, + inode->i_size); + kunmap(page); + unlock_page(page); + iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; + inode->i_data.a_ops = &udf_adinicb_aops; + } page_cache_release(page); - mark_inode_dirty(inode); + + return err; } struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block, @@ -348,8 +369,10 @@ static struct buffer_head *udf_getblk(struct inode *inode, long block, } /* Extend the file by 'blocks' blocks, return the number of extents added */ -int udf_extend_file(struct inode *inode, struct extent_position *last_pos, - struct kernel_long_ad *last_ext, sector_t blocks) +static int udf_do_extend_file(struct inode *inode, + struct extent_position *last_pos, + struct kernel_long_ad *last_ext, + sector_t blocks) { sector_t add; int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK); @@ -357,6 +380,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos, struct kernel_lb_addr prealloc_loc = {}; int prealloc_len = 0; struct udf_inode_info *iinfo; + int err; /* The previous extent is fake and we should not extend by anything * - there's nothing to do... */ @@ -422,26 +446,29 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos, /* Create enough extents to cover the whole hole */ while (blocks > add) { blocks -= add; - if (udf_add_aext(inode, last_pos, &last_ext->extLocation, - last_ext->extLength, 1) == -1) - return -1; + err = udf_add_aext(inode, last_pos, &last_ext->extLocation, + last_ext->extLength, 1); + if (err) + return err; count++; } if (blocks) { last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | (blocks << sb->s_blocksize_bits); - if (udf_add_aext(inode, last_pos, &last_ext->extLocation, - last_ext->extLength, 1) == -1) - return -1; + err = udf_add_aext(inode, last_pos, &last_ext->extLocation, + last_ext->extLength, 1); + if (err) + return err; count++; } out: /* Do we have some preallocated blocks saved? */ if (prealloc_len) { - if (udf_add_aext(inode, last_pos, &prealloc_loc, - prealloc_len, 1) == -1) - return -1; + err = udf_add_aext(inode, last_pos, &prealloc_loc, + prealloc_len, 1); + if (err) + return err; last_ext->extLocation = prealloc_loc; last_ext->extLength = prealloc_len; count++; @@ -453,11 +480,68 @@ out: else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) last_pos->offset -= sizeof(struct long_ad); else - return -1; + return -EIO; return count; } +static int udf_extend_file(struct inode *inode, loff_t newsize) +{ + + struct extent_position epos; + struct kernel_lb_addr eloc; + uint32_t elen; + int8_t etype; + struct super_block *sb = inode->i_sb; + sector_t first_block = newsize >> sb->s_blocksize_bits, offset; + int adsize; + struct udf_inode_info *iinfo = UDF_I(inode); + struct kernel_long_ad extent; + int err; + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else + BUG(); + + etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset); + + /* File has extent covering the new size (could happen when extending + * inside a block)? */ + if (etype != -1) + return 0; + if (newsize & (sb->s_blocksize - 1)) + offset++; + /* Extended file just to the boundary of the last file block? */ + if (offset == 0) + return 0; + + /* Truncate is extending the file by 'offset' blocks */ + if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) || + (epos.bh && epos.offset == sizeof(struct allocExtDesc))) { + /* File has no extents at all or has empty last + * indirect extent! Create a fake extent... */ + extent.extLocation.logicalBlockNum = 0; + extent.extLocation.partitionReferenceNum = 0; + extent.extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; + } else { + epos.offset -= adsize; + etype = udf_next_aext(inode, &epos, &extent.extLocation, + &extent.extLength, 0); + extent.extLength |= etype << 30; + } + err = udf_do_extend_file(inode, &epos, &extent, offset); + if (err < 0) + goto out; + err = 0; + iinfo->i_lenExtents = newsize; +out: + brelse(epos.bh); + return err; +} + static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, int *err, sector_t *phys, int *new) { @@ -540,7 +624,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, elen = EXT_RECORDED_ALLOCATED | ((elen + inode->i_sb->s_blocksize - 1) & ~(inode->i_sb->s_blocksize - 1)); - etype = udf_write_aext(inode, &cur_epos, &eloc, elen, 1); + udf_write_aext(inode, &cur_epos, &eloc, elen, 1); } brelse(prev_epos.bh); brelse(cur_epos.bh); @@ -564,19 +648,17 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, memset(&laarr[0].extLocation, 0x00, sizeof(struct kernel_lb_addr)); laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; - /* Will udf_extend_file() create real extent from + /* Will udf_do_extend_file() create real extent from a fake one? */ startnum = (offset > 0); } /* Create extents for the hole between EOF and offset */ - ret = udf_extend_file(inode, &prev_epos, laarr, offset); - if (ret == -1) { + ret = udf_do_extend_file(inode, &prev_epos, laarr, offset); + if (ret < 0) { brelse(prev_epos.bh); brelse(cur_epos.bh); brelse(next_epos.bh); - /* We don't really know the error here so we just make - * something up */ - *err = -ENOSPC; + *err = ret; return NULL; } c = 0; @@ -1005,52 +1087,66 @@ struct buffer_head *udf_bread(struct inode *inode, int block, return NULL; } -void udf_truncate(struct inode *inode) +int udf_setsize(struct inode *inode, loff_t newsize) { - int offset; int err; struct udf_inode_info *iinfo; + int bsize = 1 << inode->i_blkbits; if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) - return; + return -EINVAL; if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; + return -EPERM; iinfo = UDF_I(inode); - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + if (newsize > inode->i_size) { down_write(&iinfo->i_data_sem); - if (inode->i_sb->s_blocksize < - (udf_file_entry_alloc_offset(inode) + - inode->i_size)) { - udf_expand_file_adinicb(inode, inode->i_size, &err); - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - inode->i_size = iinfo->i_lenAlloc; - up_write(&iinfo->i_data_sem); - return; + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + if (bsize < + (udf_file_entry_alloc_offset(inode) + newsize)) { + err = udf_expand_file_adinicb(inode); + if (err) { + up_write(&iinfo->i_data_sem); + return err; + } } else - udf_truncate_extents(inode); - } else { - offset = inode->i_size & (inode->i_sb->s_blocksize - 1); - memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + offset, - 0x00, inode->i_sb->s_blocksize - - offset - udf_file_entry_alloc_offset(inode)); - iinfo->i_lenAlloc = inode->i_size; + iinfo->i_lenAlloc = newsize; + } + err = udf_extend_file(inode, newsize); + if (err) { + up_write(&iinfo->i_data_sem); + return err; } + truncate_setsize(inode, newsize); up_write(&iinfo->i_data_sem); } else { - block_truncate_page(inode->i_mapping, inode->i_size, - udf_get_block); + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + down_write(&iinfo->i_data_sem); + memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + newsize, + 0x00, bsize - newsize - + udf_file_entry_alloc_offset(inode)); + iinfo->i_lenAlloc = newsize; + truncate_setsize(inode, newsize); + up_write(&iinfo->i_data_sem); + goto update_time; + } + err = block_truncate_page(inode->i_mapping, newsize, + udf_get_block); + if (err) + return err; down_write(&iinfo->i_data_sem); + truncate_setsize(inode, newsize); udf_truncate_extents(inode); up_write(&iinfo->i_data_sem); } - +update_time: inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); if (IS_SYNC(inode)) udf_sync_inode(inode); else mark_inode_dirty(inode); + return 0; } static void __udf_read_inode(struct inode *inode) @@ -1637,14 +1733,13 @@ struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino) return NULL; } -int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, - struct kernel_lb_addr *eloc, uint32_t elen, int inc) +int udf_add_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t elen, int inc) { int adsize; struct short_ad *sad = NULL; struct long_ad *lad = NULL; struct allocExtDesc *aed; - int8_t etype; uint8_t *ptr; struct udf_inode_info *iinfo = UDF_I(inode); @@ -1660,7 +1755,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else - return -1; + return -EIO; if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) { unsigned char *sptr, *dptr; @@ -1672,12 +1767,12 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, obloc.partitionReferenceNum, obloc.logicalBlockNum, &err); if (!epos->block.logicalBlockNum) - return -1; + return -ENOSPC; nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, &epos->block, 0)); if (!nbh) - return -1; + return -EIO; lock_buffer(nbh); memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize); set_buffer_uptodate(nbh); @@ -1746,7 +1841,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, epos->bh = nbh; } - etype = udf_write_aext(inode, epos, eloc, elen, inc); + udf_write_aext(inode, epos, eloc, elen, inc); if (!epos->bh) { iinfo->i_lenAlloc += adsize; @@ -1764,11 +1859,11 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, mark_buffer_dirty_inode(epos->bh, inode); } - return etype; + return 0; } -int8_t udf_write_aext(struct inode *inode, struct extent_position *epos, - struct kernel_lb_addr *eloc, uint32_t elen, int inc) +void udf_write_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t elen, int inc) { int adsize; uint8_t *ptr; @@ -1798,7 +1893,7 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos, adsize = sizeof(struct long_ad); break; default: - return -1; + return; } if (epos->bh) { @@ -1817,8 +1912,6 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos, if (inc) epos->offset += adsize; - - return (elen >> 30); } int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, diff --git a/fs/udf/namei.c b/fs/udf/namei.c index b7c338d5e9d..f1dce848ef9 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -1286,8 +1286,13 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp, struct fid *fid = (struct fid *)fh; int type = FILEID_UDF_WITHOUT_PARENT; - if (len < 3 || (connectable && len < 5)) + if (connectable && (len < 5)) { + *lenp = 5; + return 255; + } else if (len < 3) { + *lenp = 3; return 255; + } *lenp = 3; fid->udf.block = location.logicalBlockNum; diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 225527cdc88..8424308db4b 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -197,6 +197,11 @@ static void udf_update_alloc_ext_desc(struct inode *inode, mark_buffer_dirty_inode(epos->bh, inode); } +/* + * Truncate extents of inode to inode->i_size. This function can be used only + * for making file shorter. For making file longer, udf_extend_file() has to + * be used. + */ void udf_truncate_extents(struct inode *inode) { struct extent_position epos; @@ -219,96 +224,65 @@ void udf_truncate_extents(struct inode *inode) etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset); byte_offset = (offset << sb->s_blocksize_bits) + (inode->i_size & (sb->s_blocksize - 1)); - if (etype != -1) { - epos.offset -= adsize; - extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset); - epos.offset += adsize; - if (byte_offset) - lenalloc = epos.offset; - else - lenalloc = epos.offset - adsize; - - if (!epos.bh) - lenalloc -= udf_file_entry_alloc_offset(inode); - else - lenalloc -= sizeof(struct allocExtDesc); - - while ((etype = udf_current_aext(inode, &epos, &eloc, - &elen, 0)) != -1) { - if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { - udf_write_aext(inode, &epos, &neloc, nelen, 0); - if (indirect_ext_len) { - /* We managed to free all extents in the - * indirect extent - free it too */ - BUG_ON(!epos.bh); - udf_free_blocks(sb, inode, &epos.block, - 0, indirect_ext_len); - } else if (!epos.bh) { - iinfo->i_lenAlloc = lenalloc; - mark_inode_dirty(inode); - } else - udf_update_alloc_ext_desc(inode, - &epos, lenalloc); - brelse(epos.bh); - epos.offset = sizeof(struct allocExtDesc); - epos.block = eloc; - epos.bh = udf_tread(sb, - udf_get_lb_pblock(sb, &eloc, 0)); - if (elen) - indirect_ext_len = - (elen + sb->s_blocksize - 1) >> - sb->s_blocksize_bits; - else - indirect_ext_len = 1; - } else { - extent_trunc(inode, &epos, &eloc, etype, - elen, 0); - epos.offset += adsize; - } - } + if (etype == -1) { + /* We should extend the file? */ + WARN_ON(byte_offset); + return; + } + epos.offset -= adsize; + extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset); + epos.offset += adsize; + if (byte_offset) + lenalloc = epos.offset; + else + lenalloc = epos.offset - adsize; - if (indirect_ext_len) { - BUG_ON(!epos.bh); - udf_free_blocks(sb, inode, &epos.block, 0, - indirect_ext_len); - } else if (!epos.bh) { - iinfo->i_lenAlloc = lenalloc; - mark_inode_dirty(inode); - } else - udf_update_alloc_ext_desc(inode, &epos, lenalloc); - } else if (inode->i_size) { - if (byte_offset) { - struct kernel_long_ad extent; + if (!epos.bh) + lenalloc -= udf_file_entry_alloc_offset(inode); + else + lenalloc -= sizeof(struct allocExtDesc); - /* - * OK, there is not extent covering inode->i_size and - * no extent above inode->i_size => truncate is - * extending the file by 'offset' blocks. - */ - if ((!epos.bh && - epos.offset == - udf_file_entry_alloc_offset(inode)) || - (epos.bh && epos.offset == - sizeof(struct allocExtDesc))) { - /* File has no extents at all or has empty last - * indirect extent! Create a fake extent... */ - extent.extLocation.logicalBlockNum = 0; - extent.extLocation.partitionReferenceNum = 0; - extent.extLength = - EXT_NOT_RECORDED_NOT_ALLOCATED; - } else { - epos.offset -= adsize; - etype = udf_next_aext(inode, &epos, - &extent.extLocation, - &extent.extLength, 0); - extent.extLength |= etype << 30; - } - udf_extend_file(inode, &epos, &extent, - offset + - ((inode->i_size & - (sb->s_blocksize - 1)) != 0)); + while ((etype = udf_current_aext(inode, &epos, &eloc, + &elen, 0)) != -1) { + if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { + udf_write_aext(inode, &epos, &neloc, nelen, 0); + if (indirect_ext_len) { + /* We managed to free all extents in the + * indirect extent - free it too */ + BUG_ON(!epos.bh); + udf_free_blocks(sb, inode, &epos.block, + 0, indirect_ext_len); + } else if (!epos.bh) { + iinfo->i_lenAlloc = lenalloc; + mark_inode_dirty(inode); + } else + udf_update_alloc_ext_desc(inode, + &epos, lenalloc); + brelse(epos.bh); + epos.offset = sizeof(struct allocExtDesc); + epos.block = eloc; + epos.bh = udf_tread(sb, + udf_get_lb_pblock(sb, &eloc, 0)); + if (elen) + indirect_ext_len = + (elen + sb->s_blocksize - 1) >> + sb->s_blocksize_bits; + else + indirect_ext_len = 1; + } else { + extent_trunc(inode, &epos, &eloc, etype, elen, 0); + epos.offset += adsize; } } + + if (indirect_ext_len) { + BUG_ON(!epos.bh); + udf_free_blocks(sb, inode, &epos.block, 0, indirect_ext_len); + } else if (!epos.bh) { + iinfo->i_lenAlloc = lenalloc; + mark_inode_dirty(inode); + } else + udf_update_alloc_ext_desc(inode, &epos, lenalloc); iinfo->i_lenExtents = inode->i_size; brelse(epos.bh); diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index eba48209f9f..dbd52d4b5ee 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -136,22 +136,20 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, extern long udf_ioctl(struct file *, unsigned int, unsigned long); /* inode.c */ extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); -extern void udf_expand_file_adinicb(struct inode *, int, int *); +extern int udf_expand_file_adinicb(struct inode *); extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *); extern struct buffer_head *udf_bread(struct inode *, int, int, int *); -extern void udf_truncate(struct inode *); +extern int udf_setsize(struct inode *, loff_t); extern void udf_read_inode(struct inode *); extern void udf_evict_inode(struct inode *); extern int udf_write_inode(struct inode *, struct writeback_control *wbc); extern long udf_block_map(struct inode *, sector_t); -extern int udf_extend_file(struct inode *, struct extent_position *, - struct kernel_long_ad *, sector_t); extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *, struct kernel_lb_addr *, uint32_t *, sector_t *); -extern int8_t udf_add_aext(struct inode *, struct extent_position *, +extern int udf_add_aext(struct inode *, struct extent_position *, + struct kernel_lb_addr *, uint32_t, int); +extern void udf_write_aext(struct inode *, struct extent_position *, struct kernel_lb_addr *, uint32_t, int); -extern int8_t udf_write_aext(struct inode *, struct extent_position *, - struct kernel_lb_addr *, uint32_t, int); extern int8_t udf_delete_aext(struct inode *, struct extent_position, struct kernel_lb_addr, uint32_t); extern int8_t udf_next_aext(struct inode *, struct extent_position *, diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig index 30c8f223253..e4f10a40768 100644 --- a/fs/ufs/Kconfig +++ b/fs/ufs/Kconfig @@ -1,7 +1,6 @@ config UFS_FS tristate "UFS file system support (read only)" depends on BLOCK - depends on BKL # probably fixable help BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD, OpenBSD and NeXTstep) use a file system called UFS. Some System V diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 2b251f2093a..03c255f12df 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -34,7 +34,6 @@ #include <linux/stat.h> #include <linux/string.h> #include <linux/mm.h> -#include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/writeback.h> @@ -43,7 +42,7 @@ #include "swab.h" #include "util.h" -static u64 ufs_frag_map(struct inode *inode, sector_t frag); +static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock); static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4]) { @@ -82,7 +81,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off * the begining of the filesystem. */ -static u64 ufs_frag_map(struct inode *inode, sector_t frag) +static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; @@ -107,7 +106,8 @@ static u64 ufs_frag_map(struct inode *inode, sector_t frag) p = offsets; - lock_kernel(); + if (needs_lock) + lock_ufs(sb); if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) goto ufs2; @@ -152,7 +152,8 @@ ufs2: ret = temp + (u64) (frag & uspi->s_fpbmask); out: - unlock_kernel(); + if (needs_lock) + unlock_ufs(sb); return ret; } @@ -415,14 +416,16 @@ out: int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) { struct super_block * sb = inode->i_sb; - struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi; + struct ufs_sb_info * sbi = UFS_SB(sb); + struct ufs_sb_private_info * uspi = sbi->s_uspi; struct buffer_head * bh; int ret, err, new; unsigned long ptr,phys; u64 phys64 = 0; + bool needs_lock = (sbi->mutex_owner != current); if (!create) { - phys64 = ufs_frag_map(inode, fragment); + phys64 = ufs_frag_map(inode, fragment, needs_lock); UFSD("phys64 = %llu\n", (unsigned long long)phys64); if (phys64) map_bh(bh_result, sb, phys64); @@ -436,7 +439,8 @@ int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head ret = 0; bh = NULL; - lock_kernel(); + if (needs_lock) + lock_ufs(sb); UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment); if (fragment > @@ -498,7 +502,9 @@ out: set_buffer_new(bh_result); map_bh(bh_result, sb, phys); abort: - unlock_kernel(); + if (needs_lock) + unlock_ufs(sb); + return err; abort_too_big: @@ -506,48 +512,6 @@ abort_too_big: goto abort; } -static struct buffer_head *ufs_getfrag(struct inode *inode, - unsigned int fragment, - int create, int *err) -{ - struct buffer_head dummy; - int error; - - dummy.b_state = 0; - dummy.b_blocknr = -1000; - error = ufs_getfrag_block(inode, fragment, &dummy, create); - *err = error; - if (!error && buffer_mapped(&dummy)) { - struct buffer_head *bh; - bh = sb_getblk(inode->i_sb, dummy.b_blocknr); - if (buffer_new(&dummy)) { - memset(bh->b_data, 0, inode->i_sb->s_blocksize); - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - } - return bh; - } - return NULL; -} - -struct buffer_head * ufs_bread (struct inode * inode, unsigned fragment, - int create, int * err) -{ - struct buffer_head * bh; - - UFSD("ENTER, ino %lu, fragment %u\n", inode->i_ino, fragment); - bh = ufs_getfrag (inode, fragment, create, err); - if (!bh || buffer_uptodate(bh)) - return bh; - ll_rw_block (READ, 1, &bh); - wait_on_buffer (bh); - if (buffer_uptodate(bh)) - return bh; - brelse (bh); - *err = -EIO; - return NULL; -} - static int ufs_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page,ufs_getfrag_block,wbc); @@ -900,9 +864,9 @@ static int ufs_update_inode(struct inode * inode, int do_sync) int ufs_write_inode(struct inode *inode, struct writeback_control *wbc) { int ret; - lock_kernel(); + lock_ufs(inode->i_sb); ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); - unlock_kernel(); + unlock_ufs(inode->i_sb); return ret; } @@ -922,22 +886,22 @@ void ufs_evict_inode(struct inode * inode) if (want_delete) { loff_t old_i_size; /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/ - lock_kernel(); + lock_ufs(inode->i_sb); mark_inode_dirty(inode); ufs_update_inode(inode, IS_SYNC(inode)); old_i_size = inode->i_size; inode->i_size = 0; if (inode->i_blocks && ufs_truncate(inode, old_i_size)) ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n"); - unlock_kernel(); + unlock_ufs(inode->i_sb); } invalidate_inode_buffers(inode); end_writeback(inode); if (want_delete) { - lock_kernel(); + lock_ufs(inode->i_sb); ufs_free_inode (inode); - unlock_kernel(); + unlock_ufs(inode->i_sb); } } diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index d6f681535eb..29309e25417 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -29,7 +29,6 @@ #include <linux/time.h> #include <linux/fs.h> -#include <linux/smp_lock.h> #include "ufs_fs.h" #include "ufs.h" @@ -55,16 +54,16 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru if (dentry->d_name.len > UFS_MAXNAMLEN) return ERR_PTR(-ENAMETOOLONG); - lock_kernel(); + lock_ufs(dir->i_sb); ino = ufs_inode_by_name(dir, &dentry->d_name); if (ino) { inode = ufs_iget(dir->i_sb, ino); if (IS_ERR(inode)) { - unlock_kernel(); + unlock_ufs(dir->i_sb); return ERR_CAST(inode); } } - unlock_kernel(); + unlock_ufs(dir->i_sb); d_add(dentry, inode); return NULL; } @@ -93,9 +92,9 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode, inode->i_fop = &ufs_file_operations; inode->i_mapping->a_ops = &ufs_aops; mark_inode_dirty(inode); - lock_kernel(); + lock_ufs(dir->i_sb); err = ufs_add_nondir(dentry, inode); - unlock_kernel(); + unlock_ufs(dir->i_sb); } UFSD("END: err=%d\n", err); return err; @@ -115,9 +114,9 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t init_special_inode(inode, mode, rdev); ufs_set_inode_dev(inode->i_sb, UFS_I(inode), rdev); mark_inode_dirty(inode); - lock_kernel(); + lock_ufs(dir->i_sb); err = ufs_add_nondir(dentry, inode); - unlock_kernel(); + unlock_ufs(dir->i_sb); } return err; } @@ -133,7 +132,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, if (l > sb->s_blocksize) goto out_notlocked; - lock_kernel(); + lock_ufs(dir->i_sb); inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); err = PTR_ERR(inode); if (IS_ERR(inode)) @@ -156,7 +155,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, err = ufs_add_nondir(dentry, inode); out: - unlock_kernel(); + unlock_ufs(dir->i_sb); out_notlocked: return err; @@ -172,9 +171,9 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, struct inode *inode = old_dentry->d_inode; int error; - lock_kernel(); + lock_ufs(dir->i_sb); if (inode->i_nlink >= UFS_LINK_MAX) { - unlock_kernel(); + unlock_ufs(dir->i_sb); return -EMLINK; } @@ -183,7 +182,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, ihold(inode); error = ufs_add_nondir(dentry, inode); - unlock_kernel(); + unlock_ufs(dir->i_sb); return error; } @@ -195,7 +194,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode) if (dir->i_nlink >= UFS_LINK_MAX) goto out; - lock_kernel(); + lock_ufs(dir->i_sb); inode_inc_link_count(dir); inode = ufs_new_inode(dir, S_IFDIR|mode); @@ -216,7 +215,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode) err = ufs_add_link(dentry, inode); if (err) goto out_fail; - unlock_kernel(); + unlock_ufs(dir->i_sb); d_instantiate(dentry, inode); out: @@ -228,7 +227,7 @@ out_fail: iput (inode); out_dir: inode_dec_link_count(dir); - unlock_kernel(); + unlock_ufs(dir->i_sb); goto out; } @@ -259,7 +258,7 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry) struct inode * inode = dentry->d_inode; int err= -ENOTEMPTY; - lock_kernel(); + lock_ufs(dir->i_sb); if (ufs_empty_dir (inode)) { err = ufs_unlink(dir, dentry); if (!err) { @@ -268,7 +267,7 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry) inode_dec_link_count(dir); } } - unlock_kernel(); + unlock_ufs(dir->i_sb); return err; } diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 2c61ac5d4e4..7693d629340 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -84,7 +84,6 @@ #include <linux/blkdev.h> #include <linux/init.h> #include <linux/parser.h> -#include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/vfs.h> #include <linux/log2.h> @@ -96,6 +95,26 @@ #include "swab.h" #include "util.h" +void lock_ufs(struct super_block *sb) +{ +#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT) + struct ufs_sb_info *sbi = UFS_SB(sb); + + mutex_lock(&sbi->mutex); + sbi->mutex_owner = current; +#endif +} + +void unlock_ufs(struct super_block *sb) +{ +#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT) + struct ufs_sb_info *sbi = UFS_SB(sb); + + sbi->mutex_owner = NULL; + mutex_unlock(&sbi->mutex); +#endif +} + static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; @@ -313,7 +332,6 @@ void ufs_panic (struct super_block * sb, const char * function, struct ufs_super_block_first * usb1; va_list args; - lock_kernel(); uspi = UFS_SB(sb)->s_uspi; usb1 = ubh_get_usb_first(uspi); @@ -521,7 +539,7 @@ static int ufs_read_cylinder_structures(struct super_block *sb) */ size = uspi->s_cssize; blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; - base = space = kmalloc(size, GFP_KERNEL); + base = space = kmalloc(size, GFP_NOFS); if (!base) goto failed; sbi->s_csp = (struct ufs_csum *)space; @@ -546,7 +564,7 @@ static int ufs_read_cylinder_structures(struct super_block *sb) * Read cylinder group (we read only first fragment from block * at this time) and prepare internal data structures for cg caching. */ - if (!(sbi->s_ucg = kmalloc (sizeof(struct buffer_head *) * uspi->s_ncg, GFP_KERNEL))) + if (!(sbi->s_ucg = kmalloc (sizeof(struct buffer_head *) * uspi->s_ncg, GFP_NOFS))) goto failed; for (i = 0; i < uspi->s_ncg; i++) sbi->s_ucg[i] = NULL; @@ -564,7 +582,7 @@ static int ufs_read_cylinder_structures(struct super_block *sb) ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data); } for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) { - if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_KERNEL))) + if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_NOFS))) goto failed; sbi->s_cgno[i] = UFS_CGNO_EMPTY; } @@ -646,8 +664,6 @@ static void ufs_put_super_internal(struct super_block *sb) UFSD("ENTER\n"); - lock_kernel(); - ufs_put_cstotal(sb); size = uspi->s_cssize; blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; @@ -676,8 +692,6 @@ static void ufs_put_super_internal(struct super_block *sb) kfree (sbi->s_ucg); kfree (base); - unlock_kernel(); - UFSD("EXIT\n"); } @@ -696,8 +710,6 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) unsigned maxsymlen; int ret = -EINVAL; - lock_kernel(); - uspi = NULL; ubh = NULL; flags = 0; @@ -718,6 +730,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) goto failed; } #endif + mutex_init(&sbi->mutex); /* * Set default mount options * Parse mount options @@ -1165,7 +1178,6 @@ magic_found: goto failed; UFSD("EXIT\n"); - unlock_kernel(); return 0; dalloc_failed: @@ -1177,12 +1189,10 @@ failed: kfree(sbi); sb->s_fs_info = NULL; UFSD("EXIT (FAILED)\n"); - unlock_kernel(); return ret; failed_nomem: UFSD("EXIT (NOMEM)\n"); - unlock_kernel(); return -ENOMEM; } @@ -1193,8 +1203,8 @@ static int ufs_sync_fs(struct super_block *sb, int wait) struct ufs_super_block_third * usb3; unsigned flags; + lock_ufs(sb); lock_super(sb); - lock_kernel(); UFSD("ENTER\n"); @@ -1213,8 +1223,8 @@ static int ufs_sync_fs(struct super_block *sb, int wait) sb->s_dirt = 0; UFSD("EXIT\n"); - unlock_kernel(); unlock_super(sb); + unlock_ufs(sb); return 0; } @@ -1256,7 +1266,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) unsigned new_mount_opt, ufstype; unsigned flags; - lock_kernel(); + lock_ufs(sb); lock_super(sb); uspi = UFS_SB(sb)->s_uspi; flags = UFS_SB(sb)->s_flags; @@ -1272,7 +1282,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufs_set_opt (new_mount_opt, ONERROR_LOCK); if (!ufs_parse_options (data, &new_mount_opt)) { unlock_super(sb); - unlock_kernel(); + unlock_ufs(sb); return -EINVAL; } if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { @@ -1280,14 +1290,14 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { printk("ufstype can't be changed during remount\n"); unlock_super(sb); - unlock_kernel(); + unlock_ufs(sb); return -EINVAL; } if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { UFS_SB(sb)->s_mount_opt = new_mount_opt; unlock_super(sb); - unlock_kernel(); + unlock_ufs(sb); return 0; } @@ -1313,7 +1323,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) printk("ufs was compiled with read-only support, " "can't be mounted as read-write\n"); unlock_super(sb); - unlock_kernel(); + unlock_ufs(sb); return -EINVAL; #else if (ufstype != UFS_MOUNT_UFSTYPE_SUN && @@ -1323,13 +1333,13 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufstype != UFS_MOUNT_UFSTYPE_UFS2) { printk("this ufstype is read-only supported\n"); unlock_super(sb); - unlock_kernel(); + unlock_ufs(sb); return -EINVAL; } if (!ufs_read_cylinder_structures(sb)) { printk("failed during remounting\n"); unlock_super(sb); - unlock_kernel(); + unlock_ufs(sb); return -EPERM; } sb->s_flags &= ~MS_RDONLY; @@ -1337,7 +1347,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) } UFS_SB(sb)->s_mount_opt = new_mount_opt; unlock_super(sb); - unlock_kernel(); + unlock_ufs(sb); return 0; } @@ -1371,7 +1381,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) struct ufs_super_block_third *usb3; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); - lock_kernel(); + lock_ufs(sb); usb1 = ubh_get_usb_first(uspi); usb2 = ubh_get_usb_second(uspi); @@ -1395,7 +1405,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); - unlock_kernel(); + unlock_ufs(sb); return 0; } @@ -1405,7 +1415,7 @@ static struct kmem_cache * ufs_inode_cachep; static struct inode *ufs_alloc_inode(struct super_block *sb) { struct ufs_inode_info *ei; - ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_KERNEL); + ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS); if (!ei) return NULL; ei->vfs_inode.i_version = 1; diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index a58f9155fc9..e56a4f56721 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -40,7 +40,6 @@ #include <linux/time.h> #include <linux/stat.h> #include <linux/string.h> -#include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/sched.h> @@ -467,7 +466,6 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size) block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block); - lock_kernel(); while (1) { retry = ufs_trunc_direct(inode); retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK, @@ -487,7 +485,6 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size) inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; ufsi->i_lastfrag = DIRECT_FRAGMENT; - unlock_kernel(); mark_inode_dirty(inode); out: UFSD("EXIT: err %d\n", err); @@ -510,7 +507,9 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr) /* XXX(truncate): truncate_setsize should be called last */ truncate_setsize(inode, attr->ia_size); + lock_ufs(inode->i_sb); error = ufs_truncate(inode, old_i_size); + unlock_ufs(inode->i_sb); if (error) return error; } diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index c08782e1b48..5be2755dd71 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -18,6 +18,8 @@ struct ufs_sb_info { unsigned s_cgno[UFS_MAX_GROUP_LOADED]; unsigned short s_cg_loaded; unsigned s_mount_opt; + struct mutex mutex; + struct task_struct *mutex_owner; }; struct ufs_inode_info { @@ -109,7 +111,6 @@ extern struct inode *ufs_iget(struct super_block *, unsigned long); extern int ufs_write_inode (struct inode *, struct writeback_control *); extern int ufs_sync_inode (struct inode *); extern void ufs_evict_inode (struct inode *); -extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *); extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create); /* namei.c */ @@ -154,4 +155,7 @@ static inline u32 ufs_dtogd(struct ufs_sb_private_info * uspi, u64 b) return do_div(b, uspi->s_fpg); } +extern void lock_ufs(struct super_block *sb); +extern void unlock_ufs(struct super_block *sb); + #endif /* _UFS_UFS_H */ diff --git a/fs/ufs/util.c b/fs/ufs/util.c index d2c36d53fe6..95425b59ce0 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -27,7 +27,7 @@ struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi, if (count > UFS_MAXFRAG) return NULL; ubh = (struct ufs_buffer_head *) - kmalloc (sizeof (struct ufs_buffer_head), GFP_KERNEL); + kmalloc (sizeof (struct ufs_buffer_head), GFP_NOFS); if (!ubh) return NULL; ubh->fragment = fragment; diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index ac1c7e8378d..f83a4c830a6 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -2022,11 +2022,12 @@ xfs_buf_init(void) if (!xfslogd_workqueue) goto out_free_buf_zone; - xfsdatad_workqueue = create_workqueue("xfsdatad"); + xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1); if (!xfsdatad_workqueue) goto out_destroy_xfslogd_workqueue; - xfsconvertd_workqueue = create_workqueue("xfsconvertd"); + xfsconvertd_workqueue = alloc_workqueue("xfsconvertd", + WQ_MEM_RECLAIM, 1); if (!xfsconvertd_workqueue) goto out_destroy_xfsdatad_workqueue; diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c index fc0114da7fd..f4f878fc008 100644 --- a/fs/xfs/linux-2.6/xfs_export.c +++ b/fs/xfs/linux-2.6/xfs_export.c @@ -89,8 +89,10 @@ xfs_fs_encode_fh( * seven combinations work. The real answer is "don't use v2". */ len = xfs_fileid_length(fileid_type); - if (*max_len < len) + if (*max_len < len) { + *max_len = len; return 255; + } *max_len = len; switch (fileid_type) { diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index bd5727852fd..9ff7fc603d2 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -102,7 +102,8 @@ xfs_mark_inode_dirty( STATIC int xfs_init_security( struct inode *inode, - struct inode *dir) + struct inode *dir, + const struct qstr *qstr) { struct xfs_inode *ip = XFS_I(inode); size_t length; @@ -110,7 +111,7 @@ xfs_init_security( unsigned char *name; int error; - error = security_inode_init_security(inode, dir, (char **)&name, + error = security_inode_init_security(inode, dir, qstr, (char **)&name, &value, &length); if (error) { if (error == -EOPNOTSUPP) @@ -194,7 +195,7 @@ xfs_vn_mknod( inode = VFS_I(ip); - error = xfs_init_security(inode, dir); + error = xfs_init_security(inode, dir, &dentry->d_name); if (unlikely(error)) goto out_cleanup_inode; @@ -367,7 +368,7 @@ xfs_vn_symlink( inode = VFS_I(cip); - error = xfs_init_security(inode, dir); + error = xfs_init_security(inode, dir, &dentry->d_name); if (unlikely(error)) goto out_cleanup_inode; diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index edfa178bafb..4aff5639573 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -309,7 +309,7 @@ xfs_mru_cache_init(void) if (!xfs_mru_elem_zone) goto out; - xfs_mru_reap_wq = create_singlethread_workqueue("xfs_mru_cache"); + xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1); if (!xfs_mru_reap_wq) goto out_destroy_mru_elem_zone; |