diff options
author | Dmitry Torokhov <dtor@insightbb.com> | 2007-02-10 01:26:32 -0500 |
---|---|---|
committer | Dmitry Torokhov <dtor@insightbb.com> | 2007-02-10 01:26:32 -0500 |
commit | b22364c8eec89e6b0c081a237f3b6348df87796f (patch) | |
tree | 233a923281fb640106465d076997ff511efb6edf /fs | |
parent | 2c8dc071517ec2843869024dc82be2e246f41064 (diff) | |
parent | 66efc5a7e3061c3597ac43a8bb1026488d57e66b (diff) |
Merge rsync://rsync.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'fs')
339 files changed, 6349 insertions, 4297 deletions
diff --git a/fs/9p/error.c b/fs/9p/error.c index ae91555c155..0d7fa4e0881 100644 --- a/fs/9p/error.c +++ b/fs/9p/error.c @@ -83,6 +83,7 @@ int v9fs_errstr2errno(char *errstr, int len) if (errno == 0) { /* TODO: if error isn't found, add it dynamically */ + errstr[len] = 0; printk(KERN_ERR "%s: errstr :%s: not found\n", __FUNCTION__, errstr); errno = 1; diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 27507201f9e..a9b6301a04f 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -25,6 +25,7 @@ #include <linux/fs.h> #include <linux/sched.h> #include <linux/idr.h> +#include <asm/semaphore.h> #include "debug.h" #include "v9fs.h" @@ -84,6 +85,7 @@ struct v9fs_fid *v9fs_fid_create(struct v9fs_session_info *v9ses, int fid) new->iounit = 0; new->rdir_pos = 0; new->rdir_fcall = NULL; + init_MUTEX(&new->lock); INIT_LIST_HEAD(&new->list); return new; @@ -102,11 +104,11 @@ void v9fs_fid_destroy(struct v9fs_fid *fid) } /** - * v9fs_fid_lookup - retrieve the right fid from a particular dentry + * v9fs_fid_lookup - return a locked fid from a dentry * @dentry: dentry to look for fid in - * @type: intent of lookup (operation or traversal) * - * find a fid in the dentry + * find a fid in the dentry, obtain its semaphore and return a reference to it. + * code calling lookup is responsible for releasing lock * * TODO: only match fids that have the same uid as current user * @@ -124,7 +126,68 @@ struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry) if (!return_fid) { dprintk(DEBUG_ERROR, "Couldn't find a fid in dentry\n"); + return_fid = ERR_PTR(-EBADF); } + if(down_interruptible(&return_fid->lock)) + return ERR_PTR(-EINTR); + return return_fid; } + +/** + * v9fs_fid_clone - lookup the fid for a dentry, clone a private copy and release it + * @dentry: dentry to look for fid in + * + * find a fid in the dentry and then clone to a new private fid + * + * TODO: only match fids that have the same uid as current user + * + */ + +struct v9fs_fid *v9fs_fid_clone(struct dentry *dentry) +{ + struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode); + struct v9fs_fid *base_fid, *new_fid = ERR_PTR(-EBADF); + struct v9fs_fcall *fcall = NULL; + int fid, err; + + base_fid = v9fs_fid_lookup(dentry); + + if(IS_ERR(base_fid)) + return base_fid; + + if(base_fid) { /* clone fid */ + fid = v9fs_get_idpool(&v9ses->fidpool); + if (fid < 0) { + eprintk(KERN_WARNING, "newfid fails!\n"); + new_fid = ERR_PTR(-ENOSPC); + goto Release_Fid; + } + + err = v9fs_t_walk(v9ses, base_fid->fid, fid, NULL, &fcall); + if (err < 0) { + dprintk(DEBUG_ERROR, "clone walk didn't work\n"); + v9fs_put_idpool(fid, &v9ses->fidpool); + new_fid = ERR_PTR(err); + goto Free_Fcall; + } + new_fid = v9fs_fid_create(v9ses, fid); + if (new_fid == NULL) { + dprintk(DEBUG_ERROR, "out of memory\n"); + new_fid = ERR_PTR(-ENOMEM); + } +Free_Fcall: + kfree(fcall); + } + +Release_Fid: + up(&base_fid->lock); + return new_fid; +} + +void v9fs_fid_clunk(struct v9fs_session_info *v9ses, struct v9fs_fid *fid) +{ + v9fs_t_clunk(v9ses, fid->fid); + v9fs_fid_destroy(fid); +} diff --git a/fs/9p/fid.h b/fs/9p/fid.h index aa974d6875c..48fc170c26c 100644 --- a/fs/9p/fid.h +++ b/fs/9p/fid.h @@ -30,6 +30,8 @@ struct v9fs_fid { struct list_head list; /* list of fids associated with a dentry */ struct list_head active; /* XXX - debug */ + struct semaphore lock; + u32 fid; unsigned char fidopen; /* set when fid is opened */ unsigned char fidclunked; /* set when fid has already been clunked */ @@ -55,3 +57,6 @@ struct v9fs_fid *v9fs_fid_get_created(struct dentry *); void v9fs_fid_destroy(struct v9fs_fid *fid); struct v9fs_fid *v9fs_fid_create(struct v9fs_session_info *, int fid); int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry); +struct v9fs_fid *v9fs_fid_clone(struct dentry *dentry); +void v9fs_fid_clunk(struct v9fs_session_info *v9ses, struct v9fs_fid *fid); + diff --git a/fs/9p/mux.c b/fs/9p/mux.c index 944273c3dbf..147ceef8e53 100644 --- a/fs/9p/mux.c +++ b/fs/9p/mux.c @@ -132,8 +132,10 @@ int v9fs_mux_global_init(void) v9fs_mux_poll_tasks[i].task = NULL; v9fs_mux_wq = create_workqueue("v9fs"); - if (!v9fs_mux_wq) + if (!v9fs_mux_wq) { + printk(KERN_WARNING "v9fs: mux: creating workqueue failed\n"); return -ENOMEM; + } return 0; } diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 0b96fae8b47..d9b561ba5e5 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -457,14 +457,19 @@ static int __init init_v9fs(void) v9fs_error_init(); - printk(KERN_INFO "Installing v9fs 9P2000 file system support\n"); + printk(KERN_INFO "Installing v9fs 9p2000 file system support\n"); ret = v9fs_mux_global_init(); - if (!ret) + if (ret) { + printk(KERN_WARNING "v9fs: starting mux failed\n"); return ret; + } ret = register_filesystem(&v9fs_fs_type); - if (!ret) + if (ret) { + printk(KERN_WARNING "v9fs: registering file system failed\n"); v9fs_mux_global_exit(); + } + return ret; } diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 9dfd259a70b..cc24abf232d 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -54,7 +54,7 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page) int retval = -EIO; loff_t offset = page_offset(page); int count = PAGE_CACHE_SIZE; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); int rsize = v9ses->maxdata - V9FS_IOHDRSZ; struct v9fs_fid *v9f = filp->private_data; diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 905c882f4e2..3129688143e 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -71,7 +71,7 @@ static inline int dt_type(struct v9fs_stat *mistat) static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) { struct v9fs_fcall *fcall = NULL; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); struct v9fs_fid *file = filp->private_data; unsigned int i, n, s; @@ -80,7 +80,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) struct v9fs_stat stat; int over = 0; - dprintk(DEBUG_VFS, "name %s\n", filp->f_dentry->d_name.name); + dprintk(DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); fid = file->fid; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 79e6f9cd734..9f17b0cacdd 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -55,53 +55,22 @@ int v9fs_file_open(struct inode *inode, struct file *file) struct v9fs_fid *vfid; struct v9fs_fcall *fcall = NULL; int omode; - int fid = V9FS_NOFID; int err; dprintk(DEBUG_VFS, "inode: %p file: %p \n", inode, file); - vfid = v9fs_fid_lookup(file->f_dentry); - if (!vfid) { - dprintk(DEBUG_ERROR, "Couldn't resolve fid from dentry\n"); - return -EBADF; - } - - fid = v9fs_get_idpool(&v9ses->fidpool); - if (fid < 0) { - eprintk(KERN_WARNING, "newfid fails!\n"); - return -ENOSPC; - } + vfid = v9fs_fid_clone(file->f_path.dentry); + if (IS_ERR(vfid)) + return PTR_ERR(vfid); - err = v9fs_t_walk(v9ses, vfid->fid, fid, NULL, &fcall); - if (err < 0) { - dprintk(DEBUG_ERROR, "rewalk didn't work\n"); - if (fcall && fcall->id == RWALK) - goto clunk_fid; - else { - v9fs_put_idpool(fid, &v9ses->fidpool); - goto free_fcall; - } - } - kfree(fcall); - - /* TODO: do special things for O_EXCL, O_NOFOLLOW, O_SYNC */ - /* translate open mode appropriately */ omode = v9fs_uflags2omode(file->f_flags); - err = v9fs_t_open(v9ses, fid, omode, &fcall); + err = v9fs_t_open(v9ses, vfid->fid, omode, &fcall); if (err < 0) { PRINT_FCALL_ERROR("open failed", fcall); - goto clunk_fid; - } - - vfid = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL); - if (vfid == NULL) { - dprintk(DEBUG_ERROR, "out of memory\n"); - err = -ENOMEM; - goto clunk_fid; + goto Clunk_Fid; } file->private_data = vfid; - vfid->fid = fid; vfid->fidopen = 1; vfid->fidclunked = 0; vfid->iounit = fcall->params.ropen.iounit; @@ -112,10 +81,8 @@ int v9fs_file_open(struct inode *inode, struct file *file) return 0; -clunk_fid: - v9fs_t_clunk(v9ses, fid); - -free_fcall: +Clunk_Fid: + v9fs_fid_clunk(v9ses, vfid); kfree(fcall); return err; @@ -133,7 +100,7 @@ free_fcall: static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) { int res = 0; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; dprintk(DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); @@ -161,7 +128,7 @@ static ssize_t v9fs_file_read(struct file *filp, char __user * data, size_t count, loff_t * offset) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); struct v9fs_fid *v9f = filp->private_data; struct v9fs_fcall *fcall = NULL; @@ -225,7 +192,7 @@ static ssize_t v9fs_file_write(struct file *filp, const char __user * data, size_t count, loff_t * offset) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); struct v9fs_fid *v9fid = filp->private_data; struct v9fs_fcall *fcall; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 18f26cdfd88..378767c07bf 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -416,12 +416,8 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) sb = file_inode->i_sb; v9ses = v9fs_inode2v9ses(file_inode); v9fid = v9fs_fid_lookup(file); - - if (!v9fid) { - dprintk(DEBUG_ERROR, - "no v9fs_fid\n"); - return -EBADF; - } + if(IS_ERR(v9fid)) + return PTR_ERR(v9fid); fid = v9fid->fid; if (fid < 0) { @@ -433,11 +429,13 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) result = v9fs_t_remove(v9ses, fid, &fcall); if (result < 0) { PRINT_FCALL_ERROR("remove fails", fcall); + goto Error; } v9fs_put_idpool(fid, &v9ses->fidpool); v9fs_fid_destroy(v9fid); +Error: kfree(fcall); return result; } @@ -473,9 +471,13 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, inode = NULL; vfid = NULL; v9ses = v9fs_inode2v9ses(dir); - dfid = v9fs_fid_lookup(dentry->d_parent); - perm = unixmode2p9mode(v9ses, mode); + dfid = v9fs_fid_clone(dentry->d_parent); + if(IS_ERR(dfid)) { + err = PTR_ERR(dfid); + goto error; + } + perm = unixmode2p9mode(v9ses, mode); if (nd && nd->flags & LOOKUP_OPEN) flags = nd->intent.open.flags - 1; else @@ -485,9 +487,10 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, perm, v9fs_uflags2omode(flags), NULL, &fid, &qid, &iounit); if (err) - goto error; + goto clunk_dfid; vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry); + v9fs_fid_clunk(v9ses, dfid); if (IS_ERR(vfid)) { err = PTR_ERR(vfid); vfid = NULL; @@ -525,6 +528,9 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, return 0; +clunk_dfid: + v9fs_fid_clunk(v9ses, dfid); + error: if (vfid) v9fs_fid_destroy(vfid); @@ -551,7 +557,12 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) inode = NULL; vfid = NULL; v9ses = v9fs_inode2v9ses(dir); - dfid = v9fs_fid_lookup(dentry->d_parent); + dfid = v9fs_fid_clone(dentry->d_parent); + if(IS_ERR(dfid)) { + err = PTR_ERR(dfid); + goto error; + } + perm = unixmode2p9mode(v9ses, mode | S_IFDIR); err = v9fs_create(v9ses, dfid->fid, (char *) dentry->d_name.name, @@ -559,26 +570,22 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (err) { dprintk(DEBUG_ERROR, "create error %d\n", err); - goto error; - } - - err = v9fs_t_clunk(v9ses, fid); - if (err) { - dprintk(DEBUG_ERROR, "clunk error %d\n", err); - goto error; + goto clean_up_dfid; } vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry); if (IS_ERR(vfid)) { err = PTR_ERR(vfid); vfid = NULL; - goto error; + goto clean_up_dfid; } + v9fs_fid_clunk(v9ses, dfid); inode = v9fs_inode_from_fid(v9ses, vfid->fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); inode = NULL; + v9fs_fid_destroy(vfid); goto error; } @@ -586,10 +593,10 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) d_instantiate(dentry, inode); return 0; -error: - if (vfid) - v9fs_fid_destroy(vfid); +clean_up_dfid: + v9fs_fid_clunk(v9ses, dfid); +error: return err; } @@ -622,28 +629,23 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, dentry->d_op = &v9fs_dentry_operations; dirfid = v9fs_fid_lookup(dentry->d_parent); - if (!dirfid) { - dprintk(DEBUG_ERROR, "no dirfid\n"); - return ERR_PTR(-EINVAL); - } + if(IS_ERR(dirfid)) + return ERR_PTR(PTR_ERR(dirfid)); dirfidnum = dirfid->fid; - if (dirfidnum < 0) { - dprintk(DEBUG_ERROR, "no dirfid for inode %p, #%lu\n", - dir, dir->i_ino); - return ERR_PTR(-EBADF); - } - newfid = v9fs_get_idpool(&v9ses->fidpool); if (newfid < 0) { eprintk(KERN_WARNING, "newfid fails!\n"); - return ERR_PTR(-ENOSPC); + result = -ENOSPC; + goto Release_Dirfid; } result = v9fs_t_walk(v9ses, dirfidnum, newfid, (char *)dentry->d_name.name, &fcall); + up(&dirfid->lock); + if (result < 0) { if (fcall && fcall->id == RWALK) v9fs_t_clunk(v9ses, newfid); @@ -701,8 +703,12 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, return NULL; - FreeFcall: +Release_Dirfid: + up(&dirfid->lock); + +FreeFcall: kfree(fcall); + return ERR_PTR(result); } @@ -746,10 +752,8 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode = old_dentry->d_inode; struct v9fs_session_info *v9ses = v9fs_inode2v9ses(old_inode); struct v9fs_fid *oldfid = v9fs_fid_lookup(old_dentry); - struct v9fs_fid *olddirfid = - v9fs_fid_lookup(old_dentry->d_parent); - struct v9fs_fid *newdirfid = - v9fs_fid_lookup(new_dentry->d_parent); + struct v9fs_fid *olddirfid; + struct v9fs_fid *newdirfid; struct v9fs_wstat wstat; struct v9fs_fcall *fcall = NULL; int fid = -1; @@ -759,16 +763,26 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, dprintk(DEBUG_VFS, "\n"); - if ((!oldfid) || (!olddirfid) || (!newdirfid)) { - dprintk(DEBUG_ERROR, "problem with arguments\n"); - return -EBADF; + if(IS_ERR(oldfid)) + return PTR_ERR(oldfid); + + olddirfid = v9fs_fid_clone(old_dentry->d_parent); + if(IS_ERR(olddirfid)) { + retval = PTR_ERR(olddirfid); + goto Release_lock; + } + + newdirfid = v9fs_fid_clone(new_dentry->d_parent); + if(IS_ERR(newdirfid)) { + retval = PTR_ERR(newdirfid); + goto Clunk_olddir; } /* 9P can only handle file rename in the same directory */ if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) { dprintk(DEBUG_ERROR, "old dir and new dir are different\n"); - retval = -EPERM; - goto FreeFcallnBail; + retval = -EXDEV; + goto Clunk_newdir; } fid = oldfid->fid; @@ -779,7 +793,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, dprintk(DEBUG_ERROR, "no fid for old file #%lu\n", old_inode->i_ino); retval = -EBADF; - goto FreeFcallnBail; + goto Clunk_newdir; } v9fs_blank_wstat(&wstat); @@ -788,11 +802,20 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, retval = v9fs_t_wstat(v9ses, fid, &wstat, &fcall); - FreeFcallnBail: if (retval < 0) PRINT_FCALL_ERROR("wstat error", fcall); kfree(fcall); + +Clunk_newdir: + v9fs_fid_clunk(v9ses, newdirfid); + +Clunk_olddir: + v9fs_fid_clunk(v9ses, olddirfid); + +Release_lock: + up(&oldfid->lock); + return retval; } @@ -810,15 +833,12 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, { struct v9fs_fcall *fcall = NULL; struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode); - struct v9fs_fid *fid = v9fs_fid_lookup(dentry); + struct v9fs_fid *fid = v9fs_fid_clone(dentry); int err = -EPERM; dprintk(DEBUG_VFS, "dentry: %p\n", dentry); - if (!fid) { - dprintk(DEBUG_ERROR, - "couldn't find fid associated with dentry\n"); - return -EBADF; - } + if(IS_ERR(fid)) + return PTR_ERR(fid); err = v9fs_t_stat(v9ses, fid->fid, &fcall); @@ -831,6 +851,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, } kfree(fcall); + v9fs_fid_clunk(v9ses, fid); return err; } @@ -844,18 +865,14 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) { struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode); - struct v9fs_fid *fid = v9fs_fid_lookup(dentry); + struct v9fs_fid *fid = v9fs_fid_clone(dentry); struct v9fs_fcall *fcall = NULL; struct v9fs_wstat wstat; int res = -EPERM; dprintk(DEBUG_VFS, "\n"); - - if (!fid) { - dprintk(DEBUG_ERROR, - "Couldn't find fid associated with dentry\n"); - return -EBADF; - } + if(IS_ERR(fid)) + return PTR_ERR(fid); v9fs_blank_wstat(&wstat); if (iattr->ia_valid & ATTR_MODE) @@ -887,6 +904,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) if (res >= 0) res = inode_setattr(dentry->d_inode, iattr); + v9fs_fid_clunk(v9ses, fid); return res; } @@ -987,18 +1005,15 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) struct v9fs_fcall *fcall = NULL; struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode); - struct v9fs_fid *fid = v9fs_fid_lookup(dentry); + struct v9fs_fid *fid = v9fs_fid_clone(dentry); - if (!fid) { - dprintk(DEBUG_ERROR, "could not resolve fid from dentry\n"); - retval = -EBADF; - goto FreeFcall; - } + if(IS_ERR(fid)) + return PTR_ERR(fid); if (!v9ses->extended) { retval = -EBADF; dprintk(DEBUG_ERROR, "not extended\n"); - goto FreeFcall; + goto ClunkFid; } dprintk(DEBUG_VFS, " %s\n", dentry->d_name.name); @@ -1009,8 +1024,10 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) goto FreeFcall; } - if (!fcall) - return -EIO; + if (!fcall) { + retval = -EIO; + goto ClunkFid; + } if (!(fcall->params.rstat.stat.mode & V9FS_DMSYMLINK)) { retval = -EINVAL; @@ -1028,9 +1045,12 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) fcall->params.rstat.stat.extension.str, buffer); retval = buflen; - FreeFcall: +FreeFcall: kfree(fcall); +ClunkFid: + v9fs_fid_clunk(v9ses, fid); + return retval; } @@ -1123,52 +1143,58 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, int err; u32 fid, perm; struct v9fs_session_info *v9ses; - struct v9fs_fid *dfid, *vfid; - struct inode *inode; + struct v9fs_fid *dfid, *vfid = NULL; + struct inode *inode = NULL; - inode = NULL; - vfid = NULL; v9ses = v9fs_inode2v9ses(dir); - dfid = v9fs_fid_lookup(dentry->d_parent); - perm = unixmode2p9mode(v9ses, mode); - if (!v9ses->extended) { dprintk(DEBUG_ERROR, "not extended\n"); return -EPERM; } + dfid = v9fs_fid_clone(dentry->d_parent); + if(IS_ERR(dfid)) { + err = PTR_ERR(dfid); + goto error; + } + + perm = unixmode2p9mode(v9ses, mode); + err = v9fs_create(v9ses, dfid->fid, (char *) dentry->d_name.name, perm, V9FS_OREAD, (char *) extension, &fid, NULL, NULL); if (err) - goto error; + goto clunk_dfid; err = v9fs_t_clunk(v9ses, fid); if (err) - goto error; + goto clunk_dfid; vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry); if (IS_ERR(vfid)) { err = PTR_ERR(vfid); vfid = NULL; - goto error; + goto clunk_dfid; } inode = v9fs_inode_from_fid(v9ses, vfid->fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); inode = NULL; - goto error; + goto free_vfid; } dentry->d_op = &v9fs_dentry_operations; d_instantiate(dentry, inode); return 0; -error: - if (vfid) - v9fs_fid_destroy(vfid); +free_vfid: + v9fs_fid_destroy(vfid); + +clunk_dfid: + v9fs_fid_clunk(v9ses, dfid); +error: return err; } @@ -1209,26 +1235,29 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { int retval; + struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); struct v9fs_fid *oldfid; char *name; dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name, old_dentry->d_name.name); - oldfid = v9fs_fid_lookup(old_dentry); - if (!oldfid) { - dprintk(DEBUG_ERROR, "can't find oldfid\n"); - return -EPERM; - } + oldfid = v9fs_fid_clone(old_dentry); + if(IS_ERR(oldfid)) + return PTR_ERR(oldfid); name = __getname(); - if (unlikely(!name)) - return -ENOMEM; + if (unlikely(!name)) { + retval = -ENOMEM; + goto clunk_fid; + } sprintf(name, "%d\n", oldfid->fid); retval = v9fs_vfs_mkspecial(dir, dentry, V9FS_DMLINK, name); __putname(name); +clunk_fid: + v9fs_fid_clunk(v9ses, oldfid); return retval; } diff --git a/fs/Kconfig b/fs/Kconfig index b3b5aa0edff..5e8e9d9ccb3 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -12,9 +12,7 @@ config EXT2_FS Ext2 is a standard Linux file system for hard disks. To compile this file system support as a module, choose M here: the - module will be called ext2. Be aware however that the file system - of your root partition (the one containing the directory /) cannot - be compiled as a module, and so this could be dangerous. + module will be called ext2. If unsure, say Y. @@ -98,9 +96,7 @@ config EXT3_FS (available at <http://sourceforge.net/projects/e2fsprogs/>). To compile this file system support as a module, choose M here: the - module will be called ext3. Be aware however that the file system - of your root partition (the one containing the directory /) cannot - be compiled as a module, and so this may be dangerous. + module will be called ext3. config EXT3_FS_XATTR bool "Ext3 extended attributes" @@ -163,9 +159,7 @@ config EXT4DEV_FS features will be added to ext4dev gradually. To compile this file system support as a module, choose M here. The - module will be called ext4dev. Be aware, however, that the filesystem - of your root partition (the one containing the directory /) cannot - be compiled as a module, and so this could be dangerous. + module will be called ext4dev. If unsure, say N. @@ -432,7 +426,6 @@ config OCFS2_FS select CONFIGFS_FS select JBD select CRC32 - select INET help OCFS2 is a general purpose extent based shared disk cluster file system with many similarities to ext3. It supports 64 bit inode @@ -1008,7 +1001,7 @@ config TMPFS_POSIX_ACL config HUGETLBFS bool "HugeTLB file system support" - depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN + depends on X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read @@ -1204,13 +1197,16 @@ config EFS_FS config JFFS_FS tristate "Journalling Flash File System (JFFS) support" - depends on MTD && BLOCK + depends on MTD && BLOCK && BROKEN help JFFS is the Journalling Flash File System developed by Axis Communications in Sweden, aimed at providing a crash/powerdown-safe file system for disk-less embedded devices. Further information is available at (<http://developer.axis.com/software/jffs/>). + NOTE: This filesystem is deprecated and is scheduled for removal in + 2.6.21. See Documentation/feature-removal-schedule.txt + config JFFS_FS_VERBOSE int "JFFS debugging verbosity (0 = quiet, 3 = noisy)" depends on JFFS_FS diff --git a/fs/Makefile b/fs/Makefile index 9a5ce9323bf..b9ffa63f77f 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -10,7 +10,8 @@ obj-y := open.o read_write.o file_table.o super.o \ ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ - pnode.o drop_caches.o splice.o sync.o utimes.o + pnode.o drop_caches.o splice.o sync.o utimes.o \ + stack.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index d3c7905b2dd..2b8903893d3 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -28,7 +28,7 @@ static DEFINE_RWLOCK(adfs_dir_lock); static int adfs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct super_block *sb = inode->i_sb; struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; struct object_info obj; diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c index bbfc8625927..b9b2b27b68c 100644 --- a/fs/adfs/dir_f.c +++ b/fs/adfs/dir_f.c @@ -53,7 +53,7 @@ static inline int adfs_readname(char *buf, char *ptr, int maxlen) { char *old_buf = buf; - while (*ptr >= ' ' && maxlen--) { + while ((unsigned char)*ptr >= ' ' && maxlen--) { if (*ptr == '/') *buf++ = '.'; else diff --git a/fs/affs/dir.c b/fs/affs/dir.c index 5d9649fa181..cad3ee34006 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -41,7 +41,7 @@ struct inode_operations affs_dir_inode_operations = { static int affs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct super_block *sb = inode->i_sb; struct buffer_head *dir_bh; struct buffer_head *fh_bh; @@ -71,7 +71,7 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir) stored++; } if (f_pos == 1) { - if (filldir(dirent, "..", 2, f_pos, parent_ino(filp->f_dentry), DT_DIR) < 0) + if (filldir(dirent, "..", 2, f_pos, parent_ino(filp->f_path.dentry), DT_DIR) < 0) return stored; filp->f_pos = f_pos = 2; stored++; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index a6ec75c56fc..4acd0413405 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -392,10 +392,10 @@ static int afs_dir_readdir(struct file *file, void *cookie, filldir_t filldir) unsigned fpos; int ret; - _enter("{%Ld,{%lu}}", file->f_pos, file->f_dentry->d_inode->i_ino); + _enter("{%Ld,{%lu}}", file->f_pos, file->f_path.dentry->d_inode->i_ino); fpos = file->f_pos; - ret = afs_dir_iterate(file->f_dentry->d_inode, &fpos, cookie, filldir); + ret = afs_dir_iterate(file->f_path.dentry->d_inode, &fpos, cookie, filldir); file->f_pos = fpos; _leave(" = %d", ret); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 99785a79d04..8f74e845082 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -18,7 +18,7 @@ #include <linux/pagemap.h> #include <linux/mount.h> #include <linux/namei.h> -#include <linux/namespace.h> +#include <linux/mnt_namespace.h> #include "super.h" #include "cell.h" #include "volume.h" @@ -136,11 +136,11 @@ static int afs_mntpt_open(struct inode *inode, struct file *file) { kenter("%p,%p{%p{%s},%s}", inode, file, - file->f_dentry->d_parent, - file->f_dentry->d_parent ? - file->f_dentry->d_parent->d_name.name : + file->f_path.dentry->d_parent, + file->f_path.dentry->d_parent ? + file->f_path.dentry->d_parent->d_name.name : (const unsigned char *) "", - file->f_dentry->d_name.name); + file->f_path.dentry->d_name.name); return -EREMOTE; } /* end afs_mntpt_open() */ @@ -298,17 +298,23 @@ static void wait_for_all_aios(struct kioctx *ctx) struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); + spin_lock_irq(&ctx->ctx_lock); if (!ctx->reqs_active) - return; + goto out; add_wait_queue(&ctx->wait, &wait); set_task_state(tsk, TASK_UNINTERRUPTIBLE); while (ctx->reqs_active) { + spin_unlock_irq(&ctx->ctx_lock); schedule(); set_task_state(tsk, TASK_UNINTERRUPTIBLE); + spin_lock_irq(&ctx->ctx_lock); } __set_task_state(tsk, TASK_RUNNING); remove_wait_queue(&ctx->wait, &wait); + +out: + spin_unlock_irq(&ctx->ctx_lock); } /* wait_on_sync_kiocb: @@ -424,7 +430,6 @@ static struct kiocb fastcall *__aio_get_req(struct kioctx *ctx) ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0); if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) { list_add(&req->ki_list, &ctx->active_reqs); - get_ioctx(ctx); ctx->reqs_active++; okay = 1; } @@ -536,8 +541,6 @@ int fastcall aio_put_req(struct kiocb *req) spin_lock_irq(&ctx->ctx_lock); ret = __aio_put_req(ctx, req); spin_unlock_irq(&ctx->ctx_lock); - if (ret) - put_ioctx(ctx); return ret; } @@ -586,7 +589,7 @@ static void use_mm(struct mm_struct *mm) * Note that on UML this *requires* PF_BORROWED_MM to be set, otherwise * it won't work. Update it accordingly if you change it here */ - activate_mm(active_mm, mm); + switch_mm(active_mm, mm, tsk); task_unlock(tsk); mmdrop(active_mm); @@ -599,9 +602,6 @@ static void use_mm(struct mm_struct *mm) * by the calling kernel thread * (Note: this routine is intended to be called only * from a kernel thread context) - * - * Comments: Called with ctx->ctx_lock held. This nests - * task_lock instead ctx_lock. */ static void unuse_mm(struct mm_struct *mm) { @@ -782,8 +782,7 @@ static int __aio_run_iocbs(struct kioctx *ctx) */ iocb->ki_users++; /* grab extra reference */ aio_run_iocb(iocb); - if (__aio_put_req(ctx, iocb)) /* drop extra ref */ - put_ioctx(ctx); + __aio_put_req(ctx, iocb); } if (!list_empty(&ctx->run_list)) return 1; @@ -850,14 +849,16 @@ static void aio_kick_handler(struct work_struct *work) { struct kioctx *ctx = container_of(work, struct kioctx, wq.work); mm_segment_t oldfs = get_fs(); + struct mm_struct *mm; int requeue; set_fs(USER_DS); use_mm(ctx->mm); spin_lock_irq(&ctx->ctx_lock); requeue =__aio_run_iocbs(ctx); - unuse_mm(ctx->mm); + mm = ctx->mm; spin_unlock_irq(&ctx->ctx_lock); + unuse_mm(mm); set_fs(oldfs); /* * we're in a worker thread already, don't use queue_delayed_work, @@ -998,14 +999,10 @@ put_rq: /* everything turned out well, dispose of the aiocb. */ ret = __aio_put_req(ctx, iocb); - spin_unlock_irqrestore(&ctx->ctx_lock, flags); - if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); - if (ret) - put_ioctx(ctx); - + spin_unlock_irqrestore(&ctx->ctx_lock, flags); return ret; } diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 368a1c33a3c..e698c51d2b0 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -45,7 +45,7 @@ static int autofs_root_readdir(struct file *filp, void *dirent, filldir_t filldi struct autofs_dir_ent *ent = NULL; struct autofs_dirhash *dirhash; struct autofs_sb_info *sbi; - struct inode * inode = filp->f_dentry->d_inode; + struct inode * inode = filp->f_path.dentry->d_inode; off_t onr, nr; lock_kernel(); @@ -557,7 +557,7 @@ static int autofs_root_ioctl(struct inode *inode, struct file *filp, case AUTOFS_IOC_SETTIMEOUT: return autofs_get_set_timeout(sbi, argp); case AUTOFS_IOC_EXPIRE: - return autofs_expire_run(inode->i_sb, sbi, filp->f_vfsmnt, + return autofs_expire_run(inode->i_sb, sbi, filp->f_path.mnt, argp); default: return -ENOSYS; diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index b13f32c8aee..216b1a364cc 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -150,7 +150,8 @@ static inline int autofs4_ispending(struct dentry *dentry) static inline void autofs4_copy_atime(struct file *src, struct file *dst) { - dst->f_dentry->d_inode->i_atime = src->f_dentry->d_inode->i_atime; + dst->f_path.dentry->d_inode->i_atime = + src->f_path.dentry->d_inode->i_atime; return; } diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 9c48250fd72..e8f6c5ad3e9 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -313,7 +313,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) struct autofs_sb_info *sbi; struct autofs_info *ino; - sbi = (struct autofs_sb_info *) kmalloc(sizeof(*sbi), GFP_KERNEL); + sbi = kmalloc(sizeof(*sbi), GFP_KERNEL); if ( !sbi ) goto fail_unlock; DPRINTK("starting up, sbi = %p",sbi); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index c1493524da4..8d05b9f7578 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -74,7 +74,7 @@ struct inode_operations autofs4_dir_inode_operations = { static int autofs4_root_readdir(struct file *file, void *dirent, filldir_t filldir) { - struct autofs_sb_info *sbi = autofs4_sbi(file->f_dentry->d_sb); + struct autofs_sb_info *sbi = autofs4_sbi(file->f_path.dentry->d_sb); int oz_mode = autofs4_oz_mode(sbi); DPRINTK("called, filp->f_pos = %lld", file->f_pos); @@ -95,8 +95,8 @@ static int autofs4_root_readdir(struct file *file, void *dirent, static int autofs4_dir_open(struct inode *inode, struct file *file) { - struct dentry *dentry = file->f_dentry; - struct vfsmount *mnt = file->f_vfsmnt; + struct dentry *dentry = file->f_path.dentry; + struct vfsmount *mnt = file->f_path.mnt; struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct dentry *cursor; int status; @@ -172,7 +172,7 @@ out: static int autofs4_dir_close(struct inode *inode, struct file *file) { - struct dentry *dentry = file->f_dentry; + struct dentry *dentry = file->f_path.dentry; struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct dentry *cursor = file->private_data; int status = 0; @@ -204,7 +204,7 @@ out: static int autofs4_dir_readdir(struct file *file, void *dirent, filldir_t filldir) { - struct dentry *dentry = file->f_dentry; + struct dentry *dentry = file->f_path.dentry; struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct dentry *cursor = file->private_data; int status; @@ -858,14 +858,14 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp, return autofs4_ask_reghost(sbi, p); case AUTOFS_IOC_ASKUMOUNT: - return autofs4_ask_umount(filp->f_vfsmnt, p); + return autofs4_ask_umount(filp->f_path.mnt, p); /* return a single thing to expire */ case AUTOFS_IOC_EXPIRE: - return autofs4_expire_run(inode->i_sb,filp->f_vfsmnt,sbi, p); + return autofs4_expire_run(inode->i_sb,filp->f_path.mnt,sbi, p); /* same as above, but can send multiple expires through pipe */ case AUTOFS_IOC_EXPIRE_MULTI: - return autofs4_expire_multi(inode->i_sb,filp->f_vfsmnt,sbi, p); + return autofs4_expire_multi(inode->i_sb,filp->f_path.mnt,sbi, p); default: return -ENOSYS; diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 34e6d7b220c..869f5193ecc 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -14,59 +14,307 @@ #include <linux/time.h> #include <linux/smp_lock.h> #include <linux/namei.h> +#include <linux/poll.h> -static int return_EIO(void) + +static loff_t bad_file_llseek(struct file *file, loff_t offset, int origin) +{ + return -EIO; +} + +static ssize_t bad_file_read(struct file *filp, char __user *buf, + size_t size, loff_t *ppos) +{ + return -EIO; +} + +static ssize_t bad_file_write(struct file *filp, const char __user *buf, + size_t siz, loff_t *ppos) +{ + return -EIO; +} + +static ssize_t bad_file_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + return -EIO; +} + +static ssize_t bad_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + return -EIO; +} + +static int bad_file_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + return -EIO; +} + +static unsigned int bad_file_poll(struct file *filp, poll_table *wait) +{ + return POLLERR; +} + +static int bad_file_ioctl (struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + return -EIO; +} + +static long bad_file_unlocked_ioctl(struct file *file, unsigned cmd, + unsigned long arg) +{ + return -EIO; +} + +static long bad_file_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return -EIO; +} + +static int bad_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + return -EIO; +} + +static int bad_file_open(struct inode *inode, struct file *filp) +{ + return -EIO; +} + +static int bad_file_flush(struct file *file, fl_owner_t id) +{ + return -EIO; +} + +static int bad_file_release(struct inode *inode, struct file *filp) +{ + return -EIO; +} + +static int bad_file_fsync(struct file *file, struct dentry *dentry, + int datasync) +{ + return -EIO; +} + +static int bad_file_aio_fsync(struct kiocb *iocb, int datasync) +{ + return -EIO; +} + +static int bad_file_fasync(int fd, struct file *filp, int on) +{ + return -EIO; +} + +static int bad_file_lock(struct file *file, int cmd, struct file_lock *fl) +{ + return -EIO; +} + +static ssize_t bad_file_sendfile(struct file *in_file, loff_t *ppos, + size_t count, read_actor_t actor, void *target) +{ + return -EIO; +} + +static ssize_t bad_file_sendpage(struct file *file, struct page *page, + int off, size_t len, loff_t *pos, int more) +{ + return -EIO; +} + +static unsigned long bad_file_get_unmapped_area(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + return -EIO; +} + +static int bad_file_check_flags(int flags) { return -EIO; } -#define EIO_ERROR ((void *) (return_EIO)) +static int bad_file_dir_notify(struct file *file, unsigned long arg) +{ + return -EIO; +} + +static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl) +{ + return -EIO; +} + +static ssize_t bad_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, size_t len, + unsigned int flags) +{ + return -EIO; +} + +static ssize_t bad_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + return -EIO; +} static const struct file_operations bad_file_ops = { - .llseek = EIO_ERROR, - .aio_read = EIO_ERROR, - .read = EIO_ERROR, - .write = EIO_ERROR, - .aio_write = EIO_ERROR, - .readdir = EIO_ERROR, - .poll = EIO_ERROR, - .ioctl = EIO_ERROR, - .mmap = EIO_ERROR, - .open = EIO_ERROR, - .flush = EIO_ERROR, - .release = EIO_ERROR, - .fsync = EIO_ERROR, - .aio_fsync = EIO_ERROR, - .fasync = EIO_ERROR, - .lock = EIO_ERROR, - .sendfile = EIO_ERROR, - .sendpage = EIO_ERROR, - .get_unmapped_area = EIO_ERROR, + .llseek = bad_file_llseek, + .read = bad_file_read, + .write = bad_file_write, + .aio_read = bad_file_aio_read, + .aio_write = bad_file_aio_write, + .readdir = bad_file_readdir, + .poll = bad_file_poll, + .ioctl = bad_file_ioctl, + .unlocked_ioctl = bad_file_unlocked_ioctl, + .compat_ioctl = bad_file_compat_ioctl, + .mmap = bad_file_mmap, + .open = bad_file_open, + .flush = bad_file_flush, + .release = bad_file_release, + .fsync = bad_file_fsync, + .aio_fsync = bad_file_aio_fsync, + .fasync = bad_file_fasync, + .lock = bad_file_lock, + .sendfile = bad_file_sendfile, + .sendpage = bad_file_sendpage, + .get_unmapped_area = bad_file_get_unmapped_area, + .check_flags = bad_file_check_flags, + .dir_notify = bad_file_dir_notify, + .flock = bad_file_flock, + .splice_write = bad_file_splice_write, + .splice_read = bad_file_splice_read, }; +static int bad_inode_create (struct inode *dir, struct dentry *dentry, + int mode, struct nameidata *nd) +{ + return -EIO; +} + +static struct dentry *bad_inode_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd) +{ + return ERR_PTR(-EIO); +} + +static int bad_inode_link (struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + return -EIO; +} + +static int bad_inode_unlink(struct inode *dir, struct dentry *dentry) +{ + return -EIO; +} + +static int bad_inode_symlink (struct inode *dir, struct dentry *dentry, + const char *symname) +{ + return -EIO; +} + +static int bad_inode_mkdir(struct inode *dir, struct dentry *dentry, + int mode) +{ + return -EIO; +} + +static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) +{ + return -EIO; +} + +static int bad_inode_mknod (struct inode *dir, struct dentry *dentry, + int mode, dev_t rdev) +{ + return -EIO; +} + +static int bad_inode_rename (struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + return -EIO; +} + +static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, + int buflen) +{ + return -EIO; +} + +static int bad_inode_permission(struct inode *inode, int mask, + struct nameidata *nd) +{ + return -EIO; +} + +static int bad_inode_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + return -EIO; +} + +static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs) +{ + return -EIO; +} + +static int bad_inode_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + return -EIO; +} + +static ssize_t bad_inode_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + return -EIO; +} + +static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer, + size_t buffer_size) +{ + return -EIO; +} + +static int bad_inode_removexattr(struct dentry *dentry, const char *name) +{ + return -EIO; +} + static struct inode_operations bad_inode_ops = { - .create = EIO_ERROR, - .lookup = EIO_ERROR, - .link = EIO_ERROR, - .unlink = EIO_ERROR, - .symlink = EIO_ERROR, - .mkdir = EIO_ERROR, - .rmdir = EIO_ERROR, - .mknod = EIO_ERROR, - .rename = EIO_ERROR, - .readlink = EIO_ERROR, + .create = bad_inode_create, + .lookup = bad_inode_lookup, + .link = bad_inode_link, + .unlink = bad_inode_unlink, + .symlink = bad_inode_symlink, + .mkdir = bad_inode_mkdir, + .rmdir = bad_inode_rmdir, + .mknod = bad_inode_mknod, + .rename = bad_inode_rename, + .readlink = bad_inode_readlink, /* follow_link must be no-op, otherwise unmounting this inode won't work */ - .truncate = EIO_ERROR, - .permission = EIO_ERROR, - .getattr = EIO_ERROR, - .setattr = EIO_ERROR, - .setxattr = EIO_ERROR, - .getxattr = EIO_ERROR, - .listxattr = EIO_ERROR, - .removexattr = EIO_ERROR, + /* put_link returns void */ + /* truncate returns void */ + .permission = bad_inode_permission, + .getattr = bad_inode_getattr, + .setattr = bad_inode_setattr, + .setxattr = bad_inode_setxattr, + .getxattr = bad_inode_getxattr, + .listxattr = bad_inode_listxattr, + .removexattr = bad_inode_removexattr, + /* truncate_range returns void */ }; @@ -88,7 +336,7 @@ static struct inode_operations bad_inode_ops = * on it to fail from this point on. */ -void make_bad_inode(struct inode * inode) +void make_bad_inode(struct inode *inode) { remove_inode_hash(inode); @@ -113,7 +361,7 @@ EXPORT_SYMBOL(make_bad_inode); * Returns true if the inode in question has been marked as bad. */ -int is_bad_inode(struct inode * inode) +int is_bad_inode(struct inode *inode) { return (inode->i_op == &bad_inode_ops); } diff --git a/fs/befs/btree.c b/fs/befs/btree.c index 81b042ee24e..af5bb93276f 100644 --- a/fs/befs/btree.c +++ b/fs/befs/btree.c @@ -260,7 +260,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds, goto error; } - this_node = (befs_btree_node *) kmalloc(sizeof (befs_btree_node), + this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS); if (!this_node) { befs_error(sb, "befs_btree_find() failed to allocate %u " diff --git a/fs/befs/debug.c b/fs/befs/debug.c index e831a8f3084..b8e304a0661 100644 --- a/fs/befs/debug.c +++ b/fs/befs/debug.c @@ -28,7 +28,7 @@ void befs_error(const struct super_block *sb, const char *fmt, ...) { va_list args; - char *err_buf = (char *) kmalloc(ERRBUFSIZE, GFP_KERNEL); + char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL); if (err_buf == NULL) { printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE); return; @@ -46,7 +46,7 @@ void befs_warning(const struct super_block *sb, const char *fmt, ...) { va_list args; - char *err_buf = (char *) kmalloc(ERRBUFSIZE, GFP_KERNEL); + char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL); if (err_buf == NULL) { printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE); return; @@ -70,7 +70,7 @@ befs_debug(const struct super_block *sb, const char *fmt, ...) char *err_buf = NULL; if (BEFS_SB(sb)->mount_opts.debug) { - err_buf = (char *) kmalloc(ERRBUFSIZE, GFP_KERNEL); + err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL); if (err_buf == NULL) { printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index bce402eee55..481e59b9d91 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -212,7 +212,7 @@ befs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) static int befs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct super_block *sb = inode->i_sb; befs_data_stream *ds = &BEFS_I(inode)->i_data.ds; befs_off_t value; @@ -222,7 +222,7 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir) char keybuf[BEFS_NAME_LEN + 1]; char *nlsname; int nlsnamelen; - const char *dirname = filp->f_dentry->d_name.name; + const char *dirname = filp->f_path.dentry->d_name.name; befs_debug(sb, "---> befs_readdir() " "name %s, inode %ld, filp->f_pos %Ld", diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index a650f1d0b85..2a746e688df 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -27,7 +27,7 @@ static struct buffer_head * bfs_find_entry(struct inode * dir, static int bfs_readdir(struct file * f, void * dirent, filldir_t filldir) { - struct inode * dir = f->f_dentry->d_inode; + struct inode * dir = f->f_path.dentry->d_inode; struct buffer_head * bh; struct bfs_dirent * de; unsigned int offset; diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index eac175ed9f4..134c99941a6 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -1,7 +1,7 @@ /* * fs/bfs/inode.c * BFS superblock and inode operations. - * Copyright (C) 1999,2000 Tigran Aivazian <tigran@veritas.com> + * Copyright (C) 1999-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> * From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds. * * Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005. @@ -18,7 +18,7 @@ #include <asm/uaccess.h> #include "bfs.h" -MODULE_AUTHOR("Tigran A. Aivazian <tigran@veritas.com>"); +MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); MODULE_DESCRIPTION("SCO UnixWare BFS filesystem for Linux"); MODULE_LICENSE("GPL"); diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 517e111bb7e..813a887cd2b 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -274,7 +274,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC && N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) || N_TRSIZE(ex) || N_DRSIZE(ex) || - i_size_read(bprm->file->f_dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { + i_size_read(bprm->file->f_path.dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { return -ENOEXEC; } @@ -389,7 +389,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) { printk(KERN_WARNING "fd_offset is not page aligned. Please convert program: %s\n", - bprm->file->f_dentry->d_name.name); + bprm->file->f_path.dentry->d_name.name); error_time = jiffies; } @@ -469,7 +469,7 @@ static int load_aout_library(struct file *file) int retval; struct exec ex; - inode = file->f_dentry->d_inode; + inode = file->f_path.dentry->d_inode; retval = -ENOEXEC; error = kernel_read(file, 0, (char *) &ex, sizeof(ex)); @@ -506,7 +506,7 @@ static int load_aout_library(struct file *file) { printk(KERN_WARNING "N_TXTOFF is not page aligned. Please convert library: %s\n", - file->f_dentry->d_name.name); + file->f_path.dentry->d_name.name); error_time = jiffies; } down_write(¤t->mm->mmap_sem); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index be5869d3499..669dbe5b031 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -682,6 +682,15 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) retval = PTR_ERR(interpreter); if (IS_ERR(interpreter)) goto out_free_interp; + + /* + * If the binary is not readable then enforce + * mm->dumpable = 0 regardless of the interpreter's + * permissions. + */ + if (file_permission(interpreter, MAY_READ) < 0) + bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; + retval = kernel_read(interpreter, 0, bprm->buf, BINPRM_BUF_SIZE); if (retval != BINPRM_BUF_SIZE) { @@ -854,13 +863,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) * default mmap base, as well as whatever program they * might try to exec. This is because the brk will * follow the loader, and is not movable. */ - if (current->flags & PF_RANDOMIZE) - load_bias = randomize_range(0x10000, - ELF_ET_DYN_BASE, - 0); - else - load_bias = ELF_ET_DYN_BASE; - load_bias = ELF_PAGESTART(load_bias - vaddr); + load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); } error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, @@ -1184,13 +1187,17 @@ static int dump_seek(struct file *file, loff_t off) */ static int maydump(struct vm_area_struct *vma) { + /* The vma can be set up to tell us the answer directly. */ + if (vma->vm_flags & VM_ALWAYSDUMP) + return 1; + /* Do not dump I/O mapped devices or special mappings */ if (vma->vm_flags & (VM_IO | VM_RESERVED)) return 0; /* Dump shared memory only if mapped from an anonymous file. */ if (vma->vm_flags & VM_SHARED) - return vma->vm_file->f_dentry->d_inode->i_nlink == 0; + return vma->vm_file->f_path.dentry->d_inode->i_nlink == 0; /* If it hasn't been written to, don't write it out */ if (!vma->anon_vma) @@ -1317,7 +1324,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus, prstatus->pr_pid = p->pid; prstatus->pr_ppid = p->parent->pid; prstatus->pr_pgrp = process_group(p); - prstatus->pr_sid = p->signal->session; + prstatus->pr_sid = process_session(p); if (thread_group_leader(p)) { /* * This is the record for the group leader. Add in the @@ -1363,7 +1370,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_pid = p->pid; psinfo->pr_ppid = p->parent->pid; psinfo->pr_pgrp = process_group(p); - psinfo->pr_sid = p->signal->session; + psinfo->pr_sid = process_session(p); i = p->state ? ffz(~p->state) + 1 : 0; psinfo->pr_state = i; @@ -1430,6 +1437,32 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t) return sz; } +static struct vm_area_struct *first_vma(struct task_struct *tsk, + struct vm_area_struct *gate_vma) +{ + struct vm_area_struct *ret = tsk->mm->mmap; + + if (ret) + return ret; + return gate_vma; +} +/* + * Helper function for iterating across a vma list. It ensures that the caller + * will visit `gate_vma' prior to terminating the search. + */ +static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, + struct vm_area_struct *gate_vma) +{ + struct vm_area_struct *ret; + + ret = this_vma->vm_next; + if (ret) + return ret; + if (this_vma == gate_vma) + return NULL; + return gate_vma; +} + /* * Actual dumper * @@ -1445,7 +1478,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file) int segs; size_t size = 0; int i; - struct vm_area_struct *vma; + struct vm_area_struct *vma, *gate_vma; struct elfhdr *elf = NULL; loff_t offset = 0, dataoff, foffset; unsigned long limit = current->signal->rlim[RLIMIT_CORE].rlim_cur; @@ -1531,6 +1564,10 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file) segs += ELF_CORE_EXTRA_PHDRS; #endif + gate_vma = get_gate_vma(current); + if (gate_vma != NULL) + segs++; + /* Set up header */ fill_elf_header(elf, segs + 1); /* including notes section */ @@ -1598,7 +1635,8 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); /* Write program headers for segments dump */ - for (vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) { + for (vma = first_vma(current, gate_vma); vma != NULL; + vma = next_vma(vma, gate_vma)) { struct elf_phdr phdr; size_t sz; @@ -1647,7 +1685,8 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file) /* Align to page */ DUMP_SEEK(dataoff - foffset); - for (vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) { + for (vma = first_vma(current, gate_vma); vma != NULL; + vma = next_vma(vma, gate_vma)) { unsigned long addr; if (!maydump(vma)) diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index ed9a61c6beb..a4d933a5120 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -234,6 +234,14 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, goto error; } + /* + * If the binary is not readable then enforce + * mm->dumpable = 0 regardless of the interpreter's + * permissions. + */ + if (file_permission(interpreter, MAY_READ) < 0) + bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; + retval = kernel_read(interpreter, 0, bprm->buf, BINPRM_BUF_SIZE); if (retval < 0) @@ -706,12 +714,11 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, return -ELIBBAD; size = sizeof(*loadmap) + nloads * sizeof(*seg); - loadmap = kmalloc(size, GFP_KERNEL); + loadmap = kzalloc(size, GFP_KERNEL); if (!loadmap) return -ENOMEM; params->loadmap = loadmap; - memset(loadmap, 0, size); loadmap->version = ELF32_FDPIC_LOADMAP_VERSION; loadmap->nsegs = nloads; @@ -855,7 +862,7 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, dynamic_error: printk("ELF FDPIC %s with invalid DYNAMIC section (inode=%lu)\n", - what, file->f_dentry->d_inode->i_ino); + what, file->f_path.dentry->d_inode->i_ino); return -ELIBBAD; } @@ -1186,7 +1193,7 @@ static int maydump(struct vm_area_struct *vma) /* Dump shared memory only if mapped from an anonymous file. */ if (vma->vm_flags & VM_SHARED) { - if (vma->vm_file->f_dentry->d_inode->i_nlink == 0) { + if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0) { kdcore("%08lx: %08lx: no (share)", vma->vm_start, vma->vm_flags); return 1; } @@ -1322,7 +1329,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus, prstatus->pr_pid = p->pid; prstatus->pr_ppid = p->parent->pid; prstatus->pr_pgrp = process_group(p); - prstatus->pr_sid = p->signal->session; + prstatus->pr_sid = process_session(p); if (thread_group_leader(p)) { /* * This is the record for the group leader. Add in the @@ -1371,7 +1378,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_pid = p->pid; psinfo->pr_ppid = p->parent->pid; psinfo->pr_pgrp = process_group(p); - psinfo->pr_sid = p->signal->session; + psinfo->pr_sid = process_session(p); i = p->state ? ffz(~p->state) + 1 : 0; psinfo->pr_state = i; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index a62fd4018a2..7b0265d7f3a 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -419,7 +419,7 @@ static int load_flat_file(struct linux_binprm * bprm, unsigned long textpos = 0, datapos = 0, result; unsigned long realdatastart = 0; unsigned long text_len, data_len, bss_len, stack_len, flags; - unsigned long memp = 0; /* for finding the brk area */ + unsigned long len, reallen, memp = 0; unsigned long extra, rlim; unsigned long *reloc = 0, *rp; struct inode *inode; @@ -429,7 +429,7 @@ static int load_flat_file(struct linux_binprm * bprm, int ret; hdr = ((struct flat_hdr *) bprm->buf); /* exec-header */ - inode = bprm->file->f_dentry->d_inode; + inode = bprm->file->f_path.dentry->d_inode; text_len = ntohl(hdr->data_start); data_len = ntohl(hdr->data_end) - ntohl(hdr->data_start); @@ -540,10 +540,18 @@ static int load_flat_file(struct linux_binprm * bprm, goto err; } + len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); down_write(¤t->mm->mmap_sem); - realdatastart = do_mmap(0, 0, data_len + extra + - MAX_SHARED_LIBS * sizeof(unsigned long), - PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); + realdatastart = do_mmap(0, 0, len, + PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); + /* Remap to use all availabe slack region space */ + if (realdatastart && (realdatastart < (unsigned long)-4096)) { + reallen = ksize(realdatastart); + if (reallen > len) { + realdatastart = do_mremap(realdatastart, len, + reallen, MREMAP_FIXED, realdatastart); + } + } up_write(¤t->mm->mmap_sem); if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) { @@ -584,11 +592,20 @@ static int load_flat_file(struct linux_binprm * bprm, } else { + len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); down_write(¤t->mm->mmap_sem); - textpos = do_mmap(0, 0, text_len + data_len + extra + - MAX_SHARED_LIBS * sizeof(unsigned long), - PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); + textpos = do_mmap(0, 0, len, + PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); + /* Remap to use all availabe slack region space */ + if (textpos && (textpos < (unsigned long) -4096)) { + reallen = ksize(textpos); + if (reallen > len) { + textpos = do_mremap(textpos, len, reallen, + MREMAP_FIXED, textpos); + } + } up_write(¤t->mm->mmap_sem); + if (!textpos || textpos >= (unsigned long) -4096) { if (!textpos) textpos = (unsigned long) -ENOMEM; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 1713c48fef5..c2e08252af3 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -311,7 +311,7 @@ static Node *create_entry(const char __user *buffer, size_t count) err = -ENOMEM; memsize = sizeof(Node) + count + 8; - e = (Node *) kmalloc(memsize, GFP_USER); + e = kmalloc(memsize, GFP_USER); if (!e) goto out; @@ -542,7 +542,7 @@ static void kill_node(Node *e) static ssize_t bm_entry_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos) { - Node *e = file->f_dentry->d_inode->i_private; + Node *e = file->f_path.dentry->d_inode->i_private; loff_t pos = *ppos; ssize_t res; char *page; @@ -576,7 +576,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct dentry *root; - Node *e = file->f_dentry->d_inode->i_private; + Node *e = file->f_path.dentry->d_inode->i_private; int res = parse_command(buffer, count); switch (res) { @@ -584,7 +584,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, break; case 2: set_bit(Enabled, &e->flags); break; - case 3: root = dget(file->f_vfsmnt->mnt_sb->s_root); + case 3: root = dget(file->f_path.mnt->mnt_sb->s_root); mutex_lock(&root->d_inode->i_mutex); kill_node(e); @@ -610,7 +610,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, Node *e; struct inode *inode; struct dentry *root, *dentry; - struct super_block *sb = file->f_vfsmnt->mnt_sb; + struct super_block *sb = file->f_path.mnt->mnt_sb; int err = 0; e = create_entry(buffer, count); @@ -699,7 +699,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer, switch (res) { case 1: enabled = 0; break; case 2: enabled = 1; break; - case 3: root = dget(file->f_vfsmnt->mnt_sb->s_root); + case 3: root = dget(file->f_path.mnt->mnt_sb->s_root); mutex_lock(&root->d_inode->i_mutex); while (!list_empty(&entries)) @@ -916,7 +916,7 @@ void bio_set_pages_dirty(struct bio *bio) } } -static void bio_release_pages(struct bio *bio) +void bio_release_pages(struct bio *bio) { struct bio_vec *bvec = bio->bi_io_vec; int i; diff --git a/fs/block_dev.c b/fs/block_dev.c index 13816b4d76f..fc7028b685f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -168,6 +168,203 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, iov, offset, nr_segs, blkdev_get_blocks, NULL); } +#if 0 +static int blk_end_aio(struct bio *bio, unsigned int bytes_done, int error) +{ + struct kiocb *iocb = bio->bi_private; + atomic_t *bio_count = &iocb->ki_bio_count; + + if (bio_data_dir(bio) == READ) + bio_check_pages_dirty(bio); + else { + bio_release_pages(bio); + bio_put(bio); + } + + /* iocb->ki_nbytes stores error code from LLDD */ + if (error) + iocb->ki_nbytes = -EIO; + + if (atomic_dec_and_test(bio_count)) { + if ((long)iocb->ki_nbytes < 0) + aio_complete(iocb, iocb->ki_nbytes, 0); + else + aio_complete(iocb, iocb->ki_left, 0); + } + + return 0; +} + +#define VEC_SIZE 16 +struct pvec { + unsigned short nr; + unsigned short idx; + struct page *page[VEC_SIZE]; +}; + +#define PAGES_SPANNED(addr, len) \ + (DIV_ROUND_UP((addr) + (len), PAGE_SIZE) - (addr) / PAGE_SIZE); + +/* + * get page pointer for user addr, we internally cache struct page array for + * (addr, count) range in pvec to avoid frequent call to get_user_pages. If + * internal page list is exhausted, a batch count of up to VEC_SIZE is used + * to get next set of page struct. + */ +static struct page *blk_get_page(unsigned long addr, size_t count, int rw, + struct pvec *pvec) +{ + int ret, nr_pages; + if (pvec->idx == pvec->nr) { + nr_pages = PAGES_SPANNED(addr, count); + nr_pages = min(nr_pages, VEC_SIZE); + down_read(¤t->mm->mmap_sem); + ret = get_user_pages(current, current->mm, addr, nr_pages, + rw == READ, 0, pvec->page, NULL); + up_read(¤t->mm->mmap_sem); + if (ret < 0) + return ERR_PTR(ret); + pvec->nr = ret; + pvec->idx = 0; + } + return pvec->page[pvec->idx++]; +} + +/* return a page back to pvec array */ +static void blk_unget_page(struct page *page, struct pvec *pvec) +{ + pvec->page[--pvec->idx] = page; +} + +static ssize_t +blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, + loff_t pos, unsigned long nr_segs) +{ + struct inode *inode = iocb->ki_filp->f_mapping->host; + unsigned blkbits = blksize_bits(bdev_hardsect_size(I_BDEV(inode))); + unsigned blocksize_mask = (1 << blkbits) - 1; + unsigned long seg = 0; /* iov segment iterator */ + unsigned long nvec; /* number of bio vec needed */ + unsigned long cur_off; /* offset into current page */ + unsigned long cur_len; /* I/O len of current page, up to PAGE_SIZE */ + + unsigned long addr; /* user iovec address */ + size_t count; /* user iovec len */ + size_t nbytes = iocb->ki_nbytes = iocb->ki_left; /* total xfer size */ + loff_t size; /* size of block device */ + struct bio *bio; + atomic_t *bio_count = &iocb->ki_bio_count; + struct page *page; + struct pvec pvec; + + pvec.nr = 0; + pvec.idx = 0; + + if (pos & blocksize_mask) + return -EINVAL; + + size = i_size_read(inode); + if (pos + nbytes > size) { + nbytes = size - pos; + iocb->ki_left = nbytes; + } + + /* + * check first non-zero iov alignment, the remaining + * iov alignment is checked inside bio loop below. + */ + do { + addr = (unsigned long) iov[seg].iov_base; + count = min(iov[seg].iov_len, nbytes); + if (addr & blocksize_mask || count & blocksize_mask) + return -EINVAL; + } while (!count && ++seg < nr_segs); + atomic_set(bio_count, 1); + + while (nbytes) { + /* roughly estimate number of bio vec needed */ + nvec = (nbytes + PAGE_SIZE - 1) / PAGE_SIZE; + nvec = max(nvec, nr_segs - seg); + nvec = min(nvec, (unsigned long) BIO_MAX_PAGES); + + /* bio_alloc should not fail with GFP_KERNEL flag */ + bio = bio_alloc(GFP_KERNEL, nvec); + bio->bi_bdev = I_BDEV(inode); + bio->bi_end_io = blk_end_aio; + bio->bi_private = iocb; + bio->bi_sector = pos >> blkbits; +same_bio: + cur_off = addr & ~PAGE_MASK; + cur_len = PAGE_SIZE - cur_off; + if (count < cur_len) + cur_len = count; + + page = blk_get_page(addr, count, rw, &pvec); + if (unlikely(IS_ERR(page))) + goto backout; + + if (bio_add_page(bio, page, cur_len, cur_off)) { + pos += cur_len; + addr += cur_len; + count -= cur_len; + nbytes -= cur_len; + + if (count) + goto same_bio; + while (++seg < nr_segs) { + addr = (unsigned long) iov[seg].iov_base; + count = iov[seg].iov_len; + if (!count) + continue; + if (unlikely(addr & blocksize_mask || + count & blocksize_mask)) { + page = ERR_PTR(-EINVAL); + goto backout; + } + count = min(count, nbytes); + goto same_bio; + } + } else { + blk_unget_page(page, &pvec); + } + + /* bio is ready, submit it */ + if (rw == READ) + bio_set_pages_dirty(bio); + atomic_inc(bio_count); + submit_bio(rw, bio); + } + +completion: + iocb->ki_left -= nbytes; + nbytes = iocb->ki_left; + iocb->ki_pos += nbytes; + + blk_run_address_space(inode->i_mapping); + if (atomic_dec_and_test(bio_count)) + aio_complete(iocb, nbytes, 0); + + return -EIOCBQUEUED; + +backout: + /* + * back out nbytes count constructed so far for this bio, + * we will throw away current bio. + */ + nbytes += bio->bi_size; + bio_release_pages(bio); + bio_put(bio); + + /* + * if no bio was submmitted, return the error code. + * otherwise, proceed with pending I/O completion. + */ + if (atomic_read(bio_count) == 1) + return PTR_ERR(page); + goto completion; +} +#endif + static int blkdev_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, blkdev_get_block, wbc); @@ -190,7 +387,7 @@ static int blkdev_commit_write(struct file *file, struct page *page, unsigned fr /* * private llseek: - * for a block special file file->f_dentry->d_inode->i_size is zero + * for a block special file file->f_path.dentry->d_inode->i_size is zero * so we compute the size by hand (just as in block_read/write above) */ static loff_t block_llseek(struct file *file, loff_t offset, int origin) @@ -263,7 +460,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag { memset(bdev, 0, sizeof(*bdev)); mutex_init(&bdev->bd_mutex); - mutex_init(&bdev->bd_mount_mutex); + sema_init(&bdev->bd_mount_sem, 1); INIT_LIST_HEAD(&bdev->bd_inodes); INIT_LIST_HEAD(&bdev->bd_list); #ifdef CONFIG_SYSFS @@ -762,7 +959,7 @@ static int bd_claim_by_kobject(struct block_device *bdev, void *holder, if (!bo) return -ENOMEM; - mutex_lock_nested(&bdev->bd_mutex, BD_MUTEX_PARTITION); + mutex_lock(&bdev->bd_mutex); res = bd_claim(bdev, holder); if (res == 0) { found = find_bd_holder(bdev, bo); @@ -796,7 +993,7 @@ static void bd_release_from_kobject(struct block_device *bdev, if (!kobj) return; - mutex_lock_nested(&bdev->bd_mutex, BD_MUTEX_PARTITION); + mutex_lock(&bdev->bd_mutex); bd_release(bdev); if ((bo = del_bd_holder(bdev, kobj))) free_bd_holder(bo); @@ -854,22 +1051,6 @@ struct block_device *open_by_devnum(dev_t dev, unsigned mode) EXPORT_SYMBOL(open_by_devnum); -static int -blkdev_get_partition(struct block_device *bdev, mode_t mode, unsigned flags); - -struct block_device *open_partition_by_devnum(dev_t dev, unsigned mode) -{ - struct block_device *bdev = bdget(dev); - int err = -ENOMEM; - int flags = mode & FMODE_WRITE ? O_RDWR : O_RDONLY; - if (bdev) - err = blkdev_get_partition(bdev, mode, flags); - return err ? ERR_PTR(err) : bdev; -} - -EXPORT_SYMBOL(open_partition_by_devnum); - - /* * This routine checks whether a removable media has been changed, * and invalidates all buffer-cache-entries in that case. This @@ -916,66 +1097,11 @@ void bd_set_size(struct block_device *bdev, loff_t size) } EXPORT_SYMBOL(bd_set_size); -static int __blkdev_put(struct block_device *bdev, unsigned int subclass) -{ - int ret = 0; - struct inode *bd_inode = bdev->bd_inode; - struct gendisk *disk = bdev->bd_disk; - - mutex_lock_nested(&bdev->bd_mutex, subclass); - lock_kernel(); - if (!--bdev->bd_openers) { - sync_blockdev(bdev); - kill_bdev(bdev); - } - if (bdev->bd_contains == bdev) { - if (disk->fops->release) - ret = disk->fops->release(bd_inode, NULL); - } else { - mutex_lock_nested(&bdev->bd_contains->bd_mutex, - subclass + 1); - bdev->bd_contains->bd_part_count--; - mutex_unlock(&bdev->bd_contains->bd_mutex); - } - if (!bdev->bd_openers) { - struct module *owner = disk->fops->owner; - - put_disk(disk); - module_put(owner); - - if (bdev->bd_contains != bdev) { - kobject_put(&bdev->bd_part->kobj); - bdev->bd_part = NULL; - } - bdev->bd_disk = NULL; - bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; - if (bdev != bdev->bd_contains) - __blkdev_put(bdev->bd_contains, subclass + 1); - bdev->bd_contains = NULL; - } - unlock_kernel(); - mutex_unlock(&bdev->bd_mutex); - bdput(bdev); - return ret; -} - -int blkdev_put(struct block_device *bdev) -{ - return __blkdev_put(bdev, BD_MUTEX_NORMAL); -} -EXPORT_SYMBOL(blkdev_put); +static int __blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags, + int for_part); +static int __blkdev_put(struct block_device *bdev, int for_part); -int blkdev_put_partition(struct block_device *bdev) -{ - return __blkdev_put(bdev, BD_MUTEX_PARTITION); -} -EXPORT_SYMBOL(blkdev_put_partition); - -static int -blkdev_get_whole(struct block_device *bdev, mode_t mode, unsigned flags); - -static int -do_open(struct block_device *bdev, struct file *file, unsigned int subclass) +static int do_open(struct block_device *bdev, struct file *file, int for_part) { struct module *owner = NULL; struct gendisk *disk; @@ -992,8 +1118,7 @@ do_open(struct block_device *bdev, struct file *file, unsigned int subclass) } owner = disk->fops->owner; - mutex_lock_nested(&bdev->bd_mutex, subclass); - + mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { bdev->bd_disk = disk; bdev->bd_contains = bdev; @@ -1020,25 +1145,21 @@ do_open(struct block_device *bdev, struct file *file, unsigned int subclass) ret = -ENOMEM; if (!whole) goto out_first; - ret = blkdev_get_whole(whole, file->f_mode, file->f_flags); + BUG_ON(for_part); + ret = __blkdev_get(whole, file->f_mode, file->f_flags, 1); if (ret) goto out_first; bdev->bd_contains = whole; - mutex_lock_nested(&whole->bd_mutex, BD_MUTEX_WHOLE); - whole->bd_part_count++; p = disk->part[part - 1]; bdev->bd_inode->i_data.backing_dev_info = whole->bd_inode->i_data.backing_dev_info; if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) { - whole->bd_part_count--; - mutex_unlock(&whole->bd_mutex); ret = -ENXIO; goto out_first; } kobject_get(&p->kobj); bdev->bd_part = p; bd_set_size(bdev, (loff_t) p->nr_sects << 9); - mutex_unlock(&whole->bd_mutex); } } else { put_disk(disk); @@ -1051,14 +1172,11 @@ do_open(struct block_device *bdev, struct file *file, unsigned int subclass) } if (bdev->bd_invalidated) rescan_partitions(bdev->bd_disk, bdev); - } else { - mutex_lock_nested(&bdev->bd_contains->bd_mutex, - BD_MUTEX_WHOLE); - bdev->bd_contains->bd_part_count++; - mutex_unlock(&bdev->bd_contains->bd_mutex); } } bdev->bd_openers++; + if (for_part) + bdev->bd_part_count++; mutex_unlock(&bdev->bd_mutex); unlock_kernel(); return 0; @@ -1067,7 +1185,7 @@ out_first: bdev->bd_disk = NULL; bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; if (bdev != bdev->bd_contains) - __blkdev_put(bdev->bd_contains, BD_MUTEX_WHOLE); + __blkdev_put(bdev->bd_contains, 1); bdev->bd_contains = NULL; put_disk(disk); module_put(owner); @@ -1079,7 +1197,8 @@ out: return ret; } -int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags) +static int __blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags, + int for_part) { /* * This crockload is due to bad choice of ->open() type. @@ -1091,51 +1210,17 @@ int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags) struct dentry fake_dentry = {}; fake_file.f_mode = mode; fake_file.f_flags = flags; - fake_file.f_dentry = &fake_dentry; + fake_file.f_path.dentry = &fake_dentry; fake_dentry.d_inode = bdev->bd_inode; - return do_open(bdev, &fake_file, BD_MUTEX_NORMAL); + return do_open(bdev, &fake_file, for_part); } -EXPORT_SYMBOL(blkdev_get); - -static int -blkdev_get_whole(struct block_device *bdev, mode_t mode, unsigned flags) +int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags) { - /* - * This crockload is due to bad choice of ->open() type. - * It will go away. - * For now, block device ->open() routine must _not_ - * examine anything in 'inode' argument except ->i_rdev. - */ - struct file fake_file = {}; - struct dentry fake_dentry = {}; - fake_file.f_mode = mode; - fake_file.f_flags = flags; - fake_file.f_dentry = &fake_dentry; - fake_dentry.d_inode = bdev->bd_inode; - - return do_open(bdev, &fake_file, BD_MUTEX_WHOLE); -} - -static int -blkdev_get_partition(struct block_device *bdev, mode_t mode, unsigned flags) -{ - /* - * This crockload is due to bad choice of ->open() type. - * It will go away. - * For now, block device ->open() routine must _not_ - * examine anything in 'inode' argument except ->i_rdev. - */ - struct file fake_file = {}; - struct dentry fake_dentry = {}; - fake_file.f_mode = mode; - fake_file.f_flags = flags; - fake_file.f_dentry = &fake_dentry; - fake_dentry.d_inode = bdev->bd_inode; - - return do_open(bdev, &fake_file, BD_MUTEX_PARTITION); + return __blkdev_get(bdev, mode, flags, 0); } +EXPORT_SYMBOL(blkdev_get); static int blkdev_open(struct inode * inode, struct file * filp) { @@ -1154,7 +1239,7 @@ static int blkdev_open(struct inode * inode, struct file * filp) if (bdev == NULL) return -ENOMEM; - res = do_open(bdev, filp, BD_MUTEX_NORMAL); + res = do_open(bdev, filp, 0); if (res) return res; @@ -1168,6 +1253,56 @@ static int blkdev_open(struct inode * inode, struct file * filp) return res; } +static int __blkdev_put(struct block_device *bdev, int for_part) +{ + int ret = 0; + struct inode *bd_inode = bdev->bd_inode; + struct gendisk *disk = bdev->bd_disk; + struct block_device *victim = NULL; + + mutex_lock_nested(&bdev->bd_mutex, for_part); + lock_kernel(); + if (for_part) + bdev->bd_part_count--; + + if (!--bdev->bd_openers) { + sync_blockdev(bdev); + kill_bdev(bdev); + } + if (bdev->bd_contains == bdev) { + if (disk->fops->release) + ret = disk->fops->release(bd_inode, NULL); + } + if (!bdev->bd_openers) { + struct module *owner = disk->fops->owner; + + put_disk(disk); + module_put(owner); + + if (bdev->bd_contains != bdev) { + kobject_put(&bdev->bd_part->kobj); + bdev->bd_part = NULL; + } + bdev->bd_disk = NULL; + bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; + if (bdev != bdev->bd_contains) + victim = bdev->bd_contains; + bdev->bd_contains = NULL; + } + unlock_kernel(); + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + if (victim) + __blkdev_put(victim, 1); + return ret; +} + +int blkdev_put(struct block_device *bdev) +{ + return __blkdev_put(bdev, 0); +} +EXPORT_SYMBOL(blkdev_put); + static int blkdev_close(struct inode * inode, struct file * filp) { struct block_device *bdev = I_BDEV(filp->f_mapping->host); diff --git a/fs/buffer.c b/fs/buffer.c index 517860f2d75..1ad674fd348 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -35,6 +35,7 @@ #include <linux/hash.h> #include <linux/suspend.h> #include <linux/buffer_head.h> +#include <linux/task_io_accounting_ops.h> #include <linux/bio.h> #include <linux/notifier.h> #include <linux/cpu.h> @@ -179,7 +180,7 @@ int fsync_bdev(struct block_device *bdev) * freeze_bdev -- lock a filesystem and force it into a consistent state * @bdev: blockdevice to lock * - * This takes the block device bd_mount_mutex to make sure no new mounts + * This takes the block device bd_mount_sem to make sure no new mounts * happen on bdev until thaw_bdev() is called. * If a superblock is found on this device, we take the s_umount semaphore * on it to make sure nobody unmounts until the snapshot creation is done. @@ -188,7 +189,7 @@ struct super_block *freeze_bdev(struct block_device *bdev) { struct super_block *sb; - mutex_lock(&bdev->bd_mount_mutex); + down(&bdev->bd_mount_sem); sb = get_super(bdev); if (sb && !(sb->s_flags & MS_RDONLY)) { sb->s_frozen = SB_FREEZE_WRITE; @@ -230,7 +231,7 @@ void thaw_bdev(struct block_device *bdev, struct super_block *sb) drop_super(sb); } - mutex_unlock(&bdev->bd_mount_mutex); + up(&bdev->bd_mount_sem); } EXPORT_SYMBOL(thaw_bdev); @@ -724,20 +725,21 @@ int __set_page_dirty_buffers(struct page *page) } spin_unlock(&mapping->private_lock); - if (!TestSetPageDirty(page)) { - write_lock_irq(&mapping->tree_lock); - if (page->mapping) { /* Race with truncate? */ - if (mapping_cap_account_dirty(mapping)) - __inc_zone_page_state(page, NR_FILE_DIRTY); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), - PAGECACHE_TAG_DIRTY); + if (TestSetPageDirty(page)) + return 0; + + write_lock_irq(&mapping->tree_lock); + if (page->mapping) { /* Race with truncate? */ + if (mapping_cap_account_dirty(mapping)) { + __inc_zone_page_state(page, NR_FILE_DIRTY); + task_io_account_write(PAGE_CACHE_SIZE); } - write_unlock_irq(&mapping->tree_lock); - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - return 1; + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); } - return 0; + write_unlock_irq(&mapping->tree_lock); + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + return 1; } EXPORT_SYMBOL(__set_page_dirty_buffers); @@ -2842,18 +2844,24 @@ int try_to_free_buffers(struct page *page) spin_lock(&mapping->private_lock); ret = drop_buffers(page, &buffers_to_free); + + /* + * If the filesystem writes its buffers by hand (eg ext3) + * then we can have clean buffers against a dirty page. We + * clean the page here; otherwise the VM will never notice + * that the filesystem did any IO at all. + * + * Also, during truncate, discard_buffer will have marked all + * the page's buffers clean. We discover that here and clean + * the page also. + * + * private_lock must be held over this entire operation in order + * to synchronise against __set_page_dirty_buffers and prevent the + * dirty bit from being lost. + */ + if (ret) + cancel_dirty_page(page, PAGE_CACHE_SIZE); spin_unlock(&mapping->private_lock); - if (ret) { - /* - * If the filesystem writes its buffers by hand (eg ext3) - * then we can have clean buffers against a dirty page. We - * clean the page here; otherwise later reattachment of buffers - * could encounter a non-uptodate page, which is unresolvable. - * This only applies in the rare case where try_to_free_buffers - * succeeds but the page is not freed. - */ - clear_page_dirty(page); - } out: if (buffers_to_free) { struct buffer_head *bh = buffers_to_free; diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 0b3c37ef52e..85e3850bf2c 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -1,3 +1,9 @@ +Version 1.47 +------------ +Fix oops in list_del during mount caused by unaligned string. +Seek to SEEK_END forces check for update of file size for non-cached +files. + Version 1.46 ------------ Support deep tree mounts. Better support OS/2, Win9x (DOS) time stamps. @@ -5,7 +11,8 @@ Allow null user to be specified on mount ("username="). Do not return EINVAL on readdir when filldir fails due to overwritten blocksize (fixes FC problem). Return error in rename 2nd attempt retry (ie report if rename by handle also fails, after rename by path fails, we were -not reporting whether the retry worked or not). +not reporting whether the retry worked or not). Fix NTLMv2 to +work to Windows servers (mount with option "sec=ntlmv2"). Version 1.45 ------------ diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 96abeb73897..6017c465440 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -143,8 +143,8 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset, ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList); if((ses->serverDomain == NULL) || (ses->serverOS == NULL) || (ses->serverNOS == NULL)) { - buf += sprintf("\nentry for %s not fully displayed\n\t", - ses->serverName); + buf += sprintf(buf, "\nentry for %s not fully " + "displayed\n\t", ses->serverName); } else { length = diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 4bc250b2d9f..fdeda519eac 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -372,8 +372,10 @@ void setup_ntlmv2_rsp(struct cifsSesInfo * ses, char * resp_buf, buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); get_random_bytes(&buf->client_chal, sizeof(buf->client_chal)); buf->reserved2 = 0; - buf->names[0].type = 0; + buf->names[0].type = cpu_to_le16(NTLMSSP_DOMAIN_TYPE); buf->names[0].length = 0; + buf->names[1].type = 0; + buf->names[1].length = 0; /* calculate buf->ntlmv2_hash */ rc = calc_ntlmv2_hash(ses, nls_cp); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 71bc87a37fc..93ef09971d2 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -498,7 +498,7 @@ cifs_get_sb(struct file_system_type *fs_type, static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - struct inode *inode = iocb->ki_filp->f_dentry->d_inode; + struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; ssize_t written; written = generic_file_aio_write(iocb, iov, nr_segs, pos); @@ -511,7 +511,15 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin) { /* origin == SEEK_END => we must revalidate the cached file length */ if (origin == SEEK_END) { - int retval = cifs_revalidate(file->f_dentry); + int retval; + + /* some applications poll for the file length in this strange + way so we must seek to end on non-oplocked files by + setting the revalidate time to zero */ + if(file->f_path.dentry->d_inode) + CIFS_I(file->f_path.dentry->d_inode)->time = 0; + + retval = cifs_revalidate(file->f_path.dentry); if (retval < 0) return (loff_t)retval; } diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index a243f779b36..8aa66dcf13b 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -100,5 +100,5 @@ extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); extern int cifs_ioctl (struct inode * inode, struct file * filep, unsigned int command, unsigned long arg); -#define CIFS_VERSION "1.46" +#define CIFS_VERSION "1.47" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 6df9dadba64..068ef51edbf 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -580,6 +580,12 @@ typedef union smb_com_session_setup_andx { /* format of NLTMv2 Response ie "case sensitive password" hash when NTLMv2 */ +#define NTLMSSP_SERVER_TYPE 1 +#define NTLMSSP_DOMAIN_TYPE 2 +#define NTLMSSP_FQ_DOMAIN_TYPE 3 +#define NTLMSSP_DNS_DOMAIN_TYPE 4 +#define NTLMSSP_DNS_PARENT_TYPE 5 + struct ntlmssp2_name { __le16 type; __le16 length; @@ -593,7 +599,7 @@ struct ntlmv2_resp { __le64 time; __u64 client_chal; /* random */ __u32 reserved2; - struct ntlmssp2_name names[1]; + struct ntlmssp2_name names[2]; /* array of name entries could follow ending in minimum 4 byte struct */ } __attribute__((packed)); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 098790eb2aa..472e33e0f3c 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -4876,7 +4876,7 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon, } else { /* Add file to outstanding requests */ /* BB change to kmem cache alloc */ - dnotify_req = (struct dir_notify_req *) kmalloc( + dnotify_req = kmalloc( sizeof(struct dir_notify_req), GFP_KERNEL); if(dnotify_req) { diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c index d91a3d44e9e..da12b482ebe 100644 --- a/fs/cifs/fcntl.c +++ b/fs/cifs/fcntl.c @@ -83,10 +83,10 @@ int cifs_dir_notify(struct file * file, unsigned long arg) return 0; xid = GetXid(); - cifs_sb = CIFS_SB(file->f_dentry->d_sb); + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); pTcon = cifs_sb->tcon; - full_path = build_path_from_dentry(file->f_dentry); + full_path = build_path_from_dentry(file->f_path.dentry); if(full_path == NULL) { rc = -ENOMEM; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 2436ed8fc84..e9dcf5ee29a 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -29,6 +29,7 @@ #include <linux/pagevec.h> #include <linux/smp_lock.h> #include <linux/writeback.h> +#include <linux/task_io_accounting_ops.h> #include <linux/delay.h> #include <asm/div64.h> #include "cifsfs.h" @@ -122,34 +123,34 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file, /* if not oplocked, invalidate inode pages if mtime or file size changed */ temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime)); - if (timespec_equal(&file->f_dentry->d_inode->i_mtime, &temp) && - (file->f_dentry->d_inode->i_size == + if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) && + (file->f_path.dentry->d_inode->i_size == (loff_t)le64_to_cpu(buf->EndOfFile))) { cFYI(1, ("inode unchanged on server")); } else { - if (file->f_dentry->d_inode->i_mapping) { + if (file->f_path.dentry->d_inode->i_mapping) { /* BB no need to lock inode until after invalidate since namei code should already have it locked? */ - filemap_write_and_wait(file->f_dentry->d_inode->i_mapping); + filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping); } cFYI(1, ("invalidating remote inode since open detected it " "changed")); - invalidate_remote_inode(file->f_dentry->d_inode); + invalidate_remote_inode(file->f_path.dentry->d_inode); } client_can_cache: if (pTcon->ses->capabilities & CAP_UNIX) - rc = cifs_get_inode_info_unix(&file->f_dentry->d_inode, + rc = cifs_get_inode_info_unix(&file->f_path.dentry->d_inode, full_path, inode->i_sb, xid); else - rc = cifs_get_inode_info(&file->f_dentry->d_inode, + rc = cifs_get_inode_info(&file->f_path.dentry->d_inode, full_path, buf, inode->i_sb, xid); if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) { pCifsInode->clientCanCacheAll = TRUE; pCifsInode->clientCanCacheRead = TRUE; cFYI(1, ("Exclusive Oplock granted on inode %p", - file->f_dentry->d_inode)); + file->f_path.dentry->d_inode)); } else if ((*oplock & 0xF) == OPLOCK_READ) pCifsInode->clientCanCacheRead = TRUE; @@ -178,7 +179,7 @@ int cifs_open(struct inode *inode, struct file *file) if (file->f_flags & O_CREAT) { /* search inode for this file and fill in file->private_data */ - pCifsInode = CIFS_I(file->f_dentry->d_inode); + pCifsInode = CIFS_I(file->f_path.dentry->d_inode); read_lock(&GlobalSMBSeslock); list_for_each(tmp, &pCifsInode->openFileList) { pCifsFile = list_entry(tmp, struct cifsFileInfo, @@ -206,7 +207,7 @@ int cifs_open(struct inode *inode, struct file *file) } } - full_path = build_path_from_dentry(file->f_dentry); + full_path = build_path_from_dentry(file->f_path.dentry); if (full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -291,7 +292,7 @@ int cifs_open(struct inode *inode, struct file *file) write_lock(&GlobalSMBSeslock); list_add(&pCifsFile->tlist, &pTcon->openFileList); - pCifsInode = CIFS_I(file->f_dentry->d_inode); + pCifsInode = CIFS_I(file->f_path.dentry->d_inode); if (pCifsInode) { rc = cifs_open_inode_helper(inode, file, pCifsInode, pCifsFile, pTcon, @@ -366,7 +367,7 @@ static int cifs_reopen_file(struct inode *inode, struct file *file, return 0; } - if (file->f_dentry == NULL) { + if (file->f_path.dentry == NULL) { up(&pCifsFile->fh_sem); cFYI(1, ("failed file reopen, no valid name if dentry freed")); FreeXid(xid); @@ -378,7 +379,7 @@ static int cifs_reopen_file(struct inode *inode, struct file *file, those that already have the rename sem can end up causing writepage to get called and if the server was down that means we end up here, and we can never tell if the caller already has the rename_sem */ - full_path = build_path_from_dentry(file->f_dentry); + full_path = build_path_from_dentry(file->f_path.dentry); if (full_path == NULL) { up(&pCifsFile->fh_sem); FreeXid(xid); @@ -444,7 +445,7 @@ static int cifs_reopen_file(struct inode *inode, struct file *file, pCifsInode->clientCanCacheAll = TRUE; pCifsInode->clientCanCacheRead = TRUE; cFYI(1, ("Exclusive Oplock granted on inode %p", - file->f_dentry->d_inode)); + file->f_path.dentry->d_inode)); } else if ((oplock & 0xF) == OPLOCK_READ) { pCifsInode->clientCanCacheRead = TRUE; pCifsInode->clientCanCacheAll = FALSE; @@ -551,7 +552,7 @@ int cifs_closedir(struct inode *inode, struct file *file) if (pCFileStruct) { struct cifsTconInfo *pTcon; - struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_dentry->d_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); pTcon = cifs_sb->tcon; @@ -664,7 +665,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) } else cFYI(1, ("Unknown type of lock")); - cifs_sb = CIFS_SB(file->f_dentry->d_sb); + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); pTcon = cifs_sb->tcon; if (file->private_data == NULL) { @@ -791,10 +792,10 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, int xid, long_op; struct cifsFileInfo *open_file; - if (file->f_dentry == NULL) + if (file->f_path.dentry == NULL) return -EBADF; - cifs_sb = CIFS_SB(file->f_dentry->d_sb); + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); if (cifs_sb == NULL) return -EBADF; @@ -802,7 +803,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, /* cFYI(1, (" write %d bytes to offset %lld of %s", write_size, - *poffset, file->f_dentry->d_name.name)); */ + *poffset, file->f_path.dentry->d_name.name)); */ if (file->private_data == NULL) return -EBADF; @@ -810,12 +811,12 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, open_file = (struct cifsFileInfo *) file->private_data; xid = GetXid(); - if (file->f_dentry->d_inode == NULL) { + if (file->f_path.dentry->d_inode == NULL) { FreeXid(xid); return -EBADF; } - if (*poffset > file->f_dentry->d_inode->i_size) + if (*poffset > file->f_path.dentry->d_inode->i_size) long_op = 2; /* writes past end of file can take a long time */ else long_op = 1; @@ -840,8 +841,8 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, return -EBADF; } if (open_file->invalidHandle) { - if ((file->f_dentry == NULL) || - (file->f_dentry->d_inode == NULL)) { + if ((file->f_path.dentry == NULL) || + (file->f_path.dentry->d_inode == NULL)) { FreeXid(xid); return total_written; } @@ -849,7 +850,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, filemap_fdatawait from here so tell reopen_file not to flush data to server now */ - rc = cifs_reopen_file(file->f_dentry->d_inode, + rc = cifs_reopen_file(file->f_path.dentry->d_inode, file, FALSE); if (rc != 0) break; @@ -878,17 +879,17 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, cifs_stats_bytes_written(pTcon, total_written); /* since the write may have blocked check these pointers again */ - if (file->f_dentry) { - if (file->f_dentry->d_inode) { - struct inode *inode = file->f_dentry->d_inode; + if (file->f_path.dentry) { + if (file->f_path.dentry->d_inode) { + struct inode *inode = file->f_path.dentry->d_inode; inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); if (total_written > 0) { - if (*poffset > file->f_dentry->d_inode->i_size) - i_size_write(file->f_dentry->d_inode, + if (*poffset > file->f_path.dentry->d_inode->i_size) + i_size_write(file->f_path.dentry->d_inode, *poffset); } - mark_inode_dirty_sync(file->f_dentry->d_inode); + mark_inode_dirty_sync(file->f_path.dentry->d_inode); } } FreeXid(xid); @@ -906,17 +907,17 @@ static ssize_t cifs_write(struct file *file, const char *write_data, int xid, long_op; struct cifsFileInfo *open_file; - if (file->f_dentry == NULL) + if (file->f_path.dentry == NULL) return -EBADF; - cifs_sb = CIFS_SB(file->f_dentry->d_sb); + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); if (cifs_sb == NULL) return -EBADF; pTcon = cifs_sb->tcon; cFYI(1,("write %zd bytes to offset %lld of %s", write_size, - *poffset, file->f_dentry->d_name.name)); + *poffset, file->f_path.dentry->d_name.name)); if (file->private_data == NULL) return -EBADF; @@ -924,12 +925,12 @@ static ssize_t cifs_write(struct file *file, const char *write_data, open_file = (struct cifsFileInfo *)file->private_data; xid = GetXid(); - if (file->f_dentry->d_inode == NULL) { + if (file->f_path.dentry->d_inode == NULL) { FreeXid(xid); return -EBADF; } - if (*poffset > file->f_dentry->d_inode->i_size) + if (*poffset > file->f_path.dentry->d_inode->i_size) long_op = 2; /* writes past end of file can take a long time */ else long_op = 1; @@ -955,8 +956,8 @@ static ssize_t cifs_write(struct file *file, const char *write_data, return -EBADF; } if (open_file->invalidHandle) { - if ((file->f_dentry == NULL) || - (file->f_dentry->d_inode == NULL)) { + if ((file->f_path.dentry == NULL) || + (file->f_path.dentry->d_inode == NULL)) { FreeXid(xid); return total_written; } @@ -964,7 +965,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data, filemap_fdatawait from here so tell reopen_file not to flush data to server now */ - rc = cifs_reopen_file(file->f_dentry->d_inode, + rc = cifs_reopen_file(file->f_path.dentry->d_inode, file, FALSE); if (rc != 0) break; @@ -1011,16 +1012,16 @@ static ssize_t cifs_write(struct file *file, const char *write_data, cifs_stats_bytes_written(pTcon, total_written); /* since the write may have blocked check these pointers again */ - if (file->f_dentry) { - if (file->f_dentry->d_inode) { - file->f_dentry->d_inode->i_ctime = - file->f_dentry->d_inode->i_mtime = CURRENT_TIME; + if (file->f_path.dentry) { + if (file->f_path.dentry->d_inode) { + file->f_path.dentry->d_inode->i_ctime = + file->f_path.dentry->d_inode->i_mtime = CURRENT_TIME; if (total_written > 0) { - if (*poffset > file->f_dentry->d_inode->i_size) - i_size_write(file->f_dentry->d_inode, + if (*poffset > file->f_path.dentry->d_inode->i_size) + i_size_write(file->f_path.dentry->d_inode, *poffset); } - mark_inode_dirty_sync(file->f_dentry->d_inode); + mark_inode_dirty_sync(file->f_path.dentry->d_inode); } } FreeXid(xid); @@ -1145,7 +1146,7 @@ static int cifs_writepages(struct address_space *mapping, pgoff_t end; pgoff_t index; int range_whole = 0; - struct kvec iov[32]; + struct kvec * iov; int len; int n_iov = 0; pgoff_t next; @@ -1170,15 +1171,21 @@ static int cifs_writepages(struct address_space *mapping, if((cifs_sb->tcon->ses) && (cifs_sb->tcon->ses->server)) if(cifs_sb->tcon->ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) - if(!experimEnabled) + if(!experimEnabled) return generic_writepages(mapping, wbc); + iov = kmalloc(32 * sizeof(struct kvec), GFP_KERNEL); + if(iov == NULL) + return generic_writepages(mapping, wbc); + + /* * BB: Is this meaningful for a non-block-device file system? * If it is, we should test it again after we do I/O */ if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; + kfree(iov); return 0; } @@ -1244,14 +1251,21 @@ retry: wait_on_page_writeback(page); if (PageWriteback(page) || - !test_clear_page_dirty(page)) { + !clear_page_dirty_for_io(page)) { unlock_page(page); break; } + /* + * This actually clears the dirty bit in the radix tree. + * See cifs_writepage() for more commentary. + */ + set_page_writeback(page); + if (page_offset(page) >= mapping->host->i_size) { done = 1; unlock_page(page); + end_page_writeback(page); break; } @@ -1315,6 +1329,7 @@ retry: SetPageError(page); kunmap(page); unlock_page(page); + end_page_writeback(page); page_cache_release(page); } if ((wbc->nr_to_write -= n_iov) <= 0) @@ -1336,7 +1351,7 @@ retry: mapping->writeback_index = index; FreeXid(xid); - + kfree(iov); return rc; } @@ -1351,11 +1366,23 @@ static int cifs_writepage(struct page* page, struct writeback_control *wbc) if (!PageUptodate(page)) { cFYI(1, ("ppw - page not up to date")); } - + + /* + * Set the "writeback" flag, and clear "dirty" in the radix tree. + * + * A writepage() implementation always needs to do either this, + * or re-dirty the page with "redirty_page_for_writepage()" in + * the case of a failure. + * + * Just unlocking the page will cause the radix tree tag-bits + * to fail to update with the state of the page correctly. + */ + set_page_writeback(page); rc = cifs_partialpagewrite(page, 0, PAGE_CACHE_SIZE); SetPageUptodate(page); /* BB add check for error and Clearuptodate? */ unlock_page(page); - page_cache_release(page); + end_page_writeback(page); + page_cache_release(page); FreeXid(xid); return rc; } @@ -1384,7 +1411,7 @@ static int cifs_commit_write(struct file *file, struct page *page, if ((open_file->invalidHandle) && (!open_file->closePend)) { rc = cifs_reopen_file( - file->f_dentry->d_inode, file); + file->f_path.dentry->d_inode, file); if (rc != 0) break; } @@ -1434,7 +1461,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync) { int xid; int rc = 0; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; xid = GetXid(); @@ -1482,7 +1509,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync) */ int cifs_flush(struct file *file, fl_owner_t id) { - struct inode * inode = file->f_dentry->d_inode; + struct inode * inode = file->f_path.dentry->d_inode; int rc = 0; /* Rather than do the steps manually: @@ -1519,7 +1546,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data, struct smb_com_read_rsp *pSMBr; xid = GetXid(); - cifs_sb = CIFS_SB(file->f_dentry->d_sb); + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); pTcon = cifs_sb->tcon; if (file->private_data == NULL) { @@ -1542,7 +1569,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data, int buf_type = CIFS_NO_BUFFER; if ((open_file->invalidHandle) && (!open_file->closePend)) { - rc = cifs_reopen_file(file->f_dentry->d_inode, + rc = cifs_reopen_file(file->f_path.dentry->d_inode, file, TRUE); if (rc != 0) break; @@ -1601,7 +1628,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, int buf_type = CIFS_NO_BUFFER; xid = GetXid(); - cifs_sb = CIFS_SB(file->f_dentry->d_sb); + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); pTcon = cifs_sb->tcon; if (file->private_data == NULL) { @@ -1629,7 +1656,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, while (rc == -EAGAIN) { if ((open_file->invalidHandle) && (!open_file->closePend)) { - rc = cifs_reopen_file(file->f_dentry->d_inode, + rc = cifs_reopen_file(file->f_path.dentry->d_inode, file, TRUE); if (rc != 0) break; @@ -1658,7 +1685,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) { - struct dentry *dentry = file->f_dentry; + struct dentry *dentry = file->f_path.dentry; int rc, xid; xid = GetXid(); @@ -1744,7 +1771,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, return -EBADF; } open_file = (struct cifsFileInfo *)file->private_data; - cifs_sb = CIFS_SB(file->f_dentry->d_sb); + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); pTcon = cifs_sb->tcon; pagevec_init(&lru_pvec, 0); @@ -1786,7 +1813,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, while (rc == -EAGAIN) { if ((open_file->invalidHandle) && (!open_file->closePend)) { - rc = cifs_reopen_file(file->f_dentry->d_inode, + rc = cifs_reopen_file(file->f_path.dentry->d_inode, file, TRUE); if (rc != 0) break; @@ -1812,6 +1839,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, cFYI(1, ("Read error in readpages: %d", rc)); break; } else if (bytes_read > 0) { + task_io_account_read(bytes_read); pSMBr = (struct smb_com_read_rsp *)smb_read_data; cifs_copy_cache_pages(mapping, page_list, bytes_read, smb_read_data + 4 /* RFC1001 hdr */ + @@ -1880,8 +1908,8 @@ static int cifs_readpage_worker(struct file *file, struct page *page, else cFYI(1, ("Bytes read %d",rc)); - file->f_dentry->d_inode->i_atime = - current_fs_time(file->f_dentry->d_inode->i_sb); + file->f_path.dentry->d_inode->i_atime = + current_fs_time(file->f_path.dentry->d_inode->i_sb); if (PAGE_CACHE_SIZE > rc) memset(read_data + rc, 0, PAGE_CACHE_SIZE - rc); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index aedf683f011..19cc294c7c7 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -71,9 +71,7 @@ sesInfoAlloc(void) { struct cifsSesInfo *ret_buf; - ret_buf = - (struct cifsSesInfo *) kzalloc(sizeof (struct cifsSesInfo), - GFP_KERNEL); + ret_buf = kzalloc(sizeof (struct cifsSesInfo), GFP_KERNEL); if (ret_buf) { write_lock(&GlobalSMBSeslock); atomic_inc(&sesInfoAllocCount); @@ -109,9 +107,7 @@ struct cifsTconInfo * tconInfoAlloc(void) { struct cifsTconInfo *ret_buf; - ret_buf = - (struct cifsTconInfo *) kzalloc(sizeof (struct cifsTconInfo), - GFP_KERNEL); + ret_buf = kzalloc(sizeof (struct cifsTconInfo), GFP_KERNEL); if (ret_buf) { write_lock(&GlobalSMBSeslock); atomic_inc(&tconInfoAllocCount); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index ed18c3965f7..782940be550 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -68,30 +68,30 @@ static int construct_dentry(struct qstr *qstring, struct file *file, int rc = 0; cFYI(1, ("For %s", qstring->name)); - cifs_sb = CIFS_SB(file->f_dentry->d_sb); + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); pTcon = cifs_sb->tcon; qstring->hash = full_name_hash(qstring->name, qstring->len); - tmp_dentry = d_lookup(file->f_dentry, qstring); + tmp_dentry = d_lookup(file->f_path.dentry, qstring); if (tmp_dentry) { cFYI(0, ("existing dentry with inode 0x%p", tmp_dentry->d_inode)); *ptmp_inode = tmp_dentry->d_inode; /* BB overwrite old name? i.e. tmp_dentry->d_name and tmp_dentry->d_name.len??*/ if(*ptmp_inode == NULL) { - *ptmp_inode = new_inode(file->f_dentry->d_sb); + *ptmp_inode = new_inode(file->f_path.dentry->d_sb); if(*ptmp_inode == NULL) return rc; rc = 1; } } else { - tmp_dentry = d_alloc(file->f_dentry, qstring); + tmp_dentry = d_alloc(file->f_path.dentry, qstring); if(tmp_dentry == NULL) { cERROR(1,("Failed allocating dentry")); *ptmp_inode = NULL; return rc; } - *ptmp_inode = new_inode(file->f_dentry->d_sb); + *ptmp_inode = new_inode(file->f_path.dentry->d_sb); if (pTcon->nocase) tmp_dentry->d_op = &cifs_ci_dentry_ops; else @@ -156,9 +156,9 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type, tmp_inode->i_atime = cnvrtDosUnixTm( le16_to_cpu(pfindData->LastAccessDate), le16_to_cpu(pfindData->LastAccessTime)); - tmp_inode->i_ctime = cnvrtDosUnixTm( - le16_to_cpu(pfindData->LastWriteDate), - le16_to_cpu(pfindData->LastWriteTime)); + tmp_inode->i_ctime = cnvrtDosUnixTm( + le16_to_cpu(pfindData->LastWriteDate), + le16_to_cpu(pfindData->LastWriteTime)); AdjustForTZ(cifs_sb->tcon, tmp_inode); attr = le16_to_cpu(pfindData->Attributes); allocation_size = le32_to_cpu(pfindData->AllocationSize); @@ -432,10 +432,10 @@ static int initiate_cifs_search(const int xid, struct file *file) cifsFile->invalidHandle = TRUE; cifsFile->srch_inf.endOfSearch = FALSE; - if(file->f_dentry == NULL) + if(file->f_path.dentry == NULL) return -ENOENT; - cifs_sb = CIFS_SB(file->f_dentry->d_sb); + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); if(cifs_sb == NULL) return -EINVAL; @@ -443,7 +443,7 @@ static int initiate_cifs_search(const int xid, struct file *file) if(pTcon == NULL) return -EINVAL; - full_path = build_path_from_dentry(file->f_dentry); + full_path = build_path_from_dentry(file->f_path.dentry); if(full_path == NULL) { return -ENOMEM; @@ -609,10 +609,10 @@ static int is_dir_changed(struct file * file) struct inode * inode; struct cifsInodeInfo *cifsInfo; - if(file->f_dentry == NULL) + if(file->f_path.dentry == NULL) return 0; - inode = file->f_dentry->d_inode; + inode = file->f_path.dentry->d_inode; if(inode == NULL) return 0; @@ -839,7 +839,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file, if((scratch_buf == NULL) || (pfindEntry == NULL) || (pCifsF == NULL)) return -ENOENT; - if(file->f_dentry == NULL) + if(file->f_path.dentry == NULL) return -ENOENT; rc = cifs_entry_is_dot(pfindEntry,pCifsF); @@ -847,7 +847,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file, if(rc != 0) return 0; - cifs_sb = CIFS_SB(file->f_dentry->d_sb); + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); qstring.name = scratch_buf; rc = cifs_get_name_from_search_buf(&qstring,pfindEntry, @@ -985,12 +985,12 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) xid = GetXid(); - if(file->f_dentry == NULL) { + if(file->f_path.dentry == NULL) { FreeXid(xid); return -EIO; } - cifs_sb = CIFS_SB(file->f_dentry->d_sb); + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); pTcon = cifs_sb->tcon; if(pTcon == NULL) return -EINVAL; @@ -998,7 +998,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) switch ((int) file->f_pos) { case 0: if (filldir(direntry, ".", 1, file->f_pos, - file->f_dentry->d_inode->i_ino, DT_DIR) < 0) { + file->f_path.dentry->d_inode->i_ino, DT_DIR) < 0) { cERROR(1, ("Filldir for current dir failed")); rc = -ENOMEM; break; @@ -1006,7 +1006,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) file->f_pos++; case 1: if (filldir(direntry, "..", 2, file->f_pos, - file->f_dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) { + file->f_path.dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) { cERROR(1, ("Filldir for parent dir failed")); rc = -ENOMEM; break; diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index bbdda99dce6..75846463089 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -182,11 +182,14 @@ static int decode_unicode_ssetup(char ** pbcc_area, int bleft, struct cifsSesInf cFYI(1,("bleft %d",bleft)); - /* word align, if bytes remaining is not even */ - if(bleft % 2) { - bleft--; - data++; - } + /* SMB header is unaligned, so cifs servers word align start of + Unicode strings */ + data++; + bleft--; /* Windows servers do not always double null terminate + their final Unicode string - in which case we + now will not attempt to decode the byte of junk + which follows it */ + words_left = bleft / 2; /* save off server operating system */ diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c index 7a1b2b961ec..1b1daf63f06 100644 --- a/fs/cifs/smbdes.c +++ b/fs/cifs/smbdes.c @@ -196,7 +196,7 @@ dohash(char *out, char *in, char *key, int forw) char c[28]; char d[28]; char *cd; - char ki[16][48]; + char (*ki)[48]; char *pd1; char l[32], r[32]; char *rl; @@ -206,6 +206,12 @@ dohash(char *out, char *in, char *key, int forw) if(pk1 == NULL) return; + ki = kmalloc(16*48, GFP_KERNEL); + if(ki == NULL) { + kfree(pk1); + return; + } + cd = pk1 + 56; pd1= cd + 56; rl = pd1 + 64; @@ -243,6 +249,7 @@ dohash(char *out, char *in, char *key, int forw) er = kmalloc(48+48+32+32+32, GFP_KERNEL); if(er == NULL) { kfree(pk1); + kfree(ki); return; } erk = er+48; @@ -290,6 +297,7 @@ dohash(char *out, char *in, char *key, int forw) permute(out, rl, perm6, 64); kfree(pk1); + kfree(ki); } static void diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 0102b28a15f..0c6f7f3b3dd 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -441,7 +441,7 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry, /* file operations for directories */ int coda_readdir(struct file *coda_file, void *dirent, filldir_t filldir) { - struct dentry *coda_dentry = coda_file->f_dentry; + struct dentry *coda_dentry = coda_file->f_path.dentry; struct coda_file_info *cfi; struct file *host_file; struct inode *host_inode; @@ -453,7 +453,7 @@ int coda_readdir(struct file *coda_file, void *dirent, filldir_t filldir) coda_vfs_stat.readdir++; - host_inode = host_file->f_dentry->d_inode; + host_inode = host_file->f_path.dentry->d_inode; mutex_lock(&host_inode->i_mutex); host_file->f_pos = coda_file->f_pos; @@ -544,14 +544,14 @@ static int coda_venus_readdir(struct file *filp, filldir_t filldir, /* catch truncated reads */ if (ret < vdir_size || ret < vdir_size + vdir->d_namlen) { printk("coda_venus_readdir: short read: %ld\n", - filp->f_dentry->d_inode->i_ino); + filp->f_path.dentry->d_inode->i_ino); ret = -EBADF; break; } /* validate whether the directory file actually makes sense */ if (vdir->d_reclen < vdir_size + vdir->d_namlen) { printk("coda_venus_readdir: Invalid dir: %ld\n", - filp->f_dentry->d_inode->i_ino); + filp->f_path.dentry->d_inode->i_ino); ret = -EBADF; break; } diff --git a/fs/coda/file.c b/fs/coda/file.c index dbfbcfa5b3c..5ef2b609ec7 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -66,7 +66,7 @@ coda_file_sendfile(struct file *coda_file, loff_t *ppos, size_t count, static ssize_t coda_file_write(struct file *coda_file, const char __user *buf, size_t count, loff_t *ppos) { - struct inode *host_inode, *coda_inode = coda_file->f_dentry->d_inode; + struct inode *host_inode, *coda_inode = coda_file->f_path.dentry->d_inode; struct coda_file_info *cfi; struct file *host_file; ssize_t ret; @@ -78,7 +78,7 @@ coda_file_write(struct file *coda_file, const char __user *buf, size_t count, lo if (!host_file->f_op || !host_file->f_op->write) return -EINVAL; - host_inode = host_file->f_dentry->d_inode; + host_inode = host_file->f_path.dentry->d_inode; mutex_lock(&coda_inode->i_mutex); ret = host_file->f_op->write(host_file, buf, count, ppos); @@ -106,8 +106,8 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma) if (!host_file->f_op || !host_file->f_op->mmap) return -ENODEV; - coda_inode = coda_file->f_dentry->d_inode; - host_inode = host_file->f_dentry->d_inode; + coda_inode = coda_file->f_path.dentry->d_inode; + host_inode = host_file->f_path.dentry->d_inode; coda_file->f_mapping = host_file->f_mapping; if (coda_inode->i_mapping == &coda_inode->i_data) coda_inode->i_mapping = host_inode->i_mapping; @@ -190,7 +190,7 @@ int coda_flush(struct file *coda_file, fl_owner_t id) cfi = CODA_FTOC(coda_file); BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); - coda_inode = coda_file->f_dentry->d_inode; + coda_inode = coda_file->f_path.dentry->d_inode; err = venus_store(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags, coda_file->f_uid); @@ -233,7 +233,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file) err = venus_close(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags, coda_file->f_uid); - host_inode = cfi->cfi_container->f_dentry->d_inode; + host_inode = cfi->cfi_container->f_path.dentry->d_inode; cii = ITOC(coda_inode); /* did we mmap this file? */ @@ -270,7 +270,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync) coda_vfs_stat.fsync++; if (host_file->f_op && host_file->f_op->fsync) { - host_dentry = host_file->f_dentry; + host_dentry = host_file->f_path.dentry; host_inode = host_dentry->d_inode; mutex_lock(&host_inode->i_mutex); err = host_file->f_op->fsync(host_file, host_dentry, datasync); diff --git a/fs/coda/inode.c b/fs/coda/inode.c index b64659fa82d..01395defed8 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -119,7 +119,7 @@ static int get_device_index(struct coda_mount_data *data) file = fget(data->fd); inode = NULL; if(file) - inode = file->f_dentry->d_inode; + inode = file->f_path.dentry->d_inode; if(!inode || !S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) { diff --git a/fs/compat.c b/fs/compat.c index a7e3f162fb1..0ec70e3cee0 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -232,7 +232,7 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user file = fget(fd); if (!file) goto out; - error = vfs_statfs(file->f_dentry, &tmp); + error = vfs_statfs(file->f_path.dentry, &tmp); if (!error) error = put_compat_statfs(buf, &tmp); fput(file); @@ -303,7 +303,7 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c file = fget(fd); if (!file) goto out; - error = vfs_statfs(file->f_dentry, &tmp); + error = vfs_statfs(file->f_path.dentry, &tmp); if (!error) error = put_compat_statfs64(buf, &tmp); fput(file); @@ -365,7 +365,7 @@ static void compat_ioctl_error(struct file *filp, unsigned int fd, /* find the name of the device. */ path = (char *)__get_free_page(GFP_KERNEL); if (path) { - fn = d_path(filp->f_dentry, filp->f_vfsmnt, path, PAGE_SIZE); + fn = d_path(filp->f_path.dentry, filp->f_path.mnt, path, PAGE_SIZE); if (IS_ERR(fn)) fn = "?"; } @@ -416,7 +416,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, case FIBMAP: case FIGETBSZ: case FIONREAD: - if (S_ISREG(filp->f_dentry->d_inode->i_mode)) + if (S_ISREG(filp->f_path.dentry->d_inode->i_mode)) break; /*FALL THROUGH*/ @@ -438,7 +438,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, goto found_handler; } - if (S_ISSOCK(filp->f_dentry->d_inode->i_mode) && + if (S_ISSOCK(filp->f_path.dentry->d_inode->i_mode) && cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) { error = siocdevprivate_ioctl(fd, cmd, arg); } else { @@ -1259,7 +1259,7 @@ out: if (iov != iovstack) kfree(iov); if ((ret + (type == READ)) > 0) { - struct dentry *dentry = file->f_dentry; + struct dentry *dentry = file->f_path.dentry; if (type == READ) fsnotify_access(dentry); else @@ -1679,19 +1679,19 @@ int compat_core_sys_select(int n, compat_ulong_t __user *inp, { fd_set_bits fds; char *bits; - int size, max_fdset, ret = -EINVAL; + int size, max_fds, ret = -EINVAL; struct fdtable *fdt; if (n < 0) goto out_nofds; - /* max_fdset can increase, so grab it once to avoid race */ + /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); - max_fdset = fdt->max_fdset; + max_fds = fdt->max_fds; rcu_read_unlock(); - if (n > max_fdset) - n = max_fdset; + if (n > max_fds) + n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index bcc3caf5d82..c81c958b3e1 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1177,7 +1177,7 @@ static int cdrom_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long ar static int vt_check(struct file *file) { struct tty_struct *tty; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; if (file->f_op->ioctl != tty_ioctl) return -EINVAL; diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index c398861f78a..1814ba44680 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -980,7 +980,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name) static int configfs_dir_open(struct inode *inode, struct file *file) { - struct dentry * dentry = file->f_dentry; + struct dentry * dentry = file->f_path.dentry; struct configfs_dirent * parent_sd = dentry->d_fsdata; mutex_lock(&dentry->d_inode->i_mutex); @@ -993,7 +993,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) static int configfs_dir_close(struct inode *inode, struct file *file) { - struct dentry * dentry = file->f_dentry; + struct dentry * dentry = file->f_path.dentry; struct configfs_dirent * cursor = file->private_data; mutex_lock(&dentry->d_inode->i_mutex); @@ -1013,7 +1013,7 @@ static inline unsigned char dt_type(struct configfs_dirent *sd) static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir) { - struct dentry *dentry = filp->f_dentry; + struct dentry *dentry = filp->f_path.dentry; struct configfs_dirent * parent_sd = dentry->d_fsdata; struct configfs_dirent *cursor = filp->private_data; struct list_head *p, *q = &cursor->s_sibling; @@ -1070,7 +1070,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin) { - struct dentry * dentry = file->f_dentry; + struct dentry * dentry = file->f_path.dentry; mutex_lock(&dentry->d_inode->i_mutex); switch (origin) { @@ -1080,7 +1080,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin) if (offset >= 0) break; default: - mutex_unlock(&file->f_dentry->d_inode->i_mutex); + mutex_unlock(&file->f_path.dentry->d_inode->i_mutex); return -EINVAL; } if (offset != file->f_pos) { diff --git a/fs/configfs/file.c b/fs/configfs/file.c index cf33fac68c8..d98be5e0132 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -134,7 +134,7 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp down(&buffer->sem); if (buffer->needs_read_fill) { - if ((retval = fill_read_buffer(file->f_dentry,buffer))) + if ((retval = fill_read_buffer(file->f_path.dentry,buffer))) goto out; } pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n", @@ -162,14 +162,17 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size int error; if (!buffer->page) - buffer->page = (char *)get_zeroed_page(GFP_KERNEL); + buffer->page = (char *)__get_free_pages(GFP_KERNEL, 0); if (!buffer->page) return -ENOMEM; - if (count > PAGE_SIZE) - count = PAGE_SIZE; + if (count >= PAGE_SIZE) + count = PAGE_SIZE - 1; error = copy_from_user(buffer->page,buf,count); buffer->needs_read_fill = 1; + /* if buf is assumed to contain a string, terminate it by \0, + * so e.g. sscanf() can scan the string easily */ + buffer->page[count] = 0; return error ? -EFAULT : count; } @@ -222,7 +225,7 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof down(&buffer->sem); len = fill_write_buffer(buffer, buf, count); if (len > 0) - len = flush_write_buffer(file->f_dentry, buffer, count); + len = flush_write_buffer(file->f_path.dentry, buffer, count); if (len > 0) *ppos += len; up(&buffer->sem); @@ -231,8 +234,8 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof static int check_perm(struct inode * inode, struct file * file) { - struct config_item *item = configfs_get_config_item(file->f_dentry->d_parent); - struct configfs_attribute * attr = to_attr(file->f_dentry); + struct config_item *item = configfs_get_config_item(file->f_path.dentry->d_parent); + struct configfs_attribute * attr = to_attr(file->f_path.dentry); struct configfs_buffer * buffer; struct configfs_item_operations * ops = NULL; int error = 0; @@ -305,8 +308,8 @@ static int configfs_open_file(struct inode * inode, struct file * filp) static int configfs_release(struct inode * inode, struct file * filp) { - struct config_item * item = to_item(filp->f_dentry->d_parent); - struct configfs_attribute * attr = to_attr(filp->f_dentry); + struct config_item * item = to_item(filp->f_path.dentry->d_parent); + struct configfs_attribute * attr = to_attr(filp->f_path.dentry); struct module * owner = attr->ca_owner; struct configfs_buffer * buffer = filp->private_data; diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 0509cedd415..6db03fb089d 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -338,7 +338,7 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) */ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct super_block *sb = inode->i_sb; char *buf; unsigned int offset; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 137d76c3f90..c692487346e 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -24,6 +24,7 @@ #include <linux/kobject.h> #include <linux/namei.h> #include <linux/debugfs.h> +#include <linux/fsnotify.h> #define DEBUGFS_MAGIC 0x64626720 @@ -54,7 +55,8 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; - /* directory inodes start off with i_nlink == 2 (for "." entry) */ + /* directory inodes start off with i_nlink == 2 + * (for "." entry) */ inc_nlink(inode); break; } @@ -87,15 +89,22 @@ static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR; res = debugfs_mknod(dir, dentry, mode, 0); - if (!res) + if (!res) { inc_nlink(dir); + fsnotify_mkdir(dir, dentry); + } return res; } static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode) { + int res; + mode = (mode & S_IALLUGO) | S_IFREG; - return debugfs_mknod(dir, dentry, mode, 0); + res = debugfs_mknod(dir, dentry, mode, 0); + if (!res) + fsnotify_create(dir, dentry); + return res; } static inline int debugfs_positive(struct dentry *dentry) @@ -135,7 +144,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode, * block. A pointer to that is in the struct vfsmount that we * have around. */ - if (!parent ) { + if (!parent) { if (debugfs_mount && debugfs_mount->mnt_sb) { parent = debugfs_mount->mnt_sb->s_root; } @@ -153,6 +162,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode, error = debugfs_mkdir(parent->d_inode, *dentry, mode); else error = debugfs_create(parent->d_inode, *dentry, mode); + dput(*dentry); } else error = PTR_ERR(*dentry); mutex_unlock(&parent->d_inode->i_mutex); @@ -197,13 +207,15 @@ struct dentry *debugfs_create_file(const char *name, mode_t mode, pr_debug("debugfs: creating file '%s'\n",name); - error = simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count); + error = simple_pin_fs(&debug_fs_type, &debugfs_mount, + &debugfs_mount_count); if (error) goto exit; error = debugfs_create_by_name(name, mode, parent, &dentry); if (error) { dentry = NULL; + simple_release_fs(&debugfs_mount, &debugfs_mount_count); goto exit; } @@ -262,6 +274,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir); void debugfs_remove(struct dentry *dentry) { struct dentry *parent; + int ret = 0; if (!dentry) return; @@ -273,11 +286,19 @@ void debugfs_remove(struct dentry *dentry) mutex_lock(&parent->d_inode->i_mutex); if (debugfs_positive(dentry)) { if (dentry->d_inode) { - if (S_ISDIR(dentry->d_inode->i_mode)) - simple_rmdir(parent->d_inode, dentry); - else + dget(dentry); + if (S_ISDIR(dentry->d_inode->i_mode)) { + ret = simple_rmdir(parent->d_inode, dentry); + if (ret) + printk(KERN_ERR + "DebugFS rmdir on %s failed : " + "directory not empty.\n", + dentry->d_name.name); + } else simple_unlink(parent->d_inode, dentry); - dput(dentry); + if (!ret) + d_delete(dentry); + dput(dentry); } } mutex_unlock(&parent->d_inode->i_mutex); diff --git a/fs/direct-io.c b/fs/direct-io.c index 5981e17f46f..d9d0833444f 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -27,6 +27,7 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/pagemap.h> +#include <linux/task_io_accounting_ops.h> #include <linux/bio.h> #include <linux/wait.h> #include <linux/err.h> @@ -121,8 +122,7 @@ struct dio { /* BIO completion state */ spinlock_t bio_lock; /* protects BIO fields below */ - int bio_count; /* nr bios to be completed */ - int bios_in_flight; /* nr bios in flight */ + unsigned long refcount; /* direct_io_worker() and bios */ struct bio *bio_list; /* singly linked via bi_private */ struct task_struct *waiter; /* waiting task (NULL if none) */ @@ -209,76 +209,55 @@ static struct page *dio_get_page(struct dio *dio) return dio->pages[dio->head++]; } -/* - * Called when all DIO BIO I/O has been completed - let the filesystem - * know, if it registered an interest earlier via get_block. Pass the - * private field of the map buffer_head so that filesystems can use it - * to hold additional state between get_block calls and dio_complete. - */ -static void dio_complete(struct dio *dio, loff_t offset, ssize_t bytes) -{ - if (dio->end_io && dio->result) - dio->end_io(dio->iocb, offset, bytes, dio->map_bh.b_private); - if (dio->lock_type == DIO_LOCKING) - /* lockdep: non-owner release */ - up_read_non_owner(&dio->inode->i_alloc_sem); -} - -/* - * Called when a BIO has been processed. If the count goes to zero then IO is - * complete and we can signal this to the AIO layer. +/** + * dio_complete() - called when all DIO BIO I/O has been completed + * @offset: the byte offset in the file of the completed operation + * + * This releases locks as dictated by the locking type, lets interested parties + * know that a DIO operation has completed, and calculates the resulting return + * code for the operation. + * + * It lets the filesystem know if it registered an interest earlier via + * get_block. Pass the private field of the map buffer_head so that + * filesystems can use it to hold additional state between get_block calls and + * dio_complete. */ -static void finished_one_bio(struct dio *dio) +static int dio_complete(struct dio *dio, loff_t offset, int ret) { - unsigned long flags; + ssize_t transferred = 0; - spin_lock_irqsave(&dio->bio_lock, flags); - if (dio->bio_count == 1) { - if (dio->is_async) { - ssize_t transferred; - loff_t offset; - - /* - * Last reference to the dio is going away. - * Drop spinlock and complete the DIO. - */ - spin_unlock_irqrestore(&dio->bio_lock, flags); + /* + * AIO submission can race with bio completion to get here while + * expecting to have the last io completed by bio completion. + * In that case -EIOCBQUEUED is in fact not an error we want + * to preserve through this call. + */ + if (ret == -EIOCBQUEUED) + ret = 0; - /* Check for short read case */ - transferred = dio->result; - offset = dio->iocb->ki_pos; + if (dio->result) { + transferred = dio->result; - if ((dio->rw == READ) && - ((offset + transferred) > dio->i_size)) - transferred = dio->i_size - offset; + /* Check for short read case */ + if ((dio->rw == READ) && ((offset + transferred) > dio->i_size)) + transferred = dio->i_size - offset; + } - /* check for error in completion path */ - if (dio->io_error) - transferred = dio->io_error; + if (dio->end_io && dio->result) + dio->end_io(dio->iocb, offset, transferred, + dio->map_bh.b_private); + if (dio->lock_type == DIO_LOCKING) + /* lockdep: non-owner release */ + up_read_non_owner(&dio->inode->i_alloc_sem); - dio_complete(dio, offset, transferred); + if (ret == 0) + ret = dio->page_errors; + if (ret == 0) + ret = dio->io_error; + if (ret == 0) + ret = transferred; - /* Complete AIO later if falling back to buffered i/o */ - if (dio->result == dio->size || - ((dio->rw == READ) && dio->result)) { - aio_complete(dio->iocb, transferred, 0); - kfree(dio); - return; - } else { - /* - * Falling back to buffered - */ - spin_lock_irqsave(&dio->bio_lock, flags); - dio->bio_count--; - if (dio->waiter) - wake_up_process(dio->waiter); - spin_unlock_irqrestore(&dio->bio_lock, flags); - return; - } - } - } - dio->bio_count--; - spin_unlock_irqrestore(&dio->bio_lock, flags); + return ret; } static int dio_bio_complete(struct dio *dio, struct bio *bio); @@ -288,12 +267,27 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio); static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error) { struct dio *dio = bio->bi_private; + unsigned long remaining; + unsigned long flags; if (bio->bi_size) return 1; /* cleanup the bio */ dio_bio_complete(dio, bio); + + spin_lock_irqsave(&dio->bio_lock, flags); + remaining = --dio->refcount; + if (remaining == 1 && dio->waiter) + wake_up_process(dio->waiter); + spin_unlock_irqrestore(&dio->bio_lock, flags); + + if (remaining == 0) { + int ret = dio_complete(dio, dio->iocb->ki_pos, 0); + aio_complete(dio->iocb, ret, 0); + kfree(dio); + } + return 0; } @@ -315,8 +309,7 @@ static int dio_bio_end_io(struct bio *bio, unsigned int bytes_done, int error) spin_lock_irqsave(&dio->bio_lock, flags); bio->bi_private = dio->bio_list; dio->bio_list = bio; - dio->bios_in_flight--; - if (dio->waiter && dio->bios_in_flight == 0) + if (--dio->refcount == 1 && dio->waiter) wake_up_process(dio->waiter); spin_unlock_irqrestore(&dio->bio_lock, flags); return 0; @@ -347,6 +340,8 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev, * In the AIO read case we speculatively dirty the pages before starting IO. * During IO completion, any of these pages which happen to have been written * back will be redirtied by bio_check_pages_dirty(). + * + * bios hold a dio reference between submit_bio and ->end_io. */ static void dio_bio_submit(struct dio *dio) { @@ -354,12 +349,14 @@ static void dio_bio_submit(struct dio *dio) unsigned long flags; bio->bi_private = dio; + spin_lock_irqsave(&dio->bio_lock, flags); - dio->bio_count++; - dio->bios_in_flight++; + dio->refcount++; spin_unlock_irqrestore(&dio->bio_lock, flags); + if (dio->is_async && dio->rw == READ) bio_set_pages_dirty(bio); + submit_bio(dio->rw, bio); dio->bio = NULL; @@ -376,28 +373,37 @@ static void dio_cleanup(struct dio *dio) } /* - * Wait for the next BIO to complete. Remove it and return it. + * Wait for the next BIO to complete. Remove it and return it. NULL is + * returned once all BIOs have been completed. This must only be called once + * all bios have been issued so that dio->refcount can only decrease. This + * requires that that the caller hold a reference on the dio. */ static struct bio *dio_await_one(struct dio *dio) { unsigned long flags; - struct bio *bio; + struct bio *bio = NULL; spin_lock_irqsave(&dio->bio_lock, flags); - while (dio->bio_list == NULL) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (dio->bio_list == NULL) { - dio->waiter = current; - spin_unlock_irqrestore(&dio->bio_lock, flags); - blk_run_address_space(dio->inode->i_mapping); - io_schedule(); - spin_lock_irqsave(&dio->bio_lock, flags); - dio->waiter = NULL; - } - set_current_state(TASK_RUNNING); + + /* + * Wait as long as the list is empty and there are bios in flight. bio + * completion drops the count, maybe adds to the list, and wakes while + * holding the bio_lock so we don't need set_current_state()'s barrier + * and can call it after testing our condition. + */ + while (dio->refcount > 1 && dio->bio_list == NULL) { + __set_current_state(TASK_UNINTERRUPTIBLE); + dio->waiter = current; + spin_unlock_irqrestore(&dio->bio_lock, flags); + io_schedule(); + /* wake up sets us TASK_RUNNING */ + spin_lock_irqsave(&dio->bio_lock, flags); + dio->waiter = NULL; + } + if (dio->bio_list) { + bio = dio->bio_list; + dio->bio_list = bio->bi_private; } - bio = dio->bio_list; - dio->bio_list = bio->bi_private; spin_unlock_irqrestore(&dio->bio_lock, flags); return bio; } @@ -426,34 +432,24 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) } bio_put(bio); } - finished_one_bio(dio); return uptodate ? 0 : -EIO; } /* - * Wait on and process all in-flight BIOs. + * Wait on and process all in-flight BIOs. This must only be called once + * all bios have been issued so that the refcount can only decrease. + * This just waits for all bios to make it through dio_bio_complete. IO + * errors are propogated through dio->io_error and should be propogated via + * dio_complete(). */ -static int dio_await_completion(struct dio *dio) +static void dio_await_completion(struct dio *dio) { - int ret = 0; - - if (dio->bio) - dio_bio_submit(dio); - - /* - * The bio_lock is not held for the read of bio_count. - * This is ok since it is the dio_bio_complete() that changes - * bio_count. - */ - while (dio->bio_count) { - struct bio *bio = dio_await_one(dio); - int ret2; - - ret2 = dio_bio_complete(dio, bio); - if (ret == 0) - ret = ret2; - } - return ret; + struct bio *bio; + do { + bio = dio_await_one(dio); + if (bio) + dio_bio_complete(dio, bio); + } while (bio); } /* @@ -675,6 +671,13 @@ submit_page_section(struct dio *dio, struct page *page, { int ret = 0; + if (dio->rw & WRITE) { + /* + * Read accounting is performed in submit_bio() + */ + task_io_account_write(len); + } + /* * Can we just grow the current page's presence in the dio? */ @@ -953,6 +956,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, struct dio *dio) { unsigned long user_addr; + unsigned long flags; int seg; ssize_t ret = 0; ssize_t ret2; @@ -983,17 +987,8 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, dio->iocb = iocb; dio->i_size = i_size_read(inode); - /* - * BIO completion state. - * - * ->bio_count starts out at one, and we decrement it to zero after all - * BIOs are submitted. This to avoid the situation where a really fast - * (or synchronous) device could take the count to zero while we're - * still submitting BIOs. - */ - dio->bio_count = 1; - dio->bios_in_flight = 0; spin_lock_init(&dio->bio_lock); + dio->refcount = 1; dio->bio_list = NULL; dio->waiter = NULL; @@ -1069,6 +1064,9 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, if (dio->bio) dio_bio_submit(dio); + /* All IO is now issued, send it on its way */ + blk_run_address_space(inode->i_mapping); + /* * It is possible that, we return short IO due to end of file. * In that case, we need to release all the pages we got hold on. @@ -1084,74 +1082,41 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, mutex_unlock(&dio->inode->i_mutex); /* - * OK, all BIOs are submitted, so we can decrement bio_count to truly - * reflect the number of to-be-processed BIOs. + * The only time we want to leave bios in flight is when a successful + * partial aio read or full aio write have been setup. In that case + * bio completion will call aio_complete. The only time it's safe to + * call aio_complete is when we return -EIOCBQUEUED, so we key on that. + * This had *better* be the only place that raises -EIOCBQUEUED. */ - if (dio->is_async) { - int should_wait = 0; + BUG_ON(ret == -EIOCBQUEUED); + if (dio->is_async && ret == 0 && dio->result && + ((rw & READ) || (dio->result == dio->size))) + ret = -EIOCBQUEUED; - if (dio->result < dio->size && (rw & WRITE)) { - dio->waiter = current; - should_wait = 1; - } - if (ret == 0) - ret = dio->result; - finished_one_bio(dio); /* This can free the dio */ - blk_run_address_space(inode->i_mapping); - if (should_wait) { - unsigned long flags; - /* - * Wait for already issued I/O to drain out and - * release its references to user-space pages - * before returning to fallback on buffered I/O - */ - - spin_lock_irqsave(&dio->bio_lock, flags); - set_current_state(TASK_UNINTERRUPTIBLE); - while (dio->bio_count) { - spin_unlock_irqrestore(&dio->bio_lock, flags); - io_schedule(); - spin_lock_irqsave(&dio->bio_lock, flags); - set_current_state(TASK_UNINTERRUPTIBLE); - } - spin_unlock_irqrestore(&dio->bio_lock, flags); - set_current_state(TASK_RUNNING); - kfree(dio); - } - } else { - ssize_t transferred = 0; - - finished_one_bio(dio); - ret2 = dio_await_completion(dio); - if (ret == 0) - ret = ret2; - if (ret == 0) - ret = dio->page_errors; - if (dio->result) { - loff_t i_size = i_size_read(inode); - - transferred = dio->result; - /* - * Adjust the return value if the read crossed a - * non-block-aligned EOF. - */ - if (rw == READ && (offset + transferred > i_size)) - transferred = i_size - offset; - } - dio_complete(dio, offset, transferred); - if (ret == 0) - ret = transferred; + if (ret != -EIOCBQUEUED) + dio_await_completion(dio); - /* We could have also come here on an AIO file extend */ - if (!is_sync_kiocb(iocb) && (rw & WRITE) && - ret >= 0 && dio->result == dio->size) - /* - * For AIO writes where we have completed the - * i/o, we have to mark the the aio complete. - */ - aio_complete(iocb, ret, 0); + /* + * Sync will always be dropping the final ref and completing the + * operation. AIO can if it was a broken operation described above or + * in fact if all the bios race to complete before we get here. In + * that case dio_complete() translates the EIOCBQUEUED into the proper + * return code that the caller will hand to aio_complete(). + * + * This is managed by the bio_lock instead of being an atomic_t so that + * completion paths can drop their ref and use the remaining count to + * decide to wake the submission path atomically. + */ + spin_lock_irqsave(&dio->bio_lock, flags); + ret2 = --dio->refcount; + spin_unlock_irqrestore(&dio->bio_lock, flags); + BUG_ON(!dio->is_async && ret2 != 0); + if (ret2 == 0) { + ret = dio_complete(dio, offset, ret); kfree(dio); - } + } else + BUG_ON(ret != -EIOCBQUEUED); + return ret; } diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index b5654a284fe..6fa7b0d5c04 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -3,21 +3,21 @@ menu "Distributed Lock Manager" config DLM tristate "Distributed Lock Manager (DLM)" - depends on IPV6 || IPV6=n + depends on SYSFS && (IPV6 || IPV6=n) select CONFIGFS_FS select IP_SCTP if DLM_SCTP help - A general purpose distributed lock manager for kernel or userspace - applications. + A general purpose distributed lock manager for kernel or userspace + applications. choice prompt "Select DLM communications protocol" depends on DLM default DLM_TCP help - The DLM Can use TCP or SCTP for it's network communications. - SCTP supports multi-homed operations whereas TCP doesn't. - However, SCTP seems to have stability problems at the moment. + The DLM Can use TCP or SCTP for it's network communications. + SCTP supports multi-homed operations whereas TCP doesn't. + However, SCTP seems to have stability problems at the moment. config DLM_TCP bool "TCP/IP" @@ -31,8 +31,8 @@ config DLM_DEBUG bool "DLM debugging" depends on DLM help - Under the debugfs mount point, the name of each lockspace will - appear as a file in the "dlm" directory. The output is the - list of resource and locks the local node knows about. + Under the debugfs mount point, the name of each lockspace will + appear as a file in the "dlm" directory. The output is the + list of resource and locks the local node knows about. endmenu diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 88553054bbf..8665c88e5af 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -54,6 +54,11 @@ static struct config_item *make_node(struct config_group *, const char *); static void drop_node(struct config_group *, struct config_item *); static void release_node(struct config_item *); +static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a, + char *buf); +static ssize_t store_cluster(struct config_item *i, + struct configfs_attribute *a, + const char *buf, size_t len); static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, char *buf); static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a, @@ -73,6 +78,101 @@ static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len); static ssize_t node_weight_read(struct node *nd, char *buf); static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len); +struct cluster { + struct config_group group; + unsigned int cl_tcp_port; + unsigned int cl_buffer_size; + unsigned int cl_rsbtbl_size; + unsigned int cl_lkbtbl_size; + unsigned int cl_dirtbl_size; + unsigned int cl_recover_timer; + unsigned int cl_toss_secs; + unsigned int cl_scan_secs; + unsigned int cl_log_debug; +}; + +enum { + CLUSTER_ATTR_TCP_PORT = 0, + CLUSTER_ATTR_BUFFER_SIZE, + CLUSTER_ATTR_RSBTBL_SIZE, + CLUSTER_ATTR_LKBTBL_SIZE, + CLUSTER_ATTR_DIRTBL_SIZE, + CLUSTER_ATTR_RECOVER_TIMER, + CLUSTER_ATTR_TOSS_SECS, + CLUSTER_ATTR_SCAN_SECS, + CLUSTER_ATTR_LOG_DEBUG, +}; + +struct cluster_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct cluster *, char *); + ssize_t (*store)(struct cluster *, const char *, size_t); +}; + +static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field, + unsigned int *info_field, int check_zero, + const char *buf, size_t len) +{ + unsigned int x; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + x = simple_strtoul(buf, NULL, 0); + + if (check_zero && !x) + return -EINVAL; + + *cl_field = x; + *info_field = x; + + return len; +} + +#define __CONFIGFS_ATTR(_name,_mode,_read,_write) { \ + .attr = { .ca_name = __stringify(_name), \ + .ca_mode = _mode, \ + .ca_owner = THIS_MODULE }, \ + .show = _read, \ + .store = _write, \ +} + +#define CLUSTER_ATTR(name, check_zero) \ +static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len) \ +{ \ + return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \ + check_zero, buf, len); \ +} \ +static ssize_t name##_read(struct cluster *cl, char *buf) \ +{ \ + return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name); \ +} \ +static struct cluster_attribute cluster_attr_##name = \ +__CONFIGFS_ATTR(name, 0644, name##_read, name##_write) + +CLUSTER_ATTR(tcp_port, 1); +CLUSTER_ATTR(buffer_size, 1); +CLUSTER_ATTR(rsbtbl_size, 1); +CLUSTER_ATTR(lkbtbl_size, 1); +CLUSTER_ATTR(dirtbl_size, 1); +CLUSTER_ATTR(recover_timer, 1); +CLUSTER_ATTR(toss_secs, 1); +CLUSTER_ATTR(scan_secs, 1); +CLUSTER_ATTR(log_debug, 0); + +static struct configfs_attribute *cluster_attrs[] = { + [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr, + [CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size.attr, + [CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size.attr, + [CLUSTER_ATTR_LKBTBL_SIZE] = &cluster_attr_lkbtbl_size.attr, + [CLUSTER_ATTR_DIRTBL_SIZE] = &cluster_attr_dirtbl_size.attr, + [CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer.attr, + [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr, + [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr, + [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr, + NULL, +}; + enum { COMM_ATTR_NODEID = 0, COMM_ATTR_LOCAL, @@ -152,10 +252,6 @@ struct clusters { struct configfs_subsystem subsys; }; -struct cluster { - struct config_group group; -}; - struct spaces { struct config_group ss_group; }; @@ -197,6 +293,8 @@ static struct configfs_group_operations clusters_ops = { static struct configfs_item_operations cluster_ops = { .release = release_cluster, + .show_attribute = show_cluster, + .store_attribute = store_cluster, }; static struct configfs_group_operations spaces_ops = { @@ -237,6 +335,7 @@ static struct config_item_type clusters_type = { static struct config_item_type cluster_type = { .ct_item_ops = &cluster_ops, + .ct_attrs = cluster_attrs, .ct_owner = THIS_MODULE, }; @@ -317,6 +416,16 @@ static struct config_group *make_cluster(struct config_group *g, cl->group.default_groups[1] = &cms->cs_group; cl->group.default_groups[2] = NULL; + cl->cl_tcp_port = dlm_config.ci_tcp_port; + cl->cl_buffer_size = dlm_config.ci_buffer_size; + cl->cl_rsbtbl_size = dlm_config.ci_rsbtbl_size; + cl->cl_lkbtbl_size = dlm_config.ci_lkbtbl_size; + cl->cl_dirtbl_size = dlm_config.ci_dirtbl_size; + cl->cl_recover_timer = dlm_config.ci_recover_timer; + cl->cl_toss_secs = dlm_config.ci_toss_secs; + cl->cl_scan_secs = dlm_config.ci_scan_secs; + cl->cl_log_debug = dlm_config.ci_log_debug; + space_list = &sps->ss_group; comm_list = &cms->cs_group; return &cl->group; @@ -509,6 +618,25 @@ void dlm_config_exit(void) * Functions for user space to read/write attributes */ +static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a, + char *buf) +{ + struct cluster *cl = to_cluster(i); + struct cluster_attribute *cla = + container_of(a, struct cluster_attribute, attr); + return cla->show ? cla->show(cl, buf) : 0; +} + +static ssize_t store_cluster(struct config_item *i, + struct configfs_attribute *a, + const char *buf, size_t len) +{ + struct cluster *cl = to_cluster(i); + struct cluster_attribute *cla = + container_of(a, struct cluster_attribute, attr); + return cla->store ? cla->store(cl, buf, len) : -EINVAL; +} + static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, char *buf) { @@ -775,15 +903,17 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_RECOVER_TIMER 5 #define DEFAULT_TOSS_SECS 10 #define DEFAULT_SCAN_SECS 5 +#define DEFAULT_LOG_DEBUG 0 struct dlm_config_info dlm_config = { - .tcp_port = DEFAULT_TCP_PORT, - .buffer_size = DEFAULT_BUFFER_SIZE, - .rsbtbl_size = DEFAULT_RSBTBL_SIZE, - .lkbtbl_size = DEFAULT_LKBTBL_SIZE, - .dirtbl_size = DEFAULT_DIRTBL_SIZE, - .recover_timer = DEFAULT_RECOVER_TIMER, - .toss_secs = DEFAULT_TOSS_SECS, - .scan_secs = DEFAULT_SCAN_SECS + .ci_tcp_port = DEFAULT_TCP_PORT, + .ci_buffer_size = DEFAULT_BUFFER_SIZE, + .ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE, + .ci_lkbtbl_size = DEFAULT_LKBTBL_SIZE, + .ci_dirtbl_size = DEFAULT_DIRTBL_SIZE, + .ci_recover_timer = DEFAULT_RECOVER_TIMER, + .ci_toss_secs = DEFAULT_TOSS_SECS, + .ci_scan_secs = DEFAULT_SCAN_SECS, + .ci_log_debug = DEFAULT_LOG_DEBUG }; diff --git a/fs/dlm/config.h b/fs/dlm/config.h index 9da7839958a..1e978611a96 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -17,14 +17,15 @@ #define DLM_MAX_ADDR_COUNT 3 struct dlm_config_info { - int tcp_port; - int buffer_size; - int rsbtbl_size; - int lkbtbl_size; - int dirtbl_size; - int recover_timer; - int toss_secs; - int scan_secs; + int ci_tcp_port; + int ci_buffer_size; + int ci_rsbtbl_size; + int ci_lkbtbl_size; + int ci_dirtbl_size; + int ci_recover_timer; + int ci_toss_secs; + int ci_scan_secs; + int ci_log_debug; }; extern struct dlm_config_info dlm_config; diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 1ee8195e6fc..61d93201e1b 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -41,6 +41,7 @@ #include <asm/uaccess.h> #include <linux/dlm.h> +#include "config.h" #define DLM_LOCKSPACE_LEN 64 @@ -69,12 +70,12 @@ struct dlm_mhandle; #define log_error(ls, fmt, args...) \ printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args) -#define DLM_LOG_DEBUG -#ifdef DLM_LOG_DEBUG -#define log_debug(ls, fmt, args...) log_error(ls, fmt, ##args) -#else -#define log_debug(ls, fmt, args...) -#endif +#define log_debug(ls, fmt, args...) \ +do { \ + if (dlm_config.ci_log_debug) \ + printk(KERN_DEBUG "dlm: %s: " fmt "\n", \ + (ls)->ls_name , ##args); \ +} while (0) #define DLM_ASSERT(x, do) \ { \ @@ -309,8 +310,8 @@ static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag) /* dlm_header is first element of all structs sent between nodes */ -#define DLM_HEADER_MAJOR 0x00020000 -#define DLM_HEADER_MINOR 0x00000001 +#define DLM_HEADER_MAJOR 0x00030000 +#define DLM_HEADER_MINOR 0x00000000 #define DLM_MSG 1 #define DLM_RCOM 2 @@ -386,6 +387,8 @@ struct dlm_rcom { uint32_t rc_type; /* DLM_RCOM_ */ int rc_result; /* multi-purpose */ uint64_t rc_id; /* match reply with request */ + uint64_t rc_seq; /* sender's ls_recover_seq */ + uint64_t rc_seq_reply; /* remote ls_recover_seq */ char rc_buf[0]; }; @@ -523,6 +526,7 @@ struct dlm_user_proc { spinlock_t asts_spin; struct list_head locks; spinlock_t locks_spin; + struct list_head unlocking; wait_queue_head_t wait; }; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 30878defaeb..e725005fafd 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -754,6 +754,11 @@ static void add_to_waiters(struct dlm_lkb *lkb, int mstype) mutex_unlock(&ls->ls_waiters_mutex); } +/* We clear the RESEND flag because we might be taking an lkb off the waiters + list as part of process_requestqueue (e.g. a lookup that has an optimized + request reply on the requestqueue) between dlm_recover_waiters_pre() which + set RESEND and dlm_recover_waiters_post() */ + static int _remove_from_waiters(struct dlm_lkb *lkb) { int error = 0; @@ -764,6 +769,7 @@ static int _remove_from_waiters(struct dlm_lkb *lkb) goto out; } lkb->lkb_wait_type = 0; + lkb->lkb_flags &= ~DLM_IFL_RESEND; list_del(&lkb->lkb_wait_reply); unhold_lkb(lkb); out: @@ -810,7 +816,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b) list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss, res_hashchain) { if (!time_after_eq(jiffies, r->res_toss_time + - dlm_config.toss_secs * HZ)) + dlm_config.ci_toss_secs * HZ)) continue; found = 1; break; @@ -2144,12 +2150,24 @@ static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb, if (lkb->lkb_astaddr) ms->m_asts |= AST_COMP; - if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP) - memcpy(ms->m_extra, r->res_name, r->res_length); + /* compare with switch in create_message; send_remove() doesn't + use send_args() */ - else if (lkb->lkb_lvbptr) + switch (ms->m_type) { + case DLM_MSG_REQUEST: + case DLM_MSG_LOOKUP: + memcpy(ms->m_extra, r->res_name, r->res_length); + break; + case DLM_MSG_CONVERT: + case DLM_MSG_UNLOCK: + case DLM_MSG_REQUEST_REPLY: + case DLM_MSG_CONVERT_REPLY: + case DLM_MSG_GRANT: + if (!lkb->lkb_lvbptr) + break; memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen); - + break; + } } static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype) @@ -2418,8 +2436,12 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb, DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb);); - if (receive_lvb(ls, lkb, ms)) - return -ENOMEM; + if (lkb->lkb_exflags & DLM_LKF_VALBLK) { + /* lkb was just created so there won't be an lvb yet */ + lkb->lkb_lvbptr = allocate_lvb(ls); + if (!lkb->lkb_lvbptr) + return -ENOMEM; + } return 0; } @@ -3002,7 +3024,7 @@ int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery) { struct dlm_message *ms = (struct dlm_message *) hd; struct dlm_ls *ls; - int error; + int error = 0; if (!recovery) dlm_message_in(ms); @@ -3119,7 +3141,7 @@ int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery) out: dlm_put_lockspace(ls); dlm_astd_wake(); - return 0; + return error; } @@ -3132,6 +3154,7 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb) if (middle_conversion(lkb)) { hold_lkb(lkb); ls->ls_stub_ms.m_result = -EINPROGRESS; + ls->ls_stub_ms.m_flags = lkb->lkb_flags; _remove_from_waiters(lkb); _receive_convert_reply(lkb, &ls->ls_stub_ms); @@ -3205,6 +3228,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) case DLM_MSG_UNLOCK: hold_lkb(lkb); ls->ls_stub_ms.m_result = -DLM_EUNLOCK; + ls->ls_stub_ms.m_flags = lkb->lkb_flags; _remove_from_waiters(lkb); _receive_unlock_reply(lkb, &ls->ls_stub_ms); dlm_put_lkb(lkb); @@ -3213,6 +3237,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) case DLM_MSG_CANCEL: hold_lkb(lkb); ls->ls_stub_ms.m_result = -DLM_ECANCEL; + ls->ls_stub_ms.m_flags = lkb->lkb_flags; _remove_from_waiters(lkb); _receive_cancel_reply(lkb, &ls->ls_stub_ms); dlm_put_lkb(lkb); @@ -3571,6 +3596,14 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) lock_rsb(r); switch (error) { + case -EBADR: + /* There's a chance the new master received our lock before + dlm_recover_master_reply(), this wouldn't happen if we did + a barrier between recover_masters and recover_locks. */ + log_debug(ls, "master copy not ready %x r %lx %s", lkb->lkb_id, + (unsigned long)r, r->res_name); + dlm_send_rcom_lock(r, lkb); + goto out; case -EEXIST: log_debug(ls, "master copy exists %x", lkb->lkb_id); /* fall through */ @@ -3585,7 +3618,7 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) /* an ack for dlm_recover_locks() which waits for replies from all the locks it sends to new masters */ dlm_recovered_lock(r); - + out: unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); @@ -3610,7 +3643,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, } if (flags & DLM_LKF_VALBLK) { - ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL); + ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL); if (!ua->lksb.sb_lvbptr) { kfree(ua); __put_lkb(ls, lkb); @@ -3679,7 +3712,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, ua = (struct dlm_user_args *)lkb->lkb_astparam; if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) { - ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL); + ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL); if (!ua->lksb.sb_lvbptr) { error = -ENOMEM; goto out_put; @@ -3745,12 +3778,10 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, goto out_put; spin_lock(&ua->proc->locks_spin); - list_del_init(&lkb->lkb_ownqueue); + /* dlm_user_add_ast() may have already taken lkb off the proc list */ + if (!list_empty(&lkb->lkb_ownqueue)) + list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking); spin_unlock(&ua->proc->locks_spin); - - /* this removes the reference for the proc->locks list added by - dlm_user_request */ - unhold_lkb(lkb); out_put: dlm_put_lkb(lkb); out: @@ -3790,9 +3821,8 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, /* this lkb was removed from the WAITING queue */ if (lkb->lkb_grmode == DLM_LOCK_IV) { spin_lock(&ua->proc->locks_spin); - list_del_init(&lkb->lkb_ownqueue); + list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking); spin_unlock(&ua->proc->locks_spin); - unhold_lkb(lkb); } out_put: dlm_put_lkb(lkb); @@ -3853,11 +3883,6 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) mutex_lock(&ls->ls_clear_proc_locks); list_for_each_entry_safe(lkb, safe, &proc->locks, lkb_ownqueue) { - if (lkb->lkb_ast_type) { - list_del(&lkb->lkb_astqueue); - unhold_lkb(lkb); - } - list_del_init(&lkb->lkb_ownqueue); if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) { @@ -3874,6 +3899,20 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) dlm_put_lkb(lkb); } + + /* in-progress unlocks */ + list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) { + list_del_init(&lkb->lkb_ownqueue); + lkb->lkb_flags |= DLM_IFL_DEAD; + dlm_put_lkb(lkb); + } + + list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) { + list_del(&lkb->lkb_astqueue); + dlm_put_lkb(lkb); + } + mutex_unlock(&ls->ls_clear_proc_locks); unlock_recovery(ls); } + diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 59012b089e8..f40817b53c6 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -236,7 +236,7 @@ static int dlm_scand(void *data) while (!kthread_should_stop()) { list_for_each_entry(ls, &lslist, ls_list) dlm_scan_rsbs(ls); - schedule_timeout_interruptible(dlm_config.scan_secs * HZ); + schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ); } return 0; } @@ -422,7 +422,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace, ls->ls_count = 0; ls->ls_flags = 0; - size = dlm_config.rsbtbl_size; + size = dlm_config.ci_rsbtbl_size; ls->ls_rsbtbl_size = size; ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL); @@ -434,7 +434,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace, rwlock_init(&ls->ls_rsbtbl[i].lock); } - size = dlm_config.lkbtbl_size; + size = dlm_config.ci_lkbtbl_size; ls->ls_lkbtbl_size = size; ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL); @@ -446,7 +446,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace, ls->ls_lkbtbl[i].counter = 1; } - size = dlm_config.dirtbl_size; + size = dlm_config.ci_dirtbl_size; ls->ls_dirtbl_size = size; ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL); @@ -489,7 +489,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace, mutex_init(&ls->ls_requestqueue_mutex); mutex_init(&ls->ls_clear_proc_locks); - ls->ls_recover_buf = kmalloc(dlm_config.buffer_size, GFP_KERNEL); + ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL); if (!ls->ls_recover_buf) goto out_dirfree; diff --git a/fs/dlm/lowcomms-sctp.c b/fs/dlm/lowcomms-sctp.c index fe158d7a928..dc83a9d979b 100644 --- a/fs/dlm/lowcomms-sctp.c +++ b/fs/dlm/lowcomms-sctp.c @@ -72,6 +72,8 @@ struct nodeinfo { struct list_head writequeue; /* outgoing writequeue_entries */ spinlock_t writequeue_lock; int nodeid; + struct work_struct swork; /* Send workqueue */ + struct work_struct lwork; /* Locking workqueue */ }; static DEFINE_IDR(nodeinfo_idr); @@ -96,6 +98,7 @@ struct connection { atomic_t waiting_requests; struct cbuf cb; int eagain_flag; + struct work_struct work; /* Send workqueue */ }; /* An entry waiting to be sent */ @@ -137,19 +140,23 @@ static void cbuf_eat(struct cbuf *cb, int n) static LIST_HEAD(write_nodes); static DEFINE_SPINLOCK(write_nodes_lock); + /* Maximum number of incoming messages to process before * doing a schedule() */ #define MAX_RX_MSG_COUNT 25 -/* Manage daemons */ -static struct task_struct *recv_task; -static struct task_struct *send_task; -static DECLARE_WAIT_QUEUE_HEAD(lowcomms_recv_wait); +/* Work queues */ +static struct workqueue_struct *recv_workqueue; +static struct workqueue_struct *send_workqueue; +static struct workqueue_struct *lock_workqueue; /* The SCTP connection */ static struct connection sctp_con; +static void process_send_sockets(struct work_struct *work); +static void process_recv_sockets(struct work_struct *work); +static void process_lock_request(struct work_struct *work); static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr) { @@ -222,6 +229,8 @@ static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc) spin_lock_init(&ni->lock); INIT_LIST_HEAD(&ni->writequeue); spin_lock_init(&ni->writequeue_lock); + INIT_WORK(&ni->lwork, process_lock_request); + INIT_WORK(&ni->swork, process_send_sockets); ni->nodeid = nodeid; if (nodeid > max_nodeid) @@ -249,11 +258,8 @@ static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc) /* Data or notification available on socket */ static void lowcomms_data_ready(struct sock *sk, int count_unused) { - atomic_inc(&sctp_con.waiting_requests); if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags)) - return; - - wake_up_interruptible(&lowcomms_recv_wait); + queue_work(recv_workqueue, &sctp_con.work); } @@ -361,10 +367,10 @@ static void init_failed(void) spin_lock_bh(&write_nodes_lock); list_add_tail(&ni->write_list, &write_nodes); spin_unlock_bh(&write_nodes_lock); + queue_work(send_workqueue, &ni->swork); } } } - wake_up_process(send_task); } /* Something happened to an association */ @@ -446,8 +452,8 @@ static void process_sctp_notification(struct msghdr *msg, char *buf) spin_lock_bh(&write_nodes_lock); list_add_tail(&ni->write_list, &write_nodes); spin_unlock_bh(&write_nodes_lock); + queue_work(send_workqueue, &ni->swork); } - wake_up_process(send_task); } break; @@ -580,8 +586,8 @@ static int receive_from_sock(void) spin_lock_bh(&write_nodes_lock); list_add_tail(&ni->write_list, &write_nodes); spin_unlock_bh(&write_nodes_lock); + queue_work(send_workqueue, &ni->swork); } - wake_up_process(send_task); } } @@ -590,6 +596,7 @@ static int receive_from_sock(void) return 0; cbuf_add(&sctp_con.cb, ret); + // PJC: TODO: Add to node's workqueue....can we ?? ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid), page_address(sctp_con.rx_page), sctp_con.cb.base, sctp_con.cb.len, @@ -635,7 +642,7 @@ static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num) if (result < 0) log_print("Can't bind to port %d addr number %d", - dlm_config.tcp_port, num); + dlm_config.ci_tcp_port, num); return result; } @@ -711,7 +718,7 @@ static int init_sock(void) /* Bind to all interfaces. */ for (i = 0; i < dlm_local_count; i++) { memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr)); - make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len); + make_sockaddr(&localaddr, dlm_config.ci_tcp_port, &addr_len); result = add_bind_addr(&localaddr, addr_len, num); if (result) @@ -820,7 +827,8 @@ void dlm_lowcomms_commit_buffer(void *arg) spin_lock_bh(&write_nodes_lock); list_add_tail(&ni->write_list, &write_nodes); spin_unlock_bh(&write_nodes_lock); - wake_up_process(send_task); + + queue_work(send_workqueue, &ni->swork); } return; @@ -863,7 +871,7 @@ static void initiate_association(int nodeid) return; } - make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen); + make_sockaddr(&rem_addr, dlm_config.ci_tcp_port, &addrlen); outmessage.msg_name = &rem_addr; outmessage.msg_namelen = addrlen; @@ -1088,101 +1096,75 @@ int dlm_lowcomms_close(int nodeid) return 0; } -static int write_list_empty(void) +// PJC: The work queue function for receiving. +static void process_recv_sockets(struct work_struct *work) { - int status; - - spin_lock_bh(&write_nodes_lock); - status = list_empty(&write_nodes); - spin_unlock_bh(&write_nodes_lock); - - return status; -} - -static int dlm_recvd(void *data) -{ - DECLARE_WAITQUEUE(wait, current); - - while (!kthread_should_stop()) { + if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) { + int ret; int count = 0; - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&lowcomms_recv_wait, &wait); - if (!test_bit(CF_READ_PENDING, &sctp_con.flags)) - cond_resched(); - remove_wait_queue(&lowcomms_recv_wait, &wait); - set_current_state(TASK_RUNNING); - - if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) { - int ret; - - do { - ret = receive_from_sock(); + do { + ret = receive_from_sock(); - /* Don't starve out everyone else */ - if (++count >= MAX_RX_MSG_COUNT) { - cond_resched(); - count = 0; - } - } while (!kthread_should_stop() && ret >=0); - } - cond_resched(); + /* Don't starve out everyone else */ + if (++count >= MAX_RX_MSG_COUNT) { + cond_resched(); + count = 0; + } + } while (!kthread_should_stop() && ret >=0); } - - return 0; + cond_resched(); } -static int dlm_sendd(void *data) +// PJC: the work queue function for sending +static void process_send_sockets(struct work_struct *work) { - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait); - - while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); - if (write_list_empty()) - cond_resched(); - set_current_state(TASK_RUNNING); - - if (sctp_con.eagain_flag) { - sctp_con.eagain_flag = 0; - refill_write_queue(); - } - process_output_queue(); + if (sctp_con.eagain_flag) { + sctp_con.eagain_flag = 0; + refill_write_queue(); } + process_output_queue(); +} - remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait); - - return 0; +// PJC: Process lock requests from a particular node. +// TODO: can we optimise this out on UP ?? +static void process_lock_request(struct work_struct *work) +{ } static void daemons_stop(void) { - kthread_stop(recv_task); - kthread_stop(send_task); + destroy_workqueue(recv_workqueue); + destroy_workqueue(send_workqueue); + destroy_workqueue(lock_workqueue); } static int daemons_start(void) { - struct task_struct *p; int error; + recv_workqueue = create_workqueue("dlm_recv"); + error = IS_ERR(recv_workqueue); + if (error) { + log_print("can't start dlm_recv %d", error); + return error; + } - p = kthread_run(dlm_recvd, NULL, "dlm_recvd"); - error = IS_ERR(p); + send_workqueue = create_singlethread_workqueue("dlm_send"); + error = IS_ERR(send_workqueue); if (error) { - log_print("can't start dlm_recvd %d", error); + log_print("can't start dlm_send %d", error); + destroy_workqueue(recv_workqueue); return error; } - recv_task = p; - p = kthread_run(dlm_sendd, NULL, "dlm_sendd"); - error = IS_ERR(p); + lock_workqueue = create_workqueue("dlm_rlock"); + error = IS_ERR(lock_workqueue); if (error) { - log_print("can't start dlm_sendd %d", error); - kthread_stop(recv_task); + log_print("can't start dlm_rlock %d", error); + destroy_workqueue(send_workqueue); + destroy_workqueue(recv_workqueue); return error; } - send_task = p; return 0; } @@ -1194,6 +1176,8 @@ int dlm_lowcomms_start(void) { int error; + INIT_WORK(&sctp_con.work, process_recv_sockets); + error = init_sock(); if (error) goto fail_sock; @@ -1224,4 +1208,3 @@ void dlm_lowcomms_stop(void) for (i = 0; i < dlm_local_count; i++) kfree(dlm_local_addr[i]); } - diff --git a/fs/dlm/lowcomms-tcp.c b/fs/dlm/lowcomms-tcp.c index 8f2791fc844..07e0a122c32 100644 --- a/fs/dlm/lowcomms-tcp.c +++ b/fs/dlm/lowcomms-tcp.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -96,10 +96,7 @@ static bool cbuf_empty(struct cbuf *cb) struct connection { struct socket *sock; /* NULL if not connected */ uint32_t nodeid; /* So we know who we are in the list */ - struct rw_semaphore sock_sem; /* Stop connect races */ - struct list_head read_list; /* On this list when ready for reading */ - struct list_head write_list; /* On this list when ready for writing */ - struct list_head state_list; /* On this list when ready to connect */ + struct mutex sock_mutex; unsigned long flags; /* bit 1,2 = We are on the read/write lists */ #define CF_READ_PENDING 1 #define CF_WRITE_PENDING 2 @@ -112,9 +109,10 @@ struct connection { struct page *rx_page; struct cbuf cb; int retries; - atomic_t waiting_requests; #define MAX_CONNECT_RETRIES 3 struct connection *othercon; + struct work_struct rwork; /* Receive workqueue */ + struct work_struct swork; /* Send workqueue */ }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) @@ -131,32 +129,18 @@ struct writequeue_entry { static struct sockaddr_storage dlm_local_addr; -/* Manage daemons */ -static struct task_struct *recv_task; -static struct task_struct *send_task; - -static wait_queue_t lowcomms_send_waitq_head; -static DECLARE_WAIT_QUEUE_HEAD(lowcomms_send_waitq); -static wait_queue_t lowcomms_recv_waitq_head; -static DECLARE_WAIT_QUEUE_HEAD(lowcomms_recv_waitq); +/* Work queues */ +static struct workqueue_struct *recv_workqueue; +static struct workqueue_struct *send_workqueue; /* An array of pointers to connections, indexed by NODEID */ static struct connection **connections; static DECLARE_MUTEX(connections_lock); -static kmem_cache_t *con_cache; +static struct kmem_cache *con_cache; static int conn_array_size; -/* List of sockets that have reads pending */ -static LIST_HEAD(read_sockets); -static DEFINE_SPINLOCK(read_sockets_lock); - -/* List of sockets which have writes pending */ -static LIST_HEAD(write_sockets); -static DEFINE_SPINLOCK(write_sockets_lock); - -/* List of sockets which have connects pending */ -static LIST_HEAD(state_sockets); -static DEFINE_SPINLOCK(state_sockets_lock); +static void process_recv_sockets(struct work_struct *work); +static void process_send_sockets(struct work_struct *work); static struct connection *nodeid2con(int nodeid, gfp_t allocation) { @@ -186,9 +170,11 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation) goto finish; con->nodeid = nodeid; - init_rwsem(&con->sock_sem); + mutex_init(&con->sock_mutex); INIT_LIST_HEAD(&con->writequeue); spin_lock_init(&con->writequeue_lock); + INIT_WORK(&con->swork, process_send_sockets); + INIT_WORK(&con->rwork, process_recv_sockets); connections[nodeid] = con; } @@ -203,41 +189,22 @@ static void lowcomms_data_ready(struct sock *sk, int count_unused) { struct connection *con = sock2con(sk); - atomic_inc(&con->waiting_requests); - if (test_and_set_bit(CF_READ_PENDING, &con->flags)) - return; - - spin_lock_bh(&read_sockets_lock); - list_add_tail(&con->read_list, &read_sockets); - spin_unlock_bh(&read_sockets_lock); - - wake_up_interruptible(&lowcomms_recv_waitq); + if (!test_and_set_bit(CF_READ_PENDING, &con->flags)) + queue_work(recv_workqueue, &con->rwork); } static void lowcomms_write_space(struct sock *sk) { struct connection *con = sock2con(sk); - if (test_and_set_bit(CF_WRITE_PENDING, &con->flags)) - return; - - spin_lock_bh(&write_sockets_lock); - list_add_tail(&con->write_list, &write_sockets); - spin_unlock_bh(&write_sockets_lock); - - wake_up_interruptible(&lowcomms_send_waitq); + if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) + queue_work(send_workqueue, &con->swork); } static inline void lowcomms_connect_sock(struct connection *con) { - if (test_and_set_bit(CF_CONNECT_PENDING, &con->flags)) - return; - - spin_lock_bh(&state_sockets_lock); - list_add_tail(&con->state_list, &state_sockets); - spin_unlock_bh(&state_sockets_lock); - - wake_up_interruptible(&lowcomms_send_waitq); + if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags)) + queue_work(send_workqueue, &con->swork); } static void lowcomms_state_change(struct sock *sk) @@ -279,7 +246,7 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, /* Close a remote connection and tidy up */ static void close_connection(struct connection *con, bool and_other) { - down_write(&con->sock_sem); + mutex_lock(&con->sock_mutex); if (con->sock) { sock_release(con->sock); @@ -294,24 +261,27 @@ static void close_connection(struct connection *con, bool and_other) con->rx_page = NULL; } con->retries = 0; - up_write(&con->sock_sem); + mutex_unlock(&con->sock_mutex); } /* Data received from remote end */ static int receive_from_sock(struct connection *con) { int ret = 0; - struct msghdr msg; - struct iovec iov[2]; - mm_segment_t fs; + struct msghdr msg = {}; + struct kvec iov[2]; unsigned len; int r; int call_again_soon = 0; + int nvec; - down_read(&con->sock_sem); + mutex_lock(&con->sock_mutex); + + if (con->sock == NULL) { + ret = -EAGAIN; + goto out_close; + } - if (con->sock == NULL) - goto out; if (con->rx_page == NULL) { /* * This doesn't need to be atomic, but I think it should @@ -323,21 +293,13 @@ static int receive_from_sock(struct connection *con) cbuf_init(&con->cb, PAGE_CACHE_SIZE); } - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_iovlen = 1; - msg.msg_iov = iov; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_flags = 0; - /* * iov[0] is the bit of the circular buffer between the current end * point (cb.base + cb.len) and the end of the buffer. */ iov[0].iov_len = con->cb.base - cbuf_data(&con->cb); iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb); - iov[1].iov_len = 0; + nvec = 1; /* * iov[1] is the bit of the circular buffer between the start of the @@ -347,18 +309,18 @@ static int receive_from_sock(struct connection *con) iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&con->cb); iov[1].iov_len = con->cb.base; iov[1].iov_base = page_address(con->rx_page); - msg.msg_iovlen = 2; + nvec = 2; } len = iov[0].iov_len + iov[1].iov_len; - fs = get_fs(); - set_fs(get_ds()); - r = ret = sock_recvmsg(con->sock, &msg, len, + r = ret = kernel_recvmsg(con->sock, &msg, iov, nvec, len, MSG_DONTWAIT | MSG_NOSIGNAL); - set_fs(fs); if (ret <= 0) goto out_close; + if (ret == -EAGAIN) + goto out_resched; + if (ret == len) call_again_soon = 1; cbuf_add(&con->cb, ret); @@ -381,24 +343,26 @@ static int receive_from_sock(struct connection *con) con->rx_page = NULL; } -out: if (call_again_soon) goto out_resched; - up_read(&con->sock_sem); + mutex_unlock(&con->sock_mutex); return 0; out_resched: - lowcomms_data_ready(con->sock->sk, 0); - up_read(&con->sock_sem); - cond_resched(); - return 0; + if (!test_and_set_bit(CF_READ_PENDING, &con->flags)) + queue_work(recv_workqueue, &con->rwork); + mutex_unlock(&con->sock_mutex); + return -EAGAIN; out_close: - up_read(&con->sock_sem); + mutex_unlock(&con->sock_mutex); if (ret != -EAGAIN && !test_bit(CF_IS_OTHERCON, &con->flags)) { close_connection(con, false); /* Reconnect when there is something to send */ } + /* Don't return success if we really got EOF */ + if (ret == 0) + ret = -EAGAIN; return ret; } @@ -412,6 +376,7 @@ static int accept_from_sock(struct connection *con) int len; int nodeid; struct connection *newcon; + struct connection *addcon; memset(&peeraddr, 0, sizeof(peeraddr)); result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, @@ -419,7 +384,7 @@ static int accept_from_sock(struct connection *con) if (result < 0) return -ENOMEM; - down_read(&con->sock_sem); + mutex_lock_nested(&con->sock_mutex, 0); result = -ENOTCONN; if (con->sock == NULL) @@ -445,7 +410,7 @@ static int accept_from_sock(struct connection *con) if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) { printk("dlm: connect from non cluster node\n"); sock_release(newsock); - up_read(&con->sock_sem); + mutex_unlock(&con->sock_mutex); return -1; } @@ -462,7 +427,7 @@ static int accept_from_sock(struct connection *con) result = -ENOMEM; goto accept_err; } - down_write(&newcon->sock_sem); + mutex_lock_nested(&newcon->sock_mutex, 1); if (newcon->sock) { struct connection *othercon = newcon->othercon; @@ -470,41 +435,45 @@ static int accept_from_sock(struct connection *con) othercon = kmem_cache_zalloc(con_cache, GFP_KERNEL); if (!othercon) { printk("dlm: failed to allocate incoming socket\n"); - up_write(&newcon->sock_sem); + mutex_unlock(&newcon->sock_mutex); result = -ENOMEM; goto accept_err; } othercon->nodeid = nodeid; othercon->rx_action = receive_from_sock; - init_rwsem(&othercon->sock_sem); + mutex_init(&othercon->sock_mutex); + INIT_WORK(&othercon->swork, process_send_sockets); + INIT_WORK(&othercon->rwork, process_recv_sockets); set_bit(CF_IS_OTHERCON, &othercon->flags); newcon->othercon = othercon; } othercon->sock = newsock; newsock->sk->sk_user_data = othercon; add_sock(newsock, othercon); + addcon = othercon; } else { newsock->sk->sk_user_data = newcon; newcon->rx_action = receive_from_sock; add_sock(newsock, newcon); - + addcon = newcon; } - up_write(&newcon->sock_sem); + mutex_unlock(&newcon->sock_mutex); /* * Add it to the active queue in case we got data * beween processing the accept adding the socket * to the read_sockets list */ - lowcomms_data_ready(newsock->sk, 0); - up_read(&con->sock_sem); + if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags)) + queue_work(recv_workqueue, &addcon->rwork); + mutex_unlock(&con->sock_mutex); return 0; accept_err: - up_read(&con->sock_sem); + mutex_unlock(&con->sock_mutex); sock_release(newsock); if (result != -EAGAIN) @@ -525,7 +494,7 @@ static void connect_to_sock(struct connection *con) return; } - down_write(&con->sock_sem); + mutex_lock(&con->sock_mutex); if (con->retries++ > MAX_CONNECT_RETRIES) goto out; @@ -548,7 +517,7 @@ static void connect_to_sock(struct connection *con) sock->sk->sk_user_data = con; con->rx_action = receive_from_sock; - make_sockaddr(&saddr, dlm_config.tcp_port, &addr_len); + make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len); add_sock(sock, con); @@ -577,7 +546,7 @@ out_err: result = 0; } out: - up_write(&con->sock_sem); + mutex_unlock(&con->sock_mutex); return; } @@ -616,10 +585,10 @@ static struct socket *create_listen_sock(struct connection *con, con->sock = sock; /* Bind to our port */ - make_sockaddr(saddr, dlm_config.tcp_port, &addr_len); + make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len); result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len); if (result < 0) { - printk("dlm: Can't bind to port %d\n", dlm_config.tcp_port); + printk("dlm: Can't bind to port %d\n", dlm_config.ci_tcp_port); sock_release(sock); sock = NULL; con->sock = NULL; @@ -638,7 +607,7 @@ static struct socket *create_listen_sock(struct connection *con, result = sock->ops->listen(sock, 5); if (result < 0) { - printk("dlm: Can't listen on port %d\n", dlm_config.tcp_port); + printk("dlm: Can't listen on port %d\n", dlm_config.ci_tcp_port); sock_release(sock); sock = NULL; goto create_out; @@ -709,6 +678,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, if (!con) return NULL; + spin_lock(&con->writequeue_lock); e = list_entry(con->writequeue.prev, struct writequeue_entry, list); if ((&e->list == &con->writequeue) || (PAGE_CACHE_SIZE - e->end < len)) { @@ -747,6 +717,7 @@ void dlm_lowcomms_commit_buffer(void *mh) struct connection *con = e->con; int users; + spin_lock(&con->writequeue_lock); users = --e->users; if (users) goto out; @@ -754,12 +725,8 @@ void dlm_lowcomms_commit_buffer(void *mh) kunmap(e->page); spin_unlock(&con->writequeue_lock); - if (test_and_set_bit(CF_WRITE_PENDING, &con->flags) == 0) { - spin_lock_bh(&write_sockets_lock); - list_add_tail(&con->write_list, &write_sockets); - spin_unlock_bh(&write_sockets_lock); - - wake_up_interruptible(&lowcomms_send_waitq); + if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) { + queue_work(send_workqueue, &con->swork); } return; @@ -783,7 +750,7 @@ static void send_to_sock(struct connection *con) struct writequeue_entry *e; int len, offset; - down_read(&con->sock_sem); + mutex_lock(&con->sock_mutex); if (con->sock == NULL) goto out_connect; @@ -800,6 +767,7 @@ static void send_to_sock(struct connection *con) offset = e->offset; BUG_ON(len == 0 && e->users == 0); spin_unlock(&con->writequeue_lock); + kmap(e->page); ret = 0; if (len) { @@ -828,18 +796,18 @@ static void send_to_sock(struct connection *con) } spin_unlock(&con->writequeue_lock); out: - up_read(&con->sock_sem); + mutex_unlock(&con->sock_mutex); return; send_error: - up_read(&con->sock_sem); + mutex_unlock(&con->sock_mutex); close_connection(con, false); lowcomms_connect_sock(con); return; out_connect: - up_read(&con->sock_sem); - lowcomms_connect_sock(con); + mutex_unlock(&con->sock_mutex); + connect_to_sock(con); return; } @@ -872,7 +840,6 @@ int dlm_lowcomms_close(int nodeid) if (con) { clean_one_writequeue(con); close_connection(con, true); - atomic_set(&con->waiting_requests, 0); } return 0; @@ -880,102 +847,29 @@ out: return -1; } -/* API send message call, may queue the request */ -/* N.B. This is the old interface - use the new one for new calls */ -int lowcomms_send_message(int nodeid, char *buf, int len, gfp_t allocation) -{ - struct writequeue_entry *e; - char *b; - - e = dlm_lowcomms_get_buffer(nodeid, len, allocation, &b); - if (e) { - memcpy(b, buf, len); - dlm_lowcomms_commit_buffer(e); - return 0; - } - return -ENOBUFS; -} - /* Look for activity on active sockets */ -static void process_sockets(void) +static void process_recv_sockets(struct work_struct *work) { - struct list_head *list; - struct list_head *temp; - int count = 0; - - spin_lock_bh(&read_sockets_lock); - list_for_each_safe(list, temp, &read_sockets) { - - struct connection *con = - list_entry(list, struct connection, read_list); - list_del(&con->read_list); - clear_bit(CF_READ_PENDING, &con->flags); - - spin_unlock_bh(&read_sockets_lock); - - /* This can reach zero if we are processing requests - * as they come in. - */ - if (atomic_read(&con->waiting_requests) == 0) { - spin_lock_bh(&read_sockets_lock); - continue; - } - - do { - con->rx_action(con); - - /* Don't starve out everyone else */ - if (++count >= MAX_RX_MSG_COUNT) { - cond_resched(); - count = 0; - } + struct connection *con = container_of(work, struct connection, rwork); + int err; - } while (!atomic_dec_and_test(&con->waiting_requests) && - !kthread_should_stop()); - - spin_lock_bh(&read_sockets_lock); - } - spin_unlock_bh(&read_sockets_lock); + clear_bit(CF_READ_PENDING, &con->flags); + do { + err = con->rx_action(con); + } while (!err); } -/* Try to send any messages that are pending - */ -static void process_output_queue(void) -{ - struct list_head *list; - struct list_head *temp; - - spin_lock_bh(&write_sockets_lock); - list_for_each_safe(list, temp, &write_sockets) { - struct connection *con = - list_entry(list, struct connection, write_list); - clear_bit(CF_WRITE_PENDING, &con->flags); - list_del(&con->write_list); - spin_unlock_bh(&write_sockets_lock); - send_to_sock(con); - spin_lock_bh(&write_sockets_lock); - } - spin_unlock_bh(&write_sockets_lock); -} - -static void process_state_queue(void) +static void process_send_sockets(struct work_struct *work) { - struct list_head *list; - struct list_head *temp; - - spin_lock_bh(&state_sockets_lock); - list_for_each_safe(list, temp, &state_sockets) { - struct connection *con = - list_entry(list, struct connection, state_list); - list_del(&con->state_list); - clear_bit(CF_CONNECT_PENDING, &con->flags); - spin_unlock_bh(&state_sockets_lock); + struct connection *con = container_of(work, struct connection, swork); + if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) { connect_to_sock(con); - spin_lock_bh(&state_sockets_lock); } - spin_unlock_bh(&state_sockets_lock); + + clear_bit(CF_WRITE_PENDING, &con->flags); + send_to_sock(con); } @@ -992,109 +886,33 @@ static void clean_writequeues(void) } } -static int read_list_empty(void) -{ - int status; - - spin_lock_bh(&read_sockets_lock); - status = list_empty(&read_sockets); - spin_unlock_bh(&read_sockets_lock); - - return status; -} - -/* DLM Transport comms receive daemon */ -static int dlm_recvd(void *data) +static void work_stop(void) { - init_waitqueue_entry(&lowcomms_recv_waitq_head, current); - add_wait_queue(&lowcomms_recv_waitq, &lowcomms_recv_waitq_head); - - while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); - if (read_list_empty()) - cond_resched(); - set_current_state(TASK_RUNNING); - - process_sockets(); - } - - return 0; + destroy_workqueue(recv_workqueue); + destroy_workqueue(send_workqueue); } -static int write_and_state_lists_empty(void) +static int work_start(void) { - int status; - - spin_lock_bh(&write_sockets_lock); - status = list_empty(&write_sockets); - spin_unlock_bh(&write_sockets_lock); - - spin_lock_bh(&state_sockets_lock); - if (list_empty(&state_sockets) == 0) - status = 0; - spin_unlock_bh(&state_sockets_lock); - - return status; -} - -/* DLM Transport send daemon */ -static int dlm_sendd(void *data) -{ - init_waitqueue_entry(&lowcomms_send_waitq_head, current); - add_wait_queue(&lowcomms_send_waitq, &lowcomms_send_waitq_head); - - while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); - if (write_and_state_lists_empty()) - cond_resched(); - set_current_state(TASK_RUNNING); - - process_state_queue(); - process_output_queue(); - } - - return 0; -} - -static void daemons_stop(void) -{ - kthread_stop(recv_task); - kthread_stop(send_task); -} - -static int daemons_start(void) -{ - struct task_struct *p; int error; - - p = kthread_run(dlm_recvd, NULL, "dlm_recvd"); - error = IS_ERR(p); + recv_workqueue = create_workqueue("dlm_recv"); + error = IS_ERR(recv_workqueue); if (error) { - log_print("can't start dlm_recvd %d", error); + log_print("can't start dlm_recv %d", error); return error; } - recv_task = p; - p = kthread_run(dlm_sendd, NULL, "dlm_sendd"); - error = IS_ERR(p); + send_workqueue = create_singlethread_workqueue("dlm_send"); + error = IS_ERR(send_workqueue); if (error) { - log_print("can't start dlm_sendd %d", error); - kthread_stop(recv_task); + log_print("can't start dlm_send %d", error); + destroy_workqueue(recv_workqueue); return error; } - send_task = p; return 0; } -/* - * Return the largest buffer size we can cope with. - */ -int lowcomms_max_buffer_size(void) -{ - return PAGE_CACHE_SIZE; -} - void dlm_lowcomms_stop(void) { int i; @@ -1107,7 +925,7 @@ void dlm_lowcomms_stop(void) connections[i]->flags |= 0xFF; } - daemons_stop(); + work_stop(); clean_writequeues(); for (i = 0; i < conn_array_size; i++) { @@ -1159,7 +977,7 @@ int dlm_lowcomms_start(void) if (error) goto fail_unlisten; - error = daemons_start(); + error = work_start(); if (error) goto fail_unlisten; diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index c9b1c3d535f..a5126e0c68a 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -82,7 +82,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base, if (msglen < sizeof(struct dlm_header)) break; err = -E2BIG; - if (msglen > dlm_config.buffer_size) { + if (msglen > dlm_config.ci_buffer_size) { log_print("message size %d from %d too big, buf len %d", msglen, nodeid, len); break; @@ -103,7 +103,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base, if (msglen > sizeof(__tmp) && msg == (struct dlm_header *) __tmp) { - msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL); + msg = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL); if (msg == NULL) return ret; } diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 4cc31be9cd9..6bfbd615380 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -56,6 +56,10 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, rc->rc_type = type; + spin_lock(&ls->ls_recover_lock); + rc->rc_seq = ls->ls_recover_seq; + spin_unlock(&ls->ls_recover_lock); + *mh_ret = mh; *rc_ret = rc; return 0; @@ -78,8 +82,17 @@ static void make_config(struct dlm_ls *ls, struct rcom_config *rf) rf->rf_lsflags = ls->ls_exflags; } -static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid) +static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) { + struct rcom_config *rf = (struct rcom_config *) rc->rc_buf; + + if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) { + log_error(ls, "version mismatch: %x nodeid %d: %x", + DLM_HEADER_MAJOR | DLM_HEADER_MINOR, nodeid, + rc->rc_header.h_version); + return -EINVAL; + } + if (rf->rf_lvblen != ls->ls_lvblen || rf->rf_lsflags != ls->ls_exflags) { log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x", @@ -125,7 +138,7 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid) goto out; allow_sync_reply(ls, &rc->rc_id); - memset(ls->ls_recover_buf, 0, dlm_config.buffer_size); + memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size); send_rcom(ls, mh, rc); @@ -141,8 +154,7 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid) log_debug(ls, "remote node %d not ready", nodeid); rc->rc_result = 0; } else - error = check_config(ls, (struct rcom_config *) rc->rc_buf, - nodeid); + error = check_config(ls, rc, nodeid); /* the caller looks at rc_result for the remote recovery status */ out: return error; @@ -159,6 +171,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) if (error) return; rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; rc->rc_result = dlm_recover_status(ls); make_config(ls, (struct rcom_config *) rc->rc_buf); @@ -200,7 +213,7 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) if (nodeid == dlm_our_nodeid()) { dlm_copy_master_names(ls, last_name, last_len, ls->ls_recover_buf + len, - dlm_config.buffer_size - len, nodeid); + dlm_config.ci_buffer_size - len, nodeid); goto out; } @@ -210,7 +223,7 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) memcpy(rc->rc_buf, last_name, last_len); allow_sync_reply(ls, &rc->rc_id); - memset(ls->ls_recover_buf, 0, dlm_config.buffer_size); + memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size); send_rcom(ls, mh, rc); @@ -224,30 +237,17 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) { struct dlm_rcom *rc; struct dlm_mhandle *mh; - int error, inlen, outlen; - int nodeid = rc_in->rc_header.h_nodeid; - uint32_t status = dlm_recover_status(ls); - - /* - * We can't run dlm_dir_rebuild_send (which uses ls_nodes) while - * dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes). - * It could only happen in rare cases where we get a late NAMES - * message from a previous instance of recovery. - */ - - if (!(status & DLM_RS_NODES)) { - log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid); - return; - } + int error, inlen, outlen, nodeid; nodeid = rc_in->rc_header.h_nodeid; inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom); - outlen = dlm_config.buffer_size - sizeof(struct dlm_rcom); + outlen = dlm_config.ci_buffer_size - sizeof(struct dlm_rcom); error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh); if (error) return; rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen, nodeid); @@ -294,6 +294,7 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) ret_nodeid = error; rc->rc_result = ret_nodeid; rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; send_rcom(ls, mh, rc); } @@ -375,20 +376,13 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in) memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock)); rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; send_rcom(ls, mh, rc); } static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) { - uint32_t status = dlm_recover_status(ls); - - if (!(status & DLM_RS_DIR)) { - log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u", - rc_in->rc_header.h_nodeid); - return; - } - dlm_recover_process_copy(ls, rc_in); } @@ -415,6 +409,7 @@ static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) rc->rc_type = DLM_RCOM_STATUS_REPLY; rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; rc->rc_result = -ESRCH; rf = (struct rcom_config *) rc->rc_buf; @@ -426,6 +421,31 @@ static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) return 0; } +static int is_old_reply(struct dlm_ls *ls, struct dlm_rcom *rc) +{ + uint64_t seq; + int rv = 0; + + switch (rc->rc_type) { + case DLM_RCOM_STATUS_REPLY: + case DLM_RCOM_NAMES_REPLY: + case DLM_RCOM_LOOKUP_REPLY: + case DLM_RCOM_LOCK_REPLY: + spin_lock(&ls->ls_recover_lock); + seq = ls->ls_recover_seq; + spin_unlock(&ls->ls_recover_lock); + if (rc->rc_seq_reply != seq) { + log_debug(ls, "ignoring old reply %x from %d " + "seq_reply %llx expect %llx", + rc->rc_type, rc->rc_header.h_nodeid, + (unsigned long long)rc->rc_seq_reply, + (unsigned long long)seq); + rv = 1; + } + } + return rv; +} + /* Called by dlm_recvd; corresponds to dlm_receive_message() but special recovery-only comms are sent through here. */ @@ -449,11 +469,14 @@ void dlm_receive_rcom(struct dlm_header *hd, int nodeid) } if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) { - log_error(ls, "ignoring recovery message %x from %d", + log_debug(ls, "ignoring recovery message %x from %d", rc->rc_type, nodeid); goto out; } + if (is_old_reply(ls, rc)) + goto out; + if (nodeid != rc->rc_header.h_nodeid) { log_error(ls, "bad rcom nodeid %d from %d", rc->rc_header.h_nodeid, nodeid); diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index cf9f6831bab..c2cc7694cd1 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -44,7 +44,7 @@ static void dlm_wait_timer_fn(unsigned long data) { struct dlm_ls *ls = (struct dlm_ls *) data; - mod_timer(&ls->ls_timer, jiffies + (dlm_config.recover_timer * HZ)); + mod_timer(&ls->ls_timer, jiffies + (dlm_config.ci_recover_timer * HZ)); wake_up(&ls->ls_wait_general); } @@ -55,7 +55,7 @@ int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls)) init_timer(&ls->ls_timer); ls->ls_timer.function = dlm_wait_timer_fn; ls->ls_timer.data = (long) ls; - ls->ls_timer.expires = jiffies + (dlm_config.recover_timer * HZ); + ls->ls_timer.expires = jiffies + (dlm_config.ci_recover_timer * HZ); add_timer(&ls->ls_timer); wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls)); @@ -397,7 +397,9 @@ int dlm_recover_masters(struct dlm_ls *ls) if (dlm_no_directory(ls)) count += recover_master_static(r); - else if (!is_master(r) && dlm_is_removed(ls, r->res_nodeid)) { + else if (!is_master(r) && + (dlm_is_removed(ls, r->res_nodeid) || + rsb_flag(r, RSB_NEW_MASTER))) { recover_master(r); count++; } diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 650536aa513..3cb636d6024 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -77,7 +77,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_members(ls, rv, &neg); if (error) { - log_error(ls, "recover_members failed %d", error); + log_debug(ls, "recover_members failed %d", error); goto fail; } start = jiffies; @@ -89,7 +89,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_directory(ls); if (error) { - log_error(ls, "recover_directory failed %d", error); + log_debug(ls, "recover_directory failed %d", error); goto fail; } @@ -99,7 +99,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_directory_wait(ls); if (error) { - log_error(ls, "recover_directory_wait failed %d", error); + log_debug(ls, "recover_directory_wait failed %d", error); goto fail; } @@ -129,7 +129,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_masters(ls); if (error) { - log_error(ls, "recover_masters failed %d", error); + log_debug(ls, "recover_masters failed %d", error); goto fail; } @@ -139,13 +139,13 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_locks(ls); if (error) { - log_error(ls, "recover_locks failed %d", error); + log_debug(ls, "recover_locks failed %d", error); goto fail; } error = dlm_recover_locks_wait(ls); if (error) { - log_error(ls, "recover_locks_wait failed %d", error); + log_debug(ls, "recover_locks_wait failed %d", error); goto fail; } @@ -166,7 +166,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_locks_wait(ls); if (error) { - log_error(ls, "recover_locks_wait failed %d", error); + log_debug(ls, "recover_locks_wait failed %d", error); goto fail; } } @@ -184,7 +184,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_set_recover_status(ls, DLM_RS_DONE); error = dlm_recover_done_wait(ls); if (error) { - log_error(ls, "recover_done_wait failed %d", error); + log_debug(ls, "recover_done_wait failed %d", error); goto fail; } @@ -192,19 +192,19 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = enable_locking(ls, rv->seq); if (error) { - log_error(ls, "enable_locking failed %d", error); + log_debug(ls, "enable_locking failed %d", error); goto fail; } error = dlm_process_requestqueue(ls); if (error) { - log_error(ls, "process_requestqueue failed %d", error); + log_debug(ls, "process_requestqueue failed %d", error); goto fail; } error = dlm_recover_waiters_post(ls); if (error) { - log_error(ls, "recover_waiters_post failed %d", error); + log_debug(ls, "recover_waiters_post failed %d", error); goto fail; } diff --git a/fs/dlm/user.c b/fs/dlm/user.c index c37e93e4f2d..d378b7fe2a1 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -180,6 +180,14 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type) ua->lksb.sb_status == -EAGAIN && !list_empty(&lkb->lkb_ownqueue)) remove_ownqueue = 1; + /* unlocks or cancels of waiting requests need to be removed from the + proc's unlocking list, again there must be a better way... */ + + if (ua->lksb.sb_status == -DLM_EUNLOCK || + (ua->lksb.sb_status == -DLM_ECANCEL && + lkb->lkb_grmode == DLM_LOCK_IV)) + remove_ownqueue = 1; + /* We want to copy the lvb to userspace when the completion ast is read if the status is 0, the lock has an lvb and lvb_ops says we should. We could probably have set_lvb_lock() @@ -523,6 +531,7 @@ static int device_open(struct inode *inode, struct file *file) proc->lockspace = ls->ls_local_handle; INIT_LIST_HEAD(&proc->asts); INIT_LIST_HEAD(&proc->locks); + INIT_LIST_HEAD(&proc->unlocking); spin_lock_init(&proc->asts_spin); spin_lock_init(&proc->locks_spin); init_waitqueue_head(&proc->wait); diff --git a/fs/dlm/util.c b/fs/dlm/util.c index 767197db994..963889cf674 100644 --- a/fs/dlm/util.c +++ b/fs/dlm/util.c @@ -134,6 +134,8 @@ void dlm_rcom_out(struct dlm_rcom *rc) rc->rc_type = cpu_to_le32(rc->rc_type); rc->rc_result = cpu_to_le32(rc->rc_result); rc->rc_id = cpu_to_le64(rc->rc_id); + rc->rc_seq = cpu_to_le64(rc->rc_seq); + rc->rc_seq_reply = cpu_to_le64(rc->rc_seq_reply); if (type == DLM_RCOM_LOCK) rcom_lock_out((struct rcom_lock *) rc->rc_buf); @@ -151,6 +153,8 @@ void dlm_rcom_in(struct dlm_rcom *rc) rc->rc_type = le32_to_cpu(rc->rc_type); rc->rc_result = le32_to_cpu(rc->rc_result); rc->rc_id = le64_to_cpu(rc->rc_id); + rc->rc_seq = le64_to_cpu(rc->rc_seq); + rc->rc_seq_reply = le64_to_cpu(rc->rc_seq_reply); if (rc->rc_type == DLM_RCOM_LOCK) rcom_lock_in((struct rcom_lock *) rc->rc_buf); diff --git a/fs/dnotify.c b/fs/dnotify.c index 1f26a2b9eee..936409fcd93 100644 --- a/fs/dnotify.c +++ b/fs/dnotify.c @@ -42,7 +42,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) struct dnotify_struct **prev; struct inode *inode; - inode = filp->f_dentry->d_inode; + inode = filp->f_path.dentry->d_inode; if (!S_ISDIR(inode->i_mode)) return; spin_lock(&inode->i_lock); @@ -74,7 +74,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) } if (!dir_notify_enable) return -EINVAL; - inode = filp->f_dentry->d_inode; + inode = filp->f_path.dentry->d_inode; if (!S_ISDIR(inode->i_mode)) return -ENOTDIR; dn = kmem_cache_alloc(dn_cache, GFP_KERNEL); diff --git a/fs/dquot.c b/fs/dquot.c index f9cd5e23ebd..0952cc474d9 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -694,9 +694,9 @@ restart: file_list_lock(); list_for_each(p, &sb->s_files) { struct file *filp = list_entry(p, struct file, f_u.fu_list); - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; if (filp->f_mode & FMODE_WRITE && dqinit_needed(inode, type)) { - struct dentry *dentry = dget(filp->f_dentry); + struct dentry *dentry = dget(filp->f_path.dentry); file_list_unlock(); sb->dq_op->initialize(inode, type); dput(dentry); @@ -828,6 +828,7 @@ static inline int need_print_warning(struct dquot *dquot) static void print_warning(struct dquot *dquot, const char warntype) { char *msg = NULL; + struct tty_struct *tty; int flag = (warntype == BHARDWARN || warntype == BSOFTLONGWARN) ? DQ_BLKS_B : ((warntype == IHARDWARN || warntype == ISOFTLONGWARN) ? DQ_INODES_B : 0); @@ -835,14 +836,15 @@ static void print_warning(struct dquot *dquot, const char warntype) return; mutex_lock(&tty_mutex); - if (!current->signal->tty) + tty = get_current_tty(); + if (!tty) goto out_lock; - tty_write_message(current->signal->tty, dquot->dq_sb->s_id); + tty_write_message(tty, dquot->dq_sb->s_id); if (warntype == ISOFTWARN || warntype == BSOFTWARN) - tty_write_message(current->signal->tty, ": warning, "); + tty_write_message(tty, ": warning, "); else - tty_write_message(current->signal->tty, ": write failed, "); - tty_write_message(current->signal->tty, quotatypes[dquot->dq_type]); + tty_write_message(tty, ": write failed, "); + tty_write_message(tty, quotatypes[dquot->dq_type]); switch (warntype) { case IHARDWARN: msg = " file limit reached.\r\n"; @@ -863,7 +865,7 @@ static void print_warning(struct dquot *dquot, const char warntype) msg = " block quota exceeded.\r\n"; break; } - tty_write_message(current->signal->tty, msg); + tty_write_message(tty, msg); out_lock: mutex_unlock(&tty_mutex); } diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 7196f50fe15..a86a55ccf87 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -828,9 +828,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat) mutex_unlock(&crypt_stat->cs_tfm_mutex); goto out; } - crypto_blkcipher_set_flags(crypt_stat->tfm, - (ECRYPTFS_DEFAULT_CHAINING_MODE - | CRYPTO_TFM_REQ_WEAK_KEY)); + crypto_blkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY); mutex_unlock(&crypt_stat->cs_tfm_mutex); rc = 0; out: diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 52d1e36dc74..329efcd3d8c 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -25,6 +25,7 @@ #include <linux/dcache.h> #include <linux/namei.h> #include <linux/mount.h> +#include <linux/fs_stack.h> #include "ecryptfs_kernel.h" /** @@ -61,7 +62,7 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd) struct inode *lower_inode = ecryptfs_inode_to_lower(dentry->d_inode); - ecryptfs_copy_attr_all(dentry->d_inode, lower_inode); + fsstack_copy_attr_all(dentry->d_inode, lower_inode, NULL); } out: return rc; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index f992533d169..0f897109759 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -28,6 +28,8 @@ #include <keys/user-type.h> #include <linux/fs.h> +#include <linux/fs_stack.h> +#include <linux/namei.h> #include <linux/scatterlist.h> /* Version verification for shared data structures w/ userspace */ @@ -174,7 +176,6 @@ ecryptfs_get_key_payload_data(struct key *key) #define ECRYPTFS_FILE_SIZE_BYTES 8 #define ECRYPTFS_DEFAULT_CIPHER "aes" #define ECRYPTFS_DEFAULT_KEY_BYTES 16 -#define ECRYPTFS_DEFAULT_CHAINING_MODE CRYPTO_TFM_MODE_CBC #define ECRYPTFS_DEFAULT_HASH "md5" #define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C #define ECRYPTFS_TAG_11_PACKET_TYPE 0xED @@ -227,8 +228,7 @@ struct ecryptfs_inode_info { /* dentry private data. Each dentry must keep track of a lower * vfsmount too. */ struct ecryptfs_dentry_info { - struct dentry *wdi_dentry; - struct vfsmount *lower_mnt; + struct path lower_path; struct ecryptfs_crypt_stat *crypt_stat; }; @@ -355,26 +355,26 @@ ecryptfs_set_dentry_private(struct dentry *dentry, static inline struct dentry * ecryptfs_dentry_to_lower(struct dentry *dentry) { - return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->wdi_dentry; + return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry; } static inline void ecryptfs_set_dentry_lower(struct dentry *dentry, struct dentry *lower_dentry) { - ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->wdi_dentry = + ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry = lower_dentry; } static inline struct vfsmount * ecryptfs_dentry_to_lower_mnt(struct dentry *dentry) { - return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_mnt; + return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.mnt; } static inline void ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt) { - ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_mnt = + ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.mnt = lower_mnt; } @@ -413,9 +413,6 @@ int ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat, const char *name, int length, char **encoded_name); struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry); -void ecryptfs_copy_attr_atime(struct inode *dest, const struct inode *src); -void ecryptfs_copy_attr_all(struct inode *dest, const struct inode *src); -void ecryptfs_copy_inode_size(struct inode *dst, const struct inode *src); void ecryptfs_dump_hex(char *data, int bytes); int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, int sg_size); diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 42099e779a5..c5a2e5298f1 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -30,6 +30,7 @@ #include <linux/security.h> #include <linux/smp_lock.h> #include <linux/compat.h> +#include <linux/fs_stack.h> #include "ecryptfs_kernel.h" /** @@ -75,7 +76,7 @@ static loff_t ecryptfs_llseek(struct file *file, loff_t offset, int origin) } ecryptfs_printk(KERN_DEBUG, "new_end_pos = [0x%.16x]\n", new_end_pos); if (expanding_file) { - rc = ecryptfs_truncate(file->f_dentry, new_end_pos); + rc = ecryptfs_truncate(file->f_path.dentry, new_end_pos); if (rc) { rv = rc; ecryptfs_printk(KERN_ERR, "Error on attempt to " @@ -116,8 +117,8 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, if (-EIOCBQUEUED == rc) rc = wait_on_sync_kiocb(iocb); if (rc >= 0) { - lower_dentry = ecryptfs_dentry_to_lower(file->f_dentry); - lower_vfsmount = ecryptfs_dentry_to_lower_mnt(file->f_dentry); + lower_dentry = ecryptfs_dentry_to_lower(file->f_path.dentry); + lower_vfsmount = ecryptfs_dentry_to_lower_mnt(file->f_path.dentry); touch_atime(lower_vfsmount, lower_dentry); } return rc; @@ -176,10 +177,10 @@ static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir) lower_file = ecryptfs_file_to_lower(file); lower_file->f_pos = file->f_pos; - inode = file->f_dentry->d_inode; + inode = file->f_path.dentry->d_inode; memset(&buf, 0, sizeof(buf)); buf.dirent = dirent; - buf.dentry = file->f_dentry; + buf.dentry = file->f_path.dentry; buf.filldir = filldir; retry: buf.filldir_called = 0; @@ -192,7 +193,7 @@ retry: goto retry; file->f_pos = lower_file->f_pos; if (rc >= 0) - ecryptfs_copy_attr_atime(inode, lower_file->f_dentry->d_inode); + fsstack_copy_attr_atime(inode, lower_file->f_path.dentry->d_inode); return rc; } @@ -239,7 +240,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file) int rc = 0; struct ecryptfs_crypt_stat *crypt_stat = NULL; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; - struct dentry *ecryptfs_dentry = file->f_dentry; + struct dentry *ecryptfs_dentry = file->f_path.dentry; /* Private value of ecryptfs_dentry allocated in * ecryptfs_lookup() */ struct dentry *lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 8a1945a84c3..11f5e5076ae 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -30,6 +30,7 @@ #include <linux/namei.h> #include <linux/mount.h> #include <linux/crypto.h> +#include <linux/fs_stack.h> #include "ecryptfs_kernel.h" static struct dentry *lock_parent(struct dentry *dentry) @@ -53,48 +54,6 @@ static void unlock_dir(struct dentry *dir) dput(dir); } -void ecryptfs_copy_inode_size(struct inode *dst, const struct inode *src) -{ - i_size_write(dst, i_size_read((struct inode *)src)); - dst->i_blocks = src->i_blocks; -} - -void ecryptfs_copy_attr_atime(struct inode *dest, const struct inode *src) -{ - dest->i_atime = src->i_atime; -} - -static void ecryptfs_copy_attr_times(struct inode *dest, - const struct inode *src) -{ - dest->i_atime = src->i_atime; - dest->i_mtime = src->i_mtime; - dest->i_ctime = src->i_ctime; -} - -static void ecryptfs_copy_attr_timesizes(struct inode *dest, - const struct inode *src) -{ - dest->i_atime = src->i_atime; - dest->i_mtime = src->i_mtime; - dest->i_ctime = src->i_ctime; - ecryptfs_copy_inode_size(dest, src); -} - -void ecryptfs_copy_attr_all(struct inode *dest, const struct inode *src) -{ - dest->i_mode = src->i_mode; - dest->i_nlink = src->i_nlink; - dest->i_uid = src->i_uid; - dest->i_gid = src->i_gid; - dest->i_rdev = src->i_rdev; - dest->i_atime = src->i_atime; - dest->i_mtime = src->i_mtime; - dest->i_ctime = src->i_ctime; - dest->i_blkbits = src->i_blkbits; - dest->i_flags = src->i_flags; -} - /** * ecryptfs_create_underlying_file * @lower_dir_inode: inode of the parent in the lower fs of the new file @@ -171,8 +130,8 @@ ecryptfs_do_create(struct inode *directory_inode, ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n"); goto out_lock; } - ecryptfs_copy_attr_timesizes(directory_inode, - lower_dir_dentry->d_inode); + fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode); + fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode); out_lock: unlock_dir(lower_dir_dentry); out: @@ -196,7 +155,7 @@ static int grow_file(struct dentry *ecryptfs_dentry, struct file *lower_file, struct ecryptfs_file_info tmp_file_info; memset(&fake_file, 0, sizeof(fake_file)); - fake_file.f_dentry = ecryptfs_dentry; + fake_file.f_path.dentry = ecryptfs_dentry; memset(&tmp_file_info, 0, sizeof(tmp_file_info)); ecryptfs_set_file_private(&fake_file, &tmp_file_info); ecryptfs_set_file_lower(&fake_file, lower_file); @@ -365,7 +324,7 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry, "d_name.name = [%s]\n", lower_dentry, lower_dentry->d_name.name); lower_inode = lower_dentry->d_inode; - ecryptfs_copy_attr_atime(dir, lower_dir_dentry->d_inode); + fsstack_copy_attr_atime(dir, lower_dir_dentry->d_inode); BUG_ON(!atomic_read(&lower_dentry->d_count)); ecryptfs_set_dentry_private(dentry, kmem_cache_alloc(ecryptfs_dentry_info_cache, @@ -462,7 +421,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0); if (rc) goto out_lock; - ecryptfs_copy_attr_timesizes(dir, lower_new_dentry->d_inode); + fsstack_copy_attr_times(dir, lower_new_dentry->d_inode); + fsstack_copy_inode_size(dir, lower_new_dentry->d_inode); old_dentry->d_inode->i_nlink = ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink; i_size_write(new_dentry->d_inode, file_size_save); @@ -488,7 +448,7 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry) printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc); goto out_unlock; } - ecryptfs_copy_attr_times(dir, lower_dir_inode); + fsstack_copy_attr_times(dir, lower_dir_inode); dentry->d_inode->i_nlink = ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink; dentry->d_inode->i_ctime = dir->i_ctime; @@ -527,7 +487,8 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0); if (rc) goto out_lock; - ecryptfs_copy_attr_timesizes(dir, lower_dir_dentry->d_inode); + fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); + fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); out_lock: unlock_dir(lower_dir_dentry); dput(lower_dentry); @@ -550,7 +511,8 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0); if (rc) goto out; - ecryptfs_copy_attr_timesizes(dir, lower_dir_dentry->d_inode); + fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); + fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); dir->i_nlink = lower_dir_dentry->d_inode->i_nlink; out: unlock_dir(lower_dir_dentry); @@ -573,7 +535,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) dput(lower_dentry); if (!rc) d_delete(lower_dentry); - ecryptfs_copy_attr_times(dir, lower_dir_dentry->d_inode); + fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); dir->i_nlink = lower_dir_dentry->d_inode->i_nlink; unlock_dir(lower_dir_dentry); if (!rc) @@ -597,7 +559,8 @@ ecryptfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0); if (rc) goto out; - ecryptfs_copy_attr_timesizes(dir, lower_dir_dentry->d_inode); + fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); + fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); out: unlock_dir(lower_dir_dentry); if (!dentry->d_inode) @@ -626,9 +589,9 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, lower_new_dir_dentry->d_inode, lower_new_dentry); if (rc) goto out_lock; - ecryptfs_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode); + fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode, NULL); if (new_dir != old_dir) - ecryptfs_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode); + fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode, NULL); out_lock: unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); dput(lower_new_dentry->d_parent); @@ -684,8 +647,8 @@ ecryptfs_readlink(struct dentry *dentry, char __user * buf, int bufsiz) rc = -EFAULT; } kfree(decoded_name); - ecryptfs_copy_attr_atime(dentry->d_inode, - lower_dentry->d_inode); + fsstack_copy_attr_atime(dentry->d_inode, + lower_dentry->d_inode); } out_free_lower_buf: kfree(lower_buf); @@ -791,7 +754,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) * the file in the underlying filesystem so that the * truncation has an effect there as well. */ memset(&fake_ecryptfs_file, 0, sizeof(fake_ecryptfs_file)); - fake_ecryptfs_file.f_dentry = dentry; + fake_ecryptfs_file.f_path.dentry = dentry; /* Released at out_free: label */ ecryptfs_set_file_private(&fake_ecryptfs_file, kmem_cache_alloc(ecryptfs_file_info_cache, @@ -915,7 +878,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) } rc = notify_change(lower_dentry, ia); out: - ecryptfs_copy_attr_all(inode, lower_inode); + fsstack_copy_attr_all(inode, lower_inode, NULL); return rc; } diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 3ede12b2593..d0541ae8fab 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -35,6 +35,7 @@ #include <linux/pagemap.h> #include <linux/key.h> #include <linux/parser.h> +#include <linux/fs_stack.h> #include "ecryptfs_kernel.h" /** @@ -112,10 +113,10 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, d_add(dentry, inode); else d_instantiate(dentry, inode); - ecryptfs_copy_attr_all(inode, lower_inode); + fsstack_copy_attr_all(inode, lower_inode, NULL); /* This size will be overwritten for real files w/ headers and * other metadata */ - ecryptfs_copy_inode_size(inode, lower_inode); + fsstack_copy_inode_size(inode, lower_inode); out: return rc; } diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 924dd90a4cf..06843d24f23 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -51,7 +51,7 @@ static struct page *ecryptfs_get1page(struct file *file, int index) struct inode *inode; struct address_space *mapping; - dentry = file->f_dentry; + dentry = file->f_path.dentry; inode = dentry->d_inode; mapping = inode->i_mapping; page = read_cache_page(mapping, index, @@ -84,7 +84,7 @@ int write_zeros(struct file *file, pgoff_t index, int start, int num_zeros); int ecryptfs_fill_zeros(struct file *file, loff_t new_length) { int rc = 0; - struct dentry *dentry = file->f_dentry; + struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; pgoff_t old_end_page_index = 0; pgoff_t index = old_end_page_index; @@ -218,7 +218,7 @@ int ecryptfs_do_readpage(struct file *file, struct page *page, char *lower_page_data; const struct address_space_operations *lower_a_ops; - dentry = file->f_dentry; + dentry = file->f_path.dentry; lower_file = ecryptfs_file_to_lower(file); lower_dentry = ecryptfs_dentry_to_lower(dentry); inode = dentry->d_inode; @@ -275,9 +275,9 @@ static int ecryptfs_readpage(struct file *file, struct page *page) int rc = 0; struct ecryptfs_crypt_stat *crypt_stat; - BUG_ON(!(file && file->f_dentry && file->f_dentry->d_inode)); - crypt_stat = - &ecryptfs_inode_to_private(file->f_dentry->d_inode)->crypt_stat; + BUG_ON(!(file && file->f_path.dentry && file->f_path.dentry->d_inode)); + crypt_stat = &ecryptfs_inode_to_private(file->f_path.dentry->d_inode) + ->crypt_stat; if (!crypt_stat || !ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED) || ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE)) { @@ -638,8 +638,8 @@ static int ecryptfs_commit_write(struct file *file, struct page *page, lower_inode = ecryptfs_inode_to_lower(inode); lower_file = ecryptfs_file_to_lower(file); mutex_lock(&lower_inode->i_mutex); - crypt_stat = - &ecryptfs_inode_to_private(file->f_dentry->d_inode)->crypt_stat; + crypt_stat = &ecryptfs_inode_to_private(file->f_path.dentry->d_inode) + ->crypt_stat; if (ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE)) { ecryptfs_printk(KERN_DEBUG, "ECRYPTFS_NEW_FILE flag set in " "crypt_stat at memory location [%p]\n", crypt_stat); diff --git a/fs/efs/dir.c b/fs/efs/dir.c index 17f5b2d3c16..b46c488eefc 100644 --- a/fs/efs/dir.c +++ b/fs/efs/dir.c @@ -20,7 +20,7 @@ struct inode_operations efs_dir_inode_operations = { }; static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct buffer_head *bh; struct efs_dir *dirblock; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 88a6f8d0b88..3ae644e7e86 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -795,8 +795,8 @@ static int ep_getfd(int *efd, struct inode **einode, struct file **efile, goto eexit_4; dentry->d_op = &eventpollfs_dentry_operations; d_add(dentry, inode); - file->f_vfsmnt = mntget(eventpoll_mnt); - file->f_dentry = dentry; + file->f_path.mnt = mntget(eventpoll_mnt); + file->f_path.dentry = dentry; file->f_mapping = inode->i_mapping; file->f_pos = 0; diff --git a/fs/exec.c b/fs/exec.c index add0e03c3ea..11fe93f7363 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -38,6 +38,7 @@ #include <linux/binfmts.h> #include <linux/swap.h> #include <linux/utsname.h> +#include <linux/pid_namespace.h> #include <linux/module.h> #include <linux/namei.h> #include <linux/proc_fs.h> @@ -620,8 +621,8 @@ static int de_thread(struct task_struct *tsk) * Reparenting needs write_lock on tasklist_lock, * so it is safe to do it under read_lock. */ - if (unlikely(tsk->group_leader == child_reaper)) - child_reaper = tsk; + if (unlikely(tsk->group_leader == child_reaper(tsk))) + tsk->nsproxy->pid_ns->child_reaper = tsk; zap_other_threads(tsk); read_unlock(&tasklist_lock); @@ -782,7 +783,7 @@ static void flush_old_files(struct files_struct * files) j++; i = j * __NFDBITS; fdt = files_fdtable(files); - if (i >= fdt->max_fds || i >= fdt->max_fdset) + if (i >= fdt->max_fds) break; set = fdt->close_on_exec->fds_bits[j]; if (!set) @@ -912,7 +913,7 @@ EXPORT_SYMBOL(flush_old_exec); int prepare_binprm(struct linux_binprm *bprm) { int mode; - struct inode * inode = bprm->file->f_dentry->d_inode; + struct inode * inode = bprm->file->f_path.dentry->d_inode; int retval; mode = inode->i_mode; @@ -922,7 +923,7 @@ int prepare_binprm(struct linux_binprm *bprm) bprm->e_uid = current->euid; bprm->e_gid = current->egid; - if(!(bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID)) { + if(!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) { /* Set-uid? */ if (mode & S_ISUID) { current->personality &= ~PER_CLEAR_ON_SETID; @@ -1519,10 +1520,10 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) 0600); if (IS_ERR(file)) goto fail_unlock; - inode = file->f_dentry->d_inode; + inode = file->f_path.dentry->d_inode; if (inode->i_nlink > 1) goto close_fail; /* multiple links - don't dump */ - if (!ispipe && d_unhashed(file->f_dentry)) + if (!ispipe && d_unhashed(file->f_path.dentry)) goto close_fail; /* AK: actually i see no reason to not allow this for named pipes etc., @@ -1533,7 +1534,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) goto close_fail; if (!file->f_op->write) goto close_fail; - if (!ispipe && do_truncate(file->f_dentry, 0, 0, file) != 0) + if (!ispipe && do_truncate(file->f_path.dentry, 0, 0, file) != 0) goto close_fail; retval = binfmt->core_dump(signr, regs, file); diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 3e7a84a1e50..0b02ba9642d 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -248,7 +248,7 @@ static int ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) { loff_t pos = filp->f_pos; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct super_block *sb = inode->i_sb; unsigned int offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index e3cf8c81507..4b099d31071 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -90,7 +90,7 @@ int ext2_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, #ifdef CONFIG_COMPAT long ext2_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; int ret; /* These are just misnamed, they actually get/put from/to user an int */ diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 255cef5f742..6347c2dbdd8 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -597,8 +597,6 @@ static int ext2_check_descriptors (struct super_block * sb) return 1; } -#define log2(n) ffz(~(n)) - /* * Maximal file size. There is a direct, and {,double-,triple-}indirect * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks. @@ -834,9 +832,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi->s_sbh = bh; sbi->s_mount_state = le16_to_cpu(es->s_state); sbi->s_addr_per_block_bits = - log2 (EXT2_ADDR_PER_BLOCK(sb)); + ilog2 (EXT2_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = - log2 (EXT2_DESC_PER_BLOCK(sb)); + ilog2 (EXT2_DESC_PER_BLOCK(sb)); if (sb->s_magic != EXT2_SUPER_MAGIC) goto cantfind_ext2; diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 5a9313ecd4e..665adee99b3 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -103,7 +103,7 @@ static int ext3_readdir(struct file * filp, struct ext3_dir_entry_2 *de; struct super_block *sb; int err; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; int ret = 0; sb = inode->i_sb; @@ -122,7 +122,7 @@ static int ext3_readdir(struct file * filp, * We don't set the inode dirty flag since it's not * critical that it get flushed back to the disk. */ - EXT3_I(filp->f_dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL; + EXT3_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL; } #endif stored = 0; @@ -402,7 +402,7 @@ static int call_filldir(struct file * filp, void * dirent, { struct dir_private_info *info = filp->private_data; loff_t curr_pos; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct super_block * sb; int error; @@ -432,7 +432,7 @@ static int ext3_dx_readdir(struct file * filp, void * dirent, filldir_t filldir) { struct dir_private_info *info = filp->private_data; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct fname *fname; int ret; diff --git a/fs/ext3/file.c b/fs/ext3/file.c index e96c388047e..881f6365c41 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -52,7 +52,7 @@ ext3_file_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; ssize_t ret; int err; diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index 12daa686957..9b8090d94e6 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -257,7 +257,7 @@ flags_err: #ifdef CONFIG_COMPAT long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; int ret; /* These are just misnamed, they actually get/put from/to user an int */ diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 60d2f9dbdb0..4df39c4315e 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -602,7 +602,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, start_minor_hash)); - dir = dir_file->f_dentry->d_inode; + dir = dir_file->f_path.dentry->d_inode; if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; @@ -613,7 +613,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, } hinfo.hash = start_hash; hinfo.minor_hash = 0; - frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err); + frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err); if (!frame) return err; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 580b8a6ca97..b34886734a4 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1347,8 +1347,6 @@ static void ext3_orphan_cleanup (struct super_block * sb, sb->s_flags = s_flags; /* Restore MS_RDONLY status */ } -#define log2(n) ffz(~(n)) - /* * Maximal file size. There is a direct, and {,double-,triple-}indirect * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks. @@ -1597,8 +1595,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc); sbi->s_sbh = bh; sbi->s_mount_state = le16_to_cpu(es->s_state); - sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb)); - sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb)); + sbi->s_addr_per_block_bits = ilog2(EXT3_ADDR_PER_BLOCK(sb)); + sbi->s_desc_per_block_bits = ilog2(EXT3_DESC_PER_BLOCK(sb)); for (i=0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index f2ed3e7fb9f..da80368b66f 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -103,7 +103,7 @@ static int ext4_readdir(struct file * filp, struct ext4_dir_entry_2 *de; struct super_block *sb; int err; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; int ret = 0; sb = inode->i_sb; @@ -122,7 +122,7 @@ static int ext4_readdir(struct file * filp, * We don't set the inode dirty flag since it's not * critical that it get flushed back to the disk. */ - EXT4_I(filp->f_dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL; + EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL; } #endif stored = 0; @@ -402,7 +402,7 @@ static int call_filldir(struct file * filp, void * dirent, { struct dir_private_info *info = filp->private_data; loff_t curr_pos; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct super_block * sb; int error; @@ -432,7 +432,7 @@ static int ext4_dx_readdir(struct file * filp, void * dirent, filldir_t filldir) { struct dir_private_info *info = filp->private_data; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct fname *fname; int ret; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 0b622c0624b..3bbc24b5878 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -52,7 +52,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; ssize_t ret; int err; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1d85d4ec959..a127cc03c9f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1232,7 +1232,7 @@ retry: from, to, NULL, do_journal_get_write_access); if (ret) /* fatal error, just put the handle and return */ - journal_stop(handle); + ext4_journal_stop(handle); } return ret; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 22a737c306c..500567dd53b 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -256,7 +256,7 @@ flags_err: #ifdef CONFIG_COMPAT long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; int ret; /* These are just misnamed, they actually get/put from/to user an int */ diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 859990eac50..e5a74a5ac26 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -602,7 +602,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, start_minor_hash)); - dir = dir_file->f_dentry->d_inode; + dir = dir_file->f_path.dentry->d_inode; if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; @@ -613,7 +613,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, } hinfo.hash = start_hash; hinfo.minor_hash = 0; - frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err); + frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err); if (!frame) return err; diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 69c439f4438..c16af246d24 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -579,7 +579,7 @@ parse_record: if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) inum = inode->i_ino; else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) { - inum = parent_ino(filp->f_dentry); + inum = parent_ino(filp->f_path.dentry); } else { loff_t i_pos = fat_make_i_pos(sb, bh, de); struct inode *tmp = fat_iget(sb, i_pos); @@ -643,7 +643,7 @@ out: static int fat_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; return __fat_readdir(inode, filp, dirent, filldir, 0, 0); } @@ -782,7 +782,7 @@ static long fat_compat_dir_ioctl(struct file *file, unsigned cmd, set_fs(KERNEL_DS); lock_kernel(); - ret = fat_dir_ioctl(file->f_dentry->d_inode, file, + ret = fat_dir_ioctl(file->f_path.dentry->d_inode, file, cmd, (unsigned long) &d); unlock_kernel(); set_fs(oldfs); diff --git a/fs/fat/file.c b/fs/fat/file.c index 0aa813d944a..c1237b70c1f 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -92,7 +92,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp, } /* This MUST be done before doing anything irreversible... */ - err = notify_change(filp->f_dentry, &ia); + err = notify_change(filp->f_path.dentry, &ia); if (err) goto up; diff --git a/fs/fcntl.c b/fs/fcntl.c index 4740d35e52c..8e382a5d51b 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -77,10 +77,9 @@ repeat: start = files->next_fd; newfd = start; - if (start < fdt->max_fdset) { + if (start < fdt->max_fds) newfd = find_next_zero_bit(fdt->open_fds->fds_bits, - fdt->max_fdset, start); - } + fdt->max_fds, start); error = -EMFILE; if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) @@ -204,7 +203,7 @@ asmlinkage long sys_dup(unsigned int fildes) static int setfl(int fd, struct file * filp, unsigned long arg) { - struct inode * inode = filp->f_dentry->d_inode; + struct inode * inode = filp->f_path.dentry->d_inode; int error = 0; /* diff --git a/fs/file.c b/fs/file.c index 51aef675470..c5575de0111 100644 --- a/fs/file.c +++ b/fs/file.c @@ -32,46 +32,28 @@ struct fdtable_defer { */ static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); - -/* - * Allocate an fd array, using kmalloc or vmalloc. - * Note: the array isn't cleared at allocation time. - */ -struct file ** alloc_fd_array(int num) +static inline void * alloc_fdmem(unsigned int size) { - struct file **new_fds; - int size = num * sizeof(struct file *); - if (size <= PAGE_SIZE) - new_fds = (struct file **) kmalloc(size, GFP_KERNEL); - else - new_fds = (struct file **) vmalloc(size); - return new_fds; + return kmalloc(size, GFP_KERNEL); + else + return vmalloc(size); } -void free_fd_array(struct file **array, int num) +static inline void free_fdarr(struct fdtable *fdt) { - int size = num * sizeof(struct file *); - - if (!array) { - printk (KERN_ERR "free_fd_array: array = 0 (num = %d)\n", num); - return; - } - - if (num <= NR_OPEN_DEFAULT) /* Don't free the embedded fd array! */ - return; - else if (size <= PAGE_SIZE) - kfree(array); + if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) + kfree(fdt->fd); else - vfree(array); + vfree(fdt->fd); } -static void __free_fdtable(struct fdtable *fdt) +static inline void free_fdset(struct fdtable *fdt) { - free_fdset(fdt->open_fds, fdt->max_fdset); - free_fdset(fdt->close_on_exec, fdt->max_fdset); - free_fd_array(fdt->fd, fdt->max_fds); - kfree(fdt); + if (fdt->max_fds <= (PAGE_SIZE * BITS_PER_BYTE / 2)) + kfree(fdt->open_fds); + else + vfree(fdt->open_fds); } static void free_fdtable_work(struct work_struct *work) @@ -86,41 +68,32 @@ static void free_fdtable_work(struct work_struct *work) spin_unlock_bh(&f->lock); while(fdt) { struct fdtable *next = fdt->next; - __free_fdtable(fdt); + vfree(fdt->fd); + free_fdset(fdt); + kfree(fdt); fdt = next; } } -static void free_fdtable_rcu(struct rcu_head *rcu) +void free_fdtable_rcu(struct rcu_head *rcu) { struct fdtable *fdt = container_of(rcu, struct fdtable, rcu); - int fdset_size, fdarray_size; struct fdtable_defer *fddef; BUG_ON(!fdt); - fdset_size = fdt->max_fdset / 8; - fdarray_size = fdt->max_fds * sizeof(struct file *); - if (fdt->free_files) { + if (fdt->max_fds <= NR_OPEN_DEFAULT) { /* - * The this fdtable was embedded in the files structure - * and the files structure itself was getting destroyed. - * It is now safe to free the files structure. + * This fdtable is embedded in the files structure and that + * structure itself is getting destroyed. */ - kmem_cache_free(files_cachep, fdt->free_files); + kmem_cache_free(files_cachep, + container_of(fdt, struct files_struct, fdtab)); return; } - if (fdt->max_fdset <= EMBEDDED_FD_SET_SIZE && - fdt->max_fds <= NR_OPEN_DEFAULT) { - /* - * The fdtable was embedded - */ - return; - } - if (fdset_size <= PAGE_SIZE && fdarray_size <= PAGE_SIZE) { - kfree(fdt->open_fds); - kfree(fdt->close_on_exec); + if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) { kfree(fdt->fd); + kfree(fdt->open_fds); kfree(fdt); } else { fddef = &get_cpu_var(fdtable_defer_list); @@ -134,136 +107,74 @@ static void free_fdtable_rcu(struct rcu_head *rcu) } } -void free_fdtable(struct fdtable *fdt) -{ - if (fdt->free_files || - fdt->max_fdset > EMBEDDED_FD_SET_SIZE || - fdt->max_fds > NR_OPEN_DEFAULT) - call_rcu(&fdt->rcu, free_fdtable_rcu); -} - /* * Expand the fdset in the files_struct. Called with the files spinlock * held for write. */ -static void copy_fdtable(struct fdtable *nfdt, struct fdtable *fdt) +static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) { - int i; - int count; - - BUG_ON(nfdt->max_fdset < fdt->max_fdset); - BUG_ON(nfdt->max_fds < fdt->max_fds); - /* Copy the existing tables and install the new pointers */ - - i = fdt->max_fdset / (sizeof(unsigned long) * 8); - count = (nfdt->max_fdset - fdt->max_fdset) / 8; + unsigned int cpy, set; - /* - * Don't copy the entire array if the current fdset is - * not yet initialised. - */ - if (i) { - memcpy (nfdt->open_fds, fdt->open_fds, - fdt->max_fdset/8); - memcpy (nfdt->close_on_exec, fdt->close_on_exec, - fdt->max_fdset/8); - memset (&nfdt->open_fds->fds_bits[i], 0, count); - memset (&nfdt->close_on_exec->fds_bits[i], 0, count); - } - - /* Don't copy/clear the array if we are creating a new - fd array for fork() */ - if (fdt->max_fds) { - memcpy(nfdt->fd, fdt->fd, - fdt->max_fds * sizeof(struct file *)); - /* clear the remainder of the array */ - memset(&nfdt->fd[fdt->max_fds], 0, - (nfdt->max_fds - fdt->max_fds) * - sizeof(struct file *)); - } -} - -/* - * Allocate an fdset array, using kmalloc or vmalloc. - * Note: the array isn't cleared at allocation time. - */ -fd_set * alloc_fdset(int num) -{ - fd_set *new_fdset; - int size = num / 8; + BUG_ON(nfdt->max_fds < ofdt->max_fds); + if (ofdt->max_fds == 0) + return; - if (size <= PAGE_SIZE) - new_fdset = (fd_set *) kmalloc(size, GFP_KERNEL); - else - new_fdset = (fd_set *) vmalloc(size); - return new_fdset; + cpy = ofdt->max_fds * sizeof(struct file *); + set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); + memcpy(nfdt->fd, ofdt->fd, cpy); + memset((char *)(nfdt->fd) + cpy, 0, set); + + cpy = ofdt->max_fds / BITS_PER_BYTE; + set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE; + memcpy(nfdt->open_fds, ofdt->open_fds, cpy); + memset((char *)(nfdt->open_fds) + cpy, 0, set); + memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); + memset((char *)(nfdt->close_on_exec) + cpy, 0, set); } -void free_fdset(fd_set *array, int num) +static struct fdtable * alloc_fdtable(unsigned int nr) { - if (num <= EMBEDDED_FD_SET_SIZE) /* Don't free an embedded fdset */ - return; - else if (num <= 8 * PAGE_SIZE) - kfree(array); - else - vfree(array); -} + struct fdtable *fdt; + char *data; -static struct fdtable *alloc_fdtable(int nr) -{ - struct fdtable *fdt = NULL; - int nfds = 0; - fd_set *new_openset = NULL, *new_execset = NULL; - struct file **new_fds; + /* + * Figure out how many fds we actually want to support in this fdtable. + * Allocation steps are keyed to the size of the fdarray, since it + * grows far faster than any of the other dynamic data. We try to fit + * the fdarray into comfortable page-tuned chunks: starting at 1024B + * and growing in powers of two from there on. + */ + nr /= (1024 / sizeof(struct file *)); + nr = roundup_pow_of_two(nr + 1); + nr *= (1024 / sizeof(struct file *)); + if (nr > NR_OPEN) + nr = NR_OPEN; - fdt = kzalloc(sizeof(*fdt), GFP_KERNEL); + fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); if (!fdt) - goto out; - - nfds = max_t(int, 8 * L1_CACHE_BYTES, roundup_pow_of_two(nr + 1)); - if (nfds > NR_OPEN) - nfds = NR_OPEN; + goto out; + fdt->max_fds = nr; + data = alloc_fdmem(nr * sizeof(struct file *)); + if (!data) + goto out_fdt; + fdt->fd = (struct file **)data; + data = alloc_fdmem(max_t(unsigned int, + 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES)); + if (!data) + goto out_arr; + fdt->open_fds = (fd_set *)data; + data += nr / BITS_PER_BYTE; + fdt->close_on_exec = (fd_set *)data; + INIT_RCU_HEAD(&fdt->rcu); + fdt->next = NULL; - new_openset = alloc_fdset(nfds); - new_execset = alloc_fdset(nfds); - if (!new_openset || !new_execset) - goto out; - fdt->open_fds = new_openset; - fdt->close_on_exec = new_execset; - fdt->max_fdset = nfds; - - nfds = NR_OPEN_DEFAULT; - /* - * Expand to the max in easy steps, and keep expanding it until - * we have enough for the requested fd array size. - */ - do { -#if NR_OPEN_DEFAULT < 256 - if (nfds < 256) - nfds = 256; - else -#endif - if (nfds < (PAGE_SIZE / sizeof(struct file *))) - nfds = PAGE_SIZE / sizeof(struct file *); - else { - nfds = nfds * 2; - if (nfds > NR_OPEN) - nfds = NR_OPEN; - } - } while (nfds <= nr); - new_fds = alloc_fd_array(nfds); - if (!new_fds) - goto out2; - fdt->fd = new_fds; - fdt->max_fds = nfds; - fdt->free_files = NULL; return fdt; -out2: - nfds = fdt->max_fdset; -out: - free_fdset(new_openset, nfds); - free_fdset(new_execset, nfds); + +out_arr: + free_fdarr(fdt); +out_fdt: kfree(fdt); +out: return NULL; } @@ -290,14 +201,17 @@ static int expand_fdtable(struct files_struct *files, int nr) * we dropped the lock */ cur_fdt = files_fdtable(files); - if (nr >= cur_fdt->max_fds || nr >= cur_fdt->max_fdset) { + if (nr >= cur_fdt->max_fds) { /* Continue as planned */ copy_fdtable(new_fdt, cur_fdt); rcu_assign_pointer(files->fdt, new_fdt); - free_fdtable(cur_fdt); + if (cur_fdt->max_fds > NR_OPEN_DEFAULT) + free_fdtable(cur_fdt); } else { /* Somebody else expanded, so undo our attempt */ - __free_fdtable(new_fdt); + free_fdarr(new_fdt); + free_fdset(new_fdt); + kfree(new_fdt); } return 1; } @@ -316,11 +230,10 @@ int expand_files(struct files_struct *files, int nr) fdt = files_fdtable(files); /* Do we need to expand? */ - if (nr < fdt->max_fdset && nr < fdt->max_fds) + if (nr < fdt->max_fds) return 0; /* Can we expand? */ - if (fdt->max_fdset >= NR_OPEN || fdt->max_fds >= NR_OPEN || - nr >= NR_OPEN) + if (nr >= NR_OPEN) return -EMFILE; /* All good, so we try */ diff --git a/fs/file_table.c b/fs/file_table.c index 24f25a057d9..4c17a18d8c1 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -152,8 +152,8 @@ EXPORT_SYMBOL(fput); */ void fastcall __fput(struct file *file) { - struct dentry *dentry = file->f_dentry; - struct vfsmount *mnt = file->f_vfsmnt; + struct dentry *dentry = file->f_path.dentry; + struct vfsmount *mnt = file->f_path.mnt; struct inode *inode = dentry->d_inode; might_sleep(); @@ -176,8 +176,8 @@ void fastcall __fput(struct file *file) put_write_access(inode); put_pid(file->f_owner.pid); file_kill(file); - file->f_dentry = NULL; - file->f_vfsmnt = NULL; + file->f_path.dentry = NULL; + file->f_path.mnt = NULL; file_free(file); dput(dentry); mntput(mnt); @@ -271,7 +271,7 @@ int fs_may_remount_ro(struct super_block *sb) file_list_lock(); list_for_each(p, &sb->s_files) { struct file *file = list_entry(p, struct file, f_u.fu_list); - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; /* File with pending delete? */ if (inode->i_nlink == 0) diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index 43886fa00a2..3995d7fbeda 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -240,7 +240,7 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, struct nameidata *nd) static int vxfs_readdir(struct file *fp, void *retp, filldir_t filler) { - struct inode *ip = fp->f_dentry->d_inode; + struct inode *ip = fp->f_path.dentry->d_inode; struct super_block *sbp = ip->i_sb; u_long bsize = sbp->s_blocksize; u_long page, npages, block, pblocks, nblocks, offset; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index c403b66ec83..a4b142a6a2c 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -251,8 +251,19 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) WARN_ON(inode->i_state & I_WILL_FREE); if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_LOCK)) { + struct address_space *mapping = inode->i_mapping; + int ret; + list_move(&inode->i_list, &inode->i_sb->s_dirty); - return 0; + + /* + * Even if we don't actually write the inode itself here, + * we can at least start some of the data writeout.. + */ + spin_unlock(&inode_lock); + ret = do_writepages(mapping, wbc); + spin_lock(&inode_lock); + return ret; } /* diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 16b39c053d4..1794305f9ed 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -23,7 +23,7 @@ static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file) { struct fuse_conn *fc; mutex_lock(&fuse_mutex); - fc = file->f_dentry->d_inode->i_private; + fc = file->f_path.dentry->d_inode->i_private; if (fc) fc = fuse_conn_get(fc); mutex_unlock(&fuse_mutex); @@ -193,8 +193,12 @@ static int fuse_ctl_get_sb(struct file_system_type *fs_type, int flags, static void fuse_ctl_kill_sb(struct super_block *sb) { + struct fuse_conn *fc; + mutex_lock(&fuse_mutex); fuse_control_sb = NULL; + list_for_each_entry(fc, &fuse_conn_list, entry) + fc->ctl_ndents = 0; mutex_unlock(&fuse_mutex); kill_litter_super(sb); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 1cabdb229ad..40080477ceb 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -856,7 +856,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) int err; size_t nbytes; struct page *page; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 128f79c4080..f63efe1337e 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -141,8 +141,8 @@ int fuse_release_common(struct inode *inode, struct file *file, int isdir) isdir ? FUSE_RELEASEDIR : FUSE_RELEASE); /* Hold vfsmount and dentry until release is finished */ - req->vfsmount = mntget(file->f_vfsmnt); - req->dentry = dget(file->f_dentry); + req->vfsmount = mntget(file->f_path.mnt); + req->dentry = dget(file->f_path.dentry); request_send_background(fc, req); } @@ -184,7 +184,7 @@ static u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) static int fuse_flush(struct file *file, fl_owner_t id) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_file *ff = file->private_data; struct fuse_req *req; @@ -483,10 +483,8 @@ static int fuse_commit_write(struct file *file, struct page *page, i_size_write(inode, pos); spin_unlock(&fc->lock); - if (offset == 0 && to == PAGE_CACHE_SIZE) { - clear_page_dirty(page); + if (offset == 0 && to == PAGE_CACHE_SIZE) SetPageUptodate(page); - } } fuse_invalidate_attr(inode); return err; @@ -533,7 +531,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, static ssize_t fuse_direct_io(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int write) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); size_t nmax = write ? fc->max_write : fc->max_read; loff_t pos = *ppos; @@ -607,7 +605,7 @@ static ssize_t fuse_direct_read(struct file *file, char __user *buf, static ssize_t fuse_direct_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; ssize_t res; /* Don't allow parallel writes to the same file */ mutex_lock(&inode->i_mutex); @@ -662,7 +660,7 @@ static int convert_fuse_file_lock(const struct fuse_file_lock *ffl, static void fuse_lk_fill(struct fuse_req *req, struct file *file, const struct file_lock *fl, int opcode, pid_t pid) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_file *ff = file->private_data; struct fuse_lk_in *arg = &req->misc.lk_in; @@ -682,7 +680,7 @@ static void fuse_lk_fill(struct fuse_req *req, struct file *file, static int fuse_getlk(struct file *file, struct file_lock *fl) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; struct fuse_lk_out outarg; @@ -707,7 +705,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) static int fuse_setlk(struct file *file, struct file_lock *fl) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; @@ -734,7 +732,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl) static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); int err; diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index c0791cbacad..de8e64c03f7 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -4,42 +4,43 @@ config GFS2_FS select FS_POSIX_ACL select CRC32 help - A cluster filesystem. + A cluster filesystem. - Allows a cluster of computers to simultaneously use a block device - that is shared between them (with FC, iSCSI, NBD, etc...). GFS reads - and writes to the block device like a local filesystem, but also uses - a lock module to allow the computers coordinate their I/O so - filesystem consistency is maintained. One of the nifty features of - GFS is perfect consistency -- changes made to the filesystem on one - machine show up immediately on all other machines in the cluster. + Allows a cluster of computers to simultaneously use a block device + that is shared between them (with FC, iSCSI, NBD, etc...). GFS reads + and writes to the block device like a local filesystem, but also uses + a lock module to allow the computers coordinate their I/O so + filesystem consistency is maintained. One of the nifty features of + GFS is perfect consistency -- changes made to the filesystem on one + machine show up immediately on all other machines in the cluster. - To use the GFS2 filesystem, you will need to enable one or more of - the below locking modules. Documentation and utilities for GFS2 can - be found here: http://sources.redhat.com/cluster + To use the GFS2 filesystem, you will need to enable one or more of + the below locking modules. Documentation and utilities for GFS2 can + be found here: http://sources.redhat.com/cluster config GFS2_FS_LOCKING_NOLOCK tristate "GFS2 \"nolock\" locking module" depends on GFS2_FS help - Single node locking module for GFS2. + Single node locking module for GFS2. - Use this module if you want to use GFS2 on a single node without - its clustering features. You can still take advantage of the - large file support, and upgrade to running a full cluster later on - if required. + Use this module if you want to use GFS2 on a single node without + its clustering features. You can still take advantage of the + large file support, and upgrade to running a full cluster later on + if required. - If you will only be using GFS2 in cluster mode, you do not need this - module. + If you will only be using GFS2 in cluster mode, you do not need this + module. config GFS2_FS_LOCKING_DLM tristate "GFS2 DLM locking module" - depends on GFS2_FS + depends on GFS2_FS && SYSFS && NET && INET && (IPV6 || IPV6=n) + select IP_SCTP if DLM_SCTP + select CONFIGFS_FS select DLM help - Multiple node locking module for GFS2 - - Most users of GFS2 will require this module. It provides the locking - interface between GFS2 and the DLM, which is required to use GFS2 - in a cluster environment. + Multiple node locking module for GFS2 + Most users of GFS2 will require this module. It provides the locking + interface between GFS2 and the DLM, which is required to use GFS2 + in a cluster environment. diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 8240c1ff94f..113f6c9110c 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -773,7 +773,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, gfs2_free_data(ip, bstart, blen); } - ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_dinode_out(ip, dibh->b_data); @@ -848,7 +848,7 @@ static int do_grow(struct gfs2_inode *ip, u64 size) } ip->i_di.di_size = size; - ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; error = gfs2_meta_inode_buffer(ip, &dibh); if (error) @@ -963,7 +963,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size) if (gfs2_is_stuffed(ip)) { ip->i_di.di_size = size; - ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size); @@ -975,7 +975,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size) if (!error) { ip->i_di.di_size = size; - ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -1048,7 +1048,7 @@ static int trunc_end(struct gfs2_inode *ip) ip->i_num.no_addr; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); } - ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG; gfs2_trans_add_bh(ip->i_gl, dibh, 1); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 0fdcb7713cd..c93ca8f361b 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -131,7 +131,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf, memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); if (ip->i_di.di_size < offset + size) ip->i_di.di_size = offset + size; - ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -229,7 +229,7 @@ out: if (ip->i_di.di_size < offset + copied) ip->i_di.di_size = offset + copied; - ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -1198,12 +1198,11 @@ static int compare_dents(const void *a, const void *b) */ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, - void *opaque, gfs2_filldir_t filldir, + void *opaque, filldir_t filldir, const struct gfs2_dirent **darr, u32 entries, int *copied) { const struct gfs2_dirent *dent, *dent_next; - struct gfs2_inum_host inum; u64 off, off_next; unsigned int x, y; int run = 0; @@ -1240,11 +1239,9 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, *offset = off; } - gfs2_inum_in(&inum, (char *)&dent->de_inum); - error = filldir(opaque, (const char *)(dent + 1), be16_to_cpu(dent->de_name_len), - off, &inum, + off, be64_to_cpu(dent->de_inum.no_addr), be16_to_cpu(dent->de_type)); if (error) return 1; @@ -1262,8 +1259,8 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, } static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, - gfs2_filldir_t filldir, int *copied, - unsigned *depth, u64 leaf_no) + filldir_t filldir, int *copied, unsigned *depth, + u64 leaf_no) { struct gfs2_inode *ip = GFS2_I(inode); struct buffer_head *bh; @@ -1343,7 +1340,7 @@ out: */ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, - gfs2_filldir_t filldir) + filldir_t filldir) { struct gfs2_inode *dip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -1402,7 +1399,7 @@ out: } int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, - gfs2_filldir_t filldir) + filldir_t filldir) { struct gfs2_inode *dip = GFS2_I(inode); struct dirent_gather g; @@ -1568,7 +1565,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, break; gfs2_trans_add_bh(ip->i_gl, bh, 1); ip->i_di.di_entries++; - ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_dinode_out(ip, bh->b_data); brelse(bh); error = 0; @@ -1654,7 +1651,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name) gfs2_consist_inode(dip); gfs2_trans_add_bh(dip->i_gl, bh, 1); dip->i_di.di_entries--; - dip->i_inode.i_mtime.tv_sec = dip->i_inode.i_ctime.tv_sec = get_seconds(); + dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_dinode_out(dip, bh->b_data); brelse(bh); mark_inode_dirty(&dip->i_inode); @@ -1702,7 +1699,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, gfs2_trans_add_bh(dip->i_gl, bh, 1); } - dip->i_inode.i_mtime.tv_sec = dip->i_inode.i_ctime.tv_sec = get_seconds(); + dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_dinode_out(dip, bh->b_data); brelse(bh); return 0; diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index b21b33668a5..48fe89046bb 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -16,30 +16,13 @@ struct inode; struct gfs2_inode; struct gfs2_inum; -/** - * gfs2_filldir_t - Report a directory entry to the caller of gfs2_dir_read() - * @opaque: opaque data used by the function - * @name: the name of the directory entry - * @length: the length of the name - * @offset: the entry's offset in the directory - * @inum: the inode number the entry points to - * @type: the type of inode the entry points to - * - * Returns: 0 on success, 1 if buffer full - */ - -typedef int (*gfs2_filldir_t) (void *opaque, - const char *name, unsigned int length, - u64 offset, - struct gfs2_inum_host *inum, unsigned int type); - int gfs2_dir_search(struct inode *dir, const struct qstr *filename, struct gfs2_inum_host *inum, unsigned int *type); int gfs2_dir_add(struct inode *inode, const struct qstr *filename, const struct gfs2_inum_host *inum, unsigned int type); int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename); -int gfs2_dir_read(struct inode *inode, u64 * offset, void *opaque, - gfs2_filldir_t filldir); +int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, + filldir_t filldir); int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, struct gfs2_inum_host *new_inum, unsigned int new_type); diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c index ebebbdcd705..0c83c7f4dda 100644 --- a/fs/gfs2/eattr.c +++ b/fs/gfs2/eattr.c @@ -301,7 +301,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -718,7 +718,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, (er->er_mode & S_IFMT)); ip->i_inode.i_mode = er->er_mode; } - ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -853,7 +853,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, (ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT)); ip->i_inode.i_mode = er->er_mode; } - ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -1134,7 +1134,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 438146904b5..6618c119025 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -19,6 +19,8 @@ #include <linux/gfs2_ondisk.h> #include <linux/list.h> #include <linux/lm_interface.h> +#include <linux/wait.h> +#include <linux/rwsem.h> #include <asm/uaccess.h> #include "gfs2.h" @@ -33,11 +35,6 @@ #include "super.h" #include "util.h" -struct greedy { - struct gfs2_holder gr_gh; - struct delayed_work gr_work; -}; - struct gfs2_gl_hash_bucket { struct hlist_head hb_list; }; @@ -47,6 +44,9 @@ typedef void (*glock_examiner) (struct gfs2_glock * gl); static int gfs2_dump_lockstate(struct gfs2_sbd *sdp); static int dump_glock(struct gfs2_glock *gl); static int dump_inode(struct gfs2_inode *ip); +static void gfs2_glock_xmote_th(struct gfs2_holder *gh); +static void gfs2_glock_drop_th(struct gfs2_glock *gl); +static DECLARE_RWSEM(gfs2_umount_flush_sem); #define GFS2_GL_HASH_SHIFT 15 #define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) @@ -213,30 +213,6 @@ out: } /** - * queue_empty - check to see if a glock's queue is empty - * @gl: the glock - * @head: the head of the queue to check - * - * This function protects the list in the event that a process already - * has a holder on the list and is adding a second holder for itself. - * The glmutex lock is what generally prevents processes from working - * on the same glock at once, but the special case of adding a second - * holder for yourself ("recursive" locking) doesn't involve locking - * glmutex, making the spin lock necessary. - * - * Returns: 1 if the queue is empty - */ - -static inline int queue_empty(struct gfs2_glock *gl, struct list_head *head) -{ - int empty; - spin_lock(&gl->gl_spin); - empty = list_empty(head); - spin_unlock(&gl->gl_spin); - return empty; -} - -/** * search_bucket() - Find struct gfs2_glock by lock number * @bucket: the bucket to search * @name: The lock name @@ -395,11 +371,6 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, gh->gh_flags = flags; gh->gh_error = 0; gh->gh_iflags = 0; - init_completion(&gh->gh_wait); - - if (gh->gh_state == LM_ST_EXCLUSIVE) - gh->gh_flags |= GL_LOCAL_EXCL; - gfs2_glock_hold(gl); } @@ -417,9 +388,6 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder * { gh->gh_state = state; gh->gh_flags = flags; - if (gh->gh_state == LM_ST_EXCLUSIVE) - gh->gh_flags |= GL_LOCAL_EXCL; - gh->gh_iflags &= 1 << HIF_ALLOCED; gh->gh_ip = (unsigned long)__builtin_return_address(0); } @@ -479,6 +447,29 @@ static void gfs2_holder_put(struct gfs2_holder *gh) kfree(gh); } +static void gfs2_holder_dispose_or_wake(struct gfs2_holder *gh) +{ + if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) { + gfs2_holder_put(gh); + return; + } + clear_bit(HIF_WAIT, &gh->gh_iflags); + smp_mb(); + wake_up_bit(&gh->gh_iflags, HIF_WAIT); +} + +static int holder_wait(void *word) +{ + schedule(); + return 0; +} + +static void wait_on_holder(struct gfs2_holder *gh) +{ + might_sleep(); + wait_on_bit(&gh->gh_iflags, HIF_WAIT, holder_wait, TASK_UNINTERRUPTIBLE); +} + /** * rq_mutex - process a mutex request in the queue * @gh: the glock holder @@ -493,7 +484,9 @@ static int rq_mutex(struct gfs2_holder *gh) list_del_init(&gh->gh_list); /* gh->gh_error never examined. */ set_bit(GLF_LOCK, &gl->gl_flags); - complete(&gh->gh_wait); + clear_bit(HIF_WAIT, &gh->gh_iflags); + smp_mb(); + wake_up_bit(&gh->gh_iflags, HIF_WAIT); return 1; } @@ -511,7 +504,6 @@ static int rq_promote(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_sbd; - const struct gfs2_glock_operations *glops = gl->gl_ops; if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) { if (list_empty(&gl->gl_holders)) { @@ -526,7 +518,7 @@ static int rq_promote(struct gfs2_holder *gh) gfs2_reclaim_glock(sdp); } - glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags); + gfs2_glock_xmote_th(gh); spin_lock(&gl->gl_spin); } return 1; @@ -537,11 +529,11 @@ static int rq_promote(struct gfs2_holder *gh) set_bit(GLF_LOCK, &gl->gl_flags); } else { struct gfs2_holder *next_gh; - if (gh->gh_flags & GL_LOCAL_EXCL) + if (gh->gh_state == LM_ST_EXCLUSIVE) return 1; next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); - if (next_gh->gh_flags & GL_LOCAL_EXCL) + if (next_gh->gh_state == LM_ST_EXCLUSIVE) return 1; } @@ -549,7 +541,7 @@ static int rq_promote(struct gfs2_holder *gh) gh->gh_error = 0; set_bit(HIF_HOLDER, &gh->gh_iflags); - complete(&gh->gh_wait); + gfs2_holder_dispose_or_wake(gh); return 0; } @@ -564,7 +556,6 @@ static int rq_promote(struct gfs2_holder *gh) static int rq_demote(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; - const struct gfs2_glock_operations *glops = gl->gl_ops; if (!list_empty(&gl->gl_holders)) return 1; @@ -573,10 +564,7 @@ static int rq_demote(struct gfs2_holder *gh) list_del_init(&gh->gh_list); gh->gh_error = 0; spin_unlock(&gl->gl_spin); - if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) - gfs2_holder_put(gh); - else - complete(&gh->gh_wait); + gfs2_holder_dispose_or_wake(gh); spin_lock(&gl->gl_spin); } else { gl->gl_req_gh = gh; @@ -585,9 +573,9 @@ static int rq_demote(struct gfs2_holder *gh) if (gh->gh_state == LM_ST_UNLOCKED || gl->gl_state != LM_ST_EXCLUSIVE) - glops->go_drop_th(gl); + gfs2_glock_drop_th(gl); else - glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags); + gfs2_glock_xmote_th(gh); spin_lock(&gl->gl_spin); } @@ -596,30 +584,6 @@ static int rq_demote(struct gfs2_holder *gh) } /** - * rq_greedy - process a queued request to drop greedy status - * @gh: the glock holder - * - * Returns: 1 if the queue is blocked - */ - -static int rq_greedy(struct gfs2_holder *gh) -{ - struct gfs2_glock *gl = gh->gh_gl; - - list_del_init(&gh->gh_list); - /* gh->gh_error never examined. */ - clear_bit(GLF_GREEDY, &gl->gl_flags); - spin_unlock(&gl->gl_spin); - - gfs2_holder_uninit(gh); - kfree(container_of(gh, struct greedy, gr_gh)); - - spin_lock(&gl->gl_spin); - - return 0; -} - -/** * run_queue - process holder structures on a glock * @gl: the glock * @@ -649,8 +613,6 @@ static void run_queue(struct gfs2_glock *gl) if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) blocked = rq_demote(gh); - else if (test_bit(HIF_GREEDY, &gh->gh_iflags)) - blocked = rq_greedy(gh); else gfs2_assert_warn(gl->gl_sbd, 0); @@ -684,6 +646,8 @@ static void gfs2_glmutex_lock(struct gfs2_glock *gl) gfs2_holder_init(gl, 0, 0, &gh); set_bit(HIF_MUTEX, &gh.gh_iflags); + if (test_and_set_bit(HIF_WAIT, &gh.gh_iflags)) + BUG(); spin_lock(&gl->gl_spin); if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { @@ -691,11 +655,13 @@ static void gfs2_glmutex_lock(struct gfs2_glock *gl) } else { gl->gl_owner = current; gl->gl_ip = (unsigned long)__builtin_return_address(0); - complete(&gh.gh_wait); + clear_bit(HIF_WAIT, &gh.gh_iflags); + smp_mb(); + wake_up_bit(&gh.gh_iflags, HIF_WAIT); } spin_unlock(&gl->gl_spin); - wait_for_completion(&gh.gh_wait); + wait_on_holder(&gh); gfs2_holder_uninit(&gh); } @@ -774,6 +740,7 @@ restart: return; set_bit(HIF_DEMOTE, &new_gh->gh_iflags); set_bit(HIF_DEALLOC, &new_gh->gh_iflags); + set_bit(HIF_WAIT, &new_gh->gh_iflags); goto restart; } @@ -825,7 +792,7 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret) int op_done = 1; gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); - gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders)); + gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC)); state_change(gl, ret & LM_OUT_ST_MASK); @@ -908,12 +875,8 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret) gfs2_glock_put(gl); - if (gh) { - if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) - gfs2_holder_put(gh); - else - complete(&gh->gh_wait); - } + if (gh) + gfs2_holder_dispose_or_wake(gh); } /** @@ -924,23 +887,26 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret) * */ -void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags) +void gfs2_glock_xmote_th(struct gfs2_holder *gh) { + struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_sbd; + int flags = gh->gh_flags; + unsigned state = gh->gh_state; const struct gfs2_glock_operations *glops = gl->gl_ops; int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | LM_FLAG_ANY | LM_FLAG_PRIORITY); unsigned int lck_ret; + if (glops->go_xmote_th) + glops->go_xmote_th(gl); + gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); - gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders)); + gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED); gfs2_assert_warn(sdp, state != gl->gl_state); - if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync) - glops->go_sync(gl); - gfs2_glock_hold(gl); gl->gl_req_bh = xmote_bh; @@ -971,10 +937,8 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret) const struct gfs2_glock_operations *glops = gl->gl_ops; struct gfs2_holder *gh = gl->gl_req_gh; - clear_bit(GLF_PREFETCH, &gl->gl_flags); - gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); - gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders)); + gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); gfs2_assert_warn(sdp, !ret); state_change(gl, LM_ST_UNLOCKED); @@ -1001,12 +965,8 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret) gfs2_glock_put(gl); - if (gh) { - if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) - gfs2_holder_put(gh); - else - complete(&gh->gh_wait); - } + if (gh) + gfs2_holder_dispose_or_wake(gh); } /** @@ -1015,19 +975,19 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret) * */ -void gfs2_glock_drop_th(struct gfs2_glock *gl) +static void gfs2_glock_drop_th(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; const struct gfs2_glock_operations *glops = gl->gl_ops; unsigned int ret; + if (glops->go_drop_th) + glops->go_drop_th(gl); + gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); - gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders)); + gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED); - if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync) - glops->go_sync(gl); - gfs2_glock_hold(gl); gl->gl_req_bh = drop_bh; @@ -1107,8 +1067,7 @@ static int glock_wait_internal(struct gfs2_holder *gh) if (gh->gh_flags & LM_FLAG_PRIORITY) do_cancels(gh); - wait_for_completion(&gh->gh_wait); - + wait_on_holder(gh); if (gh->gh_error) return gh->gh_error; @@ -1164,6 +1123,8 @@ static void add_to_queue(struct gfs2_holder *gh) struct gfs2_holder *existing; BUG_ON(!gh->gh_owner); + if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags)) + BUG(); existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner); if (existing) { @@ -1227,8 +1188,6 @@ restart: } } - clear_bit(GLF_PREFETCH, &gl->gl_flags); - return error; } @@ -1321,98 +1280,6 @@ void gfs2_glock_dq(struct gfs2_holder *gh) } /** - * gfs2_glock_prefetch - Try to prefetch a glock - * @gl: the glock - * @state: the state to prefetch in - * @flags: flags passed to go_xmote_th() - * - */ - -static void gfs2_glock_prefetch(struct gfs2_glock *gl, unsigned int state, - int flags) -{ - const struct gfs2_glock_operations *glops = gl->gl_ops; - - spin_lock(&gl->gl_spin); - - if (test_bit(GLF_LOCK, &gl->gl_flags) || !list_empty(&gl->gl_holders) || - !list_empty(&gl->gl_waiters1) || !list_empty(&gl->gl_waiters2) || - !list_empty(&gl->gl_waiters3) || - relaxed_state_ok(gl->gl_state, state, flags)) { - spin_unlock(&gl->gl_spin); - return; - } - - set_bit(GLF_PREFETCH, &gl->gl_flags); - set_bit(GLF_LOCK, &gl->gl_flags); - spin_unlock(&gl->gl_spin); - - glops->go_xmote_th(gl, state, flags); -} - -static void greedy_work(struct work_struct *work) -{ - struct greedy *gr = container_of(work, struct greedy, gr_work.work); - struct gfs2_holder *gh = &gr->gr_gh; - struct gfs2_glock *gl = gh->gh_gl; - const struct gfs2_glock_operations *glops = gl->gl_ops; - - clear_bit(GLF_SKIP_WAITERS2, &gl->gl_flags); - - if (glops->go_greedy) - glops->go_greedy(gl); - - spin_lock(&gl->gl_spin); - - if (list_empty(&gl->gl_waiters2)) { - clear_bit(GLF_GREEDY, &gl->gl_flags); - spin_unlock(&gl->gl_spin); - gfs2_holder_uninit(gh); - kfree(gr); - } else { - gfs2_glock_hold(gl); - list_add_tail(&gh->gh_list, &gl->gl_waiters2); - run_queue(gl); - spin_unlock(&gl->gl_spin); - gfs2_glock_put(gl); - } -} - -/** - * gfs2_glock_be_greedy - - * @gl: - * @time: - * - * Returns: 0 if go_greedy will be called, 1 otherwise - */ - -int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time) -{ - struct greedy *gr; - struct gfs2_holder *gh; - - if (!time || gl->gl_sbd->sd_args.ar_localcaching || - test_and_set_bit(GLF_GREEDY, &gl->gl_flags)) - return 1; - - gr = kmalloc(sizeof(struct greedy), GFP_KERNEL); - if (!gr) { - clear_bit(GLF_GREEDY, &gl->gl_flags); - return 1; - } - gh = &gr->gr_gh; - - gfs2_holder_init(gl, 0, 0, gh); - set_bit(HIF_GREEDY, &gh->gh_iflags); - INIT_DELAYED_WORK(&gr->gr_work, greedy_work); - - set_bit(GLF_SKIP_WAITERS2, &gl->gl_flags); - schedule_delayed_work(&gr->gr_work, time); - - return 0; -} - -/** * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it * @gh: the holder structure * @@ -1470,10 +1337,7 @@ static int glock_compare(const void *arg_a, const void *arg_b) return 1; if (a->ln_number < b->ln_number) return -1; - if (gh_a->gh_state == LM_ST_SHARED && gh_b->gh_state == LM_ST_EXCLUSIVE) - return 1; - if (!(gh_a->gh_flags & GL_LOCAL_EXCL) && (gh_b->gh_flags & GL_LOCAL_EXCL)) - return 1; + BUG_ON(gh_a->gh_gl->gl_ops->go_type == gh_b->gh_gl->gl_ops->go_type); return 0; } @@ -1618,34 +1482,6 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs) } /** - * gfs2_glock_prefetch_num - prefetch a glock based on lock number - * @sdp: the filesystem - * @number: the lock number - * @glops: the glock operations for the type of glock - * @state: the state to acquire the glock in - * @flags: modifier flags for the aquisition - * - * Returns: errno - */ - -void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number, - const struct gfs2_glock_operations *glops, - unsigned int state, int flags) -{ - struct gfs2_glock *gl; - int error; - - if (atomic_read(&sdp->sd_reclaim_count) < - gfs2_tune_get(sdp, gt_reclaim_limit)) { - error = gfs2_glock_get(sdp, number, glops, CREATE, &gl); - if (!error) { - gfs2_glock_prefetch(gl, state, flags); - gfs2_glock_put(gl); - } - } -} - -/** * gfs2_lvb_hold - attach a LVB from a glock * @gl: The glock in question * @@ -1703,8 +1539,6 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name, if (!gl) return; - if (gl->gl_ops->go_callback) - gl->gl_ops->go_callback(gl, state); handle_callback(gl, state); spin_lock(&gl->gl_spin); @@ -1746,12 +1580,14 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data) struct lm_async_cb *async = data; struct gfs2_glock *gl; + down_read(&gfs2_umount_flush_sem); gl = gfs2_glock_find(sdp, &async->lc_name); if (gfs2_assert_warn(sdp, gl)) return; if (!gfs2_assert_warn(sdp, gl->gl_req_bh)) gl->gl_req_bh(gl, async->lc_ret); gfs2_glock_put(gl); + up_read(&gfs2_umount_flush_sem); return; } @@ -1781,15 +1617,11 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data) static int demote_ok(struct gfs2_glock *gl) { - struct gfs2_sbd *sdp = gl->gl_sbd; const struct gfs2_glock_operations *glops = gl->gl_ops; int demote = 1; if (test_bit(GLF_STICKY, &gl->gl_flags)) demote = 0; - else if (test_bit(GLF_PREFETCH, &gl->gl_flags)) - demote = time_after_eq(jiffies, gl->gl_stamp + - gfs2_tune_get(sdp, gt_prefetch_secs) * HZ); else if (glops->go_demote_ok) demote = glops->go_demote_ok(gl); @@ -1845,7 +1677,7 @@ void gfs2_reclaim_glock(struct gfs2_sbd *sdp) atomic_inc(&sdp->sd_reclaimed); if (gfs2_glmutex_trylock(gl)) { - if (queue_empty(gl, &gl->gl_holders) && + if (list_empty(&gl->gl_holders) && gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) handle_callback(gl, LM_ST_UNLOCKED); gfs2_glmutex_unlock(gl); @@ -1909,7 +1741,7 @@ static void scan_glock(struct gfs2_glock *gl) return; if (gfs2_glmutex_trylock(gl)) { - if (queue_empty(gl, &gl->gl_holders) && + if (list_empty(&gl->gl_holders) && gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) goto out_schedule; gfs2_glmutex_unlock(gl); @@ -1958,7 +1790,7 @@ static void clear_glock(struct gfs2_glock *gl) } if (gfs2_glmutex_trylock(gl)) { - if (queue_empty(gl, &gl->gl_holders) && + if (list_empty(&gl->gl_holders) && gl->gl_state != LM_ST_UNLOCKED) handle_callback(gl, LM_ST_UNLOCKED); gfs2_glmutex_unlock(gl); @@ -2000,7 +1832,9 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait) t = jiffies; } + down_write(&gfs2_umount_flush_sem); invalidate_inodes(sdp->sd_vfs); + up_write(&gfs2_umount_flush_sem); msleep(10); } } diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index fb39108fc05..f50e40ceca4 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -20,7 +20,6 @@ #define LM_FLAG_ANY 0x00000008 #define LM_FLAG_PRIORITY 0x00000010 */ -#define GL_LOCAL_EXCL 0x00000020 #define GL_ASYNC 0x00000040 #define GL_EXACT 0x00000080 #define GL_SKIP 0x00000100 @@ -83,17 +82,11 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh); void gfs2_holder_uninit(struct gfs2_holder *gh); - -void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags); -void gfs2_glock_drop_th(struct gfs2_glock *gl); - int gfs2_glock_nq(struct gfs2_holder *gh); int gfs2_glock_poll(struct gfs2_holder *gh); int gfs2_glock_wait(struct gfs2_holder *gh); void gfs2_glock_dq(struct gfs2_holder *gh); -int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time); - void gfs2_glock_dq_uninit(struct gfs2_holder *gh); int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, @@ -103,10 +96,6 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); -void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number, - const struct gfs2_glock_operations *glops, - unsigned int state, int flags); - /** * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock * @gl: the glock diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index b068d10bcb6..c4b0391b7aa 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -117,12 +117,14 @@ static void gfs2_pte_inval(struct gfs2_glock *gl) static void meta_go_sync(struct gfs2_glock *gl) { + if (gl->gl_state != LM_ST_EXCLUSIVE) + return; + if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) { gfs2_log_flush(gl->gl_sbd, gl); gfs2_meta_sync(gl); gfs2_ail_empty_gl(gl); } - } /** @@ -142,6 +144,37 @@ static void meta_go_inval(struct gfs2_glock *gl, int flags) } /** + * inode_go_sync - Sync the dirty data and/or metadata for an inode glock + * @gl: the glock protecting the inode + * + */ + +static void inode_go_sync(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip = gl->gl_object; + + if (ip && !S_ISREG(ip->i_inode.i_mode)) + ip = NULL; + + if (test_bit(GLF_DIRTY, &gl->gl_flags)) { + gfs2_log_flush(gl->gl_sbd, gl); + if (ip) + filemap_fdatawrite(ip->i_inode.i_mapping); + gfs2_meta_sync(gl); + if (ip) { + struct address_space *mapping = ip->i_inode.i_mapping; + int error = filemap_fdatawait(mapping); + if (error == -ENOSPC) + set_bit(AS_ENOSPC, &mapping->flags); + else if (error) + set_bit(AS_EIO, &mapping->flags); + } + clear_bit(GLF_DIRTY, &gl->gl_flags); + gfs2_ail_empty_gl(gl); + } +} + +/** * inode_go_xmote_th - promote/demote a glock * @gl: the glock * @state: the requested state @@ -149,12 +182,12 @@ static void meta_go_inval(struct gfs2_glock *gl, int flags) * */ -static void inode_go_xmote_th(struct gfs2_glock *gl, unsigned int state, - int flags) +static void inode_go_xmote_th(struct gfs2_glock *gl) { if (gl->gl_state != LM_ST_UNLOCKED) gfs2_pte_inval(gl); - gfs2_glock_xmote_th(gl, state, flags); + if (gl->gl_state == LM_ST_EXCLUSIVE) + inode_go_sync(gl); } /** @@ -189,38 +222,8 @@ static void inode_go_xmote_bh(struct gfs2_glock *gl) static void inode_go_drop_th(struct gfs2_glock *gl) { gfs2_pte_inval(gl); - gfs2_glock_drop_th(gl); -} - -/** - * inode_go_sync - Sync the dirty data and/or metadata for an inode glock - * @gl: the glock protecting the inode - * - */ - -static void inode_go_sync(struct gfs2_glock *gl) -{ - struct gfs2_inode *ip = gl->gl_object; - - if (ip && !S_ISREG(ip->i_inode.i_mode)) - ip = NULL; - - if (test_bit(GLF_DIRTY, &gl->gl_flags)) { - gfs2_log_flush(gl->gl_sbd, gl); - if (ip) - filemap_fdatawrite(ip->i_inode.i_mapping); - gfs2_meta_sync(gl); - if (ip) { - struct address_space *mapping = ip->i_inode.i_mapping; - int error = filemap_fdatawait(mapping); - if (error == -ENOSPC) - set_bit(AS_ENOSPC, &mapping->flags); - else if (error) - set_bit(AS_EIO, &mapping->flags); - } - clear_bit(GLF_DIRTY, &gl->gl_flags); - gfs2_ail_empty_gl(gl); - } + if (gl->gl_state == LM_ST_EXCLUSIVE) + inode_go_sync(gl); } /** @@ -295,7 +298,7 @@ static int inode_go_lock(struct gfs2_holder *gh) if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) && (gl->gl_state == LM_ST_EXCLUSIVE) && - (gh->gh_flags & GL_LOCAL_EXCL)) + (gh->gh_state == LM_ST_EXCLUSIVE)) error = gfs2_truncatei_resume(ip); return error; @@ -319,39 +322,6 @@ static void inode_go_unlock(struct gfs2_holder *gh) } /** - * inode_greedy - - * @gl: the glock - * - */ - -static void inode_greedy(struct gfs2_glock *gl) -{ - struct gfs2_sbd *sdp = gl->gl_sbd; - struct gfs2_inode *ip = gl->gl_object; - unsigned int quantum = gfs2_tune_get(sdp, gt_greedy_quantum); - unsigned int max = gfs2_tune_get(sdp, gt_greedy_max); - unsigned int new_time; - - spin_lock(&ip->i_spin); - - if (time_after(ip->i_last_pfault + quantum, jiffies)) { - new_time = ip->i_greedy + quantum; - if (new_time > max) - new_time = max; - } else { - new_time = ip->i_greedy - quantum; - if (!new_time || new_time > max) - new_time = 1; - } - - ip->i_greedy = new_time; - - spin_unlock(&ip->i_spin); - - iput(&ip->i_inode); -} - -/** * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock * @gl: the glock * @@ -398,8 +368,7 @@ static void rgrp_go_unlock(struct gfs2_holder *gh) * */ -static void trans_go_xmote_th(struct gfs2_glock *gl, unsigned int state, - int flags) +static void trans_go_xmote_th(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; @@ -408,8 +377,6 @@ static void trans_go_xmote_th(struct gfs2_glock *gl, unsigned int state, gfs2_meta_syncfs(sdp); gfs2_log_shutdown(sdp); } - - gfs2_glock_xmote_th(gl, state, flags); } /** @@ -461,8 +428,6 @@ static void trans_go_drop_th(struct gfs2_glock *gl) gfs2_meta_syncfs(sdp); gfs2_log_shutdown(sdp); } - - gfs2_glock_drop_th(gl); } /** @@ -478,8 +443,8 @@ static int quota_go_demote_ok(struct gfs2_glock *gl) } const struct gfs2_glock_operations gfs2_meta_glops = { - .go_xmote_th = gfs2_glock_xmote_th, - .go_drop_th = gfs2_glock_drop_th, + .go_xmote_th = meta_go_sync, + .go_drop_th = meta_go_sync, .go_type = LM_TYPE_META, }; @@ -487,19 +452,14 @@ const struct gfs2_glock_operations gfs2_inode_glops = { .go_xmote_th = inode_go_xmote_th, .go_xmote_bh = inode_go_xmote_bh, .go_drop_th = inode_go_drop_th, - .go_sync = inode_go_sync, .go_inval = inode_go_inval, .go_demote_ok = inode_go_demote_ok, .go_lock = inode_go_lock, .go_unlock = inode_go_unlock, - .go_greedy = inode_greedy, .go_type = LM_TYPE_INODE, }; const struct gfs2_glock_operations gfs2_rgrp_glops = { - .go_xmote_th = gfs2_glock_xmote_th, - .go_drop_th = gfs2_glock_drop_th, - .go_sync = meta_go_sync, .go_inval = meta_go_inval, .go_demote_ok = rgrp_go_demote_ok, .go_lock = rgrp_go_lock, @@ -515,33 +475,23 @@ const struct gfs2_glock_operations gfs2_trans_glops = { }; const struct gfs2_glock_operations gfs2_iopen_glops = { - .go_xmote_th = gfs2_glock_xmote_th, - .go_drop_th = gfs2_glock_drop_th, .go_type = LM_TYPE_IOPEN, }; const struct gfs2_glock_operations gfs2_flock_glops = { - .go_xmote_th = gfs2_glock_xmote_th, - .go_drop_th = gfs2_glock_drop_th, .go_type = LM_TYPE_FLOCK, }; const struct gfs2_glock_operations gfs2_nondisk_glops = { - .go_xmote_th = gfs2_glock_xmote_th, - .go_drop_th = gfs2_glock_drop_th, .go_type = LM_TYPE_NONDISK, }; const struct gfs2_glock_operations gfs2_quota_glops = { - .go_xmote_th = gfs2_glock_xmote_th, - .go_drop_th = gfs2_glock_drop_th, .go_demote_ok = quota_go_demote_ok, .go_type = LM_TYPE_QUOTA, }; const struct gfs2_glock_operations gfs2_journal_glops = { - .go_xmote_th = gfs2_glock_xmote_th, - .go_drop_th = gfs2_glock_drop_th, .go_type = LM_TYPE_JOURNAL, }; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 734421edae8..12c80fd28db 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -101,17 +101,14 @@ struct gfs2_bufdata { }; struct gfs2_glock_operations { - void (*go_xmote_th) (struct gfs2_glock *gl, unsigned int state, int flags); + void (*go_xmote_th) (struct gfs2_glock *gl); void (*go_xmote_bh) (struct gfs2_glock *gl); void (*go_drop_th) (struct gfs2_glock *gl); void (*go_drop_bh) (struct gfs2_glock *gl); - void (*go_sync) (struct gfs2_glock *gl); void (*go_inval) (struct gfs2_glock *gl, int flags); int (*go_demote_ok) (struct gfs2_glock *gl); int (*go_lock) (struct gfs2_holder *gh); void (*go_unlock) (struct gfs2_holder *gh); - void (*go_callback) (struct gfs2_glock *gl, unsigned int state); - void (*go_greedy) (struct gfs2_glock *gl); const int go_type; }; @@ -120,7 +117,6 @@ enum { HIF_MUTEX = 0, HIF_PROMOTE = 1, HIF_DEMOTE = 2, - HIF_GREEDY = 3, /* States */ HIF_ALLOCED = 4, @@ -128,6 +124,7 @@ enum { HIF_HOLDER = 6, HIF_FIRST = 7, HIF_ABORTED = 9, + HIF_WAIT = 10, }; struct gfs2_holder { @@ -140,17 +137,14 @@ struct gfs2_holder { int gh_error; unsigned long gh_iflags; - struct completion gh_wait; unsigned long gh_ip; }; enum { GLF_LOCK = 1, GLF_STICKY = 2, - GLF_PREFETCH = 3, GLF_DIRTY = 5, GLF_SKIP_WAITERS2 = 6, - GLF_GREEDY = 7, }; struct gfs2_glock { @@ -167,7 +161,7 @@ struct gfs2_glock { unsigned long gl_ip; struct list_head gl_holders; struct list_head gl_waiters1; /* HIF_MUTEX */ - struct list_head gl_waiters2; /* HIF_DEMOTE, HIF_GREEDY */ + struct list_head gl_waiters2; /* HIF_DEMOTE */ struct list_head gl_waiters3; /* HIF_PROMOTE */ const struct gfs2_glock_operations *gl_ops; @@ -236,7 +230,6 @@ struct gfs2_inode { spinlock_t i_spin; struct rw_semaphore i_rw_mutex; - unsigned int i_greedy; unsigned long i_last_pfault; struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT]; @@ -418,17 +411,12 @@ struct gfs2_tune { unsigned int gt_atime_quantum; /* Min secs between atime updates */ unsigned int gt_new_files_jdata; unsigned int gt_new_files_directio; - unsigned int gt_max_atomic_write; /* Split big writes into this size */ unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ unsigned int gt_lockdump_size; unsigned int gt_stall_secs; /* Detects trouble! */ unsigned int gt_complain_secs; unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */ unsigned int gt_entries_per_readdir; - unsigned int gt_prefetch_secs; /* Usage window for prefetched glocks */ - unsigned int gt_greedy_default; - unsigned int gt_greedy_quantum; - unsigned int gt_greedy_max; unsigned int gt_statfs_quantum; unsigned int gt_statfs_slow; }; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index d122074c45e..0d6831a4056 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -287,10 +287,8 @@ out: * * Returns: errno */ - int gfs2_change_nlink(struct gfs2_inode *ip, int diff) { - struct gfs2_sbd *sdp = ip->i_inode.i_sb->s_fs_info; struct buffer_head *dibh; u32 nlink; int error; @@ -315,42 +313,34 @@ int gfs2_change_nlink(struct gfs2_inode *ip, int diff) else drop_nlink(&ip->i_inode); - ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); mark_inode_dirty(&ip->i_inode); - if (ip->i_inode.i_nlink == 0) { - struct gfs2_rgrpd *rgd; - struct gfs2_holder ri_gh, rg_gh; - - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - goto out; - error = -EIO; - rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr); - if (!rgd) - goto out_norgrp; - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh); - if (error) - goto out_norgrp; - + if (ip->i_inode.i_nlink == 0) gfs2_unlink_di(&ip->i_inode); /* mark inode unlinked */ - gfs2_glock_dq_uninit(&rg_gh); -out_norgrp: - gfs2_glock_dq_uninit(&ri_gh); - } -out: + return error; } struct inode *gfs2_lookup_simple(struct inode *dip, const char *name) { struct qstr qstr; + struct inode *inode; gfs2_str2qstr(&qstr, name); - return gfs2_lookupi(dip, &qstr, 1, NULL); + inode = gfs2_lookupi(dip, &qstr, 1, NULL); + /* gfs2_lookupi has inconsistent callers: vfs + * related routines expect NULL for no entry found, + * gfs2_lookup_simple callers expect ENOENT + * and do not check for NULL. + */ + if (inode == NULL) + return ERR_PTR(-ENOENT); + else + return inode; } @@ -361,8 +351,10 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name) * @is_root: If 1, ignore the caller's permissions * @i_gh: An uninitialized holder for the new inode glock * - * There will always be a vnode (Linux VFS inode) for the d_gh inode unless - * @is_root is true. + * This can be called via the VFS filldir function when NFS is doing + * a readdirplus and the inode which its intending to stat isn't + * already in cache. In this case we must not take the directory glock + * again, since the readdir call will have already taken that lock. * * Returns: errno */ @@ -375,8 +367,9 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, struct gfs2_holder d_gh; struct gfs2_inum_host inum; unsigned int type; - int error = 0; + int error; struct inode *inode = NULL; + int unlock = 0; if (!name->len || name->len > GFS2_FNAMESIZE) return ERR_PTR(-ENAMETOOLONG); @@ -388,9 +381,12 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, return dir; } - error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); - if (error) - return ERR_PTR(error); + if (gfs2_glock_is_locked_by_me(dip->i_gl) == 0) { + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + if (error) + return ERR_PTR(error); + unlock = 1; + } if (!is_root) { error = permission(dir, MAY_EXEC, NULL); @@ -405,10 +401,11 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, inode = gfs2_inode_lookup(sb, &inum, type); out: - gfs2_glock_dq_uninit(&d_gh); + if (unlock) + gfs2_glock_dq_uninit(&d_gh); if (error == -ENOENT) return NULL; - return inode; + return inode ? inode : ERR_PTR(error); } static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino) diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c index effe4a337c1..e30673dd37e 100644 --- a/fs/gfs2/lm.c +++ b/fs/gfs2/lm.c @@ -104,15 +104,9 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) vprintk(fmt, args); va_end(args); - fs_err(sdp, "about to withdraw from the cluster\n"); + fs_err(sdp, "about to withdraw this file system\n"); BUG_ON(sdp->sd_args.ar_debug); - - fs_err(sdp, "waiting for outstanding I/O\n"); - - /* FIXME: suspend dm device so oustanding bio's complete - and all further io requests fail */ - fs_err(sdp, "telling LM to withdraw\n"); gfs2_withdraw_lockproto(&sdp->sd_lockstruct); fs_err(sdp, "withdrawn\n"); diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h index 33af707a4d3..a87c7bf3c56 100644 --- a/fs/gfs2/locking/dlm/lock_dlm.h +++ b/fs/gfs2/locking/dlm/lock_dlm.h @@ -36,7 +36,7 @@ #define GDLM_STRNAME_BYTES 24 #define GDLM_LVB_SIZE 32 -#define GDLM_DROP_COUNT 50000 +#define GDLM_DROP_COUNT 200000 #define GDLM_DROP_PERIOD 60 #define GDLM_NAME_LEN 128 diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c index 2194b1d5b5e..a0e7eda643e 100644 --- a/fs/gfs2/locking/dlm/main.c +++ b/fs/gfs2/locking/dlm/main.c @@ -11,9 +11,6 @@ #include "lock_dlm.h" -extern int gdlm_drop_count; -extern int gdlm_drop_period; - extern struct lm_lockops gdlm_ops; static int __init init_lock_dlm(void) @@ -40,9 +37,6 @@ static int __init init_lock_dlm(void) return error; } - gdlm_drop_count = GDLM_DROP_COUNT; - gdlm_drop_period = GDLM_DROP_PERIOD; - printk(KERN_INFO "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__); return 0; diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c index cdd1694e889..1d8faa3da8a 100644 --- a/fs/gfs2/locking/dlm/mount.c +++ b/fs/gfs2/locking/dlm/mount.c @@ -9,8 +9,6 @@ #include "lock_dlm.h" -int gdlm_drop_count; -int gdlm_drop_period; const struct lm_lockops gdlm_ops; @@ -24,8 +22,8 @@ static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp, if (!ls) return NULL; - ls->drop_locks_count = gdlm_drop_count; - ls->drop_locks_period = gdlm_drop_period; + ls->drop_locks_count = GDLM_DROP_COUNT; + ls->drop_locks_period = GDLM_DROP_PERIOD; ls->fscb = cb; ls->sdp = sdp; ls->fsflags = flags; diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c index 29ae06f9494..4746b884662 100644 --- a/fs/gfs2/locking/dlm/sysfs.c +++ b/fs/gfs2/locking/dlm/sysfs.c @@ -116,6 +116,17 @@ static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf) return sprintf(buf, "%d\n", ls->recover_jid_status); } +static ssize_t drop_count_show(struct gdlm_ls *ls, char *buf) +{ + return sprintf(buf, "%d\n", ls->drop_locks_count); +} + +static ssize_t drop_count_store(struct gdlm_ls *ls, const char *buf, size_t len) +{ + ls->drop_locks_count = simple_strtol(buf, NULL, 0); + return len; +} + struct gdlm_attr { struct attribute attr; ssize_t (*show)(struct gdlm_ls *, char *); @@ -135,6 +146,7 @@ GDLM_ATTR(first_done, 0444, first_done_show, NULL); GDLM_ATTR(recover, 0644, recover_show, recover_store); GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); +GDLM_ATTR(drop_count, 0644, drop_count_show, drop_count_store); static struct attribute *gdlm_attrs[] = { &gdlm_attr_proto_name.attr, @@ -147,6 +159,7 @@ static struct attribute *gdlm_attrs[] = { &gdlm_attr_recover.attr, &gdlm_attr_recover_done.attr, &gdlm_attr_recover_status.attr, + &gdlm_attr_drop_count.attr, NULL, }; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 4d7f94d8c7b..16bb4b4561a 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -69,13 +69,16 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); struct gfs2_trans *tr; - if (!list_empty(&bd->bd_list_tr)) + gfs2_log_lock(sdp); + if (!list_empty(&bd->bd_list_tr)) { + gfs2_log_unlock(sdp); return; - + } tr = current->journal_info; tr->tr_touched = 1; tr->tr_num_buf++; list_add(&bd->bd_list_tr, &tr->tr_list_buf); + gfs2_log_unlock(sdp); if (!list_empty(&le->le_list)) return; @@ -84,7 +87,6 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) gfs2_meta_check(sdp, bd->bd_bh); gfs2_pin(sdp, bd->bd_bh); - gfs2_log_lock(sdp); sdp->sd_log_num_buf++; list_add(&le->le_list, &sdp->sd_log_le_buf); @@ -98,11 +100,13 @@ static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) struct list_head *head = &tr->tr_list_buf; struct gfs2_bufdata *bd; + gfs2_log_lock(sdp); while (!list_empty(head)) { bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr); list_del_init(&bd->bd_list_tr); tr->tr_num_buf--; } + gfs2_log_unlock(sdp); gfs2_assert_warn(sdp, !tr->tr_num_buf); } @@ -462,13 +466,17 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) struct address_space *mapping = bd->bd_bh->b_page->mapping; struct gfs2_inode *ip = GFS2_I(mapping->host); + gfs2_log_lock(sdp); tr->tr_touched = 1; if (list_empty(&bd->bd_list_tr) && (ip->i_di.di_flags & GFS2_DIF_JDATA)) { tr->tr_num_buf++; list_add(&bd->bd_list_tr, &tr->tr_list_buf); + gfs2_log_unlock(sdp); gfs2_pin(sdp, bd->bd_bh); tr->tr_num_buf_new++; + } else { + gfs2_log_unlock(sdp); } gfs2_trans_add_gl(bd->bd_gl); gfs2_log_lock(sdp); diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index d8d69a72a10..56e33590b65 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -16,6 +16,7 @@ #include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/fs.h> +#include <linux/writeback.h> #include <linux/gfs2_ondisk.h> #include <linux/lm_interface.h> @@ -157,6 +158,32 @@ out_ignore: } /** + * gfs2_writepages - Write a bunch of dirty pages back to disk + * @mapping: The mapping to write + * @wbc: Write-back control + * + * For journaled files and/or ordered writes this just falls back to the + * kernel's default writepages path for now. We will probably want to change + * that eventually (i.e. when we look at allocate on flush). + * + * For the data=writeback case though we can already ignore buffer heads + * and write whole extents at once. This is a big reduction in the + * number of I/O requests we send and the bmap calls we make in this case. + */ +static int gfs2_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + + if (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK && !gfs2_is_jdata(ip)) + return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); + + return generic_writepages(mapping, wbc); +} + +/** * stuffed_readpage - Fill in a Linux page with stuffed file data * @ip: the inode * @page: the page @@ -256,7 +283,7 @@ out_unlock: * the page lock and the glock) and return having done no I/O. Its * obviously not something we'd want to do on too regular a basis. * Any I/O we ignore at this time will be done via readpage later. - * 2. We have to handle stuffed files here too. + * 2. We don't handle stuffed files here we let readpage do the honours. * 3. mpage_readpages() does most of the heavy lifting in the common case. * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places. * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as @@ -269,8 +296,7 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping, struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_holder gh; - unsigned page_idx; - int ret; + int ret = 0; int do_unlock = 0; if (likely(file != &gfs2_internal_file_sentinel)) { @@ -289,29 +315,8 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping, goto out_unlock; } skip_lock: - if (gfs2_is_stuffed(ip)) { - struct pagevec lru_pvec; - pagevec_init(&lru_pvec, 0); - for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = list_entry(pages->prev, struct page, lru); - prefetchw(&page->flags); - list_del(&page->lru); - if (!add_to_page_cache(page, mapping, - page->index, GFP_KERNEL)) { - ret = stuffed_readpage(ip, page); - unlock_page(page); - if (!pagevec_add(&lru_pvec, page)) - __pagevec_lru_add(&lru_pvec); - } else { - page_cache_release(page); - } - } - pagevec_lru_add(&lru_pvec); - ret = 0; - } else { - /* What we really want to do .... */ + if (!gfs2_is_stuffed(ip)) ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block); - } if (do_unlock) { gfs2_glock_dq_m(1, &gh); @@ -356,8 +361,10 @@ static int gfs2_prepare_write(struct file *file, struct page *page, gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|LM_FLAG_TRY_1CB, &ip->i_gh); error = gfs2_glock_nq_atime(&ip->i_gh); if (unlikely(error)) { - if (error == GLR_TRYFAILED) + if (error == GLR_TRYFAILED) { + unlock_page(page); error = AOP_TRUNCATED_PAGE; + } goto out_uninit; } @@ -594,6 +601,36 @@ static void gfs2_invalidatepage(struct page *page, unsigned long offset) return; } +/** + * gfs2_ok_for_dio - check that dio is valid on this file + * @ip: The inode + * @rw: READ or WRITE + * @offset: The offset at which we are reading or writing + * + * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o) + * 1 (to accept the i/o request) + */ +static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset) +{ + /* + * Should we return an error here? I can't see that O_DIRECT for + * a journaled file makes any sense. For now we'll silently fall + * back to buffered I/O, likewise we do the same for stuffed + * files since they are (a) small and (b) unaligned. + */ + if (gfs2_is_jdata(ip)) + return 0; + + if (gfs2_is_stuffed(ip)) + return 0; + + if (offset > i_size_read(&ip->i_inode)) + return 0; + return 1; +} + + + static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) @@ -604,42 +641,28 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, struct gfs2_holder gh; int rv; - if (rw == READ) - mutex_lock(&inode->i_mutex); /* - * Shared lock, even if its a write, since we do no allocation - * on this path. All we need change is atime. + * Deferred lock, even if its a write, since we do no allocation + * on this path. All we need change is atime, and this lock mode + * ensures that other nodes have flushed their buffered read caches + * (i.e. their page cache entries for this inode). We do not, + * unfortunately have the option of only flushing a range like + * the VFS does. */ - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); + gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, GL_ATIME, &gh); rv = gfs2_glock_nq_atime(&gh); if (rv) - goto out; - - if (offset > i_size_read(inode)) - goto out; - - /* - * Should we return an error here? I can't see that O_DIRECT for - * a journaled file makes any sense. For now we'll silently fall - * back to buffered I/O, likewise we do the same for stuffed - * files since they are (a) small and (b) unaligned. - */ - if (gfs2_is_jdata(ip)) - goto out; - - if (gfs2_is_stuffed(ip)) - goto out; - - rv = blockdev_direct_IO_own_locking(rw, iocb, inode, - inode->i_sb->s_bdev, - iov, offset, nr_segs, - gfs2_get_block_direct, NULL); + return rv; + rv = gfs2_ok_for_dio(ip, rw, offset); + if (rv != 1) + goto out; /* dio not valid, fall back to buffered i/o */ + + rv = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev, + iov, offset, nr_segs, + gfs2_get_block_direct, NULL); out: gfs2_glock_dq_m(1, &gh); gfs2_holder_uninit(&gh); - if (rw == READ) - mutex_unlock(&inode->i_mutex); - return rv; } @@ -763,6 +786,7 @@ out: const struct address_space_operations gfs2_file_aops = { .writepage = gfs2_writepage, + .writepages = gfs2_writepages, .readpage = gfs2_readpage, .readpages = gfs2_readpages, .sync_page = block_sync_page, diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c index d355899585d..9187eb174b4 100644 --- a/fs/gfs2/ops_dentry.c +++ b/fs/gfs2/ops_dentry.c @@ -46,6 +46,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) struct gfs2_inum_host inum; unsigned int type; int error; + int had_lock=0; if (inode && is_bad_inode(inode)) goto invalid; @@ -53,9 +54,12 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) if (sdp->sd_args.ar_localcaching) goto valid; - error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); - if (error) - goto fail; + had_lock = gfs2_glock_is_locked_by_me(dip->i_gl); + if (!had_lock) { + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + if (error) + goto fail; + } error = gfs2_dir_search(parent->d_inode, &dentry->d_name, &inum, &type); switch (error) { @@ -82,13 +86,15 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) } valid_gunlock: - gfs2_glock_dq_uninit(&d_gh); + if (!had_lock) + gfs2_glock_dq_uninit(&d_gh); valid: dput(parent); return 1; invalid_gunlock: - gfs2_glock_dq_uninit(&d_gh); + if (!had_lock) + gfs2_glock_dq_uninit(&d_gh); invalid: if (inode && S_ISDIR(inode->i_mode)) { if (have_submounts(dentry)) diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c index b4e7b877531..4855e8cca62 100644 --- a/fs/gfs2/ops_export.c +++ b/fs/gfs2/ops_export.c @@ -22,6 +22,7 @@ #include "glock.h" #include "glops.h" #include "inode.h" +#include "ops_dentry.h" #include "ops_export.h" #include "rgrp.h" #include "util.h" @@ -112,13 +113,12 @@ struct get_name_filldir { char *name; }; -static int get_name_filldir(void *opaque, const char *name, unsigned int length, - u64 offset, struct gfs2_inum_host *inum, - unsigned int type) +static int get_name_filldir(void *opaque, const char *name, int length, + loff_t offset, u64 inum, unsigned int type) { - struct get_name_filldir *gnfd = (struct get_name_filldir *)opaque; + struct get_name_filldir *gnfd = opaque; - if (!gfs2_inum_equal(inum, &gnfd->inum)) + if (inum != gnfd->inum.no_addr) return 0; memcpy(gnfd->name, name, length); @@ -189,6 +189,7 @@ static struct dentry *gfs2_get_parent(struct dentry *child) return ERR_PTR(-ENOMEM); } + dentry->d_op = &gfs2_dops; return dentry; } @@ -215,8 +216,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj) } error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops, - LM_ST_SHARED, LM_FLAG_ANY | GL_LOCAL_EXCL, - &i_gh); + LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) return ERR_PTR(error); @@ -269,6 +269,7 @@ out_inode: return ERR_PTR(-ENOMEM); } + dentry->d_op = &gfs2_dops; return dentry; fail_rgd: diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index b3f1e0349ae..c996aa739a0 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -43,15 +43,6 @@ #include "util.h" #include "eaops.h" -/* For regular, non-NFS */ -struct filldir_reg { - struct gfs2_sbd *fdr_sbd; - int fdr_prefetch; - - filldir_t fdr_filldir; - void *fdr_opaque; -}; - /* * Most fields left uninitialised to catch anybody who tries to * use them. f_flags set to prevent file_accessed() from touching @@ -128,41 +119,6 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin) } /** - * filldir_func - Report a directory entry to the caller of gfs2_dir_read() - * @opaque: opaque data used by the function - * @name: the name of the directory entry - * @length: the length of the name - * @offset: the entry's offset in the directory - * @inum: the inode number the entry points to - * @type: the type of inode the entry points to - * - * Returns: 0 on success, 1 if buffer full - */ - -static int filldir_func(void *opaque, const char *name, unsigned int length, - u64 offset, struct gfs2_inum_host *inum, - unsigned int type) -{ - struct filldir_reg *fdr = (struct filldir_reg *)opaque; - struct gfs2_sbd *sdp = fdr->fdr_sbd; - int error; - - error = fdr->fdr_filldir(fdr->fdr_opaque, name, length, offset, - inum->no_addr, type); - if (error) - return 1; - - if (fdr->fdr_prefetch && !(length == 1 && *name == '.')) { - gfs2_glock_prefetch_num(sdp, inum->no_addr, &gfs2_inode_glops, - LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY); - gfs2_glock_prefetch_num(sdp, inum->no_addr, &gfs2_iopen_glops, - LM_ST_SHARED, LM_FLAG_TRY); - } - - return 0; -} - -/** * gfs2_readdir - Read directory entries from a directory * @file: The directory to read from * @dirent: Buffer for dirents @@ -175,16 +131,10 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir) { struct inode *dir = file->f_mapping->host; struct gfs2_inode *dip = GFS2_I(dir); - struct filldir_reg fdr; struct gfs2_holder d_gh; u64 offset = file->f_pos; int error; - fdr.fdr_sbd = GFS2_SB(dir); - fdr.fdr_prefetch = 1; - fdr.fdr_filldir = filldir; - fdr.fdr_opaque = dirent; - gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh); error = gfs2_glock_nq_atime(&d_gh); if (error) { @@ -192,7 +142,7 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir) return error; } - error = gfs2_dir_read(dir, &offset, &fdr, filldir_func); + error = gfs2_dir_read(dir, &offset, dirent, filldir); gfs2_glock_dq_uninit(&d_gh); @@ -247,7 +197,7 @@ static const u32 gfs2_to_fsflags[32] = { static int gfs2_get_flags(struct file *filp, u32 __user *ptr) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int error; @@ -305,7 +255,7 @@ void gfs2_set_inode_flags(struct inode *inode) */ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct buffer_head *bh; @@ -588,7 +538,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) { struct gfs2_file *fp = file->private_data; struct gfs2_holder *fl_gh = &fp->f_fl_gh; - struct gfs2_inode *ip = GFS2_I(file->f_dentry->d_inode); + struct gfs2_inode *ip = GFS2_I(file->f_path.dentry->d_inode); struct gfs2_glock *gl; unsigned int state; int flags; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index d14e139d267..ee80b8a5e7b 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -867,9 +867,9 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags, error = -EBUSY; goto error; } - mutex_lock(&sb->s_bdev->bd_mount_mutex); + down(&sb->s_bdev->bd_mount_sem); new = sget(fs_type, test_bdev_super, set_bdev_super, sb->s_bdev); - mutex_unlock(&sb->s_bdev->bd_mount_mutex); + up(&sb->s_bdev->bd_mount_sem); if (IS_ERR(new)) { error = PTR_ERR(new); goto error; diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 636dda4c7d3..f40a84807d7 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -264,13 +264,23 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) struct gfs2_inode *dip = GFS2_I(dir); struct gfs2_sbd *sdp = GFS2_SB(dir); struct gfs2_inode *ip = GFS2_I(dentry->d_inode); - struct gfs2_holder ghs[2]; + struct gfs2_holder ghs[3]; + struct gfs2_rgrpd *rgd; + struct gfs2_holder ri_gh; int error; + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + return error; + gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); - error = gfs2_glock_nq_m(2, ghs); + rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr); + gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); + + + error = gfs2_glock_nq_m(3, ghs); if (error) goto out; @@ -291,10 +301,12 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) out_end_trans: gfs2_trans_end(sdp); out_gunlock: - gfs2_glock_dq_m(2, ghs); + gfs2_glock_dq_m(3, ghs); out: gfs2_holder_uninit(ghs); gfs2_holder_uninit(ghs + 1); + gfs2_holder_uninit(ghs + 2); + gfs2_glock_dq_uninit(&ri_gh); return error; } @@ -449,13 +461,22 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry) struct gfs2_inode *dip = GFS2_I(dir); struct gfs2_sbd *sdp = GFS2_SB(dir); struct gfs2_inode *ip = GFS2_I(dentry->d_inode); - struct gfs2_holder ghs[2]; + struct gfs2_holder ghs[3]; + struct gfs2_rgrpd *rgd; + struct gfs2_holder ri_gh; int error; + + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + return error; gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); - error = gfs2_glock_nq_m(2, ghs); + rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr); + gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); + + error = gfs2_glock_nq_m(3, ghs); if (error) goto out; @@ -483,10 +504,12 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry) gfs2_trans_end(sdp); out_gunlock: - gfs2_glock_dq_m(2, ghs); + gfs2_glock_dq_m(3, ghs); out: gfs2_holder_uninit(ghs); gfs2_holder_uninit(ghs + 1); + gfs2_holder_uninit(ghs + 2); + gfs2_glock_dq_uninit(&ri_gh); return error; } @@ -547,7 +570,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, struct gfs2_inode *ip = GFS2_I(odentry->d_inode); struct gfs2_inode *nip = NULL; struct gfs2_sbd *sdp = GFS2_SB(odir); - struct gfs2_holder ghs[4], r_gh; + struct gfs2_holder ghs[5], r_gh; + struct gfs2_rgrpd *nrgd; unsigned int num_gh; int dir_rename = 0; int alloc_required; @@ -587,6 +611,13 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (nip) { gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); num_gh++; + /* grab the resource lock for unlink flag twiddling + * this is the case of the target file already existing + * so we unlink before doing the rename + */ + nrgd = gfs2_blk2rgrpd(sdp, nip->i_num.no_addr); + if (nrgd) + gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++); } error = gfs2_glock_nq_m(num_gh, ghs); @@ -684,12 +715,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + al->al_rgd->rd_ri.ri_length + 4 * RES_DINODE + 4 * RES_LEAF + - RES_STATFS + RES_QUOTA, 0); + RES_STATFS + RES_QUOTA + 4, 0); if (error) goto out_ipreserv; } else { error = gfs2_trans_begin(sdp, 4 * RES_DINODE + - 5 * RES_LEAF, 0); + 5 * RES_LEAF + 4, 0); if (error) goto out_gunlock; } @@ -728,7 +759,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, error = gfs2_meta_inode_buffer(ip, &dibh); if (error) goto out_end_trans; - ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -1018,7 +1049,7 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, } generic_fillattr(inode, stat); - if (unlock); + if (unlock) gfs2_glock_dq_uninit(&gh); return 0; diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c index 7685b46f934..47369d01121 100644 --- a/fs/gfs2/ops_super.c +++ b/fs/gfs2/ops_super.c @@ -173,6 +173,9 @@ static void gfs2_write_super_lockfs(struct super_block *sb) struct gfs2_sbd *sdp = sb->s_fs_info; int error; + if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + return; + for (;;) { error = gfs2_freeze_fs(sdp); if (!error) @@ -426,6 +429,12 @@ static void gfs2_delete_inode(struct inode *inode) } error = gfs2_dinode_dealloc(ip); + /* + * Must do this before unlock to avoid trying to write back + * potentially dirty data now that inode no longer exists + * on disk. + */ + truncate_inode_pages(&inode->i_data, 0); out_unlock: gfs2_glock_dq(&ip->i_iopen_gh); @@ -443,14 +452,12 @@ out: static struct inode *gfs2_alloc_inode(struct super_block *sb) { - struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_inode *ip; ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL); if (ip) { ip->i_flags = 0; ip->i_gl = NULL; - ip->i_greedy = gfs2_tune_get(sdp, gt_greedy_default); ip->i_last_pfault = jiffies; } return &ip->i_inode; diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c index 45a5f11fc39..14b380fb060 100644 --- a/fs/gfs2/ops_vm.c +++ b/fs/gfs2/ops_vm.c @@ -28,34 +28,13 @@ #include "trans.h" #include "util.h" -static void pfault_be_greedy(struct gfs2_inode *ip) -{ - unsigned int time; - - spin_lock(&ip->i_spin); - time = ip->i_greedy; - ip->i_last_pfault = jiffies; - spin_unlock(&ip->i_spin); - - igrab(&ip->i_inode); - if (gfs2_glock_be_greedy(ip->i_gl, time)) - iput(&ip->i_inode); -} - static struct page *gfs2_private_nopage(struct vm_area_struct *area, unsigned long address, int *type) { struct gfs2_inode *ip = GFS2_I(area->vm_file->f_mapping->host); - struct page *result; set_bit(GIF_PAGED, &ip->i_flags); - - result = filemap_nopage(area, address, type); - - if (result && result != NOPAGE_OOM) - pfault_be_greedy(ip); - - return result; + return filemap_nopage(area, address, type); } static int alloc_page_backing(struct gfs2_inode *ip, struct page *page) @@ -167,7 +146,6 @@ static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area, set_page_dirty(result); } - pfault_be_greedy(ip); out: gfs2_glock_dq_uninit(&i_gh); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 43a24f2e590..70f424fcf1c 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -71,17 +71,12 @@ void gfs2_tune_init(struct gfs2_tune *gt) gt->gt_atime_quantum = 3600; gt->gt_new_files_jdata = 0; gt->gt_new_files_directio = 0; - gt->gt_max_atomic_write = 4 << 20; gt->gt_max_readahead = 1 << 18; gt->gt_lockdump_size = 131072; gt->gt_stall_secs = 600; gt->gt_complain_secs = 10; gt->gt_reclaim_limit = 5000; gt->gt_entries_per_readdir = 32; - gt->gt_prefetch_secs = 10; - gt->gt_greedy_default = HZ / 10; - gt->gt_greedy_quantum = HZ / 40; - gt->gt_greedy_max = HZ / 4; gt->gt_statfs_quantum = 30; gt->gt_statfs_slow = 0; } @@ -359,8 +354,7 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) mutex_lock(&sdp->sd_jindex_mutex); for (;;) { - error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, - GL_LOCAL_EXCL, ji_gh); + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh); if (error) break; @@ -529,8 +523,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) struct gfs2_log_header_host head; int error; - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, - GL_LOCAL_EXCL, &t_gh); + error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh); if (error) return error; @@ -583,9 +576,8 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp) gfs2_quota_sync(sdp); gfs2_statfs_sync(sdp); - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, - GL_LOCAL_EXCL | GL_NOCACHE, - &t_gh); + error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE, + &t_gh); if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) return error; diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 983eaf1e06b..d01f9f0fda2 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -436,17 +436,12 @@ TUNE_ATTR(atime_quantum, 0); TUNE_ATTR(max_readahead, 0); TUNE_ATTR(complain_secs, 0); TUNE_ATTR(reclaim_limit, 0); -TUNE_ATTR(prefetch_secs, 0); TUNE_ATTR(statfs_slow, 0); TUNE_ATTR(new_files_jdata, 0); TUNE_ATTR(new_files_directio, 0); TUNE_ATTR(quota_simul_sync, 1); TUNE_ATTR(quota_cache_secs, 1); -TUNE_ATTR(max_atomic_write, 1); TUNE_ATTR(stall_secs, 1); -TUNE_ATTR(greedy_default, 1); -TUNE_ATTR(greedy_quantum, 1); -TUNE_ATTR(greedy_max, 1); TUNE_ATTR(statfs_quantum, 1); TUNE_ATTR_DAEMON(scand_secs, scand_process); TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process); @@ -465,15 +460,10 @@ static struct attribute *tune_attrs[] = { &tune_attr_max_readahead.attr, &tune_attr_complain_secs.attr, &tune_attr_reclaim_limit.attr, - &tune_attr_prefetch_secs.attr, &tune_attr_statfs_slow.attr, &tune_attr_quota_simul_sync.attr, &tune_attr_quota_cache_secs.attr, - &tune_attr_max_atomic_write.attr, &tune_attr_stall_secs.attr, - &tune_attr_greedy_default.attr, - &tune_attr_greedy_quantum.attr, - &tune_attr_greedy_max.attr, &tune_attr_statfs_quantum.attr, &tune_attr_scand_secs.attr, &tune_attr_recoverd_secs.attr, diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 37d681b4f21..e2e0358da33 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -53,7 +53,7 @@ done: */ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct super_block *sb = inode->i_sb; int len, err; char strbuf[HFS_MAX_NAMELEN]; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 02f5573e034..5cb7f8fee8d 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -102,7 +102,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_dentry->d_inode->i_mapping->host; + struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, hfs_get_block, NULL); diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 7e309751645..e886ac8460d 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -111,7 +111,7 @@ fail: static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct super_block *sb = inode->i_sb; int len, err; char strbuf[HFSPLUS_MAX_STRLEN + 1]; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 9e367524963..75e8c4d8aac 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -97,7 +97,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_dentry->d_inode->i_mapping->host; + struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, hfsplus_get_block, NULL); diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index cca3fb693f9..70543b17e4c 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -76,7 +76,7 @@ extern int make_symlink(const char *from, const char *to); extern int unlink_file(const char *file); extern int do_mkdir(const char *file, int mode); extern int do_rmdir(const char *file); -extern int do_mknod(const char *file, int mode, int dev); +extern int do_mknod(const char *file, int mode, unsigned int major, unsigned int minor); extern int link_file(const char *from, const char *to); extern int do_readlink(char *file, char *buf, int size); extern int rename_file(char *from, char *to); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index b6bd33ca373..69a376f35a6 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -35,7 +35,7 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode) return(list_entry(inode, struct hostfs_inode_info, vfs_inode)); } -#define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_dentry->d_inode) +#define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode) int hostfs_d_delete(struct dentry *dentry) { @@ -325,7 +325,7 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir) unsigned long long next, ino; int error, len; - name = dentry_name(file->f_dentry, 0); + name = dentry_name(file->f_path.dentry, 0); if(name == NULL) return(-ENOMEM); dir = open_dir(name, &error); kfree(name); @@ -366,7 +366,7 @@ int hostfs_file_open(struct inode *ino, struct file *file) if(w) r = 1; - name = dentry_name(file->f_dentry, 0); + name = dentry_name(file->f_path.dentry, 0); if(name == NULL) return(-ENOMEM); @@ -755,7 +755,7 @@ int hostfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) goto out_put; init_special_inode(inode, mode, dev); - err = do_mknod(name, mode, dev); + err = do_mknod(name, mode, MAJOR(dev), MINOR(dev)); if(err) goto out_free; diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index 23b7cee7212..1ed5ea389f1 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -295,11 +295,11 @@ int do_rmdir(const char *file) return(0); } -int do_mknod(const char *file, int mode, int dev) +int do_mknod(const char *file, int mode, unsigned int major, unsigned int minor) { int err; - err = mknod(file, mode, dev); + err = mknod(file, mode, makedev(major, minor)); if(err) return(-errno); return(0); } diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 594f9c428fc..6916c41d701 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -24,7 +24,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) loff_t new_off = off + (whence == 1 ? filp->f_pos : 0); loff_t pos; struct quad_buffer_head qbh; - struct inode *i = filp->f_dentry->d_inode; + struct inode *i = filp->f_path.dentry->d_inode; struct hpfs_inode_info *hpfs_inode = hpfs_i(i); struct super_block *s = i->i_sb; @@ -52,7 +52,7 @@ fail: static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); struct quad_buffer_head qbh; struct hpfs_dirent *de; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 8b94d24855f..fb4c8915010 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -115,7 +115,7 @@ static ssize_t hpfs_file_write(struct file *file, const char __user *buf, retval = do_sync_write(file, buf, count, ppos); if (retval > 0) - hpfs_i(file->f_dentry->d_inode)->i_dirty = 1; + hpfs_i(file->f_path.dentry->d_inode)->i_dirty = 1; return retval; } diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c index 642675fc394..afd340a45da 100644 --- a/fs/hppfs/hppfs_kern.c +++ b/fs/hppfs/hppfs_kern.c @@ -221,7 +221,7 @@ static ssize_t read_proc(struct file *file, char __user *buf, ssize_t count, ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); ssize_t n; - read = file->f_dentry->d_inode->i_fop->read; + read = file->f_path.dentry->d_inode->i_fop->read; if(!is_user) set_fs(KERNEL_DS); @@ -320,7 +320,7 @@ static ssize_t hppfs_write(struct file *file, const char __user *buf, size_t len ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); int err; - write = proc_file->f_dentry->d_inode->i_fop->write; + write = proc_file->f_path.dentry->d_inode->i_fop->write; proc_file->f_pos = file->f_pos; err = (*write)(proc_file, buf, len, &proc_file->f_pos); @@ -464,7 +464,7 @@ static int hppfs_open(struct inode *inode, struct file *file) if(data == NULL) goto out; - host_file = dentry_name(file->f_dentry, strlen("/rw")); + host_file = dentry_name(file->f_path.dentry, strlen("/rw")); if(host_file == NULL) goto out_free2; @@ -547,7 +547,7 @@ static loff_t hppfs_llseek(struct file *file, loff_t off, int where) loff_t (*llseek)(struct file *, loff_t, int); loff_t ret; - llseek = proc_file->f_dentry->d_inode->i_fop->llseek; + llseek = proc_file->f_path.dentry->d_inode->i_fop->llseek; if(llseek != NULL){ ret = (*llseek)(proc_file, off, where); if(ret < 0) @@ -591,10 +591,10 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir) struct hppfs_dirent dirent = ((struct hppfs_dirent) { .vfs_dirent = ent, .filldir = filldir, - .dentry = file->f_dentry } ); + .dentry = file->f_path.dentry } ); int err; - readdir = proc_file->f_dentry->d_inode->i_fop->readdir; + readdir = proc_file->f_path.dentry->d_inode->i_fop->readdir; proc_file->f_pos = file->f_pos; err = (*readdir)(proc_file, &dirent, hppfs_filldir); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 0706f5aac6a..e6bd553fdc4 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -58,7 +58,7 @@ static void huge_pagevec_release(struct pagevec *pvec) static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; loff_t len, vma_len; int ret; @@ -176,7 +176,7 @@ static int hugetlbfs_commit_write(struct file *file, static void truncate_huge_page(struct page *page) { - clear_page_dirty(page); + cancel_dirty_page(page, /* No IO accounting for huge pages? */0); ClearPageUptodate(page); remove_from_page_cache(page); put_page(page); @@ -449,10 +449,13 @@ static int hugetlbfs_symlink(struct inode *dir, } /* - * For direct-IO reads into hugetlb pages + * mark the head page dirty */ static int hugetlbfs_set_page_dirty(struct page *page) { + struct page *head = (struct page *)page_private(page); + + SetPageDirty(head); return 0; } @@ -774,8 +777,8 @@ struct file *hugetlb_zero_setup(size_t size) d_instantiate(dentry, inode); inode->i_size = size; inode->i_nlink = 0; - file->f_vfsmnt = mntget(hugetlbfs_vfsmount); - file->f_dentry = dentry; + file->f_path.mnt = mntget(hugetlbfs_vfsmount); + file->f_path.dentry = dentry; file->f_mapping = inode->i_mapping; file->f_op = &hugetlbfs_file_operations; file->f_mode = FMODE_WRITE | FMODE_READ; diff --git a/fs/inode.c b/fs/inode.c index 9ecccab7326..bf21dc6d0db 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1144,7 +1144,6 @@ sector_t bmap(struct inode * inode, sector_t block) res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); return res; } - EXPORT_SYMBOL(bmap); /** @@ -1163,27 +1162,43 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry) if (IS_RDONLY(inode)) return; - - if ((inode->i_flags & S_NOATIME) || - (inode->i_sb->s_flags & MS_NOATIME) || - ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))) + if (inode->i_flags & S_NOATIME) + return; + if (inode->i_sb->s_flags & MS_NOATIME) + return; + if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) return; /* * We may have a NULL vfsmount when coming from NFSD */ - if (mnt && - ((mnt->mnt_flags & MNT_NOATIME) || - ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))) - return; + if (mnt) { + if (mnt->mnt_flags & MNT_NOATIME) + return; + if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) + return; - now = current_fs_time(inode->i_sb); - if (!timespec_equal(&inode->i_atime, &now)) { - inode->i_atime = now; - mark_inode_dirty_sync(inode); + if (mnt->mnt_flags & MNT_RELATIME) { + /* + * With relative atime, only update atime if the + * previous atime is earlier than either the ctime or + * mtime. + */ + if (timespec_compare(&inode->i_mtime, + &inode->i_atime) < 0 && + timespec_compare(&inode->i_ctime, + &inode->i_atime) < 0) + return; + } } -} + now = current_fs_time(inode->i_sb); + if (timespec_equal(&inode->i_atime, &now)) + return; + + inode->i_atime = now; + mark_inode_dirty_sync(inode); +} EXPORT_SYMBOL(touch_atime); /** @@ -1200,7 +1215,7 @@ EXPORT_SYMBOL(touch_atime); void file_update_time(struct file *file) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; struct timespec now; int sync_it = 0; diff --git a/fs/inotify_user.c b/fs/inotify_user.c index e1956e6f116..55f6da55b7c 100644 --- a/fs/inotify_user.c +++ b/fs/inotify_user.c @@ -570,9 +570,9 @@ asmlinkage long sys_inotify_init(void) dev->ih = ih; filp->f_op = &inotify_fops; - filp->f_vfsmnt = mntget(inotify_mnt); - filp->f_dentry = dget(inotify_mnt->mnt_root); - filp->f_mapping = filp->f_dentry->d_inode->i_mapping; + filp->f_path.mnt = mntget(inotify_mnt); + filp->f_path.dentry = dget(inotify_mnt->mnt_root); + filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping; filp->f_mode = FMODE_READ; filp->f_flags = O_RDONLY; filp->private_data = dev; diff --git a/fs/ioctl.c b/fs/ioctl.c index 4b7660b09ac..ff61772ceed 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -31,7 +31,7 @@ static long do_ioctl(struct file *filp, unsigned int cmd, goto out; } else if (filp->f_op->ioctl) { lock_kernel(); - error = filp->f_op->ioctl(filp->f_dentry->d_inode, + error = filp->f_op->ioctl(filp->f_path.dentry->d_inode, filp, cmd, arg); unlock_kernel(); } @@ -45,7 +45,7 @@ static int file_ioctl(struct file *filp, unsigned int cmd, { int error; int block; - struct inode * inode = filp->f_dentry->d_inode; + struct inode * inode = filp->f_path.dentry->d_inode; int __user *p = (int __user *)arg; switch (cmd) { @@ -137,17 +137,17 @@ int vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, unsigned lon break; case FIOQSIZE: - if (S_ISDIR(filp->f_dentry->d_inode->i_mode) || - S_ISREG(filp->f_dentry->d_inode->i_mode) || - S_ISLNK(filp->f_dentry->d_inode->i_mode)) { - loff_t res = inode_get_bytes(filp->f_dentry->d_inode); + if (S_ISDIR(filp->f_path.dentry->d_inode->i_mode) || + S_ISREG(filp->f_path.dentry->d_inode->i_mode) || + S_ISLNK(filp->f_path.dentry->d_inode->i_mode)) { + loff_t res = inode_get_bytes(filp->f_path.dentry->d_inode); error = copy_to_user((loff_t __user *)arg, &res, sizeof(res)) ? -EFAULT : 0; } else error = -ENOTTY; break; default: - if (S_ISREG(filp->f_dentry->d_inode->i_mode)) + if (S_ISREG(filp->f_path.dentry->d_inode->i_mode)) error = file_ioctl(filp, cmd, arg); else error = do_ioctl(filp, cmd, arg); diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index 731816332b1..6bbbdb53581 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -42,7 +42,7 @@ static struct semaphore zisofs_zlib_semaphore; */ static int zisofs_readpage(struct file *file, struct page *page) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; struct address_space *mapping = inode->i_mapping; unsigned int maxpage, xpage, fpage, blockindex; unsigned long offset; diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index 27e276987fd..4af2548f97a 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -183,7 +183,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, /* Handle the case of the '..' directory */ if (de->name_len[0] == 1 && de->name[0] == 1) { - inode_number = parent_ino(filp->f_dentry); + inode_number = parent_ino(filp->f_path.dentry); if (filldir(dirent, "..", 2, filp->f_pos, inode_number, DT_DIR) < 0) break; filp->f_pos += de_len; @@ -255,8 +255,7 @@ static int isofs_readdir(struct file *filp, int result; char * tmpname; struct iso_directory_record * tmpde; - struct inode *inode = filp->f_dentry->d_inode; - + struct inode *inode = filp->f_path.dentry->d_inode; tmpname = (char *)__get_free_page(GFP_KERNEL); if (tmpname == NULL) diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 10be51290a2..be4648bc7a2 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -248,8 +248,12 @@ write_out_data: bufs = 0; goto write_out_data; } - } - else { + } else if (!locked && buffer_locked(bh)) { + __journal_file_buffer(jh, commit_transaction, + BJ_Locked); + jbd_unlock_bh_state(bh); + put_bh(bh); + } else { BUFFER_TRACE(bh, "writeout complete: unfile"); __journal_unfile_buffer(jh); jbd_unlock_bh_state(bh); diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index d38e0d575e4..cceaf57e377 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -55,7 +55,7 @@ get_transaction(journal_t *journal, transaction_t *transaction) spin_lock_init(&transaction->t_handle_lock); /* Set up the commit timer for the new transaction. */ - journal->j_commit_timer.expires = transaction->t_expires; + journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); add_timer(&journal->j_commit_timer); J_ASSERT(journal->j_running_transaction == NULL); diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c index 9f15bce9202..43baa1afa02 100644 --- a/fs/jffs/inode-v23.c +++ b/fs/jffs/inode-v23.c @@ -566,7 +566,7 @@ static int jffs_readdir(struct file *filp, void *dirent, filldir_t filldir) { struct jffs_file *f; - struct dentry *dentry = filp->f_dentry; + struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; struct jffs_control *c = (struct jffs_control *)inode->i_sb->s_fs_info; int j; @@ -818,7 +818,7 @@ jffs_mkdir(struct inode *dir, struct dentry *dentry, int mode) D1({ int len = dentry->d_name.len; - char *_name = (char *) kmalloc(len + 1, GFP_KERNEL); + char *_name = kmalloc(len + 1, GFP_KERNEL); memcpy(_name, dentry->d_name.name, len); _name[len] = '\0'; printk("***jffs_mkdir(): dir = 0x%p, name = \"%s\", " @@ -964,7 +964,7 @@ jffs_remove(struct inode *dir, struct dentry *dentry, int type) D1({ int len = dentry->d_name.len; const char *name = dentry->d_name.name; - char *_name = (char *) kmalloc(len + 1, GFP_KERNEL); + char *_name = kmalloc(len + 1, GFP_KERNEL); memcpy(_name, name, len); _name[len] = '\0'; printk("***jffs_remove(): file = \"%s\", ino = %ld\n", _name, dentry->d_inode->i_ino); @@ -1372,7 +1372,7 @@ jffs_file_write(struct file *filp, const char *buf, size_t count, struct jffs_control *c; struct jffs_file *f; struct jffs_node *node; - struct dentry *dentry = filp->f_dentry; + struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; int recoverable = 0; size_t written = 0; @@ -1380,7 +1380,7 @@ jffs_file_write(struct file *filp, const char *buf, size_t count, loff_t pos = *ppos; int err; - inode = filp->f_dentry->d_inode; + inode = filp->f_path.dentry->d_inode; D2(printk("***jffs_file_write(): inode: 0x%p (ino: %lu), " "filp: 0x%p, buf: 0x%p, count: %d\n", diff --git a/fs/jffs/intrep.c b/fs/jffs/intrep.c index d0e783f199e..6dd18911b44 100644 --- a/fs/jffs/intrep.c +++ b/fs/jffs/intrep.c @@ -436,7 +436,7 @@ jffs_checksum_flash(struct mtd_info *mtd, loff_t start, int size, __u32 *result) int i, length; /* Allocate read buffer */ - read_buf = (__u8 *) kmalloc (sizeof(__u8) * 4096, GFP_KERNEL); + read_buf = kmalloc(sizeof(__u8) * 4096, GFP_KERNEL); if (!read_buf) { printk(KERN_NOTICE "kmalloc failed in jffs_checksum_flash()\n"); return -ENOMEM; @@ -744,11 +744,11 @@ static int check_partly_erased_sectors(struct jffs_fmcontrol *fmc){ /* Allocate read buffers */ - read_buf1 = (__u8 *) kmalloc (sizeof(__u8) * READ_AHEAD_BYTES, GFP_KERNEL); + read_buf1 = kmalloc(sizeof(__u8) * READ_AHEAD_BYTES, GFP_KERNEL); if (!read_buf1) return -ENOMEM; - read_buf2 = (__u8 *) kmalloc (sizeof(__u8) * READ_AHEAD_BYTES, GFP_KERNEL); + read_buf2 = kmalloc(sizeof(__u8) * READ_AHEAD_BYTES, GFP_KERNEL); if (!read_buf2) { kfree(read_buf1); return -ENOMEM; @@ -876,7 +876,7 @@ jffs_scan_flash(struct jffs_control *c) } /* Allocate read buffer */ - read_buf = (__u8 *) kmalloc (sizeof(__u8) * 4096, GFP_KERNEL); + read_buf = kmalloc(sizeof(__u8) * 4096, GFP_KERNEL); if (!read_buf) { flash_safe_release(fmc->mtd); return -ENOMEM; @@ -1463,7 +1463,7 @@ jffs_insert_node(struct jffs_control *c, struct jffs_file *f, kfree(f->name); DJM(no_name--); } - if (!(f->name = (char *) kmalloc(raw_inode->nsize + 1, + if (!(f->name = kmalloc(raw_inode->nsize + 1, GFP_KERNEL))) { return -ENOMEM; } @@ -1737,7 +1737,7 @@ jffs_find_child(struct jffs_file *dir, const char *name, int len) printk("jffs_find_child(): Found \"%s\".\n", f->name); } else { - char *copy = (char *) kmalloc(len + 1, GFP_KERNEL); + char *copy = kmalloc(len + 1, GFP_KERNEL); if (copy) { memcpy(copy, name, len); copy[len] = '\0'; @@ -2627,7 +2627,7 @@ jffs_print_tree(struct jffs_file *first_file, int indent) return; } - if (!(space = (char *) kmalloc(indent + 1, GFP_KERNEL))) { + if (!(space = kmalloc(indent + 1, GFP_KERNEL))) { printk("jffs_print_tree(): Out of memory!\n"); return; } diff --git a/fs/jffs/jffs_fm.c b/fs/jffs/jffs_fm.c index 077258b2103..5a95fbdd6fd 100644 --- a/fs/jffs/jffs_fm.c +++ b/fs/jffs/jffs_fm.c @@ -17,6 +17,7 @@ * */ #include <linux/slab.h> +#include <linux/err.h> #include <linux/blkdev.h> #include <linux/jffs.h> #include "jffs_fm.h" @@ -104,7 +105,7 @@ jffs_build_begin(struct jffs_control *c, int unit) mtd = get_mtd_device(NULL, unit); - if (!mtd) { + if (IS_ERR(mtd)) { kfree(fmc); DJM(no_jffs_fmcontrol--); return NULL; diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c index 72b4fc13a10..4189e4a3605 100644 --- a/fs/jffs2/debug.c +++ b/fs/jffs2/debug.c @@ -178,8 +178,8 @@ __jffs2_dbg_acct_paranoia_check_nolock(struct jffs2_sb_info *c, while (ref2) { uint32_t totlen = ref_totlen(c, jeb, ref2); - if (ref2->flash_offset < jeb->offset || - ref2->flash_offset > jeb->offset + c->sector_size) { + if (ref_offset(ref2) < jeb->offset || + ref_offset(ref2) > jeb->offset + c->sector_size) { JFFS2_ERROR("node_ref %#08x shouldn't be in block at %#08x.\n", ref_offset(ref2), jeb->offset); goto error; diff --git a/fs/jffs2/debug.h b/fs/jffs2/debug.h index 3daf3bca037..f89c85d5a3f 100644 --- a/fs/jffs2/debug.h +++ b/fs/jffs2/debug.h @@ -13,6 +13,7 @@ #ifndef _JFFS2_DEBUG_H_ #define _JFFS2_DEBUG_H_ +#include <linux/sched.h> #ifndef CONFIG_JFFS2_FS_DEBUG #define CONFIG_JFFS2_FS_DEBUG 0 diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 9def6adf4a5..da6034d5071 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -123,11 +123,11 @@ static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir) { struct jffs2_inode_info *f; struct jffs2_sb_info *c; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct jffs2_full_dirent *fd; unsigned long offset, curofs; - D1(printk(KERN_DEBUG "jffs2_readdir() for dir_i #%lu\n", filp->f_dentry->d_inode->i_ino)); + D1(printk(KERN_DEBUG "jffs2_readdir() for dir_i #%lu\n", filp->f_path.dentry->d_inode->i_ino)); f = JFFS2_INODE_INFO(inode); c = JFFS2_SB_INFO(inode->i_sb); @@ -141,7 +141,7 @@ static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir) offset++; } if (offset == 1) { - unsigned long pino = parent_ino(filp->f_dentry); + unsigned long pino = parent_ino(filp->f_path.dentry); D1(printk(KERN_DEBUG "Dirent 1: \"..\", ino #%lu\n", pino)); if (filldir(dirent, "..", 2, 1, pino, DT_DIR) < 0) goto out; diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 7bc1a4201c0..abb90c0c09c 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -502,12 +502,11 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) if (ret) return ret; - c->inocache_list = kmalloc(INOCACHE_HASHSIZE * sizeof(struct jffs2_inode_cache *), GFP_KERNEL); + c->inocache_list = kcalloc(INOCACHE_HASHSIZE, sizeof(struct jffs2_inode_cache *), GFP_KERNEL); if (!c->inocache_list) { ret = -ENOMEM; goto out_wbuf; } - memset(c->inocache_list, 0, INOCACHE_HASHSIZE * sizeof(struct jffs2_inode_cache *)); jffs2_init_xattr_subsystem(c); diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c index daff3341ff9..3a3cf225981 100644 --- a/fs/jffs2/gc.c +++ b/fs/jffs2/gc.c @@ -838,6 +838,8 @@ static int jffs2_garbage_collect_deletion_dirent(struct jffs2_sb_info *c, struct for (raw = f->inocache->nodes; raw != (void *)f->inocache; raw = raw->next_in_ino) { + cond_resched(); + /* We only care about obsolete ones */ if (!(ref_obsolete(raw))) continue; diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h index 0ddfd70307f..4178b4b5594 100644 --- a/fs/jffs2/nodelist.h +++ b/fs/jffs2/nodelist.h @@ -294,23 +294,21 @@ static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev) static inline struct jffs2_node_frag *frag_first(struct rb_root *root) { - struct rb_node *node = root->rb_node; + struct rb_node *node = rb_first(root); if (!node) return NULL; - while(node->rb_left) - node = node->rb_left; + return rb_entry(node, struct jffs2_node_frag, rb); } static inline struct jffs2_node_frag *frag_last(struct rb_root *root) { - struct rb_node *node = root->rb_node; + struct rb_node *node = rb_last(root); if (!node) return NULL; - while(node->rb_right) - node = node->rb_right; + return rb_entry(node, struct jffs2_node_frag, rb); } diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 266423b2709..58a0b912e9d 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -944,13 +944,12 @@ int jffs2_do_read_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, int jffs2_do_crccheck_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic) { struct jffs2_raw_inode n; - struct jffs2_inode_info *f = kmalloc(sizeof(*f), GFP_KERNEL); + struct jffs2_inode_info *f = kzalloc(sizeof(*f), GFP_KERNEL); int ret; if (!f) return -ENOMEM; - memset(f, 0, sizeof(*f)); init_MUTEX_LOCKED(&f->sem); f->inocache = ic; diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index e2413466ddd..3af746eaff0 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -128,17 +128,19 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) } if (jffs2_sum_active()) { - s = kmalloc(sizeof(struct jffs2_summary), GFP_KERNEL); + s = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL); if (!s) { + kfree(flashbuf); JFFS2_WARNING("Can't allocate memory for summary\n"); return -ENOMEM; } - memset(s, 0, sizeof(struct jffs2_summary)); } for (i=0; i<c->nr_blocks; i++) { struct jffs2_eraseblock *jeb = &c->blocks[i]; + cond_resched(); + /* reset summary info for next eraseblock scan */ jffs2_sum_reset_collected(s); diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c index e52cef526d9..25265965bdc 100644 --- a/fs/jffs2/summary.c +++ b/fs/jffs2/summary.c @@ -26,15 +26,13 @@ int jffs2_sum_init(struct jffs2_sb_info *c) { - c->summary = kmalloc(sizeof(struct jffs2_summary), GFP_KERNEL); + c->summary = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL); if (!c->summary) { JFFS2_WARNING("Can't allocate memory for summary information!\n"); return -ENOMEM; } - memset(c->summary, 0, sizeof(struct jffs2_summary)); - c->summary->sum_buf = vmalloc(c->sector_size); if (!c->summary->sum_buf) { @@ -398,6 +396,8 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras for (i=0; i<je32_to_cpu(summary->sum_num); i++) { dbg_summary("processing summary index %d\n", i); + cond_resched(); + /* Make sure there's a spare ref for dirty space */ err = jffs2_prealloc_raw_node_refs(c, jeb, 2); if (err) diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 7deb7825402..08a0e6c49e6 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -17,6 +17,7 @@ #include <linux/init.h> #include <linux/list.h> #include <linux/fs.h> +#include <linux/err.h> #include <linux/mount.h> #include <linux/jffs2.h> #include <linux/pagemap.h> @@ -184,9 +185,9 @@ static int jffs2_get_sb_mtdnr(struct file_system_type *fs_type, struct mtd_info *mtd; mtd = get_mtd_device(NULL, mtdnr); - if (!mtd) { + if (IS_ERR(mtd)) { D1(printk(KERN_DEBUG "jffs2: MTD device #%u doesn't appear to exist\n", mtdnr)); - return -EINVAL; + return PTR_ERR(mtd); } return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd, mnt); @@ -221,7 +222,7 @@ static int jffs2_get_sb(struct file_system_type *fs_type, D1(printk(KERN_DEBUG "jffs2_get_sb(): mtd:%%s, name \"%s\"\n", dev_name+4)); for (mtdnr = 0; mtdnr < MAX_MTD_DEVICES; mtdnr++) { mtd = get_mtd_device(NULL, mtdnr); - if (mtd) { + if (!IS_ERR(mtd)) { if (!strcmp(mtd->name, dev_name+4)) return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd, mnt); put_mtd_device(mtd); diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c index fc211b6e9b0..b90d5aa3d96 100644 --- a/fs/jffs2/symlink.c +++ b/fs/jffs2/symlink.c @@ -51,7 +51,7 @@ static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd) */ if (!p) { - printk(KERN_ERR "jffs2_follow_link(): can't find symlink taerget\n"); + printk(KERN_ERR "jffs2_follow_link(): can't find symlink target\n"); p = ERR_PTR(-EIO); } D1(printk(KERN_DEBUG "jffs2_follow_link(): target path is '%s'\n", (char *) f->target)); diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 70707309dfa..9c99859f5ed 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -969,8 +969,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c, int oobsize = c->mtd->oobsize; struct mtd_oob_ops ops; - ops.len = NR_OOB_SCAN_PAGES * oobsize; - ops.ooblen = oobsize; + ops.ooblen = NR_OOB_SCAN_PAGES * oobsize; ops.oobbuf = c->oobbuf; ops.ooboffs = 0; ops.datbuf = NULL; @@ -983,10 +982,10 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c, return ret; } - if (ops.retlen < ops.len) { + if (ops.oobretlen < ops.ooblen) { D1(printk(KERN_WARNING "jffs2_check_oob_empty(): Read OOB " "returned short read (%zd bytes not %d) for block " - "at %08x\n", ops.retlen, ops.len, jeb->offset)); + "at %08x\n", ops.oobretlen, ops.ooblen, jeb->offset)); return -EIO; } @@ -1005,7 +1004,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c, } /* we know, we are aligned :) */ - for (page = oobsize; page < ops.len; page += sizeof(long)) { + for (page = oobsize; page < ops.ooblen; page += sizeof(long)) { long dat = *(long *)(&ops.oobbuf[page]); if(dat != -1) return 1; @@ -1033,7 +1032,6 @@ int jffs2_check_nand_cleanmarker (struct jffs2_sb_info *c, return 2; } - ops.len = oobsize; ops.ooblen = oobsize; ops.oobbuf = c->oobbuf; ops.ooboffs = 0; @@ -1048,10 +1046,10 @@ int jffs2_check_nand_cleanmarker (struct jffs2_sb_info *c, return ret; } - if (ops.retlen < ops.len) { + if (ops.oobretlen < ops.ooblen) { D1 (printk (KERN_WARNING "jffs2_check_nand_cleanmarker(): " "Read OOB return short read (%zd bytes not %d) " - "for block at %08x\n", ops.retlen, ops.len, + "for block at %08x\n", ops.oobretlen, ops.ooblen, jeb->offset)); return -EIO; } @@ -1090,8 +1088,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, n.nodetype = cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER); n.totlen = cpu_to_je32(8); - ops.len = c->fsdata_len; - ops.ooblen = c->fsdata_len;; + ops.ooblen = c->fsdata_len; ops.oobbuf = (uint8_t *)&n; ops.ooboffs = c->fsdata_pos; ops.datbuf = NULL; @@ -1105,10 +1102,10 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, jeb->offset, ret)); return ret; } - if (ops.retlen != ops.len) { + if (ops.oobretlen != ops.ooblen) { D1(printk(KERN_WARNING "jffs2_write_nand_cleanmarker(): " "Short write for block at %08x: %zd not %d\n", - jeb->offset, ops.retlen, ops.len)); + jeb->offset, ops.oobretlen, ops.ooblen)); return -EIO; } return 0; diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index 4da09ce1d1f..4bb3f189733 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -399,8 +399,6 @@ static void unrefer_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datu { /* must be called under down_write(xattr_sem) */ if (atomic_dec_and_lock(&xd->refcnt, &c->erase_completion_lock)) { - uint32_t xid = xd->xid, version = xd->version; - unload_xattr_datum(c, xd); xd->flags |= JFFS2_XFLAGS_DEAD; if (xd->node == (void *)xd) { @@ -411,7 +409,8 @@ static void unrefer_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datu } spin_unlock(&c->erase_completion_lock); - dbg_xattr("xdatum(xid=%u, version=%u) was removed.\n", xid, version); + dbg_xattr("xdatum(xid=%u, version=%u) was removed.\n", + xd->xid, xd->version); } } diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index f5719117edf..e285022f006 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -182,9 +182,9 @@ int jfs_get_block(struct inode *ip, sector_t lblock, * Take appropriate lock on inode */ if (create) - IWRITE_LOCK(ip); + IWRITE_LOCK(ip, RDWRLOCK_NORMAL); else - IREAD_LOCK(ip); + IREAD_LOCK(ip, RDWRLOCK_NORMAL); if (((lblock64 << ip->i_sb->s_blocksize_bits) < ip->i_size) && (!xtLookup(ip, lblock64, xlen, &xflag, &xaddr, &xlen, 0)) && @@ -359,7 +359,7 @@ void jfs_truncate(struct inode *ip) nobh_truncate_page(ip->i_mapping, ip->i_size); - IWRITE_LOCK(ip); + IWRITE_LOCK(ip, RDWRLOCK_NORMAL); jfs_truncate_nolock(ip, ip->i_size); IWRITE_UNLOCK(ip); } diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h index ddffbbd4d95..7378798f0b2 100644 --- a/fs/jfs/jfs_debug.h +++ b/fs/jfs/jfs_debug.h @@ -39,10 +39,6 @@ extern void jfs_proc_clean(void); /* * assert with traditional printf/panic */ -#ifdef CONFIG_KERNEL_ASSERTS -/* kgdb stuff */ -#define assert(p) KERNEL_ASSERT(#p, p) -#else #define assert(p) do { \ if (!(p)) { \ printk(KERN_CRIT "BUG at %s:%d assert(%s)\n", \ @@ -50,7 +46,6 @@ extern void jfs_proc_clean(void); BUG(); \ } \ } while (0) -#endif /* * debug ON diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 23546c8fd48..82b0544bd76 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -337,7 +337,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks) struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; - IREAD_LOCK(ipbmap); + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); /* block to be freed better be within the mapsize. */ if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) { @@ -733,7 +733,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) * allocation group size, try to allocate anywhere. */ if (l2nb > bmp->db_agl2size) { - IWRITE_LOCK(ipbmap); + IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP); rc = dbAllocAny(bmp, nblocks, l2nb, results); @@ -774,7 +774,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) * the hint using a tiered strategy. */ if (nblocks <= BPERDMAP) { - IREAD_LOCK(ipbmap); + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); /* get the buffer for the dmap containing the hint. */ @@ -844,7 +844,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) /* try to satisfy the allocation request with blocks within * the same allocation group as the hint. */ - IWRITE_LOCK(ipbmap); + IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP); if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) != -ENOSPC) goto write_unlock; @@ -856,7 +856,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) * Let dbNextAG recommend a preferred allocation group */ agno = dbNextAG(ipbmap); - IWRITE_LOCK(ipbmap); + IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP); /* Try to allocate within this allocation group. if that fails, try to * allocate anywhere in the map. @@ -900,7 +900,7 @@ int dbAllocExact(struct inode *ip, s64 blkno, int nblocks) s64 lblkno; struct metapage *mp; - IREAD_LOCK(ipbmap); + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); /* * validate extent request: @@ -1050,7 +1050,7 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks) */ extblkno = lastblkno + 1; - IREAD_LOCK(ipbmap); + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); /* better be within the file system */ bmp = sbi->bmap; @@ -3116,7 +3116,7 @@ int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks) struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; - IREAD_LOCK(ipbmap); + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); /* block to be allocated better be within the mapsize. */ ASSERT(nblocks <= bmp->db_mapsize - blkno); diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index ecb2216d881..6d62f322289 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -3009,7 +3009,7 @@ static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent) */ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *ip = filp->f_dentry->d_inode; + struct inode *ip = filp->f_path.dentry->d_inode; struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab; int rc = 0; loff_t dtpos; /* legacy OS/2 style position */ @@ -3777,12 +3777,12 @@ static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, struct component_name lkey; struct component_name rkey; - lkey.name = (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), + lkey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), GFP_KERNEL); if (lkey.name == NULL) return -ENOMEM; - rkey.name = (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), + rkey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), GFP_KERNEL); if (rkey.name == NULL) { kfree(lkey.name); diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h index eb550b339bb..38f70ac03be 100644 --- a/fs/jfs/jfs_filsys.h +++ b/fs/jfs/jfs_filsys.h @@ -29,31 +29,21 @@ /* * file system option (superblock flag) */ -/* mount time flag to disable journaling to disk */ -#define JFS_NOINTEGRITY 0x00000010 + +/* directory option */ +#define JFS_UNICODE 0x00000001 /* unicode name */ /* mount time flags for error handling */ #define JFS_ERR_REMOUNT_RO 0x00000002 /* remount read-only */ #define JFS_ERR_CONTINUE 0x00000004 /* continue */ #define JFS_ERR_PANIC 0x00000008 /* panic */ +/* Quota support */ #define JFS_USRQUOTA 0x00000010 #define JFS_GRPQUOTA 0x00000020 -/* platform option (conditional compilation) */ -#define JFS_AIX 0x80000000 /* AIX support */ -/* POSIX name/directory support */ - -#define JFS_OS2 0x40000000 /* OS/2 support */ -/* case-insensitive name/directory support */ - -#define JFS_DFS 0x20000000 /* DCE DFS LFS support */ - -#define JFS_LINUX 0x10000000 /* Linux support */ -/* case-sensitive name/directory support */ - -/* directory option */ -#define JFS_UNICODE 0x00000001 /* unicode name */ +/* mount time flag to disable journaling to disk */ +#define JFS_NOINTEGRITY 0x00000040 /* commit option */ #define JFS_COMMIT 0x00000f00 /* commit option mask */ @@ -61,6 +51,7 @@ #define JFS_LAZYCOMMIT 0x00000200 /* lazy commit */ #define JFS_TMPFS 0x00000400 /* temporary file system - * do not log/commit: + * Never implemented */ /* log logical volume option */ @@ -74,16 +65,25 @@ #define JFS_SPARSE 0x00020000 /* sparse regular file */ /* DASD Limits F226941 */ -#define JFS_DASD_ENABLED 0x00040000 /* DASD limits enabled */ -#define JFS_DASD_PRIME 0x00080000 /* Prime DASD usage on boot */ +#define JFS_DASD_ENABLED 0x00040000 /* DASD limits enabled */ +#define JFS_DASD_PRIME 0x00080000 /* Prime DASD usage on boot */ /* big endian flag */ -#define JFS_SWAP_BYTES 0x00100000 /* running on big endian computer */ +#define JFS_SWAP_BYTES 0x00100000 /* running on big endian computer */ /* Directory index */ -#define JFS_DIR_INDEX 0x00200000 /* Persistent index for */ - /* directory entries */ +#define JFS_DIR_INDEX 0x00200000 /* Persistent index for */ +/* platform options */ +#define JFS_LINUX 0x10000000 /* Linux support */ +#define JFS_DFS 0x20000000 /* DCE DFS LFS support */ +/* Never implemented */ + +#define JFS_OS2 0x40000000 /* OS/2 support */ +/* case-insensitive name/directory support */ + +#define JFS_AIX 0x80000000 /* AIX support */ +/* POSIX name/directory support - Never implemented*/ /* * buffer cache configuration diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index ee9b473b7b8..aa5124b643b 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -120,7 +120,7 @@ int diMount(struct inode *ipimap) * allocate/initialize the in-memory inode map control structure */ /* allocate the in-memory inode map control structure. */ - imap = (struct inomap *) kmalloc(sizeof(struct inomap), GFP_KERNEL); + imap = kmalloc(sizeof(struct inomap), GFP_KERNEL); if (imap == NULL) { jfs_err("diMount: kmalloc returned NULL!"); return -ENOMEM; @@ -331,7 +331,7 @@ int diRead(struct inode *ip) /* read the iag */ imap = JFS_IP(ipimap)->i_imap; - IREAD_LOCK(ipimap); + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); rc = diIAGRead(imap, iagno, &mp); IREAD_UNLOCK(ipimap); if (rc) { @@ -920,7 +920,7 @@ int diFree(struct inode *ip) /* Obtain read lock in imap inode. Don't release it until we have * read all of the IAG's that we are going to. */ - IREAD_LOCK(ipimap); + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); /* read the iag. */ @@ -1415,7 +1415,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip) AG_LOCK(imap, agno); /* Get read lock on imap inode */ - IREAD_LOCK(ipimap); + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); /* get the iag number and read the iag */ iagno = INOTOIAG(inum); @@ -1808,7 +1808,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) return -ENOSPC; /* obtain read lock on imap inode */ - IREAD_LOCK(imap->im_ipimap); + IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP); /* read the iag at the head of the list. */ @@ -1946,7 +1946,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip) } else { /* read the iag. */ - IREAD_LOCK(imap->im_ipimap); + IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP); if ((rc = diIAGRead(imap, iagno, &mp))) { IREAD_UNLOCK(imap->im_ipimap); jfs_error(ip->i_sb, "diAllocExt: error reading iag"); @@ -2509,7 +2509,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) */ /* acquire inode map lock */ - IWRITE_LOCK(ipimap); + IWRITE_LOCK(ipimap, RDWRLOCK_IMAP); if (ipimap->i_size >> L2PSIZE != imap->im_nextiag + 1) { IWRITE_UNLOCK(ipimap); @@ -2648,7 +2648,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) } /* obtain read lock on map */ - IREAD_LOCK(ipimap); + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); /* read the iag */ if ((rc = diIAGRead(imap, iagno, &mp))) { @@ -2779,7 +2779,7 @@ diUpdatePMap(struct inode *ipimap, return -EIO; } /* read the iag */ - IREAD_LOCK(ipimap); + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); rc = diIAGRead(imap, iagno, &mp); IREAD_UNLOCK(ipimap); if (rc) diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index 94005584445..8f453eff3c8 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -109,9 +109,11 @@ struct jfs_inode_info { #define JFS_ACL_NOT_CACHED ((void *)-1) -#define IREAD_LOCK(ip) down_read(&JFS_IP(ip)->rdwrlock) +#define IREAD_LOCK(ip, subclass) \ + down_read_nested(&JFS_IP(ip)->rdwrlock, subclass) #define IREAD_UNLOCK(ip) up_read(&JFS_IP(ip)->rdwrlock) -#define IWRITE_LOCK(ip) down_write(&JFS_IP(ip)->rdwrlock) +#define IWRITE_LOCK(ip, subclass) \ + down_write_nested(&JFS_IP(ip)->rdwrlock, subclass) #define IWRITE_UNLOCK(ip) up_write(&JFS_IP(ip)->rdwrlock) /* @@ -127,6 +129,29 @@ enum cflags { COMMIT_Synclist, /* metadata pages on group commit synclist */ }; +/* + * commit_mutex nesting subclasses: + */ +enum commit_mutex_class +{ + COMMIT_MUTEX_PARENT, + COMMIT_MUTEX_CHILD, + COMMIT_MUTEX_SECOND_PARENT, /* Renaming */ + COMMIT_MUTEX_VICTIM /* Inode being unlinked due to rename */ +}; + +/* + * rdwrlock subclasses: + * The dmap inode may be locked while a normal inode or the imap inode are + * locked. + */ +enum rdwrlock_class +{ + RDWRLOCK_NORMAL, + RDWRLOCK_IMAP, + RDWRLOCK_DMAP +}; + #define set_cflag(flag, ip) set_bit(flag, &(JFS_IP(ip)->cflag)) #define clear_cflag(flag, ip) clear_bit(flag, &(JFS_IP(ip)->cflag)) #define test_cflag(flag, ip) test_bit(flag, &(JFS_IP(ip)->cflag)) diff --git a/fs/jfs/jfs_lock.h b/fs/jfs/jfs_lock.h index 7d78e83d7c4..df48ece4b7a 100644 --- a/fs/jfs/jfs_lock.h +++ b/fs/jfs/jfs_lock.h @@ -42,7 +42,7 @@ do { \ if (cond) \ break; \ unlock_cmd; \ - schedule(); \ + io_schedule(); \ lock_cmd; \ } \ current->state = TASK_RUNNING; \ diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index b1a1c729601..58deae00750 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -56,7 +56,7 @@ static inline void __lock_metapage(struct metapage *mp) set_current_state(TASK_UNINTERRUPTIBLE); if (metapage_locked(mp)) { unlock_page(mp->page); - schedule(); + io_schedule(); lock_page(mp->page); } } while (trylock_metapage(mp)); @@ -764,22 +764,9 @@ void release_metapage(struct metapage * mp) } else if (mp->lsn) /* discard_metapage doesn't remove it */ remove_from_logsync(mp); -#if MPS_PER_PAGE == 1 - /* - * If we know this is the only thing in the page, we can throw - * the page out of the page cache. If pages are larger, we - * don't want to do this. - */ - - /* Retest mp->count since we may have released page lock */ - if (test_bit(META_discard, &mp->flag) && !mp->count) { - clear_page_dirty(page); - ClearPageUptodate(page); - } -#else /* Try to keep metapages from using up too much memory */ drop_metapage(page, mp); -#endif + unlock_page(page); page_cache_release(page); } diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index d558e51b0df..6988a1082f5 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -135,7 +135,7 @@ static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event) add_wait_queue(event, &wait); set_current_state(TASK_UNINTERRUPTIBLE); TXN_UNLOCK(); - schedule(); + io_schedule(); current->state = TASK_RUNNING; remove_wait_queue(event, &wait); } diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index e98eb03e531..acc97c46d8a 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -757,6 +757,11 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, nsplit = 0; /* push (bn, index) of the parent page/entry */ + if (BT_STACK_FULL(btstack)) { + jfs_error(ip->i_sb, "stack overrun in xtSearch!"); + XT_PUTPAGE(mp); + return -EIO; + } BT_PUSH(btstack, bn, index); /* get the child page block number */ @@ -3915,6 +3920,11 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) */ getChild: /* save current parent entry for the child page */ + if (BT_STACK_FULL(&btstack)) { + jfs_error(ip->i_sb, "stack overrun in xtTruncate!"); + XT_PUTPAGE(mp); + return -EIO; + } BT_PUSH(&btstack, bn, index); /* get child page */ @@ -4112,6 +4122,11 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) */ getChild: /* save current parent entry for the child page */ + if (BT_STACK_FULL(&btstack)) { + jfs_error(ip->i_sb, "stack overrun in xtTruncate_pmap!"); + XT_PUTPAGE(mp); + return -EIO; + } BT_PUSH(&btstack, bn, index); /* get child page */ diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index a6a8c16c872..7ab47561b68 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -104,8 +104,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, tid = txBegin(dip->i_sb, 0); - mutex_lock(&JFS_IP(dip)->commit_mutex); - mutex_lock(&JFS_IP(ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); rc = jfs_init_acl(tid, ip, dip); if (rc) @@ -238,8 +238,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) tid = txBegin(dip->i_sb, 0); - mutex_lock(&JFS_IP(dip)->commit_mutex); - mutex_lock(&JFS_IP(ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); rc = jfs_init_acl(tid, ip, dip); if (rc) @@ -365,8 +365,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) tid = txBegin(dip->i_sb, 0); - mutex_lock(&JFS_IP(dip)->commit_mutex); - mutex_lock(&JFS_IP(ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); iplist[0] = dip; iplist[1] = ip; @@ -483,12 +483,12 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) if ((rc = get_UCSname(&dname, dentry))) goto out; - IWRITE_LOCK(ip); + IWRITE_LOCK(ip, RDWRLOCK_NORMAL); tid = txBegin(dip->i_sb, 0); - mutex_lock(&JFS_IP(dip)->commit_mutex); - mutex_lock(&JFS_IP(ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); iplist[0] = dip; iplist[1] = ip; @@ -802,8 +802,8 @@ static int jfs_link(struct dentry *old_dentry, tid = txBegin(ip->i_sb, 0); - mutex_lock(&JFS_IP(dir)->commit_mutex); - mutex_lock(&JFS_IP(ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); /* * scan parent directory for entry/freespace @@ -913,8 +913,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, tid = txBegin(dip->i_sb, 0); - mutex_lock(&JFS_IP(dip)->commit_mutex); - mutex_lock(&JFS_IP(ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); rc = jfs_init_security(tid, ip, dip); if (rc) @@ -1127,7 +1127,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out3; } } else if (new_ip) { - IWRITE_LOCK(new_ip); + IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL); /* Init inode for quota operations. */ DQUOT_INIT(new_ip); } @@ -1137,13 +1137,21 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, */ tid = txBegin(new_dir->i_sb, 0); - mutex_lock(&JFS_IP(new_dir)->commit_mutex); - mutex_lock(&JFS_IP(old_ip)->commit_mutex); + /* + * How do we know the locking is safe from deadlocks? + * The vfs does the hard part for us. Any time we are taking nested + * commit_mutexes, the vfs already has i_mutex held on the parent. + * Here, the vfs has already taken i_mutex on both old_dir and new_dir. + */ + mutex_lock_nested(&JFS_IP(new_dir)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(old_ip)->commit_mutex, COMMIT_MUTEX_CHILD); if (old_dir != new_dir) - mutex_lock(&JFS_IP(old_dir)->commit_mutex); + mutex_lock_nested(&JFS_IP(old_dir)->commit_mutex, + COMMIT_MUTEX_SECOND_PARENT); if (new_ip) { - mutex_lock(&JFS_IP(new_ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(new_ip)->commit_mutex, + COMMIT_MUTEX_VICTIM); /* * Change existing directory entry to new inode number */ @@ -1357,8 +1365,8 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, tid = txBegin(dir->i_sb, 0); - mutex_lock(&JFS_IP(dir)->commit_mutex); - mutex_lock(&JFS_IP(ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); rc = jfs_init_acl(tid, ip, dir); if (rc) diff --git a/fs/libfs.c b/fs/libfs.c index bd08e0e64a8..503898d5c4a 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -63,7 +63,7 @@ int dcache_dir_open(struct inode *inode, struct file *file) { static struct qstr cursor_name = {.len = 1, .name = "."}; - file->private_data = d_alloc(file->f_dentry, &cursor_name); + file->private_data = d_alloc(file->f_path.dentry, &cursor_name); return file->private_data ? 0 : -ENOMEM; } @@ -76,7 +76,7 @@ int dcache_dir_close(struct inode *inode, struct file *file) loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin) { - mutex_lock(&file->f_dentry->d_inode->i_mutex); + mutex_lock(&file->f_path.dentry->d_inode->i_mutex); switch (origin) { case 1: offset += file->f_pos; @@ -84,7 +84,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin) if (offset >= 0) break; default: - mutex_unlock(&file->f_dentry->d_inode->i_mutex); + mutex_unlock(&file->f_path.dentry->d_inode->i_mutex); return -EINVAL; } if (offset != file->f_pos) { @@ -96,8 +96,8 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin) spin_lock(&dcache_lock); list_del(&cursor->d_u.d_child); - p = file->f_dentry->d_subdirs.next; - while (n && p != &file->f_dentry->d_subdirs) { + p = file->f_path.dentry->d_subdirs.next; + while (n && p != &file->f_path.dentry->d_subdirs) { struct dentry *next; next = list_entry(p, struct dentry, d_u.d_child); if (!d_unhashed(next) && next->d_inode) @@ -108,7 +108,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin) spin_unlock(&dcache_lock); } } - mutex_unlock(&file->f_dentry->d_inode->i_mutex); + mutex_unlock(&file->f_path.dentry->d_inode->i_mutex); return offset; } @@ -126,7 +126,7 @@ static inline unsigned char dt_type(struct inode *inode) int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) { - struct dentry *dentry = filp->f_dentry; + struct dentry *dentry = filp->f_path.dentry; struct dentry *cursor = filp->private_data; struct list_head *p, *q = &cursor->d_u.d_child; ino_t ino; diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index b85a0ad2cfb..f4d45d4d835 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -36,7 +36,7 @@ struct nlm_wait { struct nlm_host * b_host; struct file_lock * b_lock; /* local file lock */ unsigned short b_reclaim; /* got to reclaim lock */ - u32 b_status; /* grant callback status */ + __be32 b_status; /* grant callback status */ }; static LIST_HEAD(nlm_blocked); @@ -53,7 +53,7 @@ struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock * block->b_host = host; block->b_lock = fl; init_waitqueue_head(&block->b_wait); - block->b_status = NLM_LCK_BLOCKED; + block->b_status = nlm_lck_blocked; list_add(&block->b_list, &nlm_blocked); } return block; @@ -89,7 +89,7 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) * nlmclnt_lock for an explanation. */ ret = wait_event_interruptible_timeout(block->b_wait, - block->b_status != NLM_LCK_BLOCKED, + block->b_status != nlm_lck_blocked, timeout); if (ret < 0) return -ERESTARTSYS; @@ -126,12 +126,12 @@ __be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock continue; if (!nlm_cmp_addr(&block->b_host->h_addr, addr)) continue; - if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_dentry->d_inode) ,fh) != 0) + if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) continue; /* Alright, we found a lock. Set the return status * and wake up the caller */ - block->b_status = NLM_LCK_GRANTED; + block->b_status = nlm_granted; wake_up(&block->b_wait); res = nlm_granted; } @@ -176,7 +176,7 @@ reclaimer(void *ptr) lock_kernel(); lockd_up(0); /* note: this cannot fail as lockd is already running */ - dprintk("lockd: reclaiming locks for host %s", host->h_name); + dprintk("lockd: reclaiming locks for host %s\n", host->h_name); restart: nsmstate = host->h_nsmstate; @@ -206,12 +206,12 @@ restart: host->h_reclaiming = 0; up_write(&host->h_rwsem); - dprintk("NLM: done reclaiming locks for host %s", host->h_name); + dprintk("NLM: done reclaiming locks for host %s\n", host->h_name); /* Now, wake up all processes that sleep on a blocked lock */ list_for_each_entry(block, &nlm_blocked, b_list) { if (block->b_host == host) { - block->b_status = NLM_LCK_DENIED_GRACE_PERIOD; + block->b_status = nlm_lck_denied_grace_period; wake_up(&block->b_wait); } } diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 497c3cd59d5..0b4acc1c5e7 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -27,7 +27,7 @@ static int nlmclnt_test(struct nlm_rqst *, struct file_lock *); static int nlmclnt_lock(struct nlm_rqst *, struct file_lock *); static int nlmclnt_unlock(struct nlm_rqst *, struct file_lock *); -static int nlm_stat_to_errno(u32 stat); +static int nlm_stat_to_errno(__be32 stat); static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host); static int nlmclnt_cancel(struct nlm_host *, int , struct file_lock *); @@ -129,7 +129,7 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl) nlmclnt_next_cookie(&argp->cookie); argp->state = nsm_local_state; - memcpy(&lock->fh, NFS_FH(fl->fl_file->f_dentry->d_inode), sizeof(struct nfs_fh)); + memcpy(&lock->fh, NFS_FH(fl->fl_file->f_path.dentry->d_inode), sizeof(struct nfs_fh)); lock->caller = utsname()->nodename; lock->oh.data = req->a_owner; lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s", @@ -325,7 +325,7 @@ nlmclnt_call(struct nlm_rqst *req, u32 proc) } break; } else - if (resp->status == NLM_LCK_DENIED_GRACE_PERIOD) { + if (resp->status == nlm_lck_denied_grace_period) { dprintk("lockd: server in grace period\n"); if (argp->reclaim) { printk(KERN_WARNING @@ -411,10 +411,10 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl) goto out; switch (req->a_res.status) { - case NLM_LCK_GRANTED: + case nlm_granted: fl->fl_type = F_UNLCK; break; - case NLM_LCK_DENIED: + case nlm_lck_denied: /* * Report the conflicting lock back to the application. */ @@ -524,9 +524,9 @@ again: if (!req->a_args.block) break; /* Did a reclaimer thread notify us of a server reboot? */ - if (resp->status == NLM_LCK_DENIED_GRACE_PERIOD) + if (resp->status == nlm_lck_denied_grace_period) continue; - if (resp->status != NLM_LCK_BLOCKED) + if (resp->status != nlm_lck_blocked) break; /* Wait on an NLM blocking lock */ status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT); @@ -535,11 +535,11 @@ again: */ if (status < 0) goto out_unblock; - if (resp->status != NLM_LCK_BLOCKED) + if (resp->status != nlm_lck_blocked) break; } - if (resp->status == NLM_LCK_GRANTED) { + if (resp->status == nlm_granted) { down_read(&host->h_rwsem); /* Check whether or not the server has rebooted */ if (fl->fl_u.nfs_fl.state != host->h_state) { @@ -556,7 +556,7 @@ again: out_unblock: nlmclnt_finish_block(block); /* Cancel the blocked request if it is still pending */ - if (resp->status == NLM_LCK_BLOCKED) + if (resp->status == nlm_lck_blocked) nlmclnt_cancel(host, req->a_args.block, fl); out: nlm_release_call(req); @@ -585,12 +585,12 @@ nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl) req->a_args.reclaim = 1; if ((status = nlmclnt_call(req, NLMPROC_LOCK)) >= 0 - && req->a_res.status == NLM_LCK_GRANTED) + && req->a_res.status == nlm_granted) return 0; printk(KERN_WARNING "lockd: failed to reclaim lock for pid %d " "(errno %d, status %d)\n", fl->fl_pid, - status, req->a_res.status); + status, ntohl(req->a_res.status)); /* * FIXME: This is a serious failure. We can @@ -637,10 +637,10 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) if (status < 0) goto out; - if (resp->status == NLM_LCK_GRANTED) + if (resp->status == nlm_granted) goto out; - if (resp->status != NLM_LCK_DENIED_NOLOCKS) + if (resp->status != nlm_lck_denied_nolocks) printk("lockd: unexpected unlock status: %d\n", resp->status); /* What to do now? I'm out of my depth... */ status = -ENOLCK; @@ -652,7 +652,7 @@ out: static void nlmclnt_unlock_callback(struct rpc_task *task, void *data) { struct nlm_rqst *req = data; - int status = req->a_res.status; + u32 status = ntohl(req->a_res.status); if (RPC_ASSASSINATED(task)) goto die; @@ -720,6 +720,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl static void nlmclnt_cancel_callback(struct rpc_task *task, void *data) { struct nlm_rqst *req = data; + u32 status = ntohl(req->a_res.status); if (RPC_ASSASSINATED(task)) goto die; @@ -731,9 +732,9 @@ static void nlmclnt_cancel_callback(struct rpc_task *task, void *data) } dprintk("lockd: cancel status %u (task %u)\n", - req->a_res.status, task->tk_pid); + status, task->tk_pid); - switch (req->a_res.status) { + switch (status) { case NLM_LCK_GRANTED: case NLM_LCK_DENIED_GRACE_PERIOD: case NLM_LCK_DENIED: @@ -744,7 +745,7 @@ static void nlmclnt_cancel_callback(struct rpc_task *task, void *data) goto retry_cancel; default: printk(KERN_NOTICE "lockd: weird return %d for CANCEL call\n", - req->a_res.status); + status); } die: @@ -768,9 +769,9 @@ static const struct rpc_call_ops nlmclnt_cancel_ops = { * Convert an NLM status code to a generic kernel errno */ static int -nlm_stat_to_errno(u32 status) +nlm_stat_to_errno(__be32 status) { - switch(status) { + switch(ntohl(status)) { case NLM_LCK_GRANTED: return 0; case NLM_LCK_DENIED: diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 7e219b93855..c7db0a5bccd 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -343,8 +343,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, __be32 ret; dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n", - file->f_file->f_dentry->d_inode->i_sb->s_id, - file->f_file->f_dentry->d_inode->i_ino, + file->f_file->f_path.dentry->d_inode->i_sb->s_id, + file->f_file->f_path.dentry->d_inode->i_ino, lock->fl.fl_type, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end, @@ -420,8 +420,8 @@ nlmsvc_testlock(struct nlm_file *file, struct nlm_lock *lock, struct nlm_lock *conflock) { dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n", - file->f_file->f_dentry->d_inode->i_sb->s_id, - file->f_file->f_dentry->d_inode->i_ino, + file->f_file->f_path.dentry->d_inode->i_sb->s_id, + file->f_file->f_path.dentry->d_inode->i_ino, lock->fl.fl_type, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -454,8 +454,8 @@ nlmsvc_unlock(struct nlm_file *file, struct nlm_lock *lock) int error; dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n", - file->f_file->f_dentry->d_inode->i_sb->s_id, - file->f_file->f_dentry->d_inode->i_ino, + file->f_file->f_path.dentry->d_inode->i_sb->s_id, + file->f_file->f_path.dentry->d_inode->i_ino, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -483,8 +483,8 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock) int status = 0; dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n", - file->f_file->f_dentry->d_inode->i_sb->s_id, - file->f_file->f_dentry->d_inode->i_ino, + file->f_file->f_path.dentry->d_inode->i_sb->s_id, + file->f_file->f_path.dentry->d_inode->i_ino, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -645,7 +645,7 @@ static const struct rpc_call_ops nlmsvc_grant_ops = { * block. */ void -nlmsvc_grant_reply(struct nlm_cookie *cookie, u32 status) +nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status) { struct nlm_block *block; @@ -655,7 +655,7 @@ nlmsvc_grant_reply(struct nlm_cookie *cookie, u32 status) return; if (block) { - if (status == NLM_LCK_DENIED_GRACE_PERIOD) { + if (status == nlm_lck_denied_grace_period) { /* Try again in a couple of seconds */ nlmsvc_insert_block(block, 10 * HZ); } else { diff --git a/fs/lockd/svcshare.c b/fs/lockd/svcshare.c index 6220dc2a3f2..068886de4dd 100644 --- a/fs/lockd/svcshare.c +++ b/fs/lockd/svcshare.c @@ -39,7 +39,7 @@ nlmsvc_share_file(struct nlm_host *host, struct nlm_file *file, return nlm_lck_denied; } - share = (struct nlm_share *) kmalloc(sizeof(*share) + oh->len, + share = kmalloc(sizeof(*share) + oh->len, GFP_KERNEL); if (share == NULL) return nlm_lck_denied_nolocks; diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index e83024e1604..c0df00c74ce 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -43,7 +43,7 @@ static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f) static inline void nlm_debug_print_file(char *msg, struct nlm_file *file) { - struct inode *inode = file->f_file->f_dentry->d_inode; + struct inode *inode = file->f_file->f_path.dentry->d_inode; dprintk("lockd: %s %s/%ld\n", msg, inode->i_sb->s_id, inode->i_ino); diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index b7c949256e5..34dae5d7073 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -361,7 +361,7 @@ nlmsvc_decode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp) { if (!(p = nlm_decode_cookie(p, &resp->cookie))) return 0; - resp->status = ntohl(*p++); + resp->status = *p++; return xdr_argsize_check(rqstp, p); } @@ -407,8 +407,8 @@ nlmclt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp) { if (!(p = nlm_decode_cookie(p, &resp->cookie))) return -EIO; - resp->status = ntohl(*p++); - if (resp->status == NLM_LCK_DENIED) { + resp->status = *p++; + if (resp->status == nlm_lck_denied) { struct file_lock *fl = &resp->lock.fl; u32 excl; s32 start, len, end; @@ -506,7 +506,7 @@ nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp) { if (!(p = nlm_decode_cookie(p, &resp->cookie))) return -EIO; - resp->status = ntohl(*p++); + resp->status = *p++; return 0; } diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index f4c0b2b9f75..a7824055121 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -367,7 +367,7 @@ nlm4svc_decode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp) { if (!(p = nlm4_decode_cookie(p, &resp->cookie))) return 0; - resp->status = ntohl(*p++); + resp->status = *p++; return xdr_argsize_check(rqstp, p); } @@ -413,8 +413,8 @@ nlm4clt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp) { if (!(p = nlm4_decode_cookie(p, &resp->cookie))) return -EIO; - resp->status = ntohl(*p++); - if (resp->status == NLM_LCK_DENIED) { + resp->status = *p++; + if (resp->status == nlm_lck_denied) { struct file_lock *fl = &resp->lock.fl; u32 excl; s64 start, end, len; @@ -512,7 +512,7 @@ nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp) { if (!(p = nlm4_decode_cookie(p, &resp->cookie))) return -EIO; - resp->status = ntohl(*p++); + resp->status = *p++; return 0; } diff --git a/fs/locks.c b/fs/locks.c index 1cb0c57fedb..52a81005dab 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -321,7 +321,7 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl, start = filp->f_pos; break; case SEEK_END: - start = i_size_read(filp->f_dentry->d_inode); + start = i_size_read(filp->f_path.dentry->d_inode); break; default: return -EINVAL; @@ -371,7 +371,7 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl, start = filp->f_pos; break; case SEEK_END: - start = i_size_read(filp->f_dentry->d_inode); + start = i_size_read(filp->f_path.dentry->d_inode); break; default: return -EINVAL; @@ -672,7 +672,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl, struct file_lock *cfl; lock_kernel(); - for (cfl = filp->f_dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) { + for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) { if (!IS_POSIX(cfl)) continue; if (posix_locks_conflict(cfl, fl)) @@ -734,7 +734,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) { struct file_lock *new_fl = NULL; struct file_lock **before; - struct inode * inode = filp->f_dentry->d_inode; + struct inode * inode = filp->f_path.dentry->d_inode; int error = 0; int found = 0; @@ -1018,7 +1018,7 @@ static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request */ int posix_lock_file(struct file *filp, struct file_lock *fl) { - return __posix_lock_file_conf(filp->f_dentry->d_inode, fl, NULL); + return __posix_lock_file_conf(filp->f_path.dentry->d_inode, fl, NULL); } EXPORT_SYMBOL(posix_lock_file); @@ -1033,7 +1033,7 @@ EXPORT_SYMBOL(posix_lock_file); int posix_lock_file_conf(struct file *filp, struct file_lock *fl, struct file_lock *conflock) { - return __posix_lock_file_conf(filp->f_dentry->d_inode, fl, conflock); + return __posix_lock_file_conf(filp->f_path.dentry->d_inode, fl, conflock); } EXPORT_SYMBOL(posix_lock_file_conf); @@ -1333,8 +1333,8 @@ int fcntl_getlease(struct file *filp) int type = F_UNLCK; lock_kernel(); - time_out_leases(filp->f_dentry->d_inode); - for (fl = filp->f_dentry->d_inode->i_flock; fl && IS_LEASE(fl); + time_out_leases(filp->f_path.dentry->d_inode); + for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl); fl = fl->fl_next) { if (fl->fl_file == filp) { type = fl->fl_type & ~F_INPROGRESS; @@ -1359,7 +1359,7 @@ int fcntl_getlease(struct file *filp) static int __setlease(struct file *filp, long arg, struct file_lock **flp) { struct file_lock *fl, **before, **my_before = NULL, *lease; - struct dentry *dentry = filp->f_dentry; + struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; int error, rdlease_count = 0, wrlease_count = 0; @@ -1448,7 +1448,7 @@ out: int setlease(struct file *filp, long arg, struct file_lock **lease) { - struct dentry *dentry = filp->f_dentry; + struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; int error; @@ -1482,7 +1482,7 @@ EXPORT_SYMBOL(setlease); int fcntl_setlease(unsigned int fd, struct file *filp, long arg) { struct file_lock fl, *flp = &fl; - struct dentry *dentry = filp->f_dentry; + struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; int error; @@ -1692,7 +1692,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, if (copy_from_user(&flock, l, sizeof(flock))) goto out; - inode = filp->f_dentry->d_inode; + inode = filp->f_path.dentry->d_inode; /* Don't allow mandatory locks on files that may be memory mapped * and shared. @@ -1835,7 +1835,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, if (copy_from_user(&flock, l, sizeof(flock))) goto out; - inode = filp->f_dentry->d_inode; + inode = filp->f_path.dentry->d_inode; /* Don't allow mandatory locks on files that may be memory mapped * and shared. @@ -1922,7 +1922,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) * posix_lock_file(). Another process could be setting a lock on this * file at the same time, but we wouldn't remove that lock anyway. */ - if (!filp->f_dentry->d_inode->i_flock) + if (!filp->f_path.dentry->d_inode->i_flock) return; lock.fl_type = F_UNLCK; @@ -1951,7 +1951,7 @@ EXPORT_SYMBOL(locks_remove_posix); */ void locks_remove_flock(struct file *filp) { - struct inode * inode = filp->f_dentry->d_inode; + struct inode * inode = filp->f_path.dentry->d_inode; struct file_lock *fl; struct file_lock **before; @@ -2020,7 +2020,7 @@ static void lock_get_status(char* out, struct file_lock *fl, int id, char *pfx) struct inode *inode = NULL; if (fl->fl_file != NULL) - inode = fl->fl_file->f_dentry->d_inode; + inode = fl->fl_file->f_path.dentry->d_inode; out += sprintf(out, "%d:%s ", id, pfx); if (IS_POSIX(fl)) { diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 2b0a389d198..ab782c4086f 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -82,7 +82,7 @@ static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi) static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir) { unsigned long pos = filp->f_pos; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct super_block *sb = inode->i_sb; unsigned offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; diff --git a/fs/namei.c b/fs/namei.c index db1bca26d88..e4f108f0823 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -297,7 +297,7 @@ int vfs_permission(struct nameidata *nd, int mask) */ int file_permission(struct file *file, int mask) { - return permission(file->f_dentry->d_inode, mask, NULL); + return permission(file->f_path.dentry->d_inode, mask, NULL); } /* @@ -333,7 +333,7 @@ int get_write_access(struct inode * inode) int deny_write_access(struct file * file) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; spin_lock(&inode->i_lock); if (atomic_read(&inode->i_writecount) > 0) { @@ -368,7 +368,7 @@ void path_release_on_umount(struct nameidata *nd) */ void release_open_intent(struct nameidata *nd) { - if (nd->intent.open.file->f_dentry == NULL) + if (nd->intent.open.file->f_path.dentry == NULL) put_filp(nd->intent.open.file); else fput(nd->intent.open.file); @@ -572,11 +572,6 @@ fail: return PTR_ERR(link); } -struct path { - struct vfsmount *mnt; - struct dentry *dentry; -}; - static inline void dput_path(struct path *path, struct nameidata *nd) { dput(path->dentry); @@ -1143,7 +1138,7 @@ static int fastcall do_path_lookup(int dfd, const char *name, if (!file) goto out_fail; - dentry = file->f_dentry; + dentry = file->f_path.dentry; retval = -ENOTDIR; if (!S_ISDIR(dentry->d_inode->i_mode)) @@ -1153,7 +1148,7 @@ static int fastcall do_path_lookup(int dfd, const char *name, if (retval) goto fput_fail; - nd->mnt = mntget(file->f_vfsmnt); + nd->mnt = mntget(file->f_path.mnt); nd->dentry = dget(dentry); fput_light(file, fput_needed); diff --git a/fs/namespace.c b/fs/namespace.c index b00ac84ebbd..5ef336c1103 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -20,7 +20,7 @@ #include <linux/module.h> #include <linux/sysfs.h> #include <linux/seq_file.h> -#include <linux/namespace.h> +#include <linux/mnt_namespace.h> #include <linux/namei.h> #include <linux/security.h> #include <linux/mount.h> @@ -133,10 +133,10 @@ struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) static inline int check_mnt(struct vfsmount *mnt) { - return mnt->mnt_namespace == current->nsproxy->namespace; + return mnt->mnt_ns == current->nsproxy->mnt_ns; } -static void touch_namespace(struct namespace *ns) +static void touch_mnt_namespace(struct mnt_namespace *ns) { if (ns) { ns->event = ++event; @@ -144,7 +144,7 @@ static void touch_namespace(struct namespace *ns) } } -static void __touch_namespace(struct namespace *ns) +static void __touch_mnt_namespace(struct mnt_namespace *ns) { if (ns && ns->event != event) { ns->event = event; @@ -187,19 +187,19 @@ static void commit_tree(struct vfsmount *mnt) struct vfsmount *parent = mnt->mnt_parent; struct vfsmount *m; LIST_HEAD(head); - struct namespace *n = parent->mnt_namespace; + struct mnt_namespace *n = parent->mnt_ns; BUG_ON(parent == mnt); list_add_tail(&head, &mnt->mnt_list); list_for_each_entry(m, &head, mnt_list) - m->mnt_namespace = n; + m->mnt_ns = n; list_splice(&head, n->list.prev); list_add_tail(&mnt->mnt_hash, mount_hashtable + hash(parent, mnt->mnt_mountpoint)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); - touch_namespace(n); + touch_mnt_namespace(n); } static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root) @@ -320,7 +320,7 @@ EXPORT_SYMBOL(mnt_unpin); /* iterator */ static void *m_start(struct seq_file *m, loff_t *pos) { - struct namespace *n = m->private; + struct mnt_namespace *n = m->private; struct list_head *p; loff_t l = *pos; @@ -333,7 +333,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) static void *m_next(struct seq_file *m, void *v, loff_t *pos) { - struct namespace *n = m->private; + struct mnt_namespace *n = m->private; struct list_head *p = ((struct vfsmount *)v)->mnt_list.next; (*pos)++; return p == &n->list ? NULL : list_entry(p, struct vfsmount, mnt_list); @@ -368,6 +368,7 @@ static int show_vfsmnt(struct seq_file *m, void *v) { MNT_NOEXEC, ",noexec" }, { MNT_NOATIME, ",noatime" }, { MNT_NODIRATIME, ",nodiratime" }, + { MNT_RELATIME, ",relatime" }, { 0, NULL } }; struct proc_fs_info *fs_infop; @@ -526,8 +527,8 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) list_for_each_entry(p, kill, mnt_hash) { list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); - __touch_namespace(p->mnt_namespace); - p->mnt_namespace = NULL; + __touch_mnt_namespace(p->mnt_ns); + p->mnt_ns = NULL; list_del_init(&p->mnt_child); if (p->mnt_parent != p) p->mnt_mountpoint->d_mounted--; @@ -830,7 +831,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt, if (parent_nd) { detach_mnt(source_mnt, parent_nd); attach_mnt(source_mnt, nd); - touch_namespace(current->nsproxy->namespace); + touch_mnt_namespace(current->nsproxy->mnt_ns); } else { mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); commit_tree(source_mnt); @@ -1145,9 +1146,9 @@ static void expire_mount(struct vfsmount *mnt, struct list_head *mounts, */ if (!propagate_mount_busy(mnt, 2)) { /* delete from the namespace */ - touch_namespace(mnt->mnt_namespace); + touch_mnt_namespace(mnt->mnt_ns); list_del_init(&mnt->mnt_list); - mnt->mnt_namespace = NULL; + mnt->mnt_ns = NULL; umount_tree(mnt, 1, umounts); spin_unlock(&vfsmount_lock); } else { @@ -1168,7 +1169,7 @@ static void expire_mount(struct vfsmount *mnt, struct list_head *mounts, */ static void expire_mount_list(struct list_head *graveyard, struct list_head *mounts) { - struct namespace *namespace; + struct mnt_namespace *ns; struct vfsmount *mnt; while (!list_empty(graveyard)) { @@ -1178,10 +1179,10 @@ static void expire_mount_list(struct list_head *graveyard, struct list_head *mou /* don't do anything if the namespace is dead - all the * vfsmounts from it are going away anyway */ - namespace = mnt->mnt_namespace; - if (!namespace || !namespace->root) + ns = mnt->mnt_ns; + if (!ns || !ns->root) continue; - get_namespace(namespace); + get_mnt_ns(ns); spin_unlock(&vfsmount_lock); down_write(&namespace_sem); @@ -1189,7 +1190,7 @@ static void expire_mount_list(struct list_head *graveyard, struct list_head *mou up_write(&namespace_sem); release_mounts(&umounts); mntput(mnt); - put_namespace(namespace); + put_mnt_ns(ns); spin_lock(&vfsmount_lock); } } @@ -1405,9 +1406,11 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, mnt_flags |= MNT_NOATIME; if (flags & MS_NODIRATIME) mnt_flags |= MNT_NODIRATIME; + if (flags & MS_RELATIME) + mnt_flags |= MNT_RELATIME; flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | - MS_NOATIME | MS_NODIRATIME); + MS_NOATIME | MS_NODIRATIME | MS_RELATIME); /* ... and get the mountpoint */ retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd); @@ -1439,14 +1442,15 @@ dput_out: * Allocate a new namespace structure and populate it with contents * copied from the namespace of the passed in task structure. */ -struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) +struct mnt_namespace *dup_mnt_ns(struct task_struct *tsk, + struct fs_struct *fs) { - struct namespace *namespace = tsk->nsproxy->namespace; - struct namespace *new_ns; + struct mnt_namespace *mnt_ns = tsk->nsproxy->mnt_ns; + struct mnt_namespace *new_ns; struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL; struct vfsmount *p, *q; - new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL); + new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); if (!new_ns) return NULL; @@ -1457,7 +1461,7 @@ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) down_write(&namespace_sem); /* First pass: copy the tree topology */ - new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root, + new_ns->root = copy_tree(mnt_ns->root, mnt_ns->root->mnt_root, CL_COPY_ALL | CL_EXPIRE); if (!new_ns->root) { up_write(&namespace_sem); @@ -1473,10 +1477,10 @@ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) * as belonging to new namespace. We have already acquired a private * fs_struct, so tsk->fs->lock is not needed. */ - p = namespace->root; + p = mnt_ns->root; q = new_ns->root; while (p) { - q->mnt_namespace = new_ns; + q->mnt_ns = new_ns; if (fs) { if (p == fs->rootmnt) { rootmnt = p; @@ -1491,7 +1495,7 @@ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) fs->altrootmnt = mntget(q); } } - p = next_mnt(p, namespace->root); + p = next_mnt(p, mnt_ns->root); q = next_mnt(q, new_ns->root); } up_write(&namespace_sem); @@ -1506,16 +1510,16 @@ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) return new_ns; } -int copy_namespace(int flags, struct task_struct *tsk) +int copy_mnt_ns(int flags, struct task_struct *tsk) { - struct namespace *namespace = tsk->nsproxy->namespace; - struct namespace *new_ns; + struct mnt_namespace *ns = tsk->nsproxy->mnt_ns; + struct mnt_namespace *new_ns; int err = 0; - if (!namespace) + if (!ns) return 0; - get_namespace(namespace); + get_mnt_ns(ns); if (!(flags & CLONE_NEWNS)) return 0; @@ -1525,16 +1529,16 @@ int copy_namespace(int flags, struct task_struct *tsk) goto out; } - new_ns = dup_namespace(tsk, tsk->fs); + new_ns = dup_mnt_ns(tsk, tsk->fs); if (!new_ns) { err = -ENOMEM; goto out; } - tsk->nsproxy->namespace = new_ns; + tsk->nsproxy->mnt_ns = new_ns; out: - put_namespace(namespace); + put_mnt_ns(ns); return err; } @@ -1754,7 +1758,7 @@ asmlinkage long sys_pivot_root(const char __user * new_root, detach_mnt(user_nd.mnt, &root_parent); attach_mnt(user_nd.mnt, &old_nd); /* mount old root on put_old */ attach_mnt(new_nd.mnt, &root_parent); /* mount new_root on / */ - touch_namespace(current->nsproxy->namespace); + touch_mnt_namespace(current->nsproxy->mnt_ns); spin_unlock(&vfsmount_lock); chroot_fs_refs(&user_nd, &new_nd); security_sb_post_pivotroot(&user_nd, &new_nd); @@ -1779,27 +1783,27 @@ out3: static void __init init_mount_tree(void) { struct vfsmount *mnt; - struct namespace *namespace; + struct mnt_namespace *ns; mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); if (IS_ERR(mnt)) panic("Can't create rootfs"); - namespace = kmalloc(sizeof(*namespace), GFP_KERNEL); - if (!namespace) + ns = kmalloc(sizeof(*ns), GFP_KERNEL); + if (!ns) panic("Can't allocate initial namespace"); - atomic_set(&namespace->count, 1); - INIT_LIST_HEAD(&namespace->list); - init_waitqueue_head(&namespace->poll); - namespace->event = 0; - list_add(&mnt->mnt_list, &namespace->list); - namespace->root = mnt; - mnt->mnt_namespace = namespace; - - init_task.nsproxy->namespace = namespace; - get_namespace(namespace); - - set_fs_pwd(current->fs, namespace->root, namespace->root->mnt_root); - set_fs_root(current->fs, namespace->root, namespace->root->mnt_root); + atomic_set(&ns->count, 1); + INIT_LIST_HEAD(&ns->list); + init_waitqueue_head(&ns->poll); + ns->event = 0; + list_add(&mnt->mnt_list, &ns->list); + ns->root = mnt; + mnt->mnt_ns = ns; + + init_task.nsproxy->mnt_ns = ns; + get_mnt_ns(ns); + + set_fs_pwd(current->fs, ns->root, ns->root->mnt_root); + set_fs_root(current->fs, ns->root, ns->root->mnt_root); } void __init mnt_init(unsigned long mempages) @@ -1860,11 +1864,11 @@ void __init mnt_init(unsigned long mempages) init_mount_tree(); } -void __put_namespace(struct namespace *namespace) +void __put_mnt_ns(struct mnt_namespace *ns) { - struct vfsmount *root = namespace->root; + struct vfsmount *root = ns->root; LIST_HEAD(umount_list); - namespace->root = NULL; + ns->root = NULL; spin_unlock(&vfsmount_lock); down_write(&namespace_sem); spin_lock(&vfsmount_lock); @@ -1872,5 +1876,5 @@ void __put_namespace(struct namespace *namespace) spin_unlock(&vfsmount_lock); up_write(&namespace_sem); release_mounts(&umount_list); - kfree(namespace); + kfree(ns); } diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 458b3b78519..73747772c3b 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -402,7 +402,7 @@ static time_t ncp_obtain_mtime(struct dentry *dentry) static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct dentry *dentry = filp->f_dentry; + struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; struct page *page = NULL; struct ncp_server *server = NCP_SERVER(inode); @@ -554,7 +554,7 @@ static int ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir, struct ncp_cache_control *ctrl, struct ncp_entry_info *entry) { - struct dentry *newdent, *dentry = filp->f_dentry; + struct dentry *newdent, *dentry = filp->f_path.dentry; struct inode *newino, *inode = dentry->d_inode; struct ncp_cache_control ctl = *ctrl; struct qstr qname; @@ -649,7 +649,7 @@ static void ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir, struct ncp_cache_control *ctl) { - struct dentry *dentry = filp->f_dentry; + struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; struct ncp_server *server = NCP_SERVER(inode); struct ncp_volume_info info; @@ -685,7 +685,7 @@ static void ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir, struct ncp_cache_control *ctl) { - struct dentry *dentry = filp->f_dentry; + struct dentry *dentry = filp->f_path.dentry; struct inode *dir = dentry->d_inode; struct ncp_server *server = NCP_SERVER(dir); struct nw_search_sequence seq; diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index df37524b85d..b91fea03b1c 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -101,7 +101,7 @@ out: static ssize_t ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct dentry *dentry = file->f_dentry; + struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; size_t already_read = 0; off_t pos; @@ -182,7 +182,7 @@ outrel: static ssize_t ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct dentry *dentry = file->f_dentry; + struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; size_t already_written = 0; off_t pos; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index fae53243bb9..67a90bf795d 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -327,11 +327,12 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options) char *optarg; unsigned long optint; int version = 0; + int ret; data->flags = 0; data->int_flags = 0; data->mounted_uid = 0; - data->wdog_pid = -1; + data->wdog_pid = NULL; data->ncp_fd = ~0; data->time_out = 10; data->retry_count = 20; @@ -343,8 +344,9 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options) data->mounted_vol[0] = 0; while ((optval = ncp_getopt("ncpfs", &options, ncp_opts, NULL, &optarg, &optint)) != 0) { - if (optval < 0) - return optval; + ret = optval; + if (ret < 0) + goto err; switch (optval) { case 'u': data->uid = optint; @@ -371,7 +373,7 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options) data->flags = optint; break; case 'w': - data->wdog_pid = optint; + data->wdog_pid = find_get_pid(optint); break; case 'n': data->ncp_fd = optint; @@ -380,18 +382,21 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options) data->info_fd = optint; break; case 'v': - if (optint < NCP_MOUNT_VERSION_V4) { - return -ECHRNG; - } - if (optint > NCP_MOUNT_VERSION_V5) { - return -ECHRNG; - } + ret = -ECHRNG; + if (optint < NCP_MOUNT_VERSION_V4) + goto err; + if (optint > NCP_MOUNT_VERSION_V5) + goto err; version = optint; break; } } return 0; +err: + put_pid(data->wdog_pid); + data->wdog_pid = NULL; + return ret; } static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) @@ -409,6 +414,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) #endif struct ncp_entry_info finfo; + data.wdog_pid = NULL; server = kzalloc(sizeof(struct ncp_server), GFP_KERNEL); if (!server) return -ENOMEM; @@ -425,7 +431,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) data.flags = md->flags; data.int_flags = NCP_IMOUNT_LOGGEDIN_POSSIBLE; data.mounted_uid = md->mounted_uid; - data.wdog_pid = md->wdog_pid; + data.wdog_pid = find_get_pid(md->wdog_pid); data.ncp_fd = md->ncp_fd; data.time_out = md->time_out; data.retry_count = md->retry_count; @@ -445,7 +451,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) data.flags = md->flags; data.int_flags = 0; data.mounted_uid = md->mounted_uid; - data.wdog_pid = md->wdog_pid; + data.wdog_pid = find_get_pid(md->wdog_pid); data.ncp_fd = md->ncp_fd; data.time_out = md->time_out; data.retry_count = md->retry_count; @@ -471,7 +477,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) if (!ncp_filp) goto out; error = -ENOTSOCK; - sock_inode = ncp_filp->f_dentry->d_inode; + sock_inode = ncp_filp->f_path.dentry->d_inode; if (!S_ISSOCK(sock_inode->i_mode)) goto out_fput; sock = SOCKET_I(sock_inode); @@ -504,7 +510,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) if (!server->info_filp) goto out_fput; error = -ENOTSOCK; - sock_inode = server->info_filp->f_dentry->d_inode; + sock_inode = server->info_filp->f_path.dentry->d_inode; if (!S_ISSOCK(sock_inode->i_mode)) goto out_fput2; info_sock = SOCKET_I(sock_inode); @@ -679,6 +685,7 @@ out_fput: */ fput(ncp_filp); out: + put_pid(data.wdog_pid); sb->s_fs_info = NULL; kfree(server); return error; @@ -711,7 +718,8 @@ static void ncp_put_super(struct super_block *sb) if (server->info_filp) fput(server->info_filp); fput(server->ncp_filp); - kill_proc(server->m.wdog_pid, SIGTERM, 1); + kill_pid(server->m.wdog_pid, SIGTERM, 1); + put_pid(server->m.wdog_pid); kfree(server->priv.data); kfree(server->auth.object_name); diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index 589d1eac55c..8843a83d4ef 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -35,7 +35,7 @@ static int ncp_get_fs_info(struct ncp_server * server, struct file *file, struct ncp_fs_info __user *arg) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; struct ncp_fs_info info; if ((file_permission(file, MAY_WRITE) != 0) @@ -65,7 +65,7 @@ static int ncp_get_fs_info_v2(struct ncp_server * server, struct file *file, struct ncp_fs_info_v2 __user * arg) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; struct ncp_fs_info_v2 info2; if ((file_permission(file, MAY_WRITE) != 0) @@ -136,7 +136,7 @@ static int ncp_get_compat_fs_info_v2(struct ncp_server * server, struct file *file, struct compat_ncp_fs_info_v2 __user * arg) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; struct compat_ncp_fs_info_v2 info2; if ((file_permission(file, MAY_WRITE) != 0) @@ -824,7 +824,7 @@ outrel: #ifdef CONFIG_COMPAT long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; int ret; lock_kernel(); diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index e7d5a3097fe..70a69115500 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -29,7 +29,7 @@ static struct page* ncp_file_mmap_nopage(struct vm_area_struct *area, unsigned long address, int *type) { struct file *file = area->vm_file; - struct dentry *dentry = file->f_dentry; + struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; struct page* page; char *pg_addr; @@ -106,7 +106,7 @@ static struct vm_operations_struct ncp_file_mmap = /* This is used for a general mmap of a ncp file */ int ncp_mmap(struct file *file, struct vm_area_struct *vma) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; DPRINTK("ncp_mmap: called\n"); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index b34cd16f472..d9ba8cb0ee7 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -172,7 +172,7 @@ static int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) { struct file *file = desc->file; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; struct rpc_cred *cred = nfs_file_cred(file); unsigned long timestamp; int error; @@ -183,7 +183,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) again: timestamp = jiffies; - error = NFS_PROTO(inode)->readdir(file->f_dentry, cred, desc->entry->cookie, page, + error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page, NFS_SERVER(inode)->dtsize, desc->plus); if (error < 0) { /* We requested READDIRPLUS, but the server doesn't grok it */ @@ -308,7 +308,7 @@ int find_dirent_index(nfs_readdir_descriptor_t *desc) static inline int find_dirent_page(nfs_readdir_descriptor_t *desc) { - struct inode *inode = desc->file->f_dentry->d_inode; + struct inode *inode = desc->file->f_path.dentry->d_inode; struct page *page; int status; @@ -464,7 +464,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, filldir_t filldir) { struct file *file = desc->file; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; struct rpc_cred *cred = nfs_file_cred(file); struct page *page = NULL; int status; @@ -477,7 +477,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, status = -ENOMEM; goto out; } - desc->error = NFS_PROTO(inode)->readdir(file->f_dentry, cred, *desc->dir_cookie, + desc->error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, *desc->dir_cookie, page, NFS_SERVER(inode)->dtsize, desc->plus); @@ -516,7 +516,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, */ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct dentry *dentry = filp->f_dentry; + struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; nfs_readdir_descriptor_t my_desc, *desc = &my_desc; @@ -532,7 +532,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) lock_kernel(); - res = nfs_revalidate_mapping(inode, filp->f_mapping); + res = nfs_revalidate_mapping_nolock(inode, filp->f_mapping); if (res < 0) { unlock_kernel(); return res; @@ -599,7 +599,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) { - mutex_lock(&filp->f_dentry->d_inode->i_mutex); + mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); switch (origin) { case 1: offset += filp->f_pos; @@ -615,7 +615,7 @@ loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) ((struct nfs_open_context *)filp->private_data)->dir_cookie = 0; } out: - mutex_unlock(&filp->f_dentry->d_inode->i_mutex); + mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); return offset; } @@ -1102,7 +1102,7 @@ no_open: static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc) { - struct dentry *parent = desc->file->f_dentry; + struct dentry *parent = desc->file->f_path.dentry; struct inode *dir = parent->d_inode; struct nfs_entry *entry = desc->entry; struct dentry *dentry, *alias; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index f9d678f4ae0..bd21d7fde65 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -116,7 +116,7 @@ static inline int put_dreq(struct nfs_direct_req *dreq) ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) { dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n", - iocb->ki_filp->f_dentry->d_name.name, + iocb->ki_filp->f_path.dentry->d_name.name, (long long) pos, nr_segs); return -EINVAL; @@ -734,8 +734,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, size_t count = iov[0].iov_len; dprintk("nfs: direct read(%s/%s, %lu@%Ld)\n", - file->f_dentry->d_parent->d_name.name, - file->f_dentry->d_name.name, + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, (unsigned long) count, (long long) pos); if (nr_segs != 1) @@ -798,8 +798,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, size_t count = iov[0].iov_len; dfprintk(VFS, "nfs: direct write(%s/%s, %lu@%Ld)\n", - file->f_dentry->d_parent->d_name.name, - file->f_dentry->d_name.name, + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, (unsigned long) count, (long long) pos); if (nr_segs != 1) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 8e28bffc35a..9e4a2b70995 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -176,7 +176,7 @@ static int nfs_file_flush(struct file *file, fl_owner_t id) { struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; int status; dfprintk(VFS, "nfs: flush(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); @@ -201,7 +201,7 @@ static ssize_t nfs_file_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - struct dentry * dentry = iocb->ki_filp->f_dentry; + struct dentry * dentry = iocb->ki_filp->f_path.dentry; struct inode * inode = dentry->d_inode; ssize_t result; size_t count = iov_length(iov, nr_segs); @@ -226,7 +226,7 @@ static ssize_t nfs_file_sendfile(struct file *filp, loff_t *ppos, size_t count, read_actor_t actor, void *target) { - struct dentry *dentry = filp->f_dentry; + struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; ssize_t res; @@ -243,7 +243,7 @@ nfs_file_sendfile(struct file *filp, loff_t *ppos, size_t count, static int nfs_file_mmap(struct file * file, struct vm_area_struct * vma) { - struct dentry *dentry = file->f_dentry; + struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; int status; @@ -315,14 +315,13 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset) static int nfs_release_page(struct page *page, gfp_t gfp) { - /* - * Avoid deadlock on nfs_wait_on_request(). - */ - if (!(gfp & __GFP_FS)) - return 0; - /* Hack... Force nfs_wb_page() to write out the page */ - SetPageDirty(page); - return !nfs_wb_page(page->mapping->host, page); + /* If PagePrivate() is set, then the page is not freeable */ + return 0; +} + +static int nfs_launder_page(struct page *page) +{ + return nfs_wb_page(page->mapping->host, page); } const struct address_space_operations nfs_file_aops = { @@ -338,12 +337,13 @@ const struct address_space_operations nfs_file_aops = { #ifdef CONFIG_NFS_DIRECTIO .direct_IO = nfs_direct_IO, #endif + .launder_page = nfs_launder_page, }; static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - struct dentry * dentry = iocb->ki_filp->f_dentry; + struct dentry * dentry = iocb->ki_filp->f_path.dentry; struct inode * inode = dentry->d_inode; ssize_t result; size_t count = iov_length(iov, nr_segs); @@ -434,8 +434,9 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl) BUG(); } if (res < 0) - printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", - __FUNCTION__); + dprintk(KERN_WARNING "%s: VFS is out of sync with lock manager" + " - error %d!\n", + __FUNCTION__, res); return res; } @@ -535,8 +536,8 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) { dprintk("NFS: nfs_flock(f=%s/%ld, t=%x, fl=%x)\n", - filp->f_dentry->d_inode->i_sb->s_id, - filp->f_dentry->d_inode->i_ino, + filp->f_path.dentry->d_inode->i_sb->s_id, + filp->f_path.dentry->d_inode->i_ino, fl->fl_type, fl->fl_flags); /* diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 20c6f39ea38..8391bd7a83c 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -31,7 +31,7 @@ #include <linux/nfs_idmap.h> #include <linux/vfs.h> #include <linux/namei.h> -#include <linux/namespace.h> +#include <linux/mnt_namespace.h> #include <linux/security.h> #include <asm/system.h> diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 82ad7110a1c..9d4a6b2d199 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -377,7 +377,7 @@ idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, static ssize_t idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { - struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); + struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode); struct idmap *idmap = (struct idmap *)rpci->private; struct idmap_msg im_in, *im = &idmap->idmap_im; struct idmap_hashtable *h; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 36680d1061b..d8349828283 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -496,7 +496,7 @@ void put_nfs_open_context(struct nfs_open_context *ctx) */ static void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); filp->private_data = get_nfs_open_context(ctx); @@ -528,7 +528,7 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c static void nfs_file_clear_open_context(struct file *filp) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct nfs_open_context *ctx = (struct nfs_open_context *)filp->private_data; if (ctx) { @@ -551,7 +551,7 @@ int nfs_open(struct inode *inode, struct file *filp) cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0); if (IS_ERR(cred)) return PTR_ERR(cred); - ctx = alloc_nfs_open_context(filp->f_vfsmnt, filp->f_dentry, cred); + ctx = alloc_nfs_open_context(filp->f_path.mnt, filp->f_path.dentry, cred); put_rpccred(cred); if (ctx == NULL) return -ENOMEM; @@ -665,49 +665,86 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) return __nfs_revalidate_inode(server, inode); } +static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_space *mapping) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + if (mapping->nrpages != 0) { + int ret = invalidate_inode_pages2(mapping); + if (ret < 0) + return ret; + } + spin_lock(&inode->i_lock); + nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; + if (S_ISDIR(inode->i_mode)) { + memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); + /* This ensures we revalidate child dentries */ + nfsi->cache_change_attribute = jiffies; + } + spin_unlock(&inode->i_lock); + nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); + dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", + inode->i_sb->s_id, (long long)NFS_FILEID(inode)); + return 0; +} + +static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) +{ + int ret = 0; + + mutex_lock(&inode->i_mutex); + if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_DATA) { + ret = nfs_sync_mapping(mapping); + if (ret == 0) + ret = nfs_invalidate_mapping_nolock(inode, mapping); + } + mutex_unlock(&inode->i_mutex); + return ret; +} + /** - * nfs_revalidate_mapping - Revalidate the pagecache + * nfs_revalidate_mapping_nolock - Revalidate the pagecache * @inode - pointer to host inode * @mapping - pointer to mapping */ -int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) +int nfs_revalidate_mapping_nolock(struct inode *inode, struct address_space *mapping) { struct nfs_inode *nfsi = NFS_I(inode); int ret = 0; - if (NFS_STALE(inode)) - ret = -ESTALE; if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) - || nfs_attribute_timeout(inode)) + || nfs_attribute_timeout(inode) || NFS_STALE(inode)) { ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); - if (ret < 0) - goto out; + if (ret < 0) + goto out; + } + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + ret = nfs_invalidate_mapping_nolock(inode, mapping); +out: + return ret; +} - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) { - if (mapping->nrpages != 0) { - if (S_ISREG(inode->i_mode)) { - ret = nfs_sync_mapping(mapping); - if (ret < 0) - goto out; - } - ret = invalidate_inode_pages2(mapping); - if (ret < 0) - goto out; - } - spin_lock(&inode->i_lock); - nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; - if (S_ISDIR(inode->i_mode)) { - memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); - /* This ensures we revalidate child dentries */ - nfsi->cache_change_attribute = jiffies; - } - spin_unlock(&inode->i_lock); +/** + * nfs_revalidate_mapping - Revalidate the pagecache + * @inode - pointer to host inode + * @mapping - pointer to mapping + * + * This version of the function will take the inode->i_mutex and attempt to + * flush out all dirty data if it needs to invalidate the page cache. + */ +int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) +{ + struct nfs_inode *nfsi = NFS_I(inode); + int ret = 0; - nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); - dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", - inode->i_sb->s_id, - (long long)NFS_FILEID(inode)); + if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) + || nfs_attribute_timeout(inode) || NFS_STALE(inode)) { + ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (ret < 0) + goto out; } + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + ret = nfs_invalidate_mapping(inode, mapping); out: return ret; } diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 510ae524f3f..acd8fe9762d 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -841,7 +841,7 @@ static void nfs3_proc_commit_setup(struct nfs_write_data *data, int how) static int nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl) { - return nlmclnt_proc(filp->f_dentry->d_inode, cmd, fl); + return nlmclnt_proc(filp->f_path.dentry->d_inode, cmd, fl); } const struct nfs_rpc_ops nfs_v3_clientops = { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ee458aeab24..b3fd29baadc 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1877,7 +1877,7 @@ static int nfs4_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir, struct nfs_server *server = NFS_SERVER(dir->d_inode); struct unlink_desc *up; - up = (struct unlink_desc *) kmalloc(sizeof(*up), GFP_KERNEL); + up = kmalloc(sizeof(*up), GFP_KERNEL); if (!up) return -ENOMEM; diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 10f5e80ca15..560536ad74a 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -651,7 +651,7 @@ nfs_proc_commit_setup(struct nfs_write_data *data, int how) static int nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl) { - return nlmclnt_proc(filp->f_dentry->d_inode, cmd, fl); + return nlmclnt_proc(filp->f_path.dentry->d_inode, cmd, fl); } diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 6c686112cc0..525c136c7d8 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -50,7 +50,9 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; struct page *page; - void *err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping)); + void *err; + + err = ERR_PTR(nfs_revalidate_mapping_nolock(inode, inode->i_mapping)); if (err) goto read_failed; page = read_cache_page(&inode->i_data, 0, diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 594eb16879e..345492e7864 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -754,8 +754,8 @@ int nfs_updatepage(struct file *file, struct page *page, nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); dprintk("NFS: nfs_updatepage(%s/%s %d@%Ld)\n", - file->f_dentry->d_parent->d_name.name, - file->f_dentry->d_name.name, count, + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, count, (long long)(page_offset(page) +offset)); /* If we're not using byte range locks, and we know the page diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index f37df46d2ea..49c310b8492 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -35,7 +35,6 @@ #include <linux/lockd/bind.h> #define NFSDDBG_FACILITY NFSDDBG_EXPORT -#define NFSD_PARANOIA 1 typedef struct auth_domain svc_client; typedef struct svc_export svc_export; @@ -787,15 +786,20 @@ exp_get_by_name(svc_client *clp, struct vfsmount *mnt, struct dentry *dentry, key.ex_dentry = dentry; exp = svc_export_lookup(&key); - if (exp != NULL) - switch (cache_check(&svc_export_cache, &exp->h, reqp)) { + if (exp != NULL) { + int err; + + err = cache_check(&svc_export_cache, &exp->h, reqp); + switch (err) { case 0: break; case -EAGAIN: - exp = ERR_PTR(-EAGAIN); + case -ETIMEDOUT: + exp = ERR_PTR(err); break; default: exp = NULL; } + } return exp; } @@ -950,6 +954,8 @@ exp_export(struct nfsctl_export *nxp) exp = exp_get_by_name(clp, nd.mnt, nd.dentry, NULL); + memset(&new, 0, sizeof(new)); + /* must make sure there won't be an ex_fsid clash */ if ((nxp->ex_flags & NFSEXP_FSID) && (fsid_key = exp_get_fsid_key(clp, nxp->ex_dev)) && @@ -980,6 +986,9 @@ exp_export(struct nfsctl_export *nxp) new.h.expiry_time = NEVER; new.h.flags = 0; + new.ex_path = kstrdup(nxp->ex_path, GFP_KERNEL); + if (!new.ex_path) + goto finish; new.ex_client = clp; new.ex_mnt = nd.mnt; new.ex_dentry = nd.dentry; @@ -1000,10 +1009,11 @@ exp_export(struct nfsctl_export *nxp) /* failed to create at least one index */ exp_do_unexport(exp); cache_flush(); - err = -ENOMEM; - } - + } else + err = 0; finish: + if (new.ex_path) + kfree(new.ex_path); if (exp) exp_put(exp); if (fsid_key && !IS_ERR(fsid_key)) @@ -1104,6 +1114,10 @@ exp_rootfh(svc_client *clp, char *path, struct knfsd_fh *f, int maxsize) path, nd.dentry, clp->name, inode->i_sb->s_id, inode->i_ino); exp = exp_parent(clp, nd.mnt, nd.dentry, NULL); + if (IS_ERR(exp)) { + err = PTR_ERR(exp); + goto out; + } if (!exp) { dprintk("nfsd: exp_rootfh export not found.\n"); goto out; @@ -1159,12 +1173,10 @@ exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp, mk_fsid_v1(fsidv, 0); exp = exp_find(clp, 1, fsidv, creq); - if (IS_ERR(exp) && PTR_ERR(exp) == -EAGAIN) - return nfserr_dropit; + if (IS_ERR(exp)) + return nfserrno(PTR_ERR(exp)); if (exp == NULL) return nfserr_perm; - else if (IS_ERR(exp)) - return nfserrno(PTR_ERR(exp)); rv = fh_compose(fhp, exp, exp->ex_dentry, NULL); exp_put(exp); return rv; diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 11fdaf7721b..221acd1f11f 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -22,7 +22,7 @@ /* * Note: we hold the dentry use count while the file is open. */ -static u32 +static __be32 nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp) { __be32 nfserr; diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index e3eca081698..edde5dc5f79 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -222,12 +222,10 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, { struct dentry *dentry = resp->fh.fh_dentry; struct inode *inode = dentry->d_inode; - int w = nfsacl_size( - (resp->mask & NFS_ACL) ? resp->acl_access : NULL, - (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); struct kvec *head = rqstp->rq_res.head; unsigned int base; int n; + int w; if (dentry == NULL || dentry->d_inode == NULL) return 0; @@ -239,7 +237,9 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, return 0; base = (char *)p - (char *)head->iov_base; - rqstp->rq_res.page_len = w; + rqstp->rq_res.page_len = w = nfsacl_size( + (resp->mask & NFS_ACL) ? resp->acl_access : NULL, + (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); while (w > 0) { if (!rqstp->rq_respages[rqstp->rq_resused++]) return 0; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index fcad2895ddb..3e3f2de82c3 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -171,19 +171,19 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh); if (resp->status == 0 && dentry && dentry->d_inode) { struct inode *inode = dentry->d_inode; - int w = nfsacl_size( - (resp->mask & NFS_ACL) ? resp->acl_access : NULL, - (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); struct kvec *head = rqstp->rq_res.head; unsigned int base; int n; + int w; *p++ = htonl(resp->mask); if (!xdr_ressize_check(rqstp, p)) return 0; base = (char *)p - (char *)head->iov_base; - rqstp->rq_res.page_len = w; + rqstp->rq_res.page_len = w = nfsacl_size( + (resp->mask & NFS_ACL) ? resp->acl_access : NULL, + (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); while (w > 0) { if (!rqstp->rq_respages[rqstp->rq_resused++]) return 0; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 277df40f098..e695660921e 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -990,15 +990,16 @@ encode_entry(struct readdir_cd *ccd, const char *name, } int -nfs3svc_encode_entry(struct readdir_cd *cd, const char *name, - int namlen, loff_t offset, ino_t ino, unsigned int d_type) +nfs3svc_encode_entry(void *cd, const char *name, + int namlen, loff_t offset, u64 ino, unsigned int d_type) { return encode_entry(cd, name, namlen, offset, ino, d_type, 0); } int -nfs3svc_encode_entry_plus(struct readdir_cd *cd, const char *name, - int namlen, loff_t offset, ino_t ino, unsigned int d_type) +nfs3svc_encode_entry_plus(void *cd, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int d_type) { return encode_entry(cd, name, namlen, offset, ino, d_type, 1); } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 50bc94243ca..8522729830d 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -33,13 +33,6 @@ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * Note: some routines in this file are just trivial wrappers - * (e.g. nfsd4_lookup()) defined solely for the sake of consistent - * naming. Since all such routines have been declared "inline", - * there shouldn't be any associated overhead. At some point in - * the future, I might inline these "by hand" to clean up a - * little. */ #include <linux/param.h> @@ -161,8 +154,9 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_ } -static inline __be32 -nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open, struct nfs4_stateowner **replay_owner) +static __be32 +nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_open *open) { __be32 status; dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n", @@ -179,11 +173,11 @@ nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open status = nfsd4_process_open1(open); if (status == nfserr_replay_me) { struct nfs4_replay *rp = &open->op_stateowner->so_replay; - fh_put(current_fh); - current_fh->fh_handle.fh_size = rp->rp_openfh_len; - memcpy(¤t_fh->fh_handle.fh_base, rp->rp_openfh, + fh_put(&cstate->current_fh); + cstate->current_fh.fh_handle.fh_size = rp->rp_openfh_len; + memcpy(&cstate->current_fh.fh_handle.fh_base, rp->rp_openfh, rp->rp_openfh_len); - status = fh_verify(rqstp, current_fh, 0, MAY_NOP); + status = fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP); if (status) dprintk("nfsd4_open: replay failed" " restoring previous filehandle\n"); @@ -215,7 +209,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open * (3) set open->op_truncate if the file is to be * truncated after opening, (4) do permission checking. */ - status = do_open_lookup(rqstp, current_fh, open); + status = do_open_lookup(rqstp, &cstate->current_fh, + open); if (status) goto out; break; @@ -227,7 +222,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open * open->op_truncate if the file is to be truncated * after opening, (3) do permission checking. */ - status = do_open_fhandle(rqstp, current_fh, open); + status = do_open_fhandle(rqstp, &cstate->current_fh, + open); if (status) goto out; break; @@ -248,11 +244,11 @@ nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open * successful, it (1) truncates the file if open->op_truncate was * set, (2) sets open->op_stateid, (3) sets open->op_delegation. */ - status = nfsd4_process_open2(rqstp, current_fh, open); + status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); out: if (open->op_stateowner) { nfs4_get_stateowner(open->op_stateowner); - *replay_owner = open->op_stateowner; + cstate->replay_owner = open->op_stateowner; } nfs4_unlock_state(); return status; @@ -261,71 +257,80 @@ out: /* * filehandle-manipulating ops. */ -static inline __be32 -nfsd4_getfh(struct svc_fh *current_fh, struct svc_fh **getfh) +static __be32 +nfsd4_getfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct svc_fh **getfh) { - if (!current_fh->fh_dentry) + if (!cstate->current_fh.fh_dentry) return nfserr_nofilehandle; - *getfh = current_fh; + *getfh = &cstate->current_fh; return nfs_ok; } -static inline __be32 -nfsd4_putfh(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_putfh *putfh) +static __be32 +nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_putfh *putfh) { - fh_put(current_fh); - current_fh->fh_handle.fh_size = putfh->pf_fhlen; - memcpy(¤t_fh->fh_handle.fh_base, putfh->pf_fhval, putfh->pf_fhlen); - return fh_verify(rqstp, current_fh, 0, MAY_NOP); + fh_put(&cstate->current_fh); + cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen; + memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval, + putfh->pf_fhlen); + return fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP); } -static inline __be32 -nfsd4_putrootfh(struct svc_rqst *rqstp, struct svc_fh *current_fh) +static __be32 +nfsd4_putrootfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + void *arg) { __be32 status; - fh_put(current_fh); - status = exp_pseudoroot(rqstp->rq_client, current_fh, + fh_put(&cstate->current_fh); + status = exp_pseudoroot(rqstp->rq_client, &cstate->current_fh, &rqstp->rq_chandle); return status; } -static inline __be32 -nfsd4_restorefh(struct svc_fh *current_fh, struct svc_fh *save_fh) +static __be32 +nfsd4_restorefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + void *arg) { - if (!save_fh->fh_dentry) + if (!cstate->save_fh.fh_dentry) return nfserr_restorefh; - fh_dup2(current_fh, save_fh); + fh_dup2(&cstate->current_fh, &cstate->save_fh); return nfs_ok; } -static inline __be32 -nfsd4_savefh(struct svc_fh *current_fh, struct svc_fh *save_fh) +static __be32 +nfsd4_savefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + void *arg) { - if (!current_fh->fh_dentry) + if (!cstate->current_fh.fh_dentry) return nfserr_nofilehandle; - fh_dup2(save_fh, current_fh); + fh_dup2(&cstate->save_fh, &cstate->current_fh); return nfs_ok; } /* * misc nfsv4 ops */ -static inline __be32 -nfsd4_access(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_access *access) +static __be32 +nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_access *access) { if (access->ac_req_access & ~NFS3_ACCESS_FULL) return nfserr_inval; access->ac_resp_access = access->ac_req_access; - return nfsd_access(rqstp, current_fh, &access->ac_resp_access, &access->ac_supported); + return nfsd_access(rqstp, &cstate->current_fh, &access->ac_resp_access, + &access->ac_supported); } -static inline __be32 -nfsd4_commit(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_commit *commit) +static __be32 +nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_commit *commit) { __be32 status; @@ -333,14 +338,16 @@ nfsd4_commit(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_com *p++ = nfssvc_boot.tv_sec; *p++ = nfssvc_boot.tv_usec; - status = nfsd_commit(rqstp, current_fh, commit->co_offset, commit->co_count); + status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, + commit->co_count); if (status == nfserr_symlink) status = nfserr_inval; return status; } static __be32 -nfsd4_create(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_create *create) +nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_create *create) { struct svc_fh resfh; __be32 status; @@ -348,7 +355,7 @@ nfsd4_create(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_cre fh_init(&resfh, NFS4_FHSIZE); - status = fh_verify(rqstp, current_fh, S_IFDIR, MAY_CREATE); + status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, MAY_CREATE); if (status == nfserr_symlink) status = nfserr_notdir; if (status) @@ -365,9 +372,10 @@ nfsd4_create(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_cre */ create->cr_linkname[create->cr_linklen] = 0; - status = nfsd_symlink(rqstp, current_fh, create->cr_name, - create->cr_namelen, create->cr_linkname, - create->cr_linklen, &resfh, &create->cr_iattr); + status = nfsd_symlink(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + create->cr_linkname, create->cr_linklen, + &resfh, &create->cr_iattr); break; case NF4BLK: @@ -375,9 +383,9 @@ nfsd4_create(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_cre if (MAJOR(rdev) != create->cr_specdata1 || MINOR(rdev) != create->cr_specdata2) return nfserr_inval; - status = nfsd_create(rqstp, current_fh, create->cr_name, - create->cr_namelen, &create->cr_iattr, - S_IFBLK, rdev, &resfh); + status = nfsd_create(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + &create->cr_iattr, S_IFBLK, rdev, &resfh); break; case NF4CHR: @@ -385,28 +393,28 @@ nfsd4_create(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_cre if (MAJOR(rdev) != create->cr_specdata1 || MINOR(rdev) != create->cr_specdata2) return nfserr_inval; - status = nfsd_create(rqstp, current_fh, create->cr_name, - create->cr_namelen, &create->cr_iattr, - S_IFCHR, rdev, &resfh); + status = nfsd_create(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + &create->cr_iattr,S_IFCHR, rdev, &resfh); break; case NF4SOCK: - status = nfsd_create(rqstp, current_fh, create->cr_name, - create->cr_namelen, &create->cr_iattr, - S_IFSOCK, 0, &resfh); + status = nfsd_create(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + &create->cr_iattr, S_IFSOCK, 0, &resfh); break; case NF4FIFO: - status = nfsd_create(rqstp, current_fh, create->cr_name, - create->cr_namelen, &create->cr_iattr, - S_IFIFO, 0, &resfh); + status = nfsd_create(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + &create->cr_iattr, S_IFIFO, 0, &resfh); break; case NF4DIR: create->cr_iattr.ia_valid &= ~ATTR_SIZE; - status = nfsd_create(rqstp, current_fh, create->cr_name, - create->cr_namelen, &create->cr_iattr, - S_IFDIR, 0, &resfh); + status = nfsd_create(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + &create->cr_iattr, S_IFDIR, 0, &resfh); break; default: @@ -414,21 +422,22 @@ nfsd4_create(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_cre } if (!status) { - fh_unlock(current_fh); - set_change_info(&create->cr_cinfo, current_fh); - fh_dup2(current_fh, &resfh); + fh_unlock(&cstate->current_fh); + set_change_info(&create->cr_cinfo, &cstate->current_fh); + fh_dup2(&cstate->current_fh, &resfh); } fh_put(&resfh); return status; } -static inline __be32 -nfsd4_getattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_getattr *getattr) +static __be32 +nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_getattr *getattr) { __be32 status; - status = fh_verify(rqstp, current_fh, 0, MAY_NOP); + status = fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP); if (status) return status; @@ -438,26 +447,28 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_ge getattr->ga_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0; getattr->ga_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1; - getattr->ga_fhp = current_fh; + getattr->ga_fhp = &cstate->current_fh; return nfs_ok; } -static inline __be32 -nfsd4_link(struct svc_rqst *rqstp, struct svc_fh *current_fh, - struct svc_fh *save_fh, struct nfsd4_link *link) +static __be32 +nfsd4_link(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_link *link) { __be32 status = nfserr_nofilehandle; - if (!save_fh->fh_dentry) + if (!cstate->save_fh.fh_dentry) return status; - status = nfsd_link(rqstp, current_fh, link->li_name, link->li_namelen, save_fh); + status = nfsd_link(rqstp, &cstate->current_fh, + link->li_name, link->li_namelen, &cstate->save_fh); if (!status) - set_change_info(&link->li_cinfo, current_fh); + set_change_info(&link->li_cinfo, &cstate->current_fh); return status; } static __be32 -nfsd4_lookupp(struct svc_rqst *rqstp, struct svc_fh *current_fh) +nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + void *arg) { struct svc_fh tmp_fh; __be32 ret; @@ -466,22 +477,27 @@ nfsd4_lookupp(struct svc_rqst *rqstp, struct svc_fh *current_fh) if((ret = exp_pseudoroot(rqstp->rq_client, &tmp_fh, &rqstp->rq_chandle)) != 0) return ret; - if (tmp_fh.fh_dentry == current_fh->fh_dentry) { + if (tmp_fh.fh_dentry == cstate->current_fh.fh_dentry) { fh_put(&tmp_fh); return nfserr_noent; } fh_put(&tmp_fh); - return nfsd_lookup(rqstp, current_fh, "..", 2, current_fh); + return nfsd_lookup(rqstp, &cstate->current_fh, + "..", 2, &cstate->current_fh); } -static inline __be32 -nfsd4_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lookup *lookup) +static __be32 +nfsd4_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_lookup *lookup) { - return nfsd_lookup(rqstp, current_fh, lookup->lo_name, lookup->lo_len, current_fh); + return nfsd_lookup(rqstp, &cstate->current_fh, + lookup->lo_name, lookup->lo_len, + &cstate->current_fh); } -static inline __be32 -nfsd4_read(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_read *read) +static __be32 +nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_read *read) { __be32 status; @@ -493,7 +509,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_read nfs4_lock_state(); /* check stateid */ - if ((status = nfs4_preprocess_stateid_op(current_fh, &read->rd_stateid, + if ((status = nfs4_preprocess_stateid_op(&cstate->current_fh, + &read->rd_stateid, CHECK_FH | RD_STATE, &read->rd_filp))) { dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); goto out; @@ -504,12 +521,13 @@ nfsd4_read(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_read out: nfs4_unlock_state(); read->rd_rqstp = rqstp; - read->rd_fhp = current_fh; + read->rd_fhp = &cstate->current_fh; return status; } -static inline __be32 -nfsd4_readdir(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_readdir *readdir) +static __be32 +nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_readdir *readdir) { u64 cookie = readdir->rd_cookie; static const nfs4_verifier zeroverf; @@ -527,48 +545,51 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_re return nfserr_bad_cookie; readdir->rd_rqstp = rqstp; - readdir->rd_fhp = current_fh; + readdir->rd_fhp = &cstate->current_fh; return nfs_ok; } -static inline __be32 -nfsd4_readlink(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_readlink *readlink) +static __be32 +nfsd4_readlink(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_readlink *readlink) { readlink->rl_rqstp = rqstp; - readlink->rl_fhp = current_fh; + readlink->rl_fhp = &cstate->current_fh; return nfs_ok; } -static inline __be32 -nfsd4_remove(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_remove *remove) +static __be32 +nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_remove *remove) { __be32 status; if (nfs4_in_grace()) return nfserr_grace; - status = nfsd_unlink(rqstp, current_fh, 0, remove->rm_name, remove->rm_namelen); + status = nfsd_unlink(rqstp, &cstate->current_fh, 0, + remove->rm_name, remove->rm_namelen); if (status == nfserr_symlink) return nfserr_notdir; if (!status) { - fh_unlock(current_fh); - set_change_info(&remove->rm_cinfo, current_fh); + fh_unlock(&cstate->current_fh); + set_change_info(&remove->rm_cinfo, &cstate->current_fh); } return status; } -static inline __be32 -nfsd4_rename(struct svc_rqst *rqstp, struct svc_fh *current_fh, - struct svc_fh *save_fh, struct nfsd4_rename *rename) +static __be32 +nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_rename *rename) { __be32 status = nfserr_nofilehandle; - if (!save_fh->fh_dentry) + if (!cstate->save_fh.fh_dentry) return status; - if (nfs4_in_grace() && !(save_fh->fh_export->ex_flags + if (nfs4_in_grace() && !(cstate->save_fh.fh_export->ex_flags & NFSEXP_NOSUBTREECHECK)) return nfserr_grace; - status = nfsd_rename(rqstp, save_fh, rename->rn_sname, - rename->rn_snamelen, current_fh, + status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname, + rename->rn_snamelen, &cstate->current_fh, rename->rn_tname, rename->rn_tnamelen); /* the underlying filesystem returns different error's than required @@ -576,27 +597,28 @@ nfsd4_rename(struct svc_rqst *rqstp, struct svc_fh *current_fh, if (status == nfserr_isdir) status = nfserr_exist; else if ((status == nfserr_notdir) && - (S_ISDIR(save_fh->fh_dentry->d_inode->i_mode) && - S_ISDIR(current_fh->fh_dentry->d_inode->i_mode))) + (S_ISDIR(cstate->save_fh.fh_dentry->d_inode->i_mode) && + S_ISDIR(cstate->current_fh.fh_dentry->d_inode->i_mode))) status = nfserr_exist; else if (status == nfserr_symlink) status = nfserr_notdir; if (!status) { - set_change_info(&rename->rn_sinfo, current_fh); - set_change_info(&rename->rn_tinfo, save_fh); + set_change_info(&rename->rn_sinfo, &cstate->current_fh); + set_change_info(&rename->rn_tinfo, &cstate->save_fh); } return status; } -static inline __be32 -nfsd4_setattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_setattr *setattr) +static __be32 +nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_setattr *setattr) { __be32 status = nfs_ok; if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { nfs4_lock_state(); - status = nfs4_preprocess_stateid_op(current_fh, + status = nfs4_preprocess_stateid_op(&cstate->current_fh, &setattr->sa_stateid, CHECK_FH | WR_STATE, NULL); nfs4_unlock_state(); if (status) { @@ -606,16 +628,18 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_se } status = nfs_ok; if (setattr->sa_acl != NULL) - status = nfsd4_set_nfs4_acl(rqstp, current_fh, setattr->sa_acl); + status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh, + setattr->sa_acl); if (status) return status; - status = nfsd_setattr(rqstp, current_fh, &setattr->sa_iattr, + status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr, 0, (time_t)0); return status; } -static inline __be32 -nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_write *write) +static __be32 +nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_write *write) { stateid_t *stateid = &write->wr_stateid; struct file *filp = NULL; @@ -628,7 +652,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_writ return nfserr_inval; nfs4_lock_state(); - status = nfs4_preprocess_stateid_op(current_fh, stateid, + status = nfs4_preprocess_stateid_op(&cstate->current_fh, stateid, CHECK_FH | WR_STATE, &filp); if (filp) get_file(filp); @@ -645,9 +669,9 @@ nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_writ *p++ = nfssvc_boot.tv_sec; *p++ = nfssvc_boot.tv_usec; - status = nfsd_write(rqstp, current_fh, filp, write->wr_offset, - rqstp->rq_vec, write->wr_vlen, write->wr_buflen, - &write->wr_how_written); + status = nfsd_write(rqstp, &cstate->current_fh, filp, + write->wr_offset, rqstp->rq_vec, write->wr_vlen, + write->wr_buflen, &write->wr_how_written); if (filp) fput(filp); @@ -662,13 +686,14 @@ nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_writ * to NFS_OK after the call; NVERIFY by mapping NFSERR_NOT_SAME to NFS_OK. */ static __be32 -nfsd4_verify(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_verify *verify) +_nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_verify *verify) { __be32 *buf, *p; int count; __be32 status; - status = fh_verify(rqstp, current_fh, 0, MAY_NOP); + status = fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP); if (status) return status; @@ -689,8 +714,9 @@ nfsd4_verify(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_ver if (!buf) return nfserr_resource; - status = nfsd4_encode_fattr(current_fh, current_fh->fh_export, - current_fh->fh_dentry, buf, + status = nfsd4_encode_fattr(&cstate->current_fh, + cstate->current_fh.fh_export, + cstate->current_fh.fh_dentry, buf, &count, verify->ve_bmval, rqstp); @@ -712,6 +738,26 @@ out_kfree: return status; } +static __be32 +nfsd4_nverify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_verify *verify) +{ + __be32 status; + + status = _nfsd4_verify(rqstp, cstate, verify); + return status == nfserr_not_same ? nfs_ok : status; +} + +static __be32 +nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_verify *verify) +{ + __be32 status; + + status = _nfsd4_verify(rqstp, cstate, verify); + return status == nfserr_same ? nfs_ok : status; +} + /* * NULL call. */ @@ -727,6 +773,42 @@ static inline void nfsd4_increment_op_stats(u32 opnum) nfsdstats.nfs4_opcount[opnum]++; } +static void cstate_free(struct nfsd4_compound_state *cstate) +{ + if (cstate == NULL) + return; + fh_put(&cstate->current_fh); + fh_put(&cstate->save_fh); + BUG_ON(cstate->replay_owner); + kfree(cstate); +} + +static struct nfsd4_compound_state *cstate_alloc(void) +{ + struct nfsd4_compound_state *cstate; + + cstate = kmalloc(sizeof(struct nfsd4_compound_state), GFP_KERNEL); + if (cstate == NULL) + return NULL; + fh_init(&cstate->current_fh, NFS4_FHSIZE); + fh_init(&cstate->save_fh, NFS4_FHSIZE); + cstate->replay_owner = NULL; + return cstate; +} + +typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *, + void *); + +struct nfsd4_operation { + nfsd4op_func op_func; + u32 op_flags; +/* Most ops require a valid current filehandle; a few don't: */ +#define ALLOWED_WITHOUT_FH 1 +/* GETATTR and ops not listed as returning NFS4ERR_MOVED: */ +#define ALLOWED_ON_ABSENT_FS 2 +}; + +static struct nfsd4_operation nfsd4_ops[]; /* * COMPOUND call. @@ -737,21 +819,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, struct nfsd4_compoundres *resp) { struct nfsd4_op *op; - struct svc_fh *current_fh = NULL; - struct svc_fh *save_fh = NULL; - struct nfs4_stateowner *replay_owner = NULL; - int slack_space; /* in words, not bytes! */ + struct nfsd4_operation *opdesc; + struct nfsd4_compound_state *cstate = NULL; + int slack_bytes; __be32 status; status = nfserr_resource; - current_fh = kmalloc(sizeof(*current_fh), GFP_KERNEL); - if (current_fh == NULL) - goto out; - fh_init(current_fh, NFS4_FHSIZE); - save_fh = kmalloc(sizeof(*save_fh), GFP_KERNEL); - if (save_fh == NULL) + cstate = cstate_alloc(); + if (cstate == NULL) goto out; - fh_init(save_fh, NFS4_FHSIZE); resp->xbuf = &rqstp->rq_res; resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len; @@ -790,164 +866,44 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, * failed response to the next operation. If we don't * have enough room, fail with ERR_RESOURCE. */ -/* FIXME - is slack_space *really* words, or bytes??? - neilb */ - slack_space = (char *)resp->end - (char *)resp->p; - if (slack_space < COMPOUND_SLACK_SPACE + COMPOUND_ERR_SLACK_SPACE) { - BUG_ON(slack_space < COMPOUND_ERR_SLACK_SPACE); + slack_bytes = (char *)resp->end - (char *)resp->p; + if (slack_bytes < COMPOUND_SLACK_SPACE + + COMPOUND_ERR_SLACK_SPACE) { + BUG_ON(slack_bytes < COMPOUND_ERR_SLACK_SPACE); op->status = nfserr_resource; goto encode_op; } - /* All operations except RENEW, SETCLIENTID, RESTOREFH - * SETCLIENTID_CONFIRM, PUTFH and PUTROOTFH - * require a valid current filehandle - */ - if (!current_fh->fh_dentry) { - if (!((op->opnum == OP_PUTFH) || - (op->opnum == OP_PUTROOTFH) || - (op->opnum == OP_SETCLIENTID) || - (op->opnum == OP_SETCLIENTID_CONFIRM) || - (op->opnum == OP_RENEW) || - (op->opnum == OP_RESTOREFH) || - (op->opnum == OP_RELEASE_LOCKOWNER))) { + opdesc = &nfsd4_ops[op->opnum]; + + if (!cstate->current_fh.fh_dentry) { + if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) { op->status = nfserr_nofilehandle; goto encode_op; } - } - /* Check must be done at start of each operation, except - * for GETATTR and ops not listed as returning NFS4ERR_MOVED - */ - else if (current_fh->fh_export->ex_fslocs.migrated && - !((op->opnum == OP_GETATTR) || - (op->opnum == OP_PUTROOTFH) || - (op->opnum == OP_PUTPUBFH) || - (op->opnum == OP_RENEW) || - (op->opnum == OP_SETCLIENTID) || - (op->opnum == OP_RELEASE_LOCKOWNER))) { + } else if (cstate->current_fh.fh_export->ex_fslocs.migrated && + !(opdesc->op_flags & ALLOWED_ON_ABSENT_FS)) { op->status = nfserr_moved; goto encode_op; } - switch (op->opnum) { - case OP_ACCESS: - op->status = nfsd4_access(rqstp, current_fh, &op->u.access); - break; - case OP_CLOSE: - op->status = nfsd4_close(rqstp, current_fh, &op->u.close, &replay_owner); - break; - case OP_COMMIT: - op->status = nfsd4_commit(rqstp, current_fh, &op->u.commit); - break; - case OP_CREATE: - op->status = nfsd4_create(rqstp, current_fh, &op->u.create); - break; - case OP_DELEGRETURN: - op->status = nfsd4_delegreturn(rqstp, current_fh, &op->u.delegreturn); - break; - case OP_GETATTR: - op->status = nfsd4_getattr(rqstp, current_fh, &op->u.getattr); - break; - case OP_GETFH: - op->status = nfsd4_getfh(current_fh, &op->u.getfh); - break; - case OP_LINK: - op->status = nfsd4_link(rqstp, current_fh, save_fh, &op->u.link); - break; - case OP_LOCK: - op->status = nfsd4_lock(rqstp, current_fh, &op->u.lock, &replay_owner); - break; - case OP_LOCKT: - op->status = nfsd4_lockt(rqstp, current_fh, &op->u.lockt); - break; - case OP_LOCKU: - op->status = nfsd4_locku(rqstp, current_fh, &op->u.locku, &replay_owner); - break; - case OP_LOOKUP: - op->status = nfsd4_lookup(rqstp, current_fh, &op->u.lookup); - break; - case OP_LOOKUPP: - op->status = nfsd4_lookupp(rqstp, current_fh); - break; - case OP_NVERIFY: - op->status = nfsd4_verify(rqstp, current_fh, &op->u.nverify); - if (op->status == nfserr_not_same) - op->status = nfs_ok; - break; - case OP_OPEN: - op->status = nfsd4_open(rqstp, current_fh, &op->u.open, &replay_owner); - break; - case OP_OPEN_CONFIRM: - op->status = nfsd4_open_confirm(rqstp, current_fh, &op->u.open_confirm, &replay_owner); - break; - case OP_OPEN_DOWNGRADE: - op->status = nfsd4_open_downgrade(rqstp, current_fh, &op->u.open_downgrade, &replay_owner); - break; - case OP_PUTFH: - op->status = nfsd4_putfh(rqstp, current_fh, &op->u.putfh); - break; - case OP_PUTROOTFH: - op->status = nfsd4_putrootfh(rqstp, current_fh); - break; - case OP_READ: - op->status = nfsd4_read(rqstp, current_fh, &op->u.read); - break; - case OP_READDIR: - op->status = nfsd4_readdir(rqstp, current_fh, &op->u.readdir); - break; - case OP_READLINK: - op->status = nfsd4_readlink(rqstp, current_fh, &op->u.readlink); - break; - case OP_REMOVE: - op->status = nfsd4_remove(rqstp, current_fh, &op->u.remove); - break; - case OP_RENAME: - op->status = nfsd4_rename(rqstp, current_fh, save_fh, &op->u.rename); - break; - case OP_RENEW: - op->status = nfsd4_renew(&op->u.renew); - break; - case OP_RESTOREFH: - op->status = nfsd4_restorefh(current_fh, save_fh); - break; - case OP_SAVEFH: - op->status = nfsd4_savefh(current_fh, save_fh); - break; - case OP_SETATTR: - op->status = nfsd4_setattr(rqstp, current_fh, &op->u.setattr); - break; - case OP_SETCLIENTID: - op->status = nfsd4_setclientid(rqstp, &op->u.setclientid); - break; - case OP_SETCLIENTID_CONFIRM: - op->status = nfsd4_setclientid_confirm(rqstp, &op->u.setclientid_confirm); - break; - case OP_VERIFY: - op->status = nfsd4_verify(rqstp, current_fh, &op->u.verify); - if (op->status == nfserr_same) - op->status = nfs_ok; - break; - case OP_WRITE: - op->status = nfsd4_write(rqstp, current_fh, &op->u.write); - break; - case OP_RELEASE_LOCKOWNER: - op->status = nfsd4_release_lockowner(rqstp, &op->u.release_lockowner); - break; - default: + + if (opdesc->op_func) + op->status = opdesc->op_func(rqstp, cstate, &op->u); + else BUG_ON(op->status == nfs_ok); - break; - } encode_op: if (op->status == nfserr_replay_me) { - op->replay = &replay_owner->so_replay; + op->replay = &cstate->replay_owner->so_replay; nfsd4_encode_replay(resp, op); status = op->status = op->replay->rp_status; } else { nfsd4_encode_operation(resp, op); status = op->status; } - if (replay_owner && (replay_owner != (void *)(-1))) { - nfs4_put_stateowner(replay_owner); - replay_owner = NULL; + if (cstate->replay_owner) { + nfs4_put_stateowner(cstate->replay_owner); + cstate->replay_owner = NULL; } /* XXX Ugh, we need to get rid of this kind of special case: */ if (op->opnum == OP_READ && op->u.read.rd_filp) @@ -958,15 +914,124 @@ encode_op: out: nfsd4_release_compoundargs(args); - if (current_fh) - fh_put(current_fh); - kfree(current_fh); - if (save_fh) - fh_put(save_fh); - kfree(save_fh); + cstate_free(cstate); return status; } +static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = { + [OP_ACCESS] = { + .op_func = (nfsd4op_func)nfsd4_access, + }, + [OP_CLOSE] = { + .op_func = (nfsd4op_func)nfsd4_close, + }, + [OP_COMMIT] = { + .op_func = (nfsd4op_func)nfsd4_commit, + }, + [OP_CREATE] = { + .op_func = (nfsd4op_func)nfsd4_create, + }, + [OP_DELEGRETURN] = { + .op_func = (nfsd4op_func)nfsd4_delegreturn, + }, + [OP_GETATTR] = { + .op_func = (nfsd4op_func)nfsd4_getattr, + .op_flags = ALLOWED_ON_ABSENT_FS, + }, + [OP_GETFH] = { + .op_func = (nfsd4op_func)nfsd4_getfh, + }, + [OP_LINK] = { + .op_func = (nfsd4op_func)nfsd4_link, + }, + [OP_LOCK] = { + .op_func = (nfsd4op_func)nfsd4_lock, + }, + [OP_LOCKT] = { + .op_func = (nfsd4op_func)nfsd4_lockt, + }, + [OP_LOCKU] = { + .op_func = (nfsd4op_func)nfsd4_locku, + }, + [OP_LOOKUP] = { + .op_func = (nfsd4op_func)nfsd4_lookup, + }, + [OP_LOOKUPP] = { + .op_func = (nfsd4op_func)nfsd4_lookupp, + }, + [OP_NVERIFY] = { + .op_func = (nfsd4op_func)nfsd4_nverify, + }, + [OP_OPEN] = { + .op_func = (nfsd4op_func)nfsd4_open, + }, + [OP_OPEN_CONFIRM] = { + .op_func = (nfsd4op_func)nfsd4_open_confirm, + }, + [OP_OPEN_DOWNGRADE] = { + .op_func = (nfsd4op_func)nfsd4_open_downgrade, + }, + [OP_PUTFH] = { + .op_func = (nfsd4op_func)nfsd4_putfh, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + }, + [OP_PUTPUBFH] = { + /* unsupported; just for future reference: */ + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + }, + [OP_PUTROOTFH] = { + .op_func = (nfsd4op_func)nfsd4_putrootfh, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + }, + [OP_READ] = { + .op_func = (nfsd4op_func)nfsd4_read, + }, + [OP_READDIR] = { + .op_func = (nfsd4op_func)nfsd4_readdir, + }, + [OP_READLINK] = { + .op_func = (nfsd4op_func)nfsd4_readlink, + }, + [OP_REMOVE] = { + .op_func = (nfsd4op_func)nfsd4_remove, + }, + [OP_RENAME] = { + .op_func = (nfsd4op_func)nfsd4_rename, + }, + [OP_RENEW] = { + .op_func = (nfsd4op_func)nfsd4_renew, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + }, + [OP_RESTOREFH] = { + .op_func = (nfsd4op_func)nfsd4_restorefh, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + }, + [OP_SAVEFH] = { + .op_func = (nfsd4op_func)nfsd4_savefh, + }, + [OP_SETATTR] = { + .op_func = (nfsd4op_func)nfsd4_setattr, + }, + [OP_SETCLIENTID] = { + .op_func = (nfsd4op_func)nfsd4_setclientid, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + }, + [OP_SETCLIENTID_CONFIRM] = { + .op_func = (nfsd4op_func)nfsd4_setclientid_confirm, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + }, + [OP_VERIFY] = { + .op_func = (nfsd4op_func)nfsd4_verify, + }, + [OP_WRITE] = { + .op_func = (nfsd4op_func)nfsd4_write, + }, + [OP_RELEASE_LOCKOWNER] = { + .op_func = (nfsd4op_func)nfsd4_release_lockowner, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + }, +}; + #define nfs4svc_decode_voidargs NULL #define nfs4svc_release_void NULL #define nfsd4_voidres nfsd4_voidargs diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 81b8565d383..c7774e3a946 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -259,7 +259,7 @@ nfsd4_remove_clid_file(struct dentry *dir, struct dentry *dentry) printk("nfsd4: non-file found in client recovery directory\n"); return -EINVAL; } - mutex_lock(&dir->d_inode->i_mutex); + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); status = vfs_unlink(dir->d_inode, dentry); mutex_unlock(&dir->d_inode->i_mutex); return status; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 640c92b2a9f..9de89df961f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -711,7 +711,8 @@ out_err: * */ __be32 -nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_setclientid *setclid) +nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_setclientid *setclid) { __be32 ip_addr = rqstp->rq_addr.sin_addr.s_addr; struct xdr_netobj clname = { @@ -876,7 +877,9 @@ out: * NOTE: callback information will be processed here in a future patch */ __be32 -nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_setclientid_confirm *setclientid_confirm) +nfsd4_setclientid_confirm(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_setclientid_confirm *setclientid_confirm) { __be32 ip_addr = rqstp->rq_addr.sin_addr.s_addr; struct nfs4_client *conf, *unconf; @@ -1310,7 +1313,7 @@ static inline void nfs4_file_downgrade(struct file *filp, unsigned int share_access) { if (share_access & NFS4_SHARE_ACCESS_WRITE) { - put_write_access(filp->f_dentry->d_inode); + put_write_access(filp->f_path.dentry->d_inode); filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE; } } @@ -1623,7 +1626,7 @@ static __be32 nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open) { struct file *filp = stp->st_vfs_file; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; unsigned int share_access, new_writer; __be32 status; @@ -1833,7 +1836,8 @@ static void laundromat_main(struct work_struct *); static DECLARE_DELAYED_WORK(laundromat_work, laundromat_main); __be32 -nfsd4_renew(clientid_t *clid) +nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + clientid_t *clid) { struct nfs4_client *clp; __be32 status; @@ -1965,7 +1969,7 @@ search_close_lru(u32 st_id, int flags) static inline int nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp) { - return fhp->fh_dentry->d_inode != stp->st_vfs_file->f_dentry->d_inode; + return fhp->fh_dentry->d_inode != stp->st_vfs_file->f_path.dentry->d_inode; } static int @@ -2241,24 +2245,25 @@ check_replay: } __be32 -nfsd4_open_confirm(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open_confirm *oc, struct nfs4_stateowner **replay_owner) +nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_open_confirm *oc) { __be32 status; struct nfs4_stateowner *sop; struct nfs4_stateid *stp; dprintk("NFSD: nfsd4_open_confirm on file %.*s\n", - (int)current_fh->fh_dentry->d_name.len, - current_fh->fh_dentry->d_name.name); + (int)cstate->current_fh.fh_dentry->d_name.len, + cstate->current_fh.fh_dentry->d_name.name); - status = fh_verify(rqstp, current_fh, S_IFREG, 0); + status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0); if (status) return status; nfs4_lock_state(); - if ((status = nfs4_preprocess_seqid_op(current_fh, oc->oc_seqid, - &oc->oc_req_stateid, + if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, + oc->oc_seqid, &oc->oc_req_stateid, CHECK_FH | CONFIRM | OPEN_STATE, &oc->oc_stateowner, &stp, NULL))) goto out; @@ -2278,7 +2283,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs out: if (oc->oc_stateowner) { nfs4_get_stateowner(oc->oc_stateowner); - *replay_owner = oc->oc_stateowner; + cstate->replay_owner = oc->oc_stateowner; } nfs4_unlock_state(); return status; @@ -2310,22 +2315,25 @@ reset_union_bmap_deny(unsigned long deny, unsigned long *bmap) } __be32 -nfsd4_open_downgrade(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open_downgrade *od, struct nfs4_stateowner **replay_owner) +nfsd4_open_downgrade(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_open_downgrade *od) { __be32 status; struct nfs4_stateid *stp; unsigned int share_access; dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n", - (int)current_fh->fh_dentry->d_name.len, - current_fh->fh_dentry->d_name.name); + (int)cstate->current_fh.fh_dentry->d_name.len, + cstate->current_fh.fh_dentry->d_name.name); if (!access_valid(od->od_share_access) || !deny_valid(od->od_share_deny)) return nfserr_inval; nfs4_lock_state(); - if ((status = nfs4_preprocess_seqid_op(current_fh, od->od_seqid, + if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, + od->od_seqid, &od->od_stateid, CHECK_FH | OPEN_STATE, &od->od_stateowner, &stp, NULL))) @@ -2355,7 +2363,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct n out: if (od->od_stateowner) { nfs4_get_stateowner(od->od_stateowner); - *replay_owner = od->od_stateowner; + cstate->replay_owner = od->od_stateowner; } nfs4_unlock_state(); return status; @@ -2365,18 +2373,20 @@ out: * nfs4_unlock_state() called after encode */ __be32 -nfsd4_close(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_close *close, struct nfs4_stateowner **replay_owner) +nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_close *close) { __be32 status; struct nfs4_stateid *stp; dprintk("NFSD: nfsd4_close on file %.*s\n", - (int)current_fh->fh_dentry->d_name.len, - current_fh->fh_dentry->d_name.name); + (int)cstate->current_fh.fh_dentry->d_name.len, + cstate->current_fh.fh_dentry->d_name.name); nfs4_lock_state(); /* check close_lru for replay */ - if ((status = nfs4_preprocess_seqid_op(current_fh, close->cl_seqid, + if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, + close->cl_seqid, &close->cl_stateid, CHECK_FH | OPEN_STATE | CLOSE_STATE, &close->cl_stateowner, &stp, NULL))) @@ -2397,22 +2407,24 @@ nfsd4_close(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_clos out: if (close->cl_stateowner) { nfs4_get_stateowner(close->cl_stateowner); - *replay_owner = close->cl_stateowner; + cstate->replay_owner = close->cl_stateowner; } nfs4_unlock_state(); return status; } __be32 -nfsd4_delegreturn(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_delegreturn *dr) +nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_delegreturn *dr) { __be32 status; - if ((status = fh_verify(rqstp, current_fh, S_IFREG, 0))) + if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) goto out; nfs4_lock_state(); - status = nfs4_preprocess_stateid_op(current_fh, &dr->dr_stateid, DELEG_RET, NULL); + status = nfs4_preprocess_stateid_op(&cstate->current_fh, + &dr->dr_stateid, DELEG_RET, NULL); nfs4_unlock_state(); out: return status; @@ -2635,7 +2647,8 @@ check_lock_length(u64 offset, u64 length) * LOCK operation */ __be32 -nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock *lock, struct nfs4_stateowner **replay_owner) +nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_lock *lock) { struct nfs4_stateowner *open_sop = NULL; struct nfs4_stateowner *lock_sop = NULL; @@ -2654,7 +2667,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock if (check_lock_length(lock->lk_offset, lock->lk_length)) return nfserr_inval; - if ((status = fh_verify(rqstp, current_fh, S_IFREG, MAY_LOCK))) { + if ((status = fh_verify(rqstp, &cstate->current_fh, + S_IFREG, MAY_LOCK))) { dprintk("NFSD: nfsd4_lock: permission denied!\n"); return status; } @@ -2675,7 +2689,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock goto out; /* validate and update open stateid and open seqid */ - status = nfs4_preprocess_seqid_op(current_fh, + status = nfs4_preprocess_seqid_op(&cstate->current_fh, lock->lk_new_open_seqid, &lock->lk_new_open_stateid, CHECK_FH | OPEN_STATE, @@ -2702,7 +2716,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock goto out; } else { /* lock (lock owner + lock stateid) already exists */ - status = nfs4_preprocess_seqid_op(current_fh, + status = nfs4_preprocess_seqid_op(&cstate->current_fh, lock->lk_old_lock_seqid, &lock->lk_old_lock_stateid, CHECK_FH | LOCK_STATE, @@ -2759,7 +2773,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock conflock.fl_ops = NULL; conflock.fl_lmops = NULL; err = posix_lock_file_conf(filp, &file_lock, &conflock); - dprintk("NFSD: nfsd4_lock: posix_lock_file_conf status %d\n",status); switch (-err) { case 0: /* success! */ update_stateid(&lock_stp->st_stateid); @@ -2785,7 +2798,7 @@ out: release_stateowner(lock_sop); if (lock->lk_replay_owner) { nfs4_get_stateowner(lock->lk_replay_owner); - *replay_owner = lock->lk_replay_owner; + cstate->replay_owner = lock->lk_replay_owner; } nfs4_unlock_state(); return status; @@ -2795,7 +2808,8 @@ out: * LOCKT operation */ __be32 -nfsd4_lockt(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lockt *lockt) +nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_lockt *lockt) { struct inode *inode; struct file file; @@ -2816,14 +2830,14 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock if (STALE_CLIENTID(&lockt->lt_clientid)) goto out; - if ((status = fh_verify(rqstp, current_fh, S_IFREG, 0))) { + if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) { dprintk("NFSD: nfsd4_lockt: fh_verify() failed!\n"); if (status == nfserr_symlink) status = nfserr_inval; goto out; } - inode = current_fh->fh_dentry->d_inode; + inode = cstate->current_fh.fh_dentry->d_inode; locks_init_lock(&file_lock); switch (lockt->lt_type) { case NFS4_READ_LT: @@ -2862,7 +2876,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock * only the dentry:inode set. */ memset(&file, 0, sizeof (struct file)); - file.f_dentry = current_fh->fh_dentry; + file.f_path.dentry = cstate->current_fh.fh_dentry; status = nfs_ok; if (posix_test_lock(&file, &file_lock, &conflock)) { @@ -2875,7 +2889,8 @@ out: } __be32 -nfsd4_locku(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_locku *locku, struct nfs4_stateowner **replay_owner) +nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_locku *locku) { struct nfs4_stateid *stp; struct file *filp = NULL; @@ -2892,7 +2907,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock nfs4_lock_state(); - if ((status = nfs4_preprocess_seqid_op(current_fh, + if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, locku->lu_seqid, &locku->lu_stateid, CHECK_FH | LOCK_STATE, @@ -2933,7 +2948,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock out: if (locku->lu_stateowner) { nfs4_get_stateowner(locku->lu_stateowner); - *replay_owner = locku->lu_stateowner; + cstate->replay_owner = locku->lu_stateowner; } nfs4_unlock_state(); return status; @@ -2952,7 +2967,7 @@ static int check_for_locks(struct file *filp, struct nfs4_stateowner *lowner) { struct file_lock **flpp; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; int status = 0; lock_kernel(); @@ -2968,7 +2983,9 @@ out: } __be32 -nfsd4_release_lockowner(struct svc_rqst *rqstp, struct nfsd4_release_lockowner *rlockowner) +nfsd4_release_lockowner(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_release_lockowner *rlockowner) { clientid_t *clid = &rlockowner->rl_clientid; struct nfs4_stateowner *sop; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index f3f239db04b..18aa9440df1 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1845,15 +1845,11 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd, exp_get(exp); if (d_mountpoint(dentry)) { - if (nfsd_cross_mnt(cd->rd_rqstp, &dentry, &exp)) { - /* - * -EAGAIN is the only error returned from - * nfsd_cross_mnt() and it indicates that an - * up-call has been initiated to fill in the export - * options on exp. When the answer comes back, - * this call will be retried. - */ - nfserr = nfserr_dropit; + int err; + + err = nfsd_cross_mnt(cd->rd_rqstp, &dentry, &exp); + if (err) { + nfserr = nfserrno(err); goto out_put; } @@ -1884,9 +1880,10 @@ nfsd4_encode_rdattr_error(__be32 *p, int buflen, __be32 nfserr) } static int -nfsd4_encode_dirent(struct readdir_cd *ccd, const char *name, int namlen, - loff_t offset, ino_t ino, unsigned int d_type) +nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) { + struct readdir_cd *ccd = ccdv; struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common); int buflen; __be32 *p = cd->buffer; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 6100bbe2743..f90d7047585 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -66,14 +66,13 @@ nfsd_cache_init(void) printk (KERN_ERR "nfsd: cannot allocate all %d cache entries, only got %d\n", CACHESIZE, CACHESIZE-i); - hash_list = kmalloc (HASHSIZE * sizeof(struct hlist_head), GFP_KERNEL); + hash_list = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL); if (!hash_list) { nfsd_cache_shutdown(); printk (KERN_ERR "nfsd: cannot allocate %Zd bytes for hash list\n", HASHSIZE * sizeof(struct hlist_head)); return; } - memset(hash_list, 0, HASHSIZE * sizeof(struct hlist_head)); cache_disabled = 0; } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 39aed901514..eedf2e3990a 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -111,7 +111,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = { static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos) { - ino_t ino = file->f_dentry->d_inode->i_ino; + ino_t ino = file->f_path.dentry->d_inode->i_ino; char *data; ssize_t rv; diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 727ab3bd450..c59d6fbb7a6 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -24,8 +24,6 @@ #include <linux/nfsd/nfsd.h> #define NFSDDBG_FACILITY NFSDDBG_FH -#define NFSD_PARANOIA 1 -/* #define NFSD_DEBUG_VERBOSE 1 */ static int nfsd_nr_verified; @@ -169,9 +167,11 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) exp = exp_find(rqstp->rq_client, 0, tfh, &rqstp->rq_chandle); } - error = nfserr_dropit; - if (IS_ERR(exp) && PTR_ERR(exp) == -EAGAIN) + if (IS_ERR(exp) && (PTR_ERR(exp) == -EAGAIN + || PTR_ERR(exp) == -ETIMEDOUT)) { + error = nfserrno(PTR_ERR(exp)); goto out; + } error = nfserr_stale; if (!exp || IS_ERR(exp)) @@ -228,13 +228,12 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) error = nfserrno(PTR_ERR(dentry)); goto out; } -#ifdef NFSD_PARANOIA + if (S_ISDIR(dentry->d_inode->i_mode) && (dentry->d_flags & DCACHE_DISCONNECTED)) { printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %s/%s\n", dentry->d_parent->d_name.name, dentry->d_name.name); } -#endif fhp->fh_dentry = dentry; fhp->fh_export = exp; @@ -265,12 +264,13 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) /* Finally, check access permissions. */ error = nfsd_permission(exp, dentry, access); -#ifdef NFSD_PARANOIA_EXTREME if (error) { - printk("fh_verify: %s/%s permission failure, acc=%x, error=%d\n", - dentry->d_parent->d_name.name, dentry->d_name.name, access, (error >> 24)); + dprintk("fh_verify: %s/%s permission failure, " + "acc=%x, error=%d\n", + dentry->d_parent->d_name.name, + dentry->d_name.name, + access, ntohl(error)); } -#endif out: if (exp && !IS_ERR(exp)) exp_put(exp); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 0aaccb03bf7..fbf5d51947e 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -72,7 +72,7 @@ static struct svc_program nfsd_acl_program = { .pg_prog = NFS_ACL_PROGRAM, .pg_nvers = NFSD_ACL_NRVERS, .pg_vers = nfsd_acl_versions, - .pg_name = "nfsd", + .pg_name = "nfsacl", .pg_class = "nfsd", .pg_stats = &nfsd_acl_svcstats, .pg_authenticate = &svc_set_client, @@ -118,16 +118,16 @@ int nfsd_vers(int vers, enum vers_op change) switch(change) { case NFSD_SET: nfsd_versions[vers] = nfsd_version[vers]; - break; #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) if (vers < NFSD_ACL_NRVERS) - nfsd_acl_version[vers] = nfsd_acl_version[vers]; + nfsd_acl_versions[vers] = nfsd_acl_version[vers]; #endif + break; case NFSD_CLEAR: nfsd_versions[vers] = NULL; #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) if (vers < NFSD_ACL_NRVERS) - nfsd_acl_version[vers] = NULL; + nfsd_acl_versions[vers] = NULL; #endif break; case NFSD_TEST: diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index f5243f94399..6555c50d900 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -462,9 +462,10 @@ nfssvc_encode_statfsres(struct svc_rqst *rqstp, __be32 *p, } int -nfssvc_encode_entry(struct readdir_cd *ccd, const char *name, - int namlen, loff_t offset, ino_t ino, unsigned int d_type) +nfssvc_encode_entry(void *ccdv, const char *name, + int namlen, loff_t offset, u64 ino, unsigned int d_type) { + struct readdir_cd *ccd = ccdv; struct nfsd_readdirres *cd = container_of(ccd, struct nfsd_readdirres, common); __be32 *p = cd->buffer; int buflen, slen; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index bb4d926e448..8283236c6a0 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -59,7 +59,6 @@ #include <asm/uaccess.h> #define NFSDDBG_FACILITY NFSDDBG_FILEOP -#define NFSD_PARANOIA /* We must ignore files (but only files) which might have mandatory @@ -99,7 +98,7 @@ static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE]; /* * Called from nfsd_lookup and encode_dirent. Check if we have crossed * a mount point. - * Returns -EAGAIN leaving *dpp and *expp unchanged, + * Returns -EAGAIN or -ETIMEDOUT leaving *dpp and *expp unchanged, * or nfs_ok having possibly changed *dpp and *expp */ int @@ -736,10 +735,10 @@ static int nfsd_sync(struct file *filp) { int err; - struct inode *inode = filp->f_dentry->d_inode; - dprintk("nfsd: sync file %s\n", filp->f_dentry->d_name.name); + struct inode *inode = filp->f_path.dentry->d_inode; + dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name); mutex_lock(&inode->i_mutex); - err=nfsd_dosync(filp, filp->f_dentry, filp->f_op); + err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op); mutex_unlock(&inode->i_mutex); return err; @@ -822,7 +821,8 @@ nfsd_read_actor(read_descriptor_t *desc, struct page *page, unsigned long offset rqstp->rq_res.page_len = size; } else if (page != pp[-1]) { get_page(page); - put_page(*pp); + if (*pp) + put_page(*pp); *pp = page; rqstp->rq_resused++; rqstp->rq_res.page_len += size; @@ -845,7 +845,7 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, int host_err; err = nfserr_perm; - inode = file->f_dentry->d_inode; + inode = file->f_path.dentry->d_inode; #ifdef MSNFS if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && (!lock_may_read(inode, offset, *count))) @@ -883,7 +883,7 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, nfsdstats.io_read += host_err; *count = host_err; err = 0; - fsnotify_access(file->f_dentry); + fsnotify_access(file->f_path.dentry); } else err = nfserrno(host_err); out: @@ -917,11 +917,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, err = nfserr_perm; if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && - (!lock_may_write(file->f_dentry->d_inode, offset, cnt))) + (!lock_may_write(file->f_path.dentry->d_inode, offset, cnt))) goto out; #endif - dentry = file->f_dentry; + dentry = file->f_path.dentry; inode = dentry->d_inode; exp = fhp->fh_export; @@ -950,7 +950,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, set_fs(oldfs); if (host_err >= 0) { nfsdstats.io_write += cnt; - fsnotify_modify(file->f_dentry); + fsnotify_modify(file->f_path.dentry); } /* clear setuid/setgid flag after write */ @@ -1244,7 +1244,6 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 err; int host_err; __u32 v_mtime=0, v_atime=0; - int v_mode=0; err = nfserr_perm; if (!flen) @@ -1281,16 +1280,11 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; if (createmode == NFS3_CREATE_EXCLUSIVE) { - /* while the verifier would fit in mtime+atime, - * solaris7 gets confused (bugid 4218508) if these have - * the high bit set, so we use the mode as well + /* solaris7 gets confused (bugid 4218508) if these have + * the high bit set, so just clear the high bits. */ v_mtime = verifier[0]&0x7fffffff; v_atime = verifier[1]&0x7fffffff; - v_mode = S_IFREG - | ((verifier[0]&0x80000000) >> (32-7)) /* u+x */ - | ((verifier[1]&0x80000000) >> (32-9)) /* u+r */ - ; } if (dchild->d_inode) { @@ -1318,7 +1312,6 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, case NFS3_CREATE_EXCLUSIVE: if ( dchild->d_inode->i_mtime.tv_sec == v_mtime && dchild->d_inode->i_atime.tv_sec == v_atime - && dchild->d_inode->i_mode == v_mode && dchild->d_inode->i_size == 0 ) break; /* fallthru */ @@ -1340,26 +1333,22 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, } if (createmode == NFS3_CREATE_EXCLUSIVE) { - /* Cram the verifier into atime/mtime/mode */ + /* Cram the verifier into atime/mtime */ iap->ia_valid = ATTR_MTIME|ATTR_ATIME - | ATTR_MTIME_SET|ATTR_ATIME_SET - | ATTR_MODE; + | ATTR_MTIME_SET|ATTR_ATIME_SET; /* XXX someone who knows this better please fix it for nsec */ iap->ia_mtime.tv_sec = v_mtime; iap->ia_atime.tv_sec = v_atime; iap->ia_mtime.tv_nsec = 0; iap->ia_atime.tv_nsec = 0; - iap->ia_mode = v_mode; } /* Set file attributes. - * Mode has already been set but we might need to reset it - * for CREATE_EXCLUSIVE * Irix appears to send along the gid when it tries to * implement setgid directories via NFS. Clear out all that cruft. */ set_attr: - if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID)) != 0) { + if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) { __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); if (err2) err = err2; @@ -1726,7 +1715,7 @@ out: */ __be32 nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, - struct readdir_cd *cdp, encode_dent_fn func) + struct readdir_cd *cdp, filldir_t func) { __be32 err; int host_err; @@ -1751,7 +1740,7 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, do { cdp->err = nfserr_eof; /* will be cleared on successful read */ - host_err = vfs_readdir(file, (filldir_t) func, cdp); + host_err = vfs_readdir(file, func, cdp); } while (host_err >=0 && cdp->err == nfs_ok); if (host_err) err = nfserrno(host_err); @@ -1885,28 +1874,27 @@ nfsd_racache_init(int cache_size) return 0; if (cache_size < 2*RAPARM_HASH_SIZE) cache_size = 2*RAPARM_HASH_SIZE; - raparml = kmalloc(sizeof(struct raparms) * cache_size, GFP_KERNEL); - - if (raparml != NULL) { - dprintk("nfsd: allocating %d readahead buffers.\n", - cache_size); - for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) { - raparm_hash[i].pb_head = NULL; - spin_lock_init(&raparm_hash[i].pb_lock); - } - nperbucket = cache_size >> RAPARM_HASH_BITS; - memset(raparml, 0, sizeof(struct raparms) * cache_size); - for (i = 0; i < cache_size - 1; i++) { - if (i % nperbucket == 0) - raparm_hash[j++].pb_head = raparml + i; - if (i % nperbucket < nperbucket-1) - raparml[i].p_next = raparml + i + 1; - } - } else { + raparml = kcalloc(cache_size, sizeof(struct raparms), GFP_KERNEL); + + if (!raparml) { printk(KERN_WARNING - "nfsd: Could not allocate memory read-ahead cache.\n"); + "nfsd: Could not allocate memory read-ahead cache.\n"); return -ENOMEM; } + + dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); + for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) { + raparm_hash[i].pb_head = NULL; + spin_lock_init(&raparm_hash[i].pb_lock); + } + nperbucket = cache_size >> RAPARM_HASH_BITS; + for (i = 0; i < cache_size - 1; i++) { + if (i % nperbucket == 0) + raparm_hash[j++].pb_head = raparml + i; + if (i % nperbucket < nperbucket-1) + raparml[i].p_next = raparml + i + 1; + } + nfsdstats.ra_size = cache_size; return 0; } diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog index 35cc4b1d60f..af4ef808fa9 100644 --- a/fs/ntfs/ChangeLog +++ b/fs/ntfs/ChangeLog @@ -17,6 +17,13 @@ ToDo/Notes: happen is unclear however so it is worth waiting until someone hits the problem. +2.1.28 - Fix a deadlock. + + - Fix deadlock in fs/ntfs/inode.c::ntfs_put_inode(). Thanks to Sergey + Vlasov for the report and detailed analysis of the deadlock. The fix + involved getting rid of ntfs_put_inode() altogether and hence NTFS no + longer has a ->put_inode super operation. + 2.1.27 - Various bug fixes and cleanups. - Fix two compiler warnings on Alpha. Thanks to Andrew Morton for diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile index e27b4eacffb..82550838556 100644 --- a/fs/ntfs/Makefile +++ b/fs/ntfs/Makefile @@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \ index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \ unistr.o upcase.o -EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.27\" +EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.28\" ifeq ($(CONFIG_NTFS_DEBUG),y) EXTRA_CFLAGS += -DDEBUG diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 7b2c8f4f6a6..629e7abdd84 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -92,10 +92,12 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) ofs = 0; if (file_ofs < init_size) ofs = init_size - file_ofs; + local_irq_save(flags); kaddr = kmap_atomic(page, KM_BIO_SRC_IRQ); memset(kaddr + bh_offset(bh) + ofs, 0, bh->b_size - ofs); kunmap_atomic(kaddr, KM_BIO_SRC_IRQ); + local_irq_restore(flags); flush_dcache_page(page); } } else { @@ -143,11 +145,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) recs = PAGE_CACHE_SIZE / rec_size; /* Should have been verified before we got here... */ BUG_ON(!recs); + local_irq_save(flags); kaddr = kmap_atomic(page, KM_BIO_SRC_IRQ); for (i = 0; i < recs; i++) post_read_mst_fixup((NTFS_RECORD*)(kaddr + i * rec_size), rec_size); kunmap_atomic(kaddr, KM_BIO_SRC_IRQ); + local_irq_restore(flags); flush_dcache_page(page); if (likely(page_uptodate && !PageError(page))) SetPageUptodate(page); diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 85c36b8ca45..74f99a6a369 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1,7 +1,7 @@ /** * dir.c - NTFS kernel directory operations. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2001-2007 Anton Altaparmakov * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -1101,7 +1101,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir) { s64 ia_pos, ia_start, prev_ia_pos, bmp_pos; loff_t fpos, i_size; - struct inode *bmp_vi, *vdir = filp->f_dentry->d_inode; + struct inode *bmp_vi, *vdir = filp->f_path.dentry->d_inode; struct super_block *sb = vdir->i_sb; ntfs_inode *ndir = NTFS_I(vdir); ntfs_volume *vol = NTFS_SB(sb); @@ -1136,9 +1136,9 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (fpos == 1) { ntfs_debug("Calling filldir for .. with len 2, fpos 0x1, " "inode 0x%lx, DT_DIR.", - (unsigned long)parent_ino(filp->f_dentry)); + (unsigned long)parent_ino(filp->f_path.dentry)); rc = filldir(dirent, "..", 2, fpos, - parent_ino(filp->f_dentry), DT_DIR); + parent_ino(filp->f_path.dentry), DT_DIR); if (rc) goto done; fpos++; @@ -1249,16 +1249,12 @@ skip_index_root: /* Get the offset into the index allocation attribute. */ ia_pos = (s64)fpos - vol->mft_record_size; ia_mapping = vdir->i_mapping; - bmp_vi = ndir->itype.index.bmp_ino; - if (unlikely(!bmp_vi)) { - ntfs_debug("Inode 0x%lx, regetting index bitmap.", vdir->i_ino); - bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4); - if (IS_ERR(bmp_vi)) { - ntfs_error(sb, "Failed to get bitmap attribute."); - err = PTR_ERR(bmp_vi); - goto err_out; - } - ndir->itype.index.bmp_ino = bmp_vi; + ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino); + bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4); + if (IS_ERR(bmp_vi)) { + ntfs_error(sb, "Failed to get bitmap attribute."); + err = PTR_ERR(bmp_vi); + goto err_out; } bmp_mapping = bmp_vi->i_mapping; /* Get the starting bitmap bit position and sanity check it. */ @@ -1266,7 +1262,7 @@ skip_index_root: if (unlikely(bmp_pos >> 3 >= i_size_read(bmp_vi))) { ntfs_error(sb, "Current index allocation position exceeds " "index bitmap size."); - goto err_out; + goto iput_err_out; } /* Get the starting bit position in the current bitmap page. */ cur_bmp_pos = bmp_pos & ((PAGE_CACHE_SIZE * 8) - 1); @@ -1282,7 +1278,7 @@ get_next_bmp_page: ntfs_error(sb, "Reading index bitmap failed."); err = PTR_ERR(bmp_page); bmp_page = NULL; - goto err_out; + goto iput_err_out; } bmp = (u8*)page_address(bmp_page); /* Find next index block in use. */ @@ -1429,6 +1425,7 @@ find_next_index_buffer: /* @ia_page is already unlocked in this case. */ ntfs_unmap_page(ia_page); ntfs_unmap_page(bmp_page); + iput(bmp_vi); goto abort; } } @@ -1439,6 +1436,7 @@ unm_EOD: ntfs_unmap_page(ia_page); } ntfs_unmap_page(bmp_page); + iput(bmp_vi); EOD: /* We are finished, set fpos to EOD. */ fpos = i_size + vol->mft_record_size; @@ -1455,8 +1453,11 @@ done: filp->f_pos = fpos; return 0; err_out: - if (bmp_page) + if (bmp_page) { ntfs_unmap_page(bmp_page); +iput_err_out: + iput(bmp_vi); + } if (ia_page) { unlock_page(ia_page); ntfs_unmap_page(ia_page); @@ -1529,14 +1530,22 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp) static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry, int datasync) { - struct inode *vi = dentry->d_inode; - ntfs_inode *ni = NTFS_I(vi); + struct inode *bmp_vi, *vi = dentry->d_inode; int err, ret; + ntfs_attr na; ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); BUG_ON(!S_ISDIR(vi->i_mode)); - if (NInoIndexAllocPresent(ni) && ni->itype.index.bmp_ino) - write_inode_now(ni->itype.index.bmp_ino, !datasync); + /* If the bitmap attribute inode is in memory sync it, too. */ + na.mft_no = vi->i_ino; + na.type = AT_BITMAP; + na.name = I30; + na.name_len = 4; + bmp_vi = ilookup5(vi->i_sb, vi->i_ino, (test_t)ntfs_test_inode, &na); + if (bmp_vi) { + write_inode_now(bmp_vi, !datasync); + iput(bmp_vi); + } ret = ntfs_write_inode(vi, 1); write_inode_now(vi, !datasync); err = sync_blockdev(vi->i_sb->s_bdev); diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index ae2fe0016d2..076c9420c25 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2162,7 +2162,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb, goto out; if (!count) goto out; - err = remove_suid(file->f_dentry); + err = remove_suid(file->f_path.dentry); if (err) goto out; file_update_time(file); diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 247989891b4..f8bf8da67ee 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1,7 +1,7 @@ /** * inode.c - NTFS kernel inode handling. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2006 Anton Altaparmakov + * Copyright (c) 2001-2007 Anton Altaparmakov * * This program/include file is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as published @@ -95,7 +95,7 @@ int ntfs_test_inode(struct inode *vi, ntfs_attr *na) * If initializing the normal file/directory inode, set @na->type to AT_UNUSED. * In that case, @na->name and @na->name_len should be set to NULL and 0, * respectively. Although that is not strictly necessary as - * ntfs_read_inode_locked() will fill them in later. + * ntfs_read_locked_inode() will fill them in later. * * Return 0 on success and -errno on error. * @@ -171,8 +171,8 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no) { struct inode *vi; - ntfs_attr na; int err; + ntfs_attr na; na.mft_no = mft_no; na.type = AT_UNUSED; @@ -229,8 +229,8 @@ struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type, ntfschar *name, u32 name_len) { struct inode *vi; - ntfs_attr na; int err; + ntfs_attr na; /* Make sure no one calls ntfs_attr_iget() for indices. */ BUG_ON(type == AT_INDEX_ALLOCATION); @@ -287,8 +287,8 @@ struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name, u32 name_len) { struct inode *vi; - ntfs_attr na; int err; + ntfs_attr na; na.mft_no = base_vi->i_ino; na.type = AT_INDEX_ALLOCATION; @@ -402,7 +402,6 @@ void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni) ntfs_init_runlist(&ni->attr_list_rl); lockdep_set_class(&ni->attr_list_rl.lock, &attr_list_rl_lock_class); - ni->itype.index.bmp_ino = NULL; ni->itype.index.block_size = 0; ni->itype.index.vcn_size = 0; ni->itype.index.collation_rule = 0; @@ -546,6 +545,7 @@ static int ntfs_read_locked_inode(struct inode *vi) { ntfs_volume *vol = NTFS_SB(vi->i_sb); ntfs_inode *ni; + struct inode *bvi; MFT_RECORD *m; ATTR_RECORD *a; STANDARD_INFORMATION *si; @@ -780,7 +780,6 @@ skip_attr_list_load: */ if (S_ISDIR(vi->i_mode)) { loff_t bvi_size; - struct inode *bvi; ntfs_inode *bni; INDEX_ROOT *ir; u8 *ir_end, *index_end; @@ -985,13 +984,12 @@ skip_attr_list_load: err = PTR_ERR(bvi); goto unm_err_out; } - ni->itype.index.bmp_ino = bvi; bni = NTFS_I(bvi); if (NInoCompressed(bni) || NInoEncrypted(bni) || NInoSparse(bni)) { ntfs_error(vi->i_sb, "$BITMAP attribute is compressed " "and/or encrypted and/or sparse."); - goto unm_err_out; + goto iput_unm_err_out; } /* Consistency check bitmap size vs. index allocation size. */ bvi_size = i_size_read(bvi); @@ -1000,8 +998,10 @@ skip_attr_list_load: ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) " "for index allocation (0x%llx).", bvi_size << 3, vi->i_size); - goto unm_err_out; + goto iput_unm_err_out; } + /* No longer need the bitmap attribute inode. */ + iput(bvi); skip_large_dir_stuff: /* Setup the operations for this inode. */ vi->i_op = &ntfs_dir_inode_ops; @@ -1176,7 +1176,8 @@ no_data_attr_special_case: vi->i_blocks = ni->allocated_size >> 9; ntfs_debug("Done."); return 0; - +iput_unm_err_out: + iput(bvi); unm_err_out: if (!err) err = -EIO; @@ -1697,7 +1698,7 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) vi->i_size); goto iput_unm_err_out; } - ni->itype.index.bmp_ino = bvi; + iput(bvi); skip_large_index_stuff: /* Setup the operations for this index inode. */ vi->i_op = NULL; @@ -1714,7 +1715,6 @@ skip_large_index_stuff: ntfs_debug("Done."); return 0; - iput_unm_err_out: iput(bvi); unm_err_out: @@ -2191,37 +2191,6 @@ err_out: return -1; } -/** - * ntfs_put_inode - handler for when the inode reference count is decremented - * @vi: vfs inode - * - * The VFS calls ntfs_put_inode() every time the inode reference count (i_count) - * is about to be decremented (but before the decrement itself. - * - * If the inode @vi is a directory with two references, one of which is being - * dropped, we need to put the attribute inode for the directory index bitmap, - * if it is present, otherwise the directory inode would remain pinned for - * ever. - */ -void ntfs_put_inode(struct inode *vi) -{ - if (S_ISDIR(vi->i_mode) && atomic_read(&vi->i_count) == 2) { - ntfs_inode *ni = NTFS_I(vi); - if (NInoIndexAllocPresent(ni)) { - struct inode *bvi = NULL; - mutex_lock(&vi->i_mutex); - if (atomic_read(&vi->i_count) == 2) { - bvi = ni->itype.index.bmp_ino; - if (bvi) - ni->itype.index.bmp_ino = NULL; - } - mutex_unlock(&vi->i_mutex); - if (bvi) - iput(bvi); - } - } -} - static void __ntfs_clear_inode(ntfs_inode *ni) { /* Free all alocated memory. */ @@ -2287,18 +2256,6 @@ void ntfs_clear_big_inode(struct inode *vi) { ntfs_inode *ni = NTFS_I(vi); - /* - * If the inode @vi is an index inode we need to put the attribute - * inode for the index bitmap, if it is present, otherwise the index - * inode would disappear and the attribute inode for the index bitmap - * would no longer be referenced from anywhere and thus it would remain - * pinned for ever. - */ - if (NInoAttr(ni) && (ni->type == AT_INDEX_ALLOCATION) && - NInoIndexAllocPresent(ni) && ni->itype.index.bmp_ino) { - iput(ni->itype.index.bmp_ino); - ni->itype.index.bmp_ino = NULL; - } #ifdef NTFS_RW if (NInoDirty(ni)) { bool was_bad = (is_bad_inode(vi)); diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index f088291e017..117eaf8032a 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -2,7 +2,7 @@ * inode.h - Defines for inode structures NTFS Linux kernel driver. Part of * the Linux-NTFS project. * - * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2001-2007 Anton Altaparmakov * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -101,8 +101,6 @@ struct _ntfs_inode { runlist attr_list_rl; /* Run list for the attribute list value. */ union { struct { /* It is a directory, $MFT, or an index inode. */ - struct inode *bmp_ino; /* Attribute inode for the - index $BITMAP. */ u32 block_size; /* Size of an index block. */ u32 vcn_size; /* Size of a vcn in this index. */ @@ -300,8 +298,6 @@ extern void ntfs_clear_extent_inode(ntfs_inode *ni); extern int ntfs_read_inode_mount(struct inode *vi); -extern void ntfs_put_inode(struct inode *vi); - extern int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt); #ifdef NTFS_RW diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 03a391ac714..babf94d90de 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -1,7 +1,7 @@ /* * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2006 Anton Altaparmakov + * Copyright (c) 2001-2007 Anton Altaparmakov * Copyright (c) 2001,2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -2702,9 +2702,6 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs) static struct super_operations ntfs_sops = { .alloc_inode = ntfs_alloc_big_inode, /* VFS: Allocate new inode. */ .destroy_inode = ntfs_destroy_big_inode, /* VFS: Deallocate inode. */ - .put_inode = ntfs_put_inode, /* VFS: Called just before - the inode reference count - is decreased. */ #ifdef NTFS_RW //.dirty_inode = NULL, /* VFS: Called from // __mark_inode_dirty(). */ @@ -3261,7 +3258,7 @@ static void __exit exit_ntfs_fs(void) } MODULE_AUTHOR("Anton Altaparmakov <aia21@cantab.net>"); -MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2006 Anton Altaparmakov"); +MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2007 Anton Altaparmakov"); MODULE_VERSION(NTFS_VERSION); MODULE_LICENSE("GPL"); #ifdef DEBUG diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index edc91ca3792..f27e5378caf 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1959,7 +1959,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, goto bail; } - *tc = kcalloc(1, sizeof(struct ocfs2_truncate_context), GFP_KERNEL); + *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL); if (!(*tc)) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 2f7268e8152..93628b02ef5 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -540,8 +540,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int ret; - u64 vbo_max; /* file offset, max_blocks from iblock */ - u64 p_blkno; + u64 p_blkno, inode_blocks; int contig_blocks; unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; @@ -550,12 +549,23 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, * nicely aligned and of the right size, so there's no need * for us to check any of that. */ - vbo_max = ((u64)iblock + max_blocks) << blocksize_bits; - spin_lock(&OCFS2_I(inode)->ip_lock); - if ((iblock + max_blocks) > - ocfs2_clusters_to_blocks(inode->i_sb, - OCFS2_I(inode)->ip_clusters)) { + inode_blocks = ocfs2_clusters_to_blocks(inode->i_sb, + OCFS2_I(inode)->ip_clusters); + + /* + * For a read which begins past the end of file, we return a hole. + */ + if (!create && (iblock >= inode_blocks)) { + spin_unlock(&OCFS2_I(inode)->ip_lock); + ret = 0; + goto bail; + } + + /* + * Any write past EOF is not allowed because we'd be extending. + */ + if (create && (iblock + max_blocks) > inode_blocks) { spin_unlock(&OCFS2_I(inode)->ip_lock); ret = -EIO; goto bail; @@ -595,7 +605,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, ssize_t bytes, void *private) { - struct inode *inode = iocb->ki_filp->f_dentry->d_inode; + struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); @@ -611,7 +621,7 @@ static ssize_t ocfs2_direct_IO(int rw, unsigned long nr_segs) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_dentry->d_inode->i_mapping->host; + struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; int ret; mlog_entry_void(); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 4cd9a958045..5a9779bb923 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -184,10 +184,9 @@ static void o2hb_disarm_write_timeout(struct o2hb_region *reg) flush_scheduled_work(); } -static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc, - unsigned int num_ios) +static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc) { - atomic_set(&wc->wc_num_reqs, num_ios); + atomic_set(&wc->wc_num_reqs, 1); init_completion(&wc->wc_io_complete); wc->wc_error = 0; } @@ -212,6 +211,7 @@ static void o2hb_wait_on_io(struct o2hb_region *reg, struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping; blk_run_address_space(mapping); + o2hb_bio_wait_dec(wc, 1); wait_for_completion(&wc->wc_io_complete); } @@ -231,6 +231,7 @@ static int o2hb_bio_end_io(struct bio *bio, return 1; o2hb_bio_wait_dec(wc, 1); + bio_put(bio); return 0; } @@ -238,23 +239,22 @@ static int o2hb_bio_end_io(struct bio *bio, * start_slot. */ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, struct o2hb_bio_wait_ctxt *wc, - unsigned int start_slot, - unsigned int num_slots) + unsigned int *current_slot, + unsigned int max_slots) { - int i, nr_vecs, len, first_page, last_page; + int len, current_page; unsigned int vec_len, vec_start; unsigned int bits = reg->hr_block_bits; unsigned int spp = reg->hr_slots_per_page; + unsigned int cs = *current_slot; struct bio *bio; struct page *page; - nr_vecs = (num_slots + spp - 1) / spp; - /* Testing has shown this allocation to take long enough under * GFP_KERNEL that the local node can get fenced. It would be * nicest if we could pre-allocate these bios and avoid this * all together. */ - bio = bio_alloc(GFP_ATOMIC, nr_vecs); + bio = bio_alloc(GFP_ATOMIC, 16); if (!bio) { mlog(ML_ERROR, "Could not alloc slots BIO!\n"); bio = ERR_PTR(-ENOMEM); @@ -262,137 +262,53 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, } /* Must put everything in 512 byte sectors for the bio... */ - bio->bi_sector = (reg->hr_start_block + start_slot) << (bits - 9); + bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9); bio->bi_bdev = reg->hr_bdev; bio->bi_private = wc; bio->bi_end_io = o2hb_bio_end_io; - first_page = start_slot / spp; - last_page = first_page + nr_vecs; - vec_start = (start_slot << bits) % PAGE_CACHE_SIZE; - for(i = first_page; i < last_page; i++) { - page = reg->hr_slot_data[i]; + vec_start = (cs << bits) % PAGE_CACHE_SIZE; + while(cs < max_slots) { + current_page = cs / spp; + page = reg->hr_slot_data[current_page]; - vec_len = PAGE_CACHE_SIZE; - /* last page might be short */ - if (((i + 1) * spp) > (start_slot + num_slots)) - vec_len = ((num_slots + start_slot) % spp) << bits; - vec_len -= vec_start; + vec_len = min(PAGE_CACHE_SIZE, + (max_slots-cs) * (PAGE_CACHE_SIZE/spp) ); mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n", - i, vec_len, vec_start); + current_page, vec_len, vec_start); len = bio_add_page(bio, page, vec_len, vec_start); - if (len != vec_len) { - bio_put(bio); - bio = ERR_PTR(-EIO); - - mlog(ML_ERROR, "Error adding page to bio i = %d, " - "vec_len = %u, len = %d\n, start = %u\n", - i, vec_len, len, vec_start); - goto bail; - } + if (len != vec_len) break; + cs += vec_len / (PAGE_CACHE_SIZE/spp); vec_start = 0; } bail: + *current_slot = cs; return bio; } -/* - * Compute the maximum number of sectors the bdev can handle in one bio, - * as a power of two. - * - * Stolen from oracleasm, thanks Joel! - */ -static int compute_max_sectors(struct block_device *bdev) -{ - int max_pages, max_sectors, pow_two_sectors; - - struct request_queue *q; - - q = bdev_get_queue(bdev); - max_pages = q->max_sectors >> (PAGE_SHIFT - 9); - if (max_pages > BIO_MAX_PAGES) - max_pages = BIO_MAX_PAGES; - if (max_pages > q->max_phys_segments) - max_pages = q->max_phys_segments; - if (max_pages > q->max_hw_segments) - max_pages = q->max_hw_segments; - max_pages--; /* Handle I/Os that straddle a page */ - - if (max_pages) { - max_sectors = max_pages << (PAGE_SHIFT - 9); - } else { - /* If BIO contains 1 or less than 1 page. */ - max_sectors = q->max_sectors; - } - /* Why is fls() 1-based???? */ - pow_two_sectors = 1 << (fls(max_sectors) - 1); - - return pow_two_sectors; -} - -static inline void o2hb_compute_request_limits(struct o2hb_region *reg, - unsigned int num_slots, - unsigned int *num_bios, - unsigned int *slots_per_bio) -{ - unsigned int max_sectors, io_sectors; - - max_sectors = compute_max_sectors(reg->hr_bdev); - - io_sectors = num_slots << (reg->hr_block_bits - 9); - - *num_bios = (io_sectors + max_sectors - 1) / max_sectors; - *slots_per_bio = max_sectors >> (reg->hr_block_bits - 9); - - mlog(ML_HB_BIO, "My io size is %u sectors for %u slots. This " - "device can handle %u sectors of I/O\n", io_sectors, num_slots, - max_sectors); - mlog(ML_HB_BIO, "Will need %u bios holding %u slots each\n", - *num_bios, *slots_per_bio); -} - static int o2hb_read_slots(struct o2hb_region *reg, unsigned int max_slots) { - unsigned int num_bios, slots_per_bio, start_slot, num_slots; - int i, status; + unsigned int current_slot=0; + int status; struct o2hb_bio_wait_ctxt wc; - struct bio **bios; struct bio *bio; - o2hb_compute_request_limits(reg, max_slots, &num_bios, &slots_per_bio); - - bios = kcalloc(num_bios, sizeof(struct bio *), GFP_KERNEL); - if (!bios) { - status = -ENOMEM; - mlog_errno(status); - return status; - } + o2hb_bio_wait_init(&wc); - o2hb_bio_wait_init(&wc, num_bios); - - num_slots = slots_per_bio; - for(i = 0; i < num_bios; i++) { - start_slot = i * slots_per_bio; - - /* adjust num_slots at last bio */ - if (max_slots < (start_slot + num_slots)) - num_slots = max_slots - start_slot; - - bio = o2hb_setup_one_bio(reg, &wc, start_slot, num_slots); + while(current_slot < max_slots) { + bio = o2hb_setup_one_bio(reg, &wc, ¤t_slot, max_slots); if (IS_ERR(bio)) { - o2hb_bio_wait_dec(&wc, num_bios - i); - status = PTR_ERR(bio); mlog_errno(status); goto bail_and_wait; } - bios[i] = bio; + atomic_inc(&wc.wc_num_reqs); submit_bio(READ, bio); } @@ -403,38 +319,30 @@ bail_and_wait: if (wc.wc_error && !status) status = wc.wc_error; - if (bios) { - for(i = 0; i < num_bios; i++) - if (bios[i]) - bio_put(bios[i]); - kfree(bios); - } - return status; } static int o2hb_issue_node_write(struct o2hb_region *reg, - struct bio **write_bio, struct o2hb_bio_wait_ctxt *write_wc) { int status; unsigned int slot; struct bio *bio; - o2hb_bio_wait_init(write_wc, 1); + o2hb_bio_wait_init(write_wc); slot = o2nm_this_node(); - bio = o2hb_setup_one_bio(reg, write_wc, slot, 1); + bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1); if (IS_ERR(bio)) { status = PTR_ERR(bio); mlog_errno(status); goto bail; } + atomic_inc(&write_wc->wc_num_reqs); submit_bio(WRITE, bio); - *write_bio = bio; status = 0; bail: return status; @@ -826,7 +734,6 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) { int i, ret, highest_node, change = 0; unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; - struct bio *write_bio; struct o2hb_bio_wait_ctxt write_wc; ret = o2nm_configured_node_map(configured_nodes, @@ -864,7 +771,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) /* And fire off the write. Note that we don't wait on this I/O * until later. */ - ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); + ret = o2hb_issue_node_write(reg, &write_wc); if (ret < 0) { mlog_errno(ret); return ret; @@ -882,7 +789,6 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * people we find in our steady state have seen us. */ o2hb_wait_on_io(reg, &write_wc); - bio_put(write_bio); if (write_wc.wc_error) { /* Do not re-arm the write timeout on I/O error - we * can't be sure that the new block ever made it to @@ -943,7 +849,6 @@ static int o2hb_thread(void *data) { int i, ret; struct o2hb_region *reg = data; - struct bio *write_bio; struct o2hb_bio_wait_ctxt write_wc; struct timeval before_hb, after_hb; unsigned int elapsed_msec; @@ -993,10 +898,9 @@ static int o2hb_thread(void *data) * * XXX: Should we skip this on unclean_stop? */ o2hb_prepare_block(reg, 0); - ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); + ret = o2hb_issue_node_write(reg, &write_wc); if (ret == 0) { o2hb_wait_on_io(reg, &write_wc); - bio_put(write_bio); } else { mlog_errno(ret); } @@ -1447,6 +1351,15 @@ out: return ret; } +static ssize_t o2hb_region_pid_read(struct o2hb_region *reg, + char *page) +{ + if (!reg->hr_task) + return 0; + + return sprintf(page, "%u\n", reg->hr_task->pid); +} + struct o2hb_region_attribute { struct configfs_attribute attr; ssize_t (*show)(struct o2hb_region *, char *); @@ -1485,11 +1398,19 @@ static struct o2hb_region_attribute o2hb_region_attr_dev = { .store = o2hb_region_dev_write, }; +static struct o2hb_region_attribute o2hb_region_attr_pid = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "pid", + .ca_mode = S_IRUGO | S_IRUSR }, + .show = o2hb_region_pid_read, +}; + static struct configfs_attribute *o2hb_region_attrs[] = { &o2hb_region_attr_block_bytes.attr, &o2hb_region_attr_start_block.attr, &o2hb_region_attr_blocks.attr, &o2hb_region_attr_dev.attr, + &o2hb_region_attr_pid.attr, NULL, }; @@ -1553,7 +1474,7 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g struct o2hb_region *reg = NULL; struct config_item *ret = NULL; - reg = kcalloc(1, sizeof(struct o2hb_region), GFP_KERNEL); + reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL); if (reg == NULL) goto out; /* ENOMEM */ @@ -1679,7 +1600,7 @@ struct config_group *o2hb_alloc_hb_set(void) struct o2hb_heartbeat_group *hs = NULL; struct config_group *ret = NULL; - hs = kcalloc(1, sizeof(struct o2hb_heartbeat_group), GFP_KERNEL); + hs = kzalloc(sizeof(struct o2hb_heartbeat_group), GFP_KERNEL); if (hs == NULL) goto out; diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index d11753c50bc..b17333a0606 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -35,7 +35,7 @@ /* for now we operate under the assertion that there can be only one * cluster active at a time. Changing this will require trickling * cluster references throughout where nodes are looked up */ -static struct o2nm_cluster *o2nm_single_cluster = NULL; +struct o2nm_cluster *o2nm_single_cluster = NULL; #define OCFS2_MAX_HB_CTL_PATH 256 static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; @@ -97,17 +97,6 @@ const char *o2nm_get_hb_ctl_path(void) } EXPORT_SYMBOL_GPL(o2nm_get_hb_ctl_path); -struct o2nm_cluster { - struct config_group cl_group; - unsigned cl_has_local:1; - u8 cl_local_node; - rwlock_t cl_nodes_lock; - struct o2nm_node *cl_nodes[O2NM_MAX_NODES]; - struct rb_root cl_node_ip_tree; - /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */ - unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; -}; - struct o2nm_node *o2nm_get_node_by_num(u8 node_num) { struct o2nm_node *node = NULL; @@ -543,6 +532,179 @@ static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group) } #endif +struct o2nm_cluster_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct o2nm_cluster *, char *); + ssize_t (*store)(struct o2nm_cluster *, const char *, size_t); +}; + +static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count, + unsigned int *val) +{ + unsigned long tmp; + char *p = (char *)page; + + tmp = simple_strtoul(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + if (tmp == 0) + return -EINVAL; + if (tmp >= (u32)-1) + return -ERANGE; + + *val = tmp; + + return count; +} + +static ssize_t o2nm_cluster_attr_idle_timeout_ms_read( + struct o2nm_cluster *cluster, char *page) +{ + return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms); +} + +static ssize_t o2nm_cluster_attr_idle_timeout_ms_write( + struct o2nm_cluster *cluster, const char *page, size_t count) +{ + ssize_t ret; + unsigned int val; + + ret = o2nm_cluster_attr_write(page, count, &val); + + if (ret > 0) { + if (cluster->cl_idle_timeout_ms != val + && o2net_num_connected_peers()) { + mlog(ML_NOTICE, + "o2net: cannot change idle timeout after " + "the first peer has agreed to it." + " %d connected peers\n", + o2net_num_connected_peers()); + ret = -EINVAL; + } else if (val <= cluster->cl_keepalive_delay_ms) { + mlog(ML_NOTICE, "o2net: idle timeout must be larger " + "than keepalive delay\n"); + ret = -EINVAL; + } else { + cluster->cl_idle_timeout_ms = val; + } + } + + return ret; +} + +static ssize_t o2nm_cluster_attr_keepalive_delay_ms_read( + struct o2nm_cluster *cluster, char *page) +{ + return sprintf(page, "%u\n", cluster->cl_keepalive_delay_ms); +} + +static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write( + struct o2nm_cluster *cluster, const char *page, size_t count) +{ + ssize_t ret; + unsigned int val; + + ret = o2nm_cluster_attr_write(page, count, &val); + + if (ret > 0) { + if (cluster->cl_keepalive_delay_ms != val + && o2net_num_connected_peers()) { + mlog(ML_NOTICE, + "o2net: cannot change keepalive delay after" + " the first peer has agreed to it." + " %d connected peers\n", + o2net_num_connected_peers()); + ret = -EINVAL; + } else if (val >= cluster->cl_idle_timeout_ms) { + mlog(ML_NOTICE, "o2net: keepalive delay must be " + "smaller than idle timeout\n"); + ret = -EINVAL; + } else { + cluster->cl_keepalive_delay_ms = val; + } + } + + return ret; +} + +static ssize_t o2nm_cluster_attr_reconnect_delay_ms_read( + struct o2nm_cluster *cluster, char *page) +{ + return sprintf(page, "%u\n", cluster->cl_reconnect_delay_ms); +} + +static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write( + struct o2nm_cluster *cluster, const char *page, size_t count) +{ + return o2nm_cluster_attr_write(page, count, + &cluster->cl_reconnect_delay_ms); +} +static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "idle_timeout_ms", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_cluster_attr_idle_timeout_ms_read, + .store = o2nm_cluster_attr_idle_timeout_ms_write, +}; + +static struct o2nm_cluster_attribute o2nm_cluster_attr_keepalive_delay_ms = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "keepalive_delay_ms", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_cluster_attr_keepalive_delay_ms_read, + .store = o2nm_cluster_attr_keepalive_delay_ms_write, +}; + +static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "reconnect_delay_ms", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_cluster_attr_reconnect_delay_ms_read, + .store = o2nm_cluster_attr_reconnect_delay_ms_write, +}; + +static struct configfs_attribute *o2nm_cluster_attrs[] = { + &o2nm_cluster_attr_idle_timeout_ms.attr, + &o2nm_cluster_attr_keepalive_delay_ms.attr, + &o2nm_cluster_attr_reconnect_delay_ms.attr, + NULL, +}; +static ssize_t o2nm_cluster_show(struct config_item *item, + struct configfs_attribute *attr, + char *page) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster(item); + struct o2nm_cluster_attribute *o2nm_cluster_attr = + container_of(attr, struct o2nm_cluster_attribute, attr); + ssize_t ret = 0; + + if (o2nm_cluster_attr->show) + ret = o2nm_cluster_attr->show(cluster, page); + return ret; +} + +static ssize_t o2nm_cluster_store(struct config_item *item, + struct configfs_attribute *attr, + const char *page, size_t count) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster(item); + struct o2nm_cluster_attribute *o2nm_cluster_attr = + container_of(attr, struct o2nm_cluster_attribute, attr); + ssize_t ret; + + if (o2nm_cluster_attr->store == NULL) { + ret = -EINVAL; + goto out; + } + + ret = o2nm_cluster_attr->store(cluster, page, count); + if (ret < count) + goto out; +out: + return ret; +} + static struct config_item *o2nm_node_group_make_item(struct config_group *group, const char *name) { @@ -552,7 +714,7 @@ static struct config_item *o2nm_node_group_make_item(struct config_group *group, if (strlen(name) > O2NM_MAX_NAME_LEN) goto out; /* ENAMETOOLONG */ - node = kcalloc(1, sizeof(struct o2nm_node), GFP_KERNEL); + node = kzalloc(sizeof(struct o2nm_node), GFP_KERNEL); if (node == NULL) goto out; /* ENOMEM */ @@ -624,10 +786,13 @@ static void o2nm_cluster_release(struct config_item *item) static struct configfs_item_operations o2nm_cluster_item_ops = { .release = o2nm_cluster_release, + .show_attribute = o2nm_cluster_show, + .store_attribute = o2nm_cluster_store, }; static struct config_item_type o2nm_cluster_type = { .ct_item_ops = &o2nm_cluster_item_ops, + .ct_attrs = o2nm_cluster_attrs, .ct_owner = THIS_MODULE, }; @@ -660,8 +825,8 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g if (o2nm_single_cluster) goto out; /* ENOSPC */ - cluster = kcalloc(1, sizeof(struct o2nm_cluster), GFP_KERNEL); - ns = kcalloc(1, sizeof(struct o2nm_node_group), GFP_KERNEL); + cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL); + ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL); defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); o2hb_group = o2hb_alloc_hb_set(); if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL) @@ -678,6 +843,9 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g cluster->cl_group.default_groups[2] = NULL; rwlock_init(&cluster->cl_nodes_lock); cluster->cl_node_ip_tree = RB_ROOT; + cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT; + cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT; + cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT; ret = &cluster->cl_group; o2nm_single_cluster = cluster; diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h index fce8033c310..8fb23cacc2f 100644 --- a/fs/ocfs2/cluster/nodemanager.h +++ b/fs/ocfs2/cluster/nodemanager.h @@ -53,6 +53,23 @@ struct o2nm_node { unsigned long nd_set_attributes; }; +struct o2nm_cluster { + struct config_group cl_group; + unsigned cl_has_local:1; + u8 cl_local_node; + rwlock_t cl_nodes_lock; + struct o2nm_node *cl_nodes[O2NM_MAX_NODES]; + struct rb_root cl_node_ip_tree; + unsigned int cl_idle_timeout_ms; + unsigned int cl_keepalive_delay_ms; + unsigned int cl_reconnect_delay_ms; + + /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */ + unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; +}; + +extern struct o2nm_cluster *o2nm_single_cluster; + u8 o2nm_this_node(void); int o2nm_configured_node_map(unsigned long *map, unsigned bytes); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 9b3209dc0b1..1718215fc01 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -147,6 +147,28 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes); static void o2net_sc_send_keep_req(struct work_struct *work); static void o2net_idle_timer(unsigned long data); static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); +static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); + +/* + * FIXME: These should use to_o2nm_cluster_from_node(), but we end up + * losing our parent link to the cluster during shutdown. This can be + * solved by adding a pre-removal callback to configfs, or passing + * around the cluster with the node. -jeffm + */ +static inline int o2net_reconnect_delay(struct o2nm_node *node) +{ + return o2nm_single_cluster->cl_reconnect_delay_ms; +} + +static inline int o2net_keepalive_delay(struct o2nm_node *node) +{ + return o2nm_single_cluster->cl_keepalive_delay_ms; +} + +static inline int o2net_idle_timeout(struct o2nm_node *node) +{ + return o2nm_single_cluster->cl_idle_timeout_ms; +} static inline int o2net_sys_err_to_errno(enum o2net_system_error err) { @@ -271,6 +293,8 @@ static void sc_kref_release(struct kref *kref) { struct o2net_sock_container *sc = container_of(kref, struct o2net_sock_container, sc_kref); + BUG_ON(timer_pending(&sc->sc_idle_timeout)); + sclog(sc, "releasing\n"); if (sc->sc_sock) { @@ -300,7 +324,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node) struct page *page = NULL; page = alloc_page(GFP_NOFS); - sc = kcalloc(1, sizeof(*sc), GFP_NOFS); + sc = kzalloc(sizeof(*sc), GFP_NOFS); if (sc == NULL || page == NULL) goto out; @@ -356,6 +380,13 @@ static void o2net_sc_cancel_delayed_work(struct o2net_sock_container *sc, sc_put(sc); } +static atomic_t o2net_connected_peers = ATOMIC_INIT(0); + +int o2net_num_connected_peers(void) +{ + return atomic_read(&o2net_connected_peers); +} + static void o2net_set_nn_state(struct o2net_node *nn, struct o2net_sock_container *sc, unsigned valid, int err) @@ -366,6 +397,11 @@ static void o2net_set_nn_state(struct o2net_node *nn, assert_spin_locked(&nn->nn_lock); + if (old_sc && !sc) + atomic_dec(&o2net_connected_peers); + else if (!old_sc && sc) + atomic_inc(&o2net_connected_peers); + /* the node num comparison and single connect/accept path should stop * an non-null sc from being overwritten with another */ BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc); @@ -424,9 +460,9 @@ static void o2net_set_nn_state(struct o2net_node *nn, /* delay if we're withing a RECONNECT_DELAY of the * last attempt */ delay = (nn->nn_last_connect_attempt + - msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS)) + msecs_to_jiffies(o2net_reconnect_delay(sc->sc_node))) - jiffies; - if (delay > msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS)) + if (delay > msecs_to_jiffies(o2net_reconnect_delay(sc->sc_node))) delay = 0; mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay); queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay); @@ -520,6 +556,8 @@ static void o2net_register_callbacks(struct sock *sk, sk->sk_data_ready = o2net_data_ready; sk->sk_state_change = o2net_state_change; + mutex_init(&sc->sc_send_lock); + write_unlock_bh(&sk->sk_callback_lock); } @@ -652,6 +690,7 @@ static void o2net_handler_put(struct o2net_msg_handler *nmh) * be given to the handler if their payload is longer than the max. */ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, o2net_msg_handler_func *func, void *data, + o2net_post_msg_handler_func *post_func, struct list_head *unreg_list) { struct o2net_msg_handler *nmh = NULL; @@ -678,7 +717,7 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, goto out; } - nmh = kcalloc(1, sizeof(struct o2net_msg_handler), GFP_NOFS); + nmh = kzalloc(sizeof(struct o2net_msg_handler), GFP_NOFS); if (nmh == NULL) { ret = -ENOMEM; goto out; @@ -686,6 +725,7 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, nmh->nh_func = func; nmh->nh_func_data = data; + nmh->nh_post_func = post_func; nmh->nh_msg_type = msg_type; nmh->nh_max_len = max_len; nmh->nh_key = key; @@ -820,10 +860,12 @@ static void o2net_sendpage(struct o2net_sock_container *sc, ssize_t ret; + mutex_lock(&sc->sc_send_lock); ret = sc->sc_sock->ops->sendpage(sc->sc_sock, virt_to_page(kmalloced_virt), (long)kmalloced_virt & ~PAGE_MASK, size, MSG_DONTWAIT); + mutex_unlock(&sc->sc_send_lock); if (ret != size) { mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret); @@ -938,8 +980,10 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, /* finally, convert the message header to network byte-order * and send */ + mutex_lock(&sc->sc_send_lock); ret = o2net_send_tcp_msg(sc->sc_sock, vec, veclen, sizeof(struct o2net_msg) + caller_bytes); + mutex_unlock(&sc->sc_send_lock); msglog(msg, "sending returned %d\n", ret); if (ret < 0) { mlog(0, "error returned from o2net_send_tcp_msg=%d\n", ret); @@ -1013,6 +1057,7 @@ static int o2net_process_message(struct o2net_sock_container *sc, int ret = 0, handler_status; enum o2net_system_error syserr; struct o2net_msg_handler *nmh = NULL; + void *ret_data = NULL; msglog(hdr, "processing message\n"); @@ -1065,17 +1110,26 @@ static int o2net_process_message(struct o2net_sock_container *sc, sc->sc_msg_type = be16_to_cpu(hdr->msg_type); handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) + be16_to_cpu(hdr->data_len), - nmh->nh_func_data); + nmh->nh_func_data, &ret_data); do_gettimeofday(&sc->sc_tv_func_stop); out_respond: /* this destroys the hdr, so don't use it after this */ + mutex_lock(&sc->sc_send_lock); ret = o2net_send_status_magic(sc->sc_sock, hdr, syserr, handler_status); + mutex_unlock(&sc->sc_send_lock); hdr = NULL; mlog(0, "sending handler status %d, syserr %d returned %d\n", handler_status, syserr, ret); + if (nmh) { + BUG_ON(ret_data != NULL && nmh->nh_post_func == NULL); + if (nmh->nh_post_func) + (nmh->nh_post_func)(handler_status, nmh->nh_func_data, + ret_data); + } + out: if (nmh) o2net_handler_put(nmh); @@ -1099,13 +1153,51 @@ static int o2net_check_handshake(struct o2net_sock_container *sc) return -1; } + /* + * Ensure timeouts are consistent with other nodes, otherwise + * we can end up with one node thinking that the other must be down, + * but isn't. This can ultimately cause corruption. + */ + if (be32_to_cpu(hand->o2net_idle_timeout_ms) != + o2net_idle_timeout(sc->sc_node)) { + mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of " + "%u ms, but we use %u ms locally. disconnecting\n", + SC_NODEF_ARGS(sc), + be32_to_cpu(hand->o2net_idle_timeout_ms), + o2net_idle_timeout(sc->sc_node)); + o2net_ensure_shutdown(nn, sc, -ENOTCONN); + return -1; + } + + if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != + o2net_keepalive_delay(sc->sc_node)) { + mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of " + "%u ms, but we use %u ms locally. disconnecting\n", + SC_NODEF_ARGS(sc), + be32_to_cpu(hand->o2net_keepalive_delay_ms), + o2net_keepalive_delay(sc->sc_node)); + o2net_ensure_shutdown(nn, sc, -ENOTCONN); + return -1; + } + + if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) != + O2HB_MAX_WRITE_TIMEOUT_MS) { + mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of " + "%u ms, but we use %u ms locally. disconnecting\n", + SC_NODEF_ARGS(sc), + be32_to_cpu(hand->o2hb_heartbeat_timeout_ms), + O2HB_MAX_WRITE_TIMEOUT_MS); + o2net_ensure_shutdown(nn, sc, -ENOTCONN); + return -1; + } + sc->sc_handshake_ok = 1; spin_lock(&nn->nn_lock); /* set valid and queue the idle timers only if it hasn't been * shut down already */ if (nn->nn_sc == sc) { - o2net_sc_postpone_idle(sc); + o2net_sc_reset_idle_timer(sc); o2net_set_nn_state(nn, sc, 1, 0); } spin_unlock(&nn->nn_lock); @@ -1131,6 +1223,23 @@ static int o2net_advance_rx(struct o2net_sock_container *sc) sclog(sc, "receiving\n"); do_gettimeofday(&sc->sc_tv_advance_start); + if (unlikely(sc->sc_handshake_ok == 0)) { + if(sc->sc_page_off < sizeof(struct o2net_handshake)) { + data = page_address(sc->sc_page) + sc->sc_page_off; + datalen = sizeof(struct o2net_handshake) - sc->sc_page_off; + ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); + if (ret > 0) + sc->sc_page_off += ret; + } + + if (sc->sc_page_off == sizeof(struct o2net_handshake)) { + o2net_check_handshake(sc); + if (unlikely(sc->sc_handshake_ok == 0)) + ret = -EPROTO; + } + goto out; + } + /* do we need more header? */ if (sc->sc_page_off < sizeof(struct o2net_msg)) { data = page_address(sc->sc_page) + sc->sc_page_off; @@ -1138,15 +1247,6 @@ static int o2net_advance_rx(struct o2net_sock_container *sc) ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); if (ret > 0) { sc->sc_page_off += ret; - - /* this working relies on the handshake being - * smaller than the normal message header */ - if (sc->sc_page_off >= sizeof(struct o2net_handshake)&& - !sc->sc_handshake_ok && o2net_check_handshake(sc)) { - ret = -EPROTO; - goto out; - } - /* only swab incoming here.. we can * only get here once as we cross from * being under to over */ @@ -1248,6 +1348,18 @@ static int o2net_set_nodelay(struct socket *sock) return ret; } +static void o2net_initialize_handshake(void) +{ + o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( + O2HB_MAX_WRITE_TIMEOUT_MS); + o2net_hand->o2net_idle_timeout_ms = cpu_to_be32( + o2net_idle_timeout(NULL)); + o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32( + o2net_keepalive_delay(NULL)); + o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32( + o2net_reconnect_delay(NULL)); +} + /* ------------------------------------------------------------ */ /* called when a connect completes and after a sock is accepted. the @@ -1262,6 +1374,7 @@ static void o2net_sc_connect_completed(struct work_struct *work) (unsigned long long)O2NET_PROTOCOL_VERSION, (unsigned long long)be64_to_cpu(o2net_hand->connector_id)); + o2net_initialize_handshake(); o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand)); sc_put(sc); } @@ -1287,8 +1400,10 @@ static void o2net_idle_timer(unsigned long data) do_gettimeofday(&now); - printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for 10 " - "seconds, shutting it down.\n", SC_NODEF_ARGS(sc)); + printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " + "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), + o2net_idle_timeout(sc->sc_node) / 1000, + o2net_idle_timeout(sc->sc_node) % 1000); mlog(ML_NOTICE, "here are some times that might help debug the " "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", @@ -1306,14 +1421,21 @@ static void o2net_idle_timer(unsigned long data) o2net_sc_queue_work(sc, &sc->sc_shutdown_work); } -static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) +static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) { o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work, - O2NET_KEEPALIVE_DELAY_SECS * HZ); + msecs_to_jiffies(o2net_keepalive_delay(sc->sc_node))); do_gettimeofday(&sc->sc_tv_timer); mod_timer(&sc->sc_idle_timeout, - jiffies + (O2NET_IDLE_TIMEOUT_SECS * HZ)); + jiffies + msecs_to_jiffies(o2net_idle_timeout(sc->sc_node))); +} + +static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) +{ + /* Only push out an existing timer */ + if (timer_pending(&sc->sc_idle_timeout)) + o2net_sc_reset_idle_timer(sc); } /* this work func is kicked whenever a path sets the nn state which doesn't @@ -1435,9 +1557,12 @@ static void o2net_connect_expired(struct work_struct *work) spin_lock(&nn->nn_lock); if (!nn->nn_sc_valid) { + struct o2nm_node *node = nn->nn_sc->sc_node; mlog(ML_ERROR, "no connection established with node %u after " - "%u seconds, giving up and returning errors.\n", - o2net_num_from_nn(nn), O2NET_IDLE_TIMEOUT_SECS); + "%u.%u seconds, giving up and returning errors.\n", + o2net_num_from_nn(nn), + o2net_idle_timeout(node) / 1000, + o2net_idle_timeout(node) % 1000); o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); } @@ -1478,6 +1603,8 @@ static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num, if (node_num != o2nm_this_node()) o2net_disconnect_node(node); + + BUG_ON(atomic_read(&o2net_connected_peers) < 0); } static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, @@ -1489,14 +1616,14 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, /* ensure an immediate connect attempt */ nn->nn_last_connect_attempt = jiffies - - (msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS) + 1); + (msecs_to_jiffies(o2net_reconnect_delay(node)) + 1); if (node_num != o2nm_this_node()) { /* heartbeat doesn't work unless a local node number is * configured and doing so brings up the o2net_wq, so we can * use it.. */ queue_delayed_work(o2net_wq, &nn->nn_connect_expired, - O2NET_IDLE_TIMEOUT_SECS * HZ); + msecs_to_jiffies(o2net_idle_timeout(node))); /* believe it or not, accept and node hearbeating testing * can succeed for this node before we got here.. so @@ -1641,6 +1768,7 @@ static int o2net_accept_one(struct socket *sock) o2net_register_callbacks(sc->sc_sock->sk, sc); o2net_sc_queue_work(sc, &sc->sc_rx_work); + o2net_initialize_handshake(); o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand)); out: @@ -1685,13 +1813,13 @@ out: ready(sk, bytes); } -static int o2net_open_listening_sock(__be16 port) +static int o2net_open_listening_sock(__be32 addr, __be16 port) { struct socket *sock = NULL; int ret; struct sockaddr_in sin = { .sin_family = PF_INET, - .sin_addr = { .s_addr = (__force u32)htonl(INADDR_ANY) }, + .sin_addr = { .s_addr = (__force u32)addr }, .sin_port = (__force u16)port, }; @@ -1714,15 +1842,15 @@ static int o2net_open_listening_sock(__be16 port) sock->sk->sk_reuse = 1; ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); if (ret < 0) { - mlog(ML_ERROR, "unable to bind socket to port %d, ret=%d\n", - ntohs(port), ret); + mlog(ML_ERROR, "unable to bind socket at %u.%u.%u.%u:%u, " + "ret=%d\n", NIPQUAD(addr), ntohs(port), ret); goto out; } ret = sock->ops->listen(sock, 64); if (ret < 0) { - mlog(ML_ERROR, "unable to listen on port %d, ret=%d\n", - ntohs(port), ret); + mlog(ML_ERROR, "unable to listen on %u.%u.%u.%u:%u, ret=%d\n", + NIPQUAD(addr), ntohs(port), ret); } out: @@ -1755,7 +1883,8 @@ int o2net_start_listening(struct o2nm_node *node) return -ENOMEM; /* ? */ } - ret = o2net_open_listening_sock(node->nd_ipv4_port); + ret = o2net_open_listening_sock(node->nd_ipv4_address, + node->nd_ipv4_port); if (ret) { destroy_workqueue(o2net_wq); o2net_wq = NULL; @@ -1808,9 +1937,9 @@ int o2net_init(void) o2quo_init(); - o2net_hand = kcalloc(1, sizeof(struct o2net_handshake), GFP_KERNEL); - o2net_keep_req = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL); - o2net_keep_resp = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL); + o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); + o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); + o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) { kfree(o2net_hand); kfree(o2net_keep_req); diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index 616ff2b8434..da880fc215f 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -50,10 +50,20 @@ struct o2net_msg __u8 buf[0]; }; -typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data); +typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +typedef void (o2net_post_msg_handler_func)(int status, void *data, + void *ret_data); #define O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(struct o2net_msg)) +/* same as hb delay, we're waiting for another node to recognize our hb */ +#define O2NET_RECONNECT_DELAY_MS_DEFAULT 2000 + +#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 5000 +#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 10000 + + /* TODO: figure this out.... */ static inline int o2net_link_down(int err, struct socket *sock) { @@ -92,6 +102,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec, int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, o2net_msg_handler_func *func, void *data, + o2net_post_msg_handler_func *post_func, struct list_head *unreg_list); void o2net_unregister_handler_list(struct list_head *list); @@ -101,6 +112,7 @@ void o2net_unregister_hb_callbacks(void); int o2net_start_listening(struct o2nm_node *node); void o2net_stop_listening(struct o2nm_node *node); void o2net_disconnect_node(struct o2nm_node *node); +int o2net_num_connected_peers(void); int o2net_init(void); void o2net_exit(void); diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index daebbd3a2c8..4dae5df5e46 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -27,23 +27,26 @@ #define O2NET_MSG_KEEP_REQ_MAGIC ((u16)0xfa57) #define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58) -/* same as hb delay, we're waiting for another node to recognize our hb */ -#define O2NET_RECONNECT_DELAY_MS O2HB_REGION_TIMEOUT_MS - /* we're delaying our quorum decision so that heartbeat will have timed * out truly dead nodes by the time we come around to making decisions * on their number */ #define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS) -#define O2NET_KEEPALIVE_DELAY_SECS 5 -#define O2NET_IDLE_TIMEOUT_SECS 10 - /* * This version number represents quite a lot, unfortunately. It not * only represents the raw network message protocol on the wire but also * locking semantics of the file system using the protocol. It should * be somewhere else, I'm sure, but right now it isn't. * + * New in version 7: + * - DLM join domain includes the live nodemap + * + * New in version 6: + * - DLM lockres remote refcount fixes. + * + * New in version 5: + * - Network timeout checking protocol + * * New in version 4: * - Remove i_generation from lock names for better stat performance. * @@ -54,10 +57,14 @@ * - full 64 bit i_size in the metadata lock lvbs * - introduction of "rw" lock and pushing meta/data locking down */ -#define O2NET_PROTOCOL_VERSION 4ULL +#define O2NET_PROTOCOL_VERSION 7ULL struct o2net_handshake { __be64 protocol_version; __be64 connector_id; + __be32 o2hb_heartbeat_timeout_ms; + __be32 o2net_idle_timeout_ms; + __be32 o2net_keepalive_delay_ms; + __be32 o2net_reconnect_delay_ms; }; struct o2net_node { @@ -148,6 +155,8 @@ struct o2net_sock_container { struct timeval sc_tv_func_stop; u32 sc_msg_key; u16 sc_msg_type; + + struct mutex sc_send_lock; }; struct o2net_msg_handler { @@ -157,6 +166,8 @@ struct o2net_msg_handler { u32 nh_key; o2net_msg_handler_func *nh_func; o2net_msg_handler_func *nh_func_data; + o2net_post_msg_handler_func + *nh_post_func; struct kref nh_kref; struct list_head nh_unregister_item; }; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index baad2aa27c1..66821e17816 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -79,7 +79,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) struct buffer_head * bh, * tmp; struct ocfs2_dir_entry * de; int err; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct super_block * sb = inode->i_sb; unsigned int ra_sectors = 16; int lock_level = 0; diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 681046d5139..241cad342a4 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -263,7 +263,8 @@ void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, -int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { int ret; unsigned int locklen; @@ -311,8 +312,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data) past->type != DLM_BAST) { mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu" "name=%.*s\n", past->type, - dlm_get_lock_cookie_node(cookie), - dlm_get_lock_cookie_seq(cookie), + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), locklen, name); ret = DLM_IVLOCKID; goto leave; @@ -323,8 +324,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data) mlog(0, "got %sast for unknown lockres! " "cookie=%u:%llu, name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", - dlm_get_lock_cookie_node(cookie), - dlm_get_lock_cookie_seq(cookie), + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), locklen, name, locklen); ret = DLM_IVLOCKID; goto leave; @@ -369,7 +370,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data) mlog(0, "got %sast for unknown lock! cookie=%u:%llu, " "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", - dlm_get_lock_cookie_node(cookie), dlm_get_lock_cookie_seq(cookie), + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), locklen, name, locklen); ret = DLM_NORMAL; diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 6b6ff76538c..e90b92f9ece 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -180,6 +180,11 @@ struct dlm_assert_master_priv unsigned ignore_higher:1; }; +struct dlm_deref_lockres_priv +{ + struct dlm_lock_resource *deref_res; + u8 deref_node; +}; struct dlm_work_item { @@ -191,6 +196,7 @@ struct dlm_work_item struct dlm_request_all_locks_priv ral; struct dlm_mig_lockres_priv ml; struct dlm_assert_master_priv am; + struct dlm_deref_lockres_priv dl; } u; }; @@ -222,6 +228,9 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm, #define DLM_LOCK_RES_DIRTY 0x00000008 #define DLM_LOCK_RES_IN_PROGRESS 0x00000010 #define DLM_LOCK_RES_MIGRATING 0x00000020 +#define DLM_LOCK_RES_DROPPING_REF 0x00000040 +#define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000 +#define DLM_LOCK_RES_SETREF_INPROG 0x00002000 /* max milliseconds to wait to sync up a network failure with a node death */ #define DLM_NODE_DEATH_WAIT_MAX (5 * 1000) @@ -265,6 +274,8 @@ struct dlm_lock_resource u8 owner; //node which owns the lock resource, or unknown u16 state; char lvb[DLM_LVB_LEN]; + unsigned int inflight_locks; + unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; }; struct dlm_migratable_lock @@ -367,7 +378,7 @@ enum { DLM_CONVERT_LOCK_MSG, /* 504 */ DLM_PROXY_AST_MSG, /* 505 */ DLM_UNLOCK_LOCK_MSG, /* 506 */ - DLM_UNUSED_MSG2, /* 507 */ + DLM_DEREF_LOCKRES_MSG, /* 507 */ DLM_MIGRATE_REQUEST_MSG, /* 508 */ DLM_MIG_LOCKRES_MSG, /* 509 */ DLM_QUERY_JOIN_MSG, /* 510 */ @@ -417,6 +428,9 @@ struct dlm_master_request u8 name[O2NM_MAX_NAME_LEN]; }; +#define DLM_ASSERT_RESPONSE_REASSERT 0x00000001 +#define DLM_ASSERT_RESPONSE_MASTERY_REF 0x00000002 + #define DLM_ASSERT_MASTER_MLE_CLEANUP 0x00000001 #define DLM_ASSERT_MASTER_REQUERY 0x00000002 #define DLM_ASSERT_MASTER_FINISH_MIGRATION 0x00000004 @@ -430,6 +444,8 @@ struct dlm_assert_master u8 name[O2NM_MAX_NAME_LEN]; }; +#define DLM_MIGRATE_RESPONSE_MASTERY_REF 0x00000001 + struct dlm_migrate_request { u8 master; @@ -609,12 +625,16 @@ struct dlm_begin_reco }; +#define BITS_PER_BYTE 8 +#define BITS_TO_BYTES(bits) (((bits)+BITS_PER_BYTE-1)/BITS_PER_BYTE) + struct dlm_query_join_request { u8 node_idx; u8 pad1[2]; u8 name_len; u8 domain[O2NM_MAX_NAME_LEN]; + u8 node_map[BITS_TO_BYTES(O2NM_MAX_NODES)]; }; struct dlm_assert_joined @@ -648,6 +668,16 @@ struct dlm_finalize_reco __be32 pad2; }; +struct dlm_deref_lockres +{ + u32 pad1; + u16 pad2; + u8 node_idx; + u8 namelen; + + u8 name[O2NM_MAX_NAME_LEN]; +}; + static inline enum dlm_status __dlm_lockres_state_to_status(struct dlm_lock_resource *res) { @@ -688,16 +718,20 @@ void dlm_lock_put(struct dlm_lock *lock); void dlm_lock_attach_lockres(struct dlm_lock *lock, struct dlm_lock_resource *res); -int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data); +int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); void dlm_revert_pending_convert(struct dlm_lock_resource *res, struct dlm_lock *lock); void dlm_revert_pending_lock(struct dlm_lock_resource *res, struct dlm_lock *lock); -int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data); +int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); void dlm_commit_pending_cancel(struct dlm_lock_resource *res, struct dlm_lock *lock); void dlm_commit_pending_unlock(struct dlm_lock_resource *res, @@ -721,8 +755,6 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); -void dlm_purge_lockres(struct dlm_ctxt *dlm, - struct dlm_lock_resource *lockres); static inline void dlm_lockres_get(struct dlm_lock_resource *res) { /* This is called on every lookup, so it might be worth @@ -733,6 +765,10 @@ void dlm_lockres_put(struct dlm_lock_resource *res); void __dlm_unhash_lockres(struct dlm_lock_resource *res); void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); +struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, + const char *name, + unsigned int len, + unsigned int hash); struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, const char *name, unsigned int len, @@ -753,6 +789,47 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, const char *name, unsigned int namelen); +#define dlm_lockres_set_refmap_bit(bit,res) \ + __dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__) +#define dlm_lockres_clear_refmap_bit(bit,res) \ + __dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__) + +static inline void __dlm_lockres_set_refmap_bit(int bit, + struct dlm_lock_resource *res, + const char *file, + int line) +{ + //printk("%s:%d:%.*s: setting bit %d\n", file, line, + // res->lockname.len, res->lockname.name, bit); + set_bit(bit, res->refmap); +} + +static inline void __dlm_lockres_clear_refmap_bit(int bit, + struct dlm_lock_resource *res, + const char *file, + int line) +{ + //printk("%s:%d:%.*s: clearing bit %d\n", file, line, + // res->lockname.len, res->lockname.name, bit); + clear_bit(bit, res->refmap); +} + +void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + const char *file, + int line); +void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + int new_lockres, + const char *file, + int line); +#define dlm_lockres_drop_inflight_ref(d,r) \ + __dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__) +#define dlm_lockres_grab_inflight_ref(d,r) \ + __dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__) +#define dlm_lockres_grab_inflight_ref_new(d,r) \ + __dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__) + void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void dlm_do_local_ast(struct dlm_ctxt *dlm, @@ -801,10 +878,7 @@ int dlm_heartbeat_init(struct dlm_ctxt *dlm); void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data); void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data); -int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); -int dlm_migrate_lockres(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res, - u8 target); +int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, u8 old_master); @@ -812,15 +886,27 @@ void dlm_lockres_release_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res); -int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data); +int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +void dlm_assert_master_post_handler(int status, void *data, void *ret_data); +int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, u8 nodenum, u8 *real_master); @@ -856,10 +942,12 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) int dlm_init_mle_cache(void); void dlm_destroy_mle_cache(void); void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up); +int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node); int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); - +int __dlm_lockres_has_locks(struct dlm_lock_resource *res); int __dlm_lockres_unused(struct dlm_lock_resource *res); static inline const char * dlm_lock_mode_name(int mode) diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index c764dc8e40a..ecb4d997221 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -286,8 +286,8 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, __dlm_print_one_lock_resource(res); mlog(ML_ERROR, "converting a remote lock that is already " "converting! (cookie=%u:%llu, conv=%d)\n", - dlm_get_lock_cookie_node(lock->ml.cookie), - dlm_get_lock_cookie_seq(lock->ml.cookie), + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), lock->ml.convert_type); status = DLM_DENIED; goto bail; @@ -418,7 +418,8 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm, * returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS, * status from __dlmconvert_master */ -int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf; @@ -428,7 +429,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) struct dlm_lockstatus *lksb; enum dlm_status status = DLM_NORMAL; u32 flags; - int call_ast = 0, kick_thread = 0, ast_reserved = 0; + int call_ast = 0, kick_thread = 0, ast_reserved = 0, wake = 0; if (!dlm_grab(dlm)) { dlm_error(DLM_REJECTED); @@ -479,25 +480,14 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) } lock = NULL; } - if (!lock) { - __dlm_print_one_lock_resource(res); - list_for_each(iter, &res->granted) { - lock = list_entry(iter, struct dlm_lock, list); - if (lock->ml.node == cnv->node_idx) { - mlog(ML_ERROR, "There is something here " - "for node %u, lock->ml.cookie=%llu, " - "cnv->cookie=%llu\n", cnv->node_idx, - (unsigned long long)lock->ml.cookie, - (unsigned long long)cnv->cookie); - break; - } - } - lock = NULL; - } spin_unlock(&res->spinlock); if (!lock) { status = DLM_IVLOCKID; - dlm_error(status); + mlog(ML_ERROR, "did not find lock to convert on grant queue! " + "cookie=%u:%llu\n", + dlm_get_lock_cookie_node(be64_to_cpu(cnv->cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cnv->cookie))); + __dlm_print_one_lock_resource(res); goto leave; } @@ -524,8 +514,11 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) cnv->requested_type, &call_ast, &kick_thread); res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + wake = 1; } spin_unlock(&res->spinlock); + if (wake) + wake_up(&res->wq); if (status != DLM_NORMAL) { if (status != DLM_NOTQUEUED) @@ -534,12 +527,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) } leave: - if (!lock) - mlog(ML_ERROR, "did not find lock to convert on grant queue! " - "cookie=%u:%llu\n", - dlm_get_lock_cookie_node(cnv->cookie), - dlm_get_lock_cookie_seq(cnv->cookie)); - else + if (lock) dlm_lock_put(lock); /* either queue the ast or release it, if reserved */ diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 3f6c8d88f7a..64239b37e5d 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -53,6 +53,23 @@ void dlm_print_one_lock_resource(struct dlm_lock_resource *res) spin_unlock(&res->spinlock); } +static void dlm_print_lockres_refmap(struct dlm_lock_resource *res) +{ + int bit; + assert_spin_locked(&res->spinlock); + + mlog(ML_NOTICE, " refmap nodes: [ "); + bit = 0; + while (1) { + bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); + if (bit >= O2NM_MAX_NODES) + break; + printk("%u ", bit); + bit++; + } + printk("], inflight=%u\n", res->inflight_locks); +} + void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) { struct list_head *iter2; @@ -65,6 +82,7 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) res->owner, res->state); mlog(ML_NOTICE, " last used: %lu, on purge list: %s\n", res->last_used, list_empty(&res->purge) ? "no" : "yes"); + dlm_print_lockres_refmap(res); mlog(ML_NOTICE, " granted queue: \n"); list_for_each(iter2, &res->granted) { lock = list_entry(iter2, struct dlm_lock, list); @@ -72,8 +90,8 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", lock->ml.type, lock->ml.convert_type, lock->ml.node, - dlm_get_lock_cookie_node(lock->ml.cookie), - dlm_get_lock_cookie_seq(lock->ml.cookie), + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), list_empty(&lock->ast_list) ? 'y' : 'n', lock->ast_pending ? 'y' : 'n', list_empty(&lock->bast_list) ? 'y' : 'n', @@ -87,8 +105,8 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", lock->ml.type, lock->ml.convert_type, lock->ml.node, - dlm_get_lock_cookie_node(lock->ml.cookie), - dlm_get_lock_cookie_seq(lock->ml.cookie), + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), list_empty(&lock->ast_list) ? 'y' : 'n', lock->ast_pending ? 'y' : 'n', list_empty(&lock->bast_list) ? 'y' : 'n', @@ -102,8 +120,8 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", lock->ml.type, lock->ml.convert_type, lock->ml.node, - dlm_get_lock_cookie_node(lock->ml.cookie), - dlm_get_lock_cookie_seq(lock->ml.cookie), + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), list_empty(&lock->ast_list) ? 'y' : 'n', lock->ast_pending ? 'y' : 'n', list_empty(&lock->bast_list) ? 'y' : 'n', diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 420a375a394..6087c4749fe 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -48,6 +48,36 @@ #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) #include "cluster/masklog.h" +/* + * ocfs2 node maps are array of long int, which limits to send them freely + * across the wire due to endianness issues. To workaround this, we convert + * long ints to byte arrays. Following 3 routines are helper functions to + * set/test/copy bits within those array of bytes + */ +static inline void byte_set_bit(u8 nr, u8 map[]) +{ + map[nr >> 3] |= (1UL << (nr & 7)); +} + +static inline int byte_test_bit(u8 nr, u8 map[]) +{ + return ((1UL << (nr & 7)) & (map[nr >> 3])) != 0; +} + +static inline void byte_copymap(u8 dmap[], unsigned long smap[], + unsigned int sz) +{ + unsigned int nn; + + if (!sz) + return; + + memset(dmap, 0, ((sz + 7) >> 3)); + for (nn = 0 ; nn < sz; nn++) + if (test_bit(nn, smap)) + byte_set_bit(nn, dmap); +} + static void dlm_free_pagevec(void **vec, int pages) { while (pages--) @@ -95,10 +125,14 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); #define DLM_DOMAIN_BACKOFF_MS 200 -static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data); -static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data); -static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data); -static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data); +static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); @@ -125,10 +159,10 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm, hlist_add_head(&res->hash_node, bucket); } -struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, - const char *name, - unsigned int len, - unsigned int hash) +struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, + const char *name, + unsigned int len, + unsigned int hash) { struct hlist_head *bucket; struct hlist_node *list; @@ -154,6 +188,37 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, return NULL; } +/* intended to be called by functions which do not care about lock + * resources which are being purged (most net _handler functions). + * this will return NULL for any lock resource which is found but + * currently in the process of dropping its mastery reference. + * use __dlm_lookup_lockres_full when you need the lock resource + * regardless (e.g. dlm_get_lock_resource) */ +struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, + const char *name, + unsigned int len, + unsigned int hash) +{ + struct dlm_lock_resource *res = NULL; + + mlog_entry("%.*s\n", len, name); + + assert_spin_locked(&dlm->spinlock); + + res = __dlm_lookup_lockres_full(dlm, name, len, hash); + if (res) { + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_DROPPING_REF) { + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + return NULL; + } + spin_unlock(&res->spinlock); + } + + return res; +} + struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, const char *name, unsigned int len) @@ -330,43 +395,60 @@ static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) wake_up(&dlm_domain_events); } -static void dlm_migrate_all_locks(struct dlm_ctxt *dlm) +static int dlm_migrate_all_locks(struct dlm_ctxt *dlm) { - int i; + int i, num, n, ret = 0; struct dlm_lock_resource *res; + struct hlist_node *iter; + struct hlist_head *bucket; + int dropped; mlog(0, "Migrating locks from domain %s\n", dlm->name); -restart: + + num = 0; spin_lock(&dlm->spinlock); for (i = 0; i < DLM_HASH_BUCKETS; i++) { - while (!hlist_empty(dlm_lockres_hash(dlm, i))) { - res = hlist_entry(dlm_lockres_hash(dlm, i)->first, - struct dlm_lock_resource, hash_node); - /* need reference when manually grabbing lockres */ +redo_bucket: + n = 0; + bucket = dlm_lockres_hash(dlm, i); + iter = bucket->first; + while (iter) { + n++; + res = hlist_entry(iter, struct dlm_lock_resource, + hash_node); dlm_lockres_get(res); - /* this should unhash the lockres - * and exit with dlm->spinlock */ - mlog(0, "purging res=%p\n", res); - if (dlm_lockres_is_dirty(dlm, res)) { - /* HACK! this should absolutely go. - * need to figure out why some empty - * lockreses are still marked dirty */ - mlog(ML_ERROR, "lockres %.*s dirty!\n", - res->lockname.len, res->lockname.name); - - spin_unlock(&dlm->spinlock); - dlm_kick_thread(dlm, res); - wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); - dlm_lockres_put(res); - goto restart; - } - dlm_purge_lockres(dlm, res); + /* migrate, if necessary. this will drop the dlm + * spinlock and retake it if it does migration. */ + dropped = dlm_empty_lockres(dlm, res); + + spin_lock(&res->spinlock); + __dlm_lockres_calc_usage(dlm, res); + iter = res->hash_node.next; + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + + cond_resched_lock(&dlm->spinlock); + + if (dropped) + goto redo_bucket; } + num += n; + mlog(0, "%s: touched %d lockreses in bucket %d " + "(tot=%d)\n", dlm->name, n, i, num); } spin_unlock(&dlm->spinlock); - + wake_up(&dlm->dlm_thread_wq); + + /* let the dlm thread take care of purging, keep scanning until + * nothing remains in the hash */ + if (num) { + mlog(0, "%s: %d lock resources in hash last pass\n", + dlm->name, num); + ret = -EAGAIN; + } mlog(0, "DONE Migrating locks from domain %s\n", dlm->name); + return ret; } static int dlm_no_joining_node(struct dlm_ctxt *dlm) @@ -418,7 +500,8 @@ static void __dlm_print_nodes(struct dlm_ctxt *dlm) printk("\n"); } -static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data) +static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; unsigned int node; @@ -571,7 +654,9 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm) /* We changed dlm state, notify the thread */ dlm_kick_thread(dlm, NULL); - dlm_migrate_all_locks(dlm); + while (dlm_migrate_all_locks(dlm)) { + mlog(0, "%s: more migration to do\n", dlm->name); + } dlm_mark_domain_leaving(dlm); dlm_leave_domain(dlm); dlm_complete_dlm_shutdown(dlm); @@ -580,11 +665,13 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm) } EXPORT_SYMBOL_GPL(dlm_unregister_domain); -static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) +static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_query_join_request *query; enum dlm_query_join_response response; struct dlm_ctxt *dlm = NULL; + u8 nodenum; query = (struct dlm_query_join_request *) msg->buf; @@ -608,6 +695,28 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) spin_lock(&dlm_domain_lock); dlm = __dlm_lookup_domain_full(query->domain, query->name_len); + if (!dlm) + goto unlock_respond; + + /* + * There is a small window where the joining node may not see the + * node(s) that just left but still part of the cluster. DISALLOW + * join request if joining node has different node map. + */ + nodenum=0; + while (nodenum < O2NM_MAX_NODES) { + if (test_bit(nodenum, dlm->domain_map)) { + if (!byte_test_bit(nodenum, query->node_map)) { + mlog(0, "disallow join as node %u does not " + "have node %u in its nodemap\n", + query->node_idx, nodenum); + response = JOIN_DISALLOW; + goto unlock_respond; + } + } + nodenum++; + } + /* Once the dlm ctxt is marked as leaving then we don't want * to be put in someone's domain map. * Also, explicitly disallow joining at certain troublesome @@ -626,15 +735,15 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) /* Disallow parallel joins. */ response = JOIN_DISALLOW; } else if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) { - mlog(ML_NOTICE, "node %u trying to join, but recovery " + mlog(0, "node %u trying to join, but recovery " "is ongoing.\n", bit); response = JOIN_DISALLOW; } else if (test_bit(bit, dlm->recovery_map)) { - mlog(ML_NOTICE, "node %u trying to join, but it " + mlog(0, "node %u trying to join, but it " "still needs recovery.\n", bit); response = JOIN_DISALLOW; } else if (test_bit(bit, dlm->domain_map)) { - mlog(ML_NOTICE, "node %u trying to join, but it " + mlog(0, "node %u trying to join, but it " "is still in the domain! needs recovery?\n", bit); response = JOIN_DISALLOW; @@ -649,6 +758,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) spin_unlock(&dlm->spinlock); } +unlock_respond: spin_unlock(&dlm_domain_lock); respond: @@ -657,7 +767,8 @@ respond: return response; } -static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data) +static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_assert_joined *assert; struct dlm_ctxt *dlm = NULL; @@ -694,7 +805,8 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data) return 0; } -static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data) +static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_cancel_join *cancel; struct dlm_ctxt *dlm = NULL; @@ -796,6 +908,9 @@ static int dlm_request_join(struct dlm_ctxt *dlm, join_msg.name_len = strlen(dlm->name); memcpy(join_msg.domain, dlm->name, join_msg.name_len); + /* copy live node map to join message */ + byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES); + status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg, sizeof(join_msg), node, &retval); if (status < 0 && status != -ENOPROTOOPT) { @@ -920,7 +1035,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) mlog_entry("%p", dlm); - ctxt = kcalloc(1, sizeof(*ctxt), GFP_KERNEL); + ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); if (!ctxt) { status = -ENOMEM; mlog_errno(status); @@ -1036,98 +1151,106 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key, sizeof(struct dlm_master_request), dlm_master_request_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key, sizeof(struct dlm_assert_master), dlm_assert_master_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, dlm_assert_master_post_handler, + &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key, sizeof(struct dlm_create_lock), dlm_create_lock_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key, DLM_CONVERT_LOCK_MAX_LEN, dlm_convert_lock_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key, DLM_UNLOCK_LOCK_MAX_LEN, dlm_unlock_lock_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key, DLM_PROXY_AST_MAX_LEN, dlm_proxy_ast_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key, sizeof(struct dlm_exit_domain), dlm_exit_domain_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_DEREF_LOCKRES_MSG, dlm->key, + sizeof(struct dlm_deref_lockres), + dlm_deref_lockres_handler, + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key, sizeof(struct dlm_migrate_request), dlm_migrate_request_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key, DLM_MIG_LOCKRES_MAX_LEN, dlm_mig_lockres_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key, sizeof(struct dlm_master_requery), dlm_master_requery_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key, sizeof(struct dlm_lock_request), dlm_request_all_locks_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key, sizeof(struct dlm_reco_data_done), dlm_reco_data_done_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key, sizeof(struct dlm_begin_reco), dlm_begin_reco_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key, sizeof(struct dlm_finalize_reco), dlm_finalize_reco_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; @@ -1141,6 +1264,8 @@ bail: static int dlm_join_domain(struct dlm_ctxt *dlm) { int status; + unsigned int backoff; + unsigned int total_backoff = 0; BUG_ON(!dlm); @@ -1172,18 +1297,27 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) } do { - unsigned int backoff; status = dlm_try_to_join_domain(dlm); /* If we're racing another node to the join, then we * need to back off temporarily and let them * complete. */ +#define DLM_JOIN_TIMEOUT_MSECS 90000 if (status == -EAGAIN) { if (signal_pending(current)) { status = -ERESTARTSYS; goto bail; } + if (total_backoff > + msecs_to_jiffies(DLM_JOIN_TIMEOUT_MSECS)) { + status = -ERESTARTSYS; + mlog(ML_NOTICE, "Timed out joining dlm domain " + "%s after %u msecs\n", dlm->name, + jiffies_to_msecs(total_backoff)); + goto bail; + } + /* * <chip> After you! * <dale> No, after you! @@ -1193,6 +1327,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) */ backoff = (unsigned int)(jiffies & 0x3); backoff *= DLM_DOMAIN_BACKOFF_MS; + total_backoff += backoff; mlog(0, "backoff %d\n", backoff); msleep(backoff); } @@ -1223,7 +1358,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, int i; struct dlm_ctxt *dlm = NULL; - dlm = kcalloc(1, sizeof(*dlm), GFP_KERNEL); + dlm = kzalloc(sizeof(*dlm), GFP_KERNEL); if (!dlm) { mlog_errno(-ENOMEM); goto leave; @@ -1421,21 +1556,21 @@ static int dlm_register_net_handlers(void) status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, sizeof(struct dlm_query_join_request), dlm_query_join_handler, - NULL, &dlm_join_handlers); + NULL, NULL, &dlm_join_handlers); if (status) goto bail; status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, sizeof(struct dlm_assert_joined), dlm_assert_joined_handler, - NULL, &dlm_join_handlers); + NULL, NULL, &dlm_join_handlers); if (status) goto bail; status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY, sizeof(struct dlm_cancel_join), dlm_cancel_join_handler, - NULL, &dlm_join_handlers); + NULL, NULL, &dlm_join_handlers); bail: if (status < 0) diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c index 941acf14e61..b7f0ba97a1a 100644 --- a/fs/ocfs2/dlm/dlmfs.c +++ b/fs/ocfs2/dlm/dlmfs.c @@ -176,7 +176,7 @@ static ssize_t dlmfs_file_read(struct file *filp, int bytes_left; ssize_t readlen; char *lvb_buf; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", inode->i_ino, count, *ppos); @@ -220,7 +220,7 @@ static ssize_t dlmfs_file_write(struct file *filp, int bytes_left; ssize_t writelen; char *lvb_buf; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", inode->i_ino, count, *ppos); diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 42a1b91979b..52578d907d9 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -163,6 +163,10 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm, kick_thread = 1; } } + /* reduce the inflight count, this may result in the lockres + * being purged below during calc_usage */ + if (lock->ml.node == dlm->node_num) + dlm_lockres_drop_inflight_ref(dlm, res); spin_unlock(&res->spinlock); wake_up(&res->wq); @@ -408,13 +412,13 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, struct dlm_lock *lock; int kernel_allocated = 0; - lock = kcalloc(1, sizeof(*lock), GFP_NOFS); + lock = kzalloc(sizeof(*lock), GFP_NOFS); if (!lock) return NULL; if (!lksb) { /* zero memory only if kernel-allocated */ - lksb = kcalloc(1, sizeof(*lksb), GFP_NOFS); + lksb = kzalloc(sizeof(*lksb), GFP_NOFS); if (!lksb) { kfree(lock); return NULL; @@ -437,7 +441,8 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, * held on exit: none * returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED */ -int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 856012b4fa4..77e4e6169a0 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -99,9 +99,10 @@ static void dlm_mle_node_up(struct dlm_ctxt *dlm, int idx); static void dlm_assert_master_worker(struct dlm_work_item *item, void *data); -static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname, - unsigned int namelen, void *nodemap, - u32 flags); +static int dlm_do_assert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + void *nodemap, u32 flags); +static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data); static inline int dlm_mle_equal(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle, @@ -237,7 +238,8 @@ static int dlm_find_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry **mle, char *name, unsigned int namelen); -static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to); +static int dlm_do_master_request(struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, int to); static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, @@ -687,6 +689,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, INIT_LIST_HEAD(&res->purge); atomic_set(&res->asts_reserved, 0); res->migration_pending = 0; + res->inflight_locks = 0; kref_init(&res->refs); @@ -700,6 +703,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, res->last_used = 0; memset(res->lvb, 0, DLM_LVB_LEN); + memset(res->refmap, 0, sizeof(res->refmap)); } struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, @@ -722,6 +726,42 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, return res; } +void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + int new_lockres, + const char *file, + int line) +{ + if (!new_lockres) + assert_spin_locked(&res->spinlock); + + if (!test_bit(dlm->node_num, res->refmap)) { + BUG_ON(res->inflight_locks != 0); + dlm_lockres_set_refmap_bit(dlm->node_num, res); + } + res->inflight_locks++; + mlog(0, "%s:%.*s: inflight++: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_locks); +} + +void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + const char *file, + int line) +{ + assert_spin_locked(&res->spinlock); + + BUG_ON(res->inflight_locks == 0); + res->inflight_locks--; + mlog(0, "%s:%.*s: inflight--: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_locks); + if (res->inflight_locks == 0) + dlm_lockres_clear_refmap_bit(dlm->node_num, res); + wake_up(&res->wq); +} + /* * lookup a lock resource by name. * may already exist in the hashtable. @@ -752,6 +792,7 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, unsigned int hash; int tries = 0; int bit, wait_on_recovery = 0; + int drop_inflight_if_nonlocal = 0; BUG_ON(!lockid); @@ -761,9 +802,30 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, lookup: spin_lock(&dlm->spinlock); - tmpres = __dlm_lookup_lockres(dlm, lockid, namelen, hash); + tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); if (tmpres) { + int dropping_ref = 0; + + spin_lock(&tmpres->spinlock); + if (tmpres->owner == dlm->node_num) { + BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF); + dlm_lockres_grab_inflight_ref(dlm, tmpres); + } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) + dropping_ref = 1; + spin_unlock(&tmpres->spinlock); spin_unlock(&dlm->spinlock); + + /* wait until done messaging the master, drop our ref to allow + * the lockres to be purged, start over. */ + if (dropping_ref) { + spin_lock(&tmpres->spinlock); + __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF); + spin_unlock(&tmpres->spinlock); + dlm_lockres_put(tmpres); + tmpres = NULL; + goto lookup; + } + mlog(0, "found in hash!\n"); if (res) dlm_lockres_put(res); @@ -793,6 +855,7 @@ lookup: spin_lock(&res->spinlock); dlm_change_lockres_owner(dlm, res, dlm->node_num); __dlm_insert_lockres(dlm, res); + dlm_lockres_grab_inflight_ref(dlm, res); spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); /* lockres still marked IN_PROGRESS */ @@ -805,29 +868,40 @@ lookup: /* if we found a block, wait for lock to be mastered by another node */ blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen); if (blocked) { + int mig; if (mle->type == DLM_MLE_MASTER) { mlog(ML_ERROR, "master entry for nonexistent lock!\n"); BUG(); - } else if (mle->type == DLM_MLE_MIGRATION) { - /* migration is in progress! */ - /* the good news is that we now know the - * "current" master (mle->master). */ - + } + mig = (mle->type == DLM_MLE_MIGRATION); + /* if there is a migration in progress, let the migration + * finish before continuing. we can wait for the absence + * of the MIGRATION mle: either the migrate finished or + * one of the nodes died and the mle was cleaned up. + * if there is a BLOCK here, but it already has a master + * set, we are too late. the master does not have a ref + * for us in the refmap. detach the mle and drop it. + * either way, go back to the top and start over. */ + if (mig || mle->master != O2NM_MAX_NODES) { + BUG_ON(mig && mle->master == dlm->node_num); + /* we arrived too late. the master does not + * have a ref for us. retry. */ + mlog(0, "%s:%.*s: late on %s\n", + dlm->name, namelen, lockid, + mig ? "MIGRATION" : "BLOCK"); spin_unlock(&dlm->master_lock); - assert_spin_locked(&dlm->spinlock); - - /* set the lockres owner and hash it */ - spin_lock(&res->spinlock); - dlm_set_lockres_owner(dlm, res, mle->master); - __dlm_insert_lockres(dlm, res); - spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); /* master is known, detach */ - dlm_mle_detach_hb_events(dlm, mle); + if (!mig) + dlm_mle_detach_hb_events(dlm, mle); dlm_put_mle(mle); mle = NULL; - goto wake_waiters; + /* this is lame, but we cant wait on either + * the mle or lockres waitqueue here */ + if (mig) + msleep(100); + goto lookup; } } else { /* go ahead and try to master lock on this node */ @@ -858,6 +932,13 @@ lookup: /* finally add the lockres to its hash bucket */ __dlm_insert_lockres(dlm, res); + /* since this lockres is new it doesnt not require the spinlock */ + dlm_lockres_grab_inflight_ref_new(dlm, res); + + /* if this node does not become the master make sure to drop + * this inflight reference below */ + drop_inflight_if_nonlocal = 1; + /* get an extra ref on the mle in case this is a BLOCK * if so, the creator of the BLOCK may try to put the last * ref at this time in the assert master handler, so we @@ -910,7 +991,7 @@ redo_request: ret = -EINVAL; dlm_node_iter_init(mle->vote_map, &iter); while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { - ret = dlm_do_master_request(mle, nodenum); + ret = dlm_do_master_request(res, mle, nodenum); if (ret < 0) mlog_errno(ret); if (mle->master != O2NM_MAX_NODES) { @@ -960,6 +1041,8 @@ wait: wake_waiters: spin_lock(&res->spinlock); + if (res->owner != dlm->node_num && drop_inflight_if_nonlocal) + dlm_lockres_drop_inflight_ref(dlm, res); res->state &= ~DLM_LOCK_RES_IN_PROGRESS; spin_unlock(&res->spinlock); wake_up(&res->wq); @@ -998,7 +1081,7 @@ recheck: /* this will cause the master to re-assert across * the whole cluster, freeing up mles */ if (res->owner != dlm->node_num) { - ret = dlm_do_master_request(mle, res->owner); + ret = dlm_do_master_request(res, mle, res->owner); if (ret < 0) { /* give recovery a chance to run */ mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); @@ -1062,6 +1145,8 @@ recheck: * now tell other nodes that I am * mastering this. */ mle->master = dlm->node_num; + /* ref was grabbed in get_lock_resource + * will be dropped in dlmlock_master */ assert = 1; sleep = 0; } @@ -1087,7 +1172,8 @@ recheck: (atomic_read(&mle->woken) == 1), timeo); if (res->owner == O2NM_MAX_NODES) { - mlog(0, "waiting again\n"); + mlog(0, "%s:%.*s: waiting again\n", dlm->name, + res->lockname.len, res->lockname.name); goto recheck; } mlog(0, "done waiting, master is %u\n", res->owner); @@ -1100,8 +1186,7 @@ recheck: m = dlm->node_num; mlog(0, "about to master %.*s here, this=%u\n", res->lockname.len, res->lockname.name, m); - ret = dlm_do_assert_master(dlm, res->lockname.name, - res->lockname.len, mle->vote_map, 0); + ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0); if (ret) { /* This is a failure in the network path, * not in the response to the assert_master @@ -1117,6 +1202,8 @@ recheck: /* set the lockres owner */ spin_lock(&res->spinlock); + /* mastery reference obtained either during + * assert_master_handler or in get_lock_resource */ dlm_change_lockres_owner(dlm, res, m); spin_unlock(&res->spinlock); @@ -1283,7 +1370,8 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, * */ -static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to) +static int dlm_do_master_request(struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, int to) { struct dlm_ctxt *dlm = mle->dlm; struct dlm_master_request request; @@ -1339,6 +1427,9 @@ again: case DLM_MASTER_RESP_YES: set_bit(to, mle->response_map); mlog(0, "node %u is the master, response=YES\n", to); + mlog(0, "%s:%.*s: master node %u now knows I have a " + "reference\n", dlm->name, res->lockname.len, + res->lockname.name, to); mle->master = to; break; case DLM_MASTER_RESP_NO: @@ -1379,7 +1470,8 @@ out: * * if possible, TRIM THIS DOWN!!! */ -int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { u8 response = DLM_MASTER_RESP_MAYBE; struct dlm_ctxt *dlm = data; @@ -1417,10 +1509,11 @@ way_up_top: /* take care of the easy cases up front */ spin_lock(&res->spinlock); - if (res->state & DLM_LOCK_RES_RECOVERING) { + if (res->state & (DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_MIGRATING)) { spin_unlock(&res->spinlock); mlog(0, "returning DLM_MASTER_RESP_ERROR since res is " - "being recovered\n"); + "being recovered/migrated\n"); response = DLM_MASTER_RESP_ERROR; if (mle) kmem_cache_free(dlm_mle_cache, mle); @@ -1428,8 +1521,10 @@ way_up_top: } if (res->owner == dlm->node_num) { + mlog(0, "%s:%.*s: setting bit %u in refmap\n", + dlm->name, namelen, name, request->node_idx); + dlm_lockres_set_refmap_bit(request->node_idx, res); spin_unlock(&res->spinlock); - // mlog(0, "this node is the master\n"); response = DLM_MASTER_RESP_YES; if (mle) kmem_cache_free(dlm_mle_cache, mle); @@ -1477,7 +1572,6 @@ way_up_top: mlog(0, "node %u is master, but trying to migrate to " "node %u.\n", tmpmle->master, tmpmle->new_master); if (tmpmle->master == dlm->node_num) { - response = DLM_MASTER_RESP_YES; mlog(ML_ERROR, "no owner on lockres, but this " "node is trying to migrate it to %u?!\n", tmpmle->new_master); @@ -1494,6 +1588,10 @@ way_up_top: * go back and clean the mles on any * other nodes */ dispatch_assert = 1; + dlm_lockres_set_refmap_bit(request->node_idx, res); + mlog(0, "%s:%.*s: setting bit %u in refmap\n", + dlm->name, namelen, name, + request->node_idx); } else response = DLM_MASTER_RESP_NO; } else { @@ -1607,17 +1705,24 @@ send_response: * can periodically run all locks owned by this node * and re-assert across the cluster... */ -static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname, - unsigned int namelen, void *nodemap, - u32 flags) +int dlm_do_assert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + void *nodemap, u32 flags) { struct dlm_assert_master assert; int to, tmpret; struct dlm_node_iter iter; int ret = 0; int reassert; + const char *lockname = res->lockname.name; + unsigned int namelen = res->lockname.len; BUG_ON(namelen > O2NM_MAX_NAME_LEN); + + spin_lock(&res->spinlock); + res->state |= DLM_LOCK_RES_SETREF_INPROG; + spin_unlock(&res->spinlock); + again: reassert = 0; @@ -1647,6 +1752,7 @@ again: mlog(0, "link to %d went down!\n", to); /* any nonzero status return will do */ ret = tmpret; + r = 0; } else if (r < 0) { /* ok, something horribly messed. kill thyself. */ mlog(ML_ERROR,"during assert master of %.*s to %u, " @@ -1661,17 +1767,39 @@ again: spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); BUG(); - } else if (r == EAGAIN) { + } + + if (r & DLM_ASSERT_RESPONSE_REASSERT && + !(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) { + mlog(ML_ERROR, "%.*s: very strange, " + "master MLE but no lockres on %u\n", + namelen, lockname, to); + } + + if (r & DLM_ASSERT_RESPONSE_REASSERT) { mlog(0, "%.*s: node %u create mles on other " "nodes and requests a re-assert\n", namelen, lockname, to); reassert = 1; } + if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) { + mlog(0, "%.*s: node %u has a reference to this " + "lockres, set the bit in the refmap\n", + namelen, lockname, to); + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(to, res); + spin_unlock(&res->spinlock); + } } if (reassert) goto again; + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_SETREF_INPROG; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + return ret; } @@ -1684,7 +1812,8 @@ again: * * if possible, TRIM THIS DOWN!!! */ -int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_master_list_entry *mle = NULL; @@ -1693,7 +1822,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) char *name; unsigned int namelen, hash; u32 flags; - int master_request = 0; + int master_request = 0, have_lockres_ref = 0; int ret = 0; if (!dlm_grab(dlm)) @@ -1851,6 +1980,7 @@ ok: spin_unlock(&mle->spinlock); if (res) { + int wake = 0; spin_lock(&res->spinlock); if (mle->type == DLM_MLE_MIGRATION) { mlog(0, "finishing off migration of lockres %.*s, " @@ -1858,12 +1988,16 @@ ok: res->lockname.len, res->lockname.name, dlm->node_num, mle->new_master); res->state &= ~DLM_LOCK_RES_MIGRATING; + wake = 1; dlm_change_lockres_owner(dlm, res, mle->new_master); BUG_ON(res->state & DLM_LOCK_RES_DIRTY); } else { dlm_change_lockres_owner(dlm, res, mle->master); } spin_unlock(&res->spinlock); + have_lockres_ref = 1; + if (wake) + wake_up(&res->wq); } /* master is known, detach if not already detached. @@ -1913,12 +2047,28 @@ ok: done: ret = 0; - if (res) - dlm_lockres_put(res); + if (res) { + spin_lock(&res->spinlock); + res->state |= DLM_LOCK_RES_SETREF_INPROG; + spin_unlock(&res->spinlock); + *ret_data = (void *)res; + } dlm_put(dlm); if (master_request) { mlog(0, "need to tell master to reassert\n"); - ret = EAGAIN; // positive. negative would shoot down the node. + /* positive. negative would shoot down the node. */ + ret |= DLM_ASSERT_RESPONSE_REASSERT; + if (!have_lockres_ref) { + mlog(ML_ERROR, "strange, got assert from %u, MASTER " + "mle present here for %s:%.*s, but no lockres!\n", + assert->node_idx, dlm->name, namelen, name); + } + } + if (have_lockres_ref) { + /* let the master know we have a reference to the lockres */ + ret |= DLM_ASSERT_RESPONSE_MASTERY_REF; + mlog(0, "%s:%.*s: got assert from %u, need a ref\n", + dlm->name, namelen, name, assert->node_idx); } return ret; @@ -1929,17 +2079,31 @@ kill: __dlm_print_one_lock_resource(res); spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); - dlm_lockres_put(res); + *ret_data = (void *)res; dlm_put(dlm); return -EINVAL; } +void dlm_assert_master_post_handler(int status, void *data, void *ret_data) +{ + struct dlm_lock_resource *res = (struct dlm_lock_resource *)ret_data; + + if (ret_data) { + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_SETREF_INPROG; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + dlm_lockres_put(res); + } + return; +} + int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, int ignore_higher, u8 request_from, u32 flags) { struct dlm_work_item *item; - item = kcalloc(1, sizeof(*item), GFP_NOFS); + item = kzalloc(sizeof(*item), GFP_NOFS); if (!item) return -ENOMEM; @@ -2023,9 +2187,7 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) * even if one or more nodes die */ mlog(0, "worker about to master %.*s here, this=%u\n", res->lockname.len, res->lockname.name, dlm->node_num); - ret = dlm_do_assert_master(dlm, res->lockname.name, - res->lockname.len, - nodemap, flags); + ret = dlm_do_assert_master(dlm, res, nodemap, flags); if (ret < 0) { /* no need to restart, we are done */ if (!dlm_is_host_down(ret)) @@ -2097,14 +2259,180 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, return ret; } +/* + * DLM_DEREF_LOCKRES_MSG + */ + +int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +{ + struct dlm_deref_lockres deref; + int ret = 0, r; + const char *lockname; + unsigned int namelen; + + lockname = res->lockname.name; + namelen = res->lockname.len; + BUG_ON(namelen > O2NM_MAX_NAME_LEN); + + mlog(0, "%s:%.*s: sending deref to %d\n", + dlm->name, namelen, lockname, res->owner); + memset(&deref, 0, sizeof(deref)); + deref.node_idx = dlm->node_num; + deref.namelen = namelen; + memcpy(deref.name, lockname, namelen); + + ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, + &deref, sizeof(deref), res->owner, &r); + if (ret < 0) + mlog_errno(ret); + else if (r < 0) { + /* BAD. other node says I did not have a ref. */ + mlog(ML_ERROR,"while dropping ref on %s:%.*s " + "(master=%u) got %d.\n", dlm->name, namelen, + lockname, res->owner, r); + dlm_print_one_lock_resource(res); + BUG(); + } + return ret; +} + +int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_deref_lockres *deref = (struct dlm_deref_lockres *)msg->buf; + struct dlm_lock_resource *res = NULL; + char *name; + unsigned int namelen; + int ret = -EINVAL; + u8 node; + unsigned int hash; + struct dlm_work_item *item; + int cleared = 0; + int dispatch = 0; + + if (!dlm_grab(dlm)) + return 0; + + name = deref->name; + namelen = deref->namelen; + node = deref->node_idx; + + if (namelen > DLM_LOCKID_NAME_MAX) { + mlog(ML_ERROR, "Invalid name length!"); + goto done; + } + if (deref->node_idx >= O2NM_MAX_NODES) { + mlog(ML_ERROR, "Invalid node number: %u\n", node); + goto done; + } + + hash = dlm_lockid_hash(name, namelen); + + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres_full(dlm, name, namelen, hash); + if (!res) { + spin_unlock(&dlm->spinlock); + mlog(ML_ERROR, "%s:%.*s: bad lockres name\n", + dlm->name, namelen, name); + goto done; + } + spin_unlock(&dlm->spinlock); + + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_SETREF_INPROG) + dispatch = 1; + else { + BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); + if (test_bit(node, res->refmap)) { + dlm_lockres_clear_refmap_bit(node, res); + cleared = 1; + } + } + spin_unlock(&res->spinlock); + + if (!dispatch) { + if (cleared) + dlm_lockres_calc_usage(dlm, res); + else { + mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " + "but it is already dropped!\n", dlm->name, + res->lockname.len, res->lockname.name, node); + __dlm_print_one_lock_resource(res); + } + ret = 0; + goto done; + } + + item = kzalloc(sizeof(*item), GFP_NOFS); + if (!item) { + ret = -ENOMEM; + mlog_errno(ret); + goto done; + } + + dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL); + item->u.dl.deref_res = res; + item->u.dl.deref_node = node; + + spin_lock(&dlm->work_lock); + list_add_tail(&item->list, &dlm->work_list); + spin_unlock(&dlm->work_lock); + + queue_work(dlm->dlm_worker, &dlm->dispatched_work); + return 0; + +done: + if (res) + dlm_lockres_put(res); + dlm_put(dlm); + + return ret; +} + +static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) +{ + struct dlm_ctxt *dlm; + struct dlm_lock_resource *res; + u8 node; + u8 cleared = 0; + + dlm = item->dlm; + res = item->u.dl.deref_res; + node = item->u.dl.deref_node; + + spin_lock(&res->spinlock); + BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); + if (test_bit(node, res->refmap)) { + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); + dlm_lockres_clear_refmap_bit(node, res); + cleared = 1; + } + spin_unlock(&res->spinlock); + + if (cleared) { + mlog(0, "%s:%.*s node %u ref dropped in dispatch\n", + dlm->name, res->lockname.len, res->lockname.name, node); + dlm_lockres_calc_usage(dlm, res); + } else { + mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " + "but it is already dropped!\n", dlm->name, + res->lockname.len, res->lockname.name, node); + __dlm_print_one_lock_resource(res); + } + + dlm_lockres_put(res); +} + /* * DLM_MIGRATE_LOCKRES */ -int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, - u8 target) +static int dlm_migrate_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 target) { struct dlm_master_list_entry *mle = NULL; struct dlm_master_list_entry *oldmle = NULL; @@ -2116,7 +2444,7 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, struct list_head *queue, *iter; int i; struct dlm_lock *lock; - int empty = 1; + int empty = 1, wake = 0; if (!dlm_grab(dlm)) return -EINVAL; @@ -2241,6 +2569,7 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, res->lockname.name, target); spin_lock(&res->spinlock); res->state &= ~DLM_LOCK_RES_MIGRATING; + wake = 1; spin_unlock(&res->spinlock); ret = -EINVAL; } @@ -2268,6 +2597,9 @@ fail: * the lockres */ + /* now that remote nodes are spinning on the MIGRATING flag, + * ensure that all assert_master work is flushed. */ + flush_workqueue(dlm->dlm_worker); /* get an extra reference on the mle. * otherwise the assert_master from the new @@ -2296,6 +2628,7 @@ fail: dlm_put_mle_inuse(mle); spin_lock(&res->spinlock); res->state &= ~DLM_LOCK_RES_MIGRATING; + wake = 1; spin_unlock(&res->spinlock); goto leave; } @@ -2322,7 +2655,8 @@ fail: res->owner == target) break; - mlog(0, "timed out during migration\n"); + mlog(0, "%s:%.*s: timed out during migration\n", + dlm->name, res->lockname.len, res->lockname.name); /* avoid hang during shutdown when migrating lockres * to a node which also goes down */ if (dlm_is_node_dead(dlm, target)) { @@ -2330,20 +2664,20 @@ fail: "target %u is no longer up, restarting\n", dlm->name, res->lockname.len, res->lockname.name, target); - ret = -ERESTARTSYS; + ret = -EINVAL; + /* migration failed, detach and clean up mle */ + dlm_mle_detach_hb_events(dlm, mle); + dlm_put_mle(mle); + dlm_put_mle_inuse(mle); + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_MIGRATING; + wake = 1; + spin_unlock(&res->spinlock); + goto leave; } - } - if (ret == -ERESTARTSYS) { - /* migration failed, detach and clean up mle */ - dlm_mle_detach_hb_events(dlm, mle); - dlm_put_mle(mle); - dlm_put_mle_inuse(mle); - spin_lock(&res->spinlock); - res->state &= ~DLM_LOCK_RES_MIGRATING; - spin_unlock(&res->spinlock); - goto leave; - } - /* TODO: if node died: stop, clean up, return error */ + } else + mlog(0, "%s:%.*s: caught signal during migration\n", + dlm->name, res->lockname.len, res->lockname.name); } /* all done, set the owner, clear the flag */ @@ -2366,6 +2700,11 @@ leave: if (ret < 0) dlm_kick_thread(dlm, res); + /* wake up waiters if the MIGRATING flag got set + * but migration failed */ + if (wake) + wake_up(&res->wq); + /* TODO: cleanup */ if (mres) free_page((unsigned long)mres); @@ -2376,6 +2715,53 @@ leave: return ret; } +#define DLM_MIGRATION_RETRY_MS 100 + +/* Should be called only after beginning the domain leave process. + * There should not be any remaining locks on nonlocal lock resources, + * and there should be no local locks left on locally mastered resources. + * + * Called with the dlm spinlock held, may drop it to do migration, but + * will re-acquire before exit. + * + * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */ +int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +{ + int ret; + int lock_dropped = 0; + + if (res->owner != dlm->node_num) { + if (!__dlm_lockres_unused(res)) { + mlog(ML_ERROR, "%s:%.*s: this node is not master, " + "trying to free this but locks remain\n", + dlm->name, res->lockname.len, res->lockname.name); + } + goto leave; + } + + /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */ + spin_unlock(&dlm->spinlock); + lock_dropped = 1; + while (1) { + ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES); + if (ret >= 0) + break; + if (ret == -ENOTEMPTY) { + mlog(ML_ERROR, "lockres %.*s still has local locks!\n", + res->lockname.len, res->lockname.name); + BUG(); + } + + mlog(0, "lockres %.*s: migrate failed, " + "retrying\n", res->lockname.len, + res->lockname.name); + msleep(DLM_MIGRATION_RETRY_MS); + } + spin_lock(&dlm->spinlock); +leave: + return lock_dropped; +} + int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock) { int ret; @@ -2405,7 +2791,8 @@ static int dlm_migration_can_proceed(struct dlm_ctxt *dlm, return can_proceed; } -int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) { int ret; spin_lock(&res->spinlock); @@ -2434,8 +2821,15 @@ static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, __dlm_lockres_reserve_ast(res); spin_unlock(&res->spinlock); - /* now flush all the pending asts.. hang out for a bit */ + /* now flush all the pending asts */ dlm_kick_thread(dlm, res); + /* before waiting on DIRTY, block processes which may + * try to dirty the lockres before MIGRATING is set */ + spin_lock(&res->spinlock); + BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY); + res->state |= DLM_LOCK_RES_BLOCK_DIRTY; + spin_unlock(&res->spinlock); + /* now wait on any pending asts and the DIRTY state */ wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); dlm_lockres_release_ast(dlm, res); @@ -2461,6 +2855,13 @@ again: mlog(0, "trying again...\n"); goto again; } + /* now that we are sure the MIGRATING state is there, drop + * the unneded state which blocked threads trying to DIRTY */ + spin_lock(&res->spinlock); + BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY)); + BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); + res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; + spin_unlock(&res->spinlock); /* did the target go down or die? */ spin_lock(&dlm->spinlock); @@ -2490,7 +2891,7 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, { struct list_head *iter, *iter2; struct list_head *queue = &res->granted; - int i; + int i, bit; struct dlm_lock *lock; assert_spin_locked(&res->spinlock); @@ -2508,12 +2909,28 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, BUG_ON(!list_empty(&lock->bast_list)); BUG_ON(lock->ast_pending); BUG_ON(lock->bast_pending); + dlm_lockres_clear_refmap_bit(lock->ml.node, res); list_del_init(&lock->list); dlm_lock_put(lock); } } queue++; } + bit = 0; + while (1) { + bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); + if (bit >= O2NM_MAX_NODES) + break; + /* do not clear the local node reference, if there is a + * process holding this, let it drop the ref itself */ + if (bit != dlm->node_num) { + mlog(0, "%s:%.*s: node %u had a ref to this " + "migrating lockres, clearing\n", dlm->name, + res->lockname.len, res->lockname.name, bit); + dlm_lockres_clear_refmap_bit(bit, res); + } + bit++; + } } /* for now this is not too intelligent. we will @@ -2601,6 +3018,16 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, mlog(0, "migrate request (node %u) returned %d!\n", nodenum, status); ret = status; + } else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) { + /* during the migration request we short-circuited + * the mastery of the lockres. make sure we have + * a mastery ref for nodenum */ + mlog(0, "%s:%.*s: need ref for node %u\n", + dlm->name, res->lockname.len, res->lockname.name, + nodenum); + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(nodenum, res); + spin_unlock(&res->spinlock); } } @@ -2619,7 +3046,8 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, * we will have no mle in the list to start with. now we can add an mle for * the migration and this should be the only one found for those scanning the * list. */ -int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_lock_resource *res = NULL; @@ -2745,7 +3173,13 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, /* remove it from the list so that only one * mle will be found */ list_del_init(&tmp->list); - __dlm_mle_detach_hb_events(dlm, mle); + /* this was obviously WRONG. mle is uninited here. should be tmp. */ + __dlm_mle_detach_hb_events(dlm, tmp); + ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; + mlog(0, "%s:%.*s: master=%u, newmaster=%u, " + "telling master to get ref for cleared out mle " + "during migration\n", dlm->name, namelen, name, + master, new_master); } spin_unlock(&tmp->spinlock); } @@ -2753,6 +3187,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, /* now add a migration mle to the tail of the list */ dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen); mle->new_master = new_master; + /* the new master will be sending an assert master for this. + * at that point we will get the refmap reference */ mle->master = master; /* do this for consistency with other mle types */ set_bit(new_master, mle->maybe_map); @@ -2902,6 +3338,13 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, clear_bit(dlm->node_num, iter.node_map); spin_unlock(&dlm->spinlock); + /* ownership of the lockres is changing. account for the + * mastery reference here since old_master will briefly have + * a reference after the migration completes */ + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(old_master, res); + spin_unlock(&res->spinlock); + mlog(0, "now time to do a migrate request to other nodes\n"); ret = dlm_do_migrate_request(dlm, res, old_master, dlm->node_num, &iter); @@ -2914,8 +3357,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, res->lockname.len, res->lockname.name); /* this call now finishes out the nodemap * even if one or more nodes die */ - ret = dlm_do_assert_master(dlm, res->lockname.name, - res->lockname.len, iter.node_map, + ret = dlm_do_assert_master(dlm, res, iter.node_map, DLM_ASSERT_MASTER_FINISH_MIGRATION); if (ret < 0) { /* no longer need to retry. all living nodes contacted. */ @@ -2927,8 +3369,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, set_bit(old_master, iter.node_map); mlog(0, "doing assert master of %.*s back to %u\n", res->lockname.len, res->lockname.name, old_master); - ret = dlm_do_assert_master(dlm, res->lockname.name, - res->lockname.len, iter.node_map, + ret = dlm_do_assert_master(dlm, res, iter.node_map, DLM_ASSERT_MASTER_FINISH_MIGRATION); if (ret < 0) { mlog(0, "assert master to original master failed " diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index fb3e2b0817f..6d4a83d5015 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -163,9 +163,6 @@ void dlm_dispatch_work(struct work_struct *work) dlm_workfunc_t *workfunc; int tot=0; - if (!dlm_joined(dlm)) - return; - spin_lock(&dlm->work_lock); list_splice_init(&dlm->work_list, &tmp_list); spin_unlock(&dlm->work_lock); @@ -757,7 +754,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) } BUG_ON(num == dead_node); - ndata = kcalloc(1, sizeof(*ndata), GFP_NOFS); + ndata = kzalloc(sizeof(*ndata), GFP_NOFS); if (!ndata) { dlm_destroy_recovery_area(dlm, dead_node); return -ENOMEM; @@ -821,7 +818,8 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, } -int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf; @@ -842,7 +840,7 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data) } BUG_ON(lr->dead_node != dlm->reco.dead_node); - item = kcalloc(1, sizeof(*item), GFP_NOFS); + item = kzalloc(sizeof(*item), GFP_NOFS); if (!item) { dlm_put(dlm); return -ENOMEM; @@ -978,7 +976,8 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to) } -int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf; @@ -1129,6 +1128,11 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, if (total_locks == mres_total_locks) mres->flags |= DLM_MRES_ALL_DONE; + mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n", + dlm->name, res->lockname.len, res->lockname.name, + orig_flags & DLM_MRES_MIGRATION ? "migrate" : "recovery", + send_to); + /* send it */ ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres, sz, send_to, &status); @@ -1213,6 +1217,34 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock, return 0; } +static void dlm_add_dummy_lock(struct dlm_ctxt *dlm, + struct dlm_migratable_lockres *mres) +{ + struct dlm_lock dummy; + memset(&dummy, 0, sizeof(dummy)); + dummy.ml.cookie = 0; + dummy.ml.type = LKM_IVMODE; + dummy.ml.convert_type = LKM_IVMODE; + dummy.ml.highest_blocked = LKM_IVMODE; + dummy.lksb = NULL; + dummy.ml.node = dlm->node_num; + dlm_add_lock_to_array(&dummy, mres, DLM_BLOCKED_LIST); +} + +static inline int dlm_is_dummy_lock(struct dlm_ctxt *dlm, + struct dlm_migratable_lock *ml, + u8 *nodenum) +{ + if (unlikely(ml->cookie == 0 && + ml->type == LKM_IVMODE && + ml->convert_type == LKM_IVMODE && + ml->highest_blocked == LKM_IVMODE && + ml->list == DLM_BLOCKED_LIST)) { + *nodenum = ml->node; + return 1; + } + return 0; +} int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, struct dlm_migratable_lockres *mres, @@ -1260,6 +1292,14 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, goto error; } } + if (total_locks == 0) { + /* send a dummy lock to indicate a mastery reference only */ + mlog(0, "%s:%.*s: sending dummy lock to %u, %s\n", + dlm->name, res->lockname.len, res->lockname.name, + send_to, flags & DLM_MRES_RECOVERY ? "recovery" : + "migration"); + dlm_add_dummy_lock(dlm, mres); + } /* flush any remaining locks */ ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); if (ret < 0) @@ -1293,7 +1333,8 @@ error: * do we spin? returning an error only delays the problem really */ -int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_migratable_lockres *mres = @@ -1323,7 +1364,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) ret = -ENOMEM; buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS); - item = kcalloc(1, sizeof(*item), GFP_NOFS); + item = kzalloc(sizeof(*item), GFP_NOFS); if (!buf || !item) goto leave; @@ -1382,17 +1423,21 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) spin_lock(&res->spinlock); res->state &= ~DLM_LOCK_RES_IN_PROGRESS; spin_unlock(&res->spinlock); + wake_up(&res->wq); /* add an extra ref for just-allocated lockres * otherwise the lockres will be purged immediately */ dlm_lockres_get(res); - } /* at this point we have allocated everything we need, * and we have a hashed lockres with an extra ref and * the proper res->state flags. */ ret = 0; + spin_lock(&res->spinlock); + /* drop this either when master requery finds a different master + * or when a lock is added by the recovery worker */ + dlm_lockres_grab_inflight_ref(dlm, res); if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) { /* migration cannot have an unknown master */ BUG_ON(!(mres->flags & DLM_MRES_RECOVERY)); @@ -1400,10 +1445,11 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) "unknown owner.. will need to requery: " "%.*s\n", mres->lockname_len, mres->lockname); } else { - spin_lock(&res->spinlock); + /* take a reference now to pin the lockres, drop it + * when locks are added in the worker */ dlm_change_lockres_owner(dlm, res, dlm->node_num); - spin_unlock(&res->spinlock); } + spin_unlock(&res->spinlock); /* queue up work for dlm_mig_lockres_worker */ dlm_grab(dlm); /* get an extra ref for the work item */ @@ -1459,6 +1505,9 @@ again: "this node will take it.\n", res->lockname.len, res->lockname.name); } else { + spin_lock(&res->spinlock); + dlm_lockres_drop_inflight_ref(dlm, res); + spin_unlock(&res->spinlock); mlog(0, "master needs to respond to sender " "that node %u still owns %.*s\n", real_master, res->lockname.len, @@ -1578,7 +1627,8 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, /* this function cannot error, so unless the sending * or receiving of the message failed, the owner can * be trusted */ -int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf; @@ -1660,21 +1710,38 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, { struct dlm_migratable_lock *ml; struct list_head *queue; + struct list_head *tmpq = NULL; struct dlm_lock *newlock = NULL; struct dlm_lockstatus *lksb = NULL; int ret = 0; - int i, bad; + int i, j, bad; struct list_head *iter; struct dlm_lock *lock = NULL; + u8 from = O2NM_MAX_NODES; + unsigned int added = 0; mlog(0, "running %d locks for this lockres\n", mres->num_locks); for (i=0; i<mres->num_locks; i++) { ml = &(mres->ml[i]); + + if (dlm_is_dummy_lock(dlm, ml, &from)) { + /* placeholder, just need to set the refmap bit */ + BUG_ON(mres->num_locks != 1); + mlog(0, "%s:%.*s: dummy lock for %u\n", + dlm->name, mres->lockname_len, mres->lockname, + from); + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(from, res); + spin_unlock(&res->spinlock); + added++; + break; + } BUG_ON(ml->highest_blocked != LKM_IVMODE); newlock = NULL; lksb = NULL; queue = dlm_list_num_to_pointer(res, ml->list); + tmpq = NULL; /* if the lock is for the local node it needs to * be moved to the proper location within the queue. @@ -1684,11 +1751,16 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, BUG_ON(!(mres->flags & DLM_MRES_MIGRATION)); spin_lock(&res->spinlock); - list_for_each(iter, queue) { - lock = list_entry (iter, struct dlm_lock, list); - if (lock->ml.cookie != ml->cookie) - lock = NULL; - else + for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { + tmpq = dlm_list_idx_to_ptr(res, j); + list_for_each(iter, tmpq) { + lock = list_entry (iter, struct dlm_lock, list); + if (lock->ml.cookie != ml->cookie) + lock = NULL; + else + break; + } + if (lock) break; } @@ -1698,12 +1770,20 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, u64 c = ml->cookie; mlog(ML_ERROR, "could not find local lock " "with cookie %u:%llu!\n", - dlm_get_lock_cookie_node(c), - dlm_get_lock_cookie_seq(c)); + dlm_get_lock_cookie_node(be64_to_cpu(c)), + dlm_get_lock_cookie_seq(be64_to_cpu(c))); + __dlm_print_one_lock_resource(res); BUG(); } BUG_ON(lock->ml.node != ml->node); + if (tmpq != queue) { + mlog(0, "lock was on %u instead of %u for %.*s\n", + j, ml->list, res->lockname.len, res->lockname.name); + spin_unlock(&res->spinlock); + continue; + } + /* see NOTE above about why we do not update * to match the master here */ @@ -1711,6 +1791,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, /* do not alter lock refcount. switching lists. */ list_move_tail(&lock->list, queue); spin_unlock(&res->spinlock); + added++; mlog(0, "just reordered a local lock!\n"); continue; @@ -1799,14 +1880,14 @@ skip_lvb: mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already " "exists on this lockres!\n", dlm->name, res->lockname.len, res->lockname.name, - dlm_get_lock_cookie_node(c), - dlm_get_lock_cookie_seq(c)); + dlm_get_lock_cookie_node(be64_to_cpu(c)), + dlm_get_lock_cookie_seq(be64_to_cpu(c))); mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, " "node=%u, cookie=%u:%llu, queue=%d\n", ml->type, ml->convert_type, ml->node, - dlm_get_lock_cookie_node(ml->cookie), - dlm_get_lock_cookie_seq(ml->cookie), + dlm_get_lock_cookie_node(be64_to_cpu(ml->cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(ml->cookie)), ml->list); __dlm_print_one_lock_resource(res); @@ -1817,12 +1898,22 @@ skip_lvb: if (!bad) { dlm_lock_get(newlock); list_add_tail(&newlock->list, queue); + mlog(0, "%s:%.*s: added lock for node %u, " + "setting refmap bit\n", dlm->name, + res->lockname.len, res->lockname.name, ml->node); + dlm_lockres_set_refmap_bit(ml->node, res); + added++; } spin_unlock(&res->spinlock); } mlog(0, "done running all the locks\n"); leave: + /* balance the ref taken when the work was queued */ + spin_lock(&res->spinlock); + dlm_lockres_drop_inflight_ref(dlm, res); + spin_unlock(&res->spinlock); + if (ret < 0) { mlog_errno(ret); if (newlock) @@ -1935,9 +2026,11 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, if (res->owner == dead_node) { list_del_init(&res->recovering); spin_lock(&res->spinlock); + /* new_master has our reference from + * the lock state sent during recovery */ dlm_change_lockres_owner(dlm, res, new_master); res->state &= ~DLM_LOCK_RES_RECOVERING; - if (!__dlm_lockres_unused(res)) + if (__dlm_lockres_has_locks(res)) __dlm_dirty_lockres(dlm, res); spin_unlock(&res->spinlock); wake_up(&res->wq); @@ -1977,9 +2070,11 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, dlm_lockres_put(res); } spin_lock(&res->spinlock); + /* new_master has our reference from + * the lock state sent during recovery */ dlm_change_lockres_owner(dlm, res, new_master); res->state &= ~DLM_LOCK_RES_RECOVERING; - if (!__dlm_lockres_unused(res)) + if (__dlm_lockres_has_locks(res)) __dlm_dirty_lockres(dlm, res); spin_unlock(&res->spinlock); wake_up(&res->wq); @@ -2048,6 +2143,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, { struct list_head *iter, *tmpiter; struct dlm_lock *lock; + unsigned int freed = 0; /* this node is the lockres master: * 1) remove any stale locks for the dead node @@ -2062,6 +2158,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, if (lock->ml.node == dead_node) { list_del_init(&lock->list); dlm_lock_put(lock); + freed++; } } list_for_each_safe(iter, tmpiter, &res->converting) { @@ -2069,6 +2166,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, if (lock->ml.node == dead_node) { list_del_init(&lock->list); dlm_lock_put(lock); + freed++; } } list_for_each_safe(iter, tmpiter, &res->blocked) { @@ -2076,9 +2174,23 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, if (lock->ml.node == dead_node) { list_del_init(&lock->list); dlm_lock_put(lock); + freed++; } } + if (freed) { + mlog(0, "%s:%.*s: freed %u locks for dead node %u, " + "dropping ref from lockres\n", dlm->name, + res->lockname.len, res->lockname.name, freed, dead_node); + BUG_ON(!test_bit(dead_node, res->refmap)); + dlm_lockres_clear_refmap_bit(dead_node, res); + } else if (test_bit(dead_node, res->refmap)) { + mlog(0, "%s:%.*s: dead node %u had a ref, but had " + "no locks and had not purged before dying\n", dlm->name, + res->lockname.len, res->lockname.name, dead_node); + dlm_lockres_clear_refmap_bit(dead_node, res); + } + /* do not kick thread yet */ __dlm_dirty_lockres(dlm, res); } @@ -2141,9 +2253,21 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) spin_lock(&res->spinlock); /* zero the lvb if necessary */ dlm_revalidate_lvb(dlm, res, dead_node); - if (res->owner == dead_node) + if (res->owner == dead_node) { + if (res->state & DLM_LOCK_RES_DROPPING_REF) + mlog(0, "%s:%.*s: owned by " + "dead node %u, this node was " + "dropping its ref when it died. " + "continue, dropping the flag.\n", + dlm->name, res->lockname.len, + res->lockname.name, dead_node); + + /* the wake_up for this will happen when the + * RECOVERING flag is dropped later */ + res->state &= ~DLM_LOCK_RES_DROPPING_REF; + dlm_move_lockres_to_recovery_list(dlm, res); - else if (res->owner == dlm->node_num) { + } else if (res->owner == dlm->node_num) { dlm_free_dead_locks(dlm, res, dead_node); __dlm_lockres_calc_usage(dlm, res); } @@ -2480,7 +2604,8 @@ retry: return ret; } -int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf; @@ -2608,7 +2733,8 @@ stage2: return ret; } -int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf; diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 0c822f3ffb0..8ffa0916eb8 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -54,9 +54,6 @@ #include "cluster/masklog.h" static int dlm_thread(void *data); -static void dlm_purge_lockres_now(struct dlm_ctxt *dlm, - struct dlm_lock_resource *lockres); - static void dlm_flush_asts(struct dlm_ctxt *dlm); #define dlm_lock_is_remote(dlm, lock) ((lock)->ml.node != (dlm)->node_num) @@ -82,14 +79,33 @@ repeat: current->state = TASK_RUNNING; } - -int __dlm_lockres_unused(struct dlm_lock_resource *res) +int __dlm_lockres_has_locks(struct dlm_lock_resource *res) { if (list_empty(&res->granted) && list_empty(&res->converting) && - list_empty(&res->blocked) && - list_empty(&res->dirty)) - return 1; + list_empty(&res->blocked)) + return 0; + return 1; +} + +/* "unused": the lockres has no locks, is not on the dirty list, + * has no inflight locks (in the gap between mastery and acquiring + * the first lock), and has no bits in its refmap. + * truly ready to be freed. */ +int __dlm_lockres_unused(struct dlm_lock_resource *res) +{ + if (!__dlm_lockres_has_locks(res) && + (list_empty(&res->dirty) && !(res->state & DLM_LOCK_RES_DIRTY))) { + /* try not to scan the bitmap unless the first two + * conditions are already true */ + int bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + if (bit >= O2NM_MAX_NODES) { + /* since the bit for dlm->node_num is not + * set, inflight_locks better be zero */ + BUG_ON(res->inflight_locks != 0); + return 1; + } + } return 0; } @@ -106,46 +122,21 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, assert_spin_locked(&res->spinlock); if (__dlm_lockres_unused(res)){ - /* For now, just keep any resource we master */ - if (res->owner == dlm->node_num) - { - if (!list_empty(&res->purge)) { - mlog(0, "we master %s:%.*s, but it is on " - "the purge list. Removing\n", - dlm->name, res->lockname.len, - res->lockname.name); - list_del_init(&res->purge); - dlm->purge_count--; - } - return; - } - if (list_empty(&res->purge)) { - mlog(0, "putting lockres %.*s from purge list\n", - res->lockname.len, res->lockname.name); + mlog(0, "putting lockres %.*s:%p onto purge list\n", + res->lockname.len, res->lockname.name, res); res->last_used = jiffies; + dlm_lockres_get(res); list_add_tail(&res->purge, &dlm->purge_list); dlm->purge_count++; - - /* if this node is not the owner, there is - * no way to keep track of who the owner could be. - * unhash it to avoid serious problems. */ - if (res->owner != dlm->node_num) { - mlog(0, "%s:%.*s: doing immediate " - "purge of lockres owned by %u\n", - dlm->name, res->lockname.len, - res->lockname.name, res->owner); - - dlm_purge_lockres_now(dlm, res); - } } } else if (!list_empty(&res->purge)) { - mlog(0, "removing lockres %.*s from purge list, " - "owner=%u\n", res->lockname.len, res->lockname.name, - res->owner); + mlog(0, "removing lockres %.*s:%p from purge list, owner=%u\n", + res->lockname.len, res->lockname.name, res, res->owner); list_del_init(&res->purge); + dlm_lockres_put(res); dlm->purge_count--; } } @@ -163,68 +154,65 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, spin_unlock(&dlm->spinlock); } -/* TODO: Eventual API: Called with the dlm spinlock held, may drop it - * to do migration, but will re-acquire before exit. */ -void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *lockres) +static int dlm_purge_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) { int master; - int ret; - - spin_lock(&lockres->spinlock); - master = lockres->owner == dlm->node_num; - spin_unlock(&lockres->spinlock); + int ret = 0; - mlog(0, "purging lockres %.*s, master = %d\n", lockres->lockname.len, - lockres->lockname.name, master); - - /* Non master is the easy case -- no migration required, just - * quit. */ + spin_lock(&res->spinlock); + if (!__dlm_lockres_unused(res)) { + spin_unlock(&res->spinlock); + mlog(0, "%s:%.*s: tried to purge but not unused\n", + dlm->name, res->lockname.len, res->lockname.name); + return -ENOTEMPTY; + } + master = (res->owner == dlm->node_num); if (!master) - goto finish; - - /* Wheee! Migrate lockres here! */ - spin_unlock(&dlm->spinlock); -again: + res->state |= DLM_LOCK_RES_DROPPING_REF; + spin_unlock(&res->spinlock); - ret = dlm_migrate_lockres(dlm, lockres, O2NM_MAX_NODES); - if (ret == -ENOTEMPTY) { - mlog(ML_ERROR, "lockres %.*s still has local locks!\n", - lockres->lockname.len, lockres->lockname.name); + mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len, + res->lockname.name, master); - BUG(); - } else if (ret < 0) { - mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n", - lockres->lockname.len, lockres->lockname.name); - msleep(100); - goto again; + if (!master) { + spin_lock(&res->spinlock); + /* This ensures that clear refmap is sent after the set */ + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); + spin_unlock(&res->spinlock); + /* drop spinlock to do messaging, retake below */ + spin_unlock(&dlm->spinlock); + /* clear our bit from the master's refmap, ignore errors */ + ret = dlm_drop_lockres_ref(dlm, res); + if (ret < 0) { + mlog_errno(ret); + if (!dlm_is_host_down(ret)) + BUG(); + } + mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n", + dlm->name, res->lockname.len, res->lockname.name, ret); + spin_lock(&dlm->spinlock); } - spin_lock(&dlm->spinlock); - -finish: - if (!list_empty(&lockres->purge)) { - list_del_init(&lockres->purge); + if (!list_empty(&res->purge)) { + mlog(0, "removing lockres %.*s:%p from purgelist, " + "master = %d\n", res->lockname.len, res->lockname.name, + res, master); + list_del_init(&res->purge); + dlm_lockres_put(res); dlm->purge_count--; } - __dlm_unhash_lockres(lockres); -} - -/* make an unused lockres go away immediately. - * as soon as the dlm spinlock is dropped, this lockres - * will not be found. kfree still happens on last put. */ -static void dlm_purge_lockres_now(struct dlm_ctxt *dlm, - struct dlm_lock_resource *lockres) -{ - assert_spin_locked(&dlm->spinlock); - assert_spin_locked(&lockres->spinlock); + __dlm_unhash_lockres(res); - BUG_ON(!__dlm_lockres_unused(lockres)); - - if (!list_empty(&lockres->purge)) { - list_del_init(&lockres->purge); - dlm->purge_count--; + /* lockres is not in the hash now. drop the flag and wake up + * any processes waiting in dlm_get_lock_resource. */ + if (!master) { + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_DROPPING_REF; + spin_unlock(&res->spinlock); + wake_up(&res->wq); } - __dlm_unhash_lockres(lockres); + return 0; } static void dlm_run_purge_list(struct dlm_ctxt *dlm, @@ -268,13 +256,17 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm, break; } + mlog(0, "removing lockres %.*s:%p from purgelist\n", + lockres->lockname.len, lockres->lockname.name, lockres); list_del_init(&lockres->purge); + dlm_lockres_put(lockres); dlm->purge_count--; /* This may drop and reacquire the dlm spinlock if it * has to do migration. */ mlog(0, "calling dlm_purge_lockres!\n"); - dlm_purge_lockres(dlm, lockres); + if (dlm_purge_lockres(dlm, lockres)) + BUG(); mlog(0, "DONE calling dlm_purge_lockres!\n"); /* Avoid adding any scheduling latencies */ @@ -467,12 +459,17 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) assert_spin_locked(&res->spinlock); /* don't shuffle secondary queues */ - if ((res->owner == dlm->node_num) && - !(res->state & DLM_LOCK_RES_DIRTY)) { - /* ref for dirty_list */ - dlm_lockres_get(res); - list_add_tail(&res->dirty, &dlm->dirty_list); - res->state |= DLM_LOCK_RES_DIRTY; + if ((res->owner == dlm->node_num)) { + if (res->state & (DLM_LOCK_RES_MIGRATING | + DLM_LOCK_RES_BLOCK_DIRTY)) + return; + + if (list_empty(&res->dirty)) { + /* ref for dirty_list */ + dlm_lockres_get(res); + list_add_tail(&res->dirty, &dlm->dirty_list); + res->state |= DLM_LOCK_RES_DIRTY; + } } } @@ -651,7 +648,7 @@ static int dlm_thread(void *data) dlm_lockres_get(res); spin_lock(&res->spinlock); - res->state &= ~DLM_LOCK_RES_DIRTY; + /* We clear the DLM_LOCK_RES_DIRTY state once we shuffle lists below */ list_del_init(&res->dirty); spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); @@ -675,10 +672,11 @@ static int dlm_thread(void *data) /* it is now ok to move lockreses in these states * to the dirty list, assuming that they will only be * dirty for a short while. */ + BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); if (res->state & (DLM_LOCK_RES_IN_PROGRESS | - DLM_LOCK_RES_MIGRATING | DLM_LOCK_RES_RECOVERING)) { /* move it to the tail and keep going */ + res->state &= ~DLM_LOCK_RES_DIRTY; spin_unlock(&res->spinlock); mlog(0, "delaying list shuffling for in-" "progress lockres %.*s, state=%d\n", @@ -699,6 +697,7 @@ static int dlm_thread(void *data) /* called while holding lockres lock */ dlm_shuffle_lists(dlm, res); + res->state &= ~DLM_LOCK_RES_DIRTY; spin_unlock(&res->spinlock); dlm_lockres_calc_usage(dlm, res); @@ -709,11 +708,8 @@ in_progress: /* if the lock was in-progress, stick * it on the back of the list */ if (delay) { - /* ref for dirty_list */ - dlm_lockres_get(res); spin_lock(&res->spinlock); - list_add_tail(&res->dirty, &dlm->dirty_list); - res->state |= DLM_LOCK_RES_DIRTY; + __dlm_dirty_lockres(dlm, res); spin_unlock(&res->spinlock); } dlm_lockres_put(res); diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 37be4b2e0d4..86ca085ef32 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -147,6 +147,10 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, goto leave; } + if (res->state & DLM_LOCK_RES_MIGRATING) { + status = DLM_MIGRATING; + goto leave; + } /* see above for what the spec says about * LKM_CANCEL and the lock queue state */ @@ -244,8 +248,8 @@ leave: /* this should always be coupled with list removal */ BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK)); mlog(0, "lock %u:%llu should be gone now! refs=%d\n", - dlm_get_lock_cookie_node(lock->ml.cookie), - dlm_get_lock_cookie_seq(lock->ml.cookie), + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), atomic_read(&lock->lock_refs.refcount)-1); dlm_lock_put(lock); } @@ -379,7 +383,8 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, * returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID, * return value from dlmunlock_master */ -int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf; @@ -502,8 +507,8 @@ not_found: if (!found) mlog(ML_ERROR, "failed to find lock to unlock! " "cookie=%u:%llu\n", - dlm_get_lock_cookie_node(unlock->cookie), - dlm_get_lock_cookie_seq(unlock->cookie)); + dlm_get_lock_cookie_node(be64_to_cpu(unlock->cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(unlock->cookie))); else dlm_lock_put(lock); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 69fba16efbd..e335541727f 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -770,7 +770,7 @@ static int ocfs2_lock_create(struct ocfs2_super *osb, int dlm_flags) { int ret = 0; - enum dlm_status status; + enum dlm_status status = DLM_NORMAL; unsigned long flags; mlog_entry_void(); @@ -1138,6 +1138,7 @@ int ocfs2_rw_lock(struct inode *inode, int write) { int status, level; struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); BUG_ON(!inode); @@ -1147,6 +1148,9 @@ int ocfs2_rw_lock(struct inode *inode, int write) (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); + if (ocfs2_mount_local(osb)) + return 0; + lockres = &OCFS2_I(inode)->ip_rw_lockres; level = write ? LKM_EXMODE : LKM_PRMODE; @@ -1164,6 +1168,7 @@ void ocfs2_rw_unlock(struct inode *inode, int write) { int level = write ? LKM_EXMODE : LKM_PRMODE; struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry_void(); @@ -1171,7 +1176,8 @@ void ocfs2_rw_unlock(struct inode *inode, int write) (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); - ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); mlog_exit_void(); } @@ -1182,6 +1188,7 @@ int ocfs2_data_lock_full(struct inode *inode, { int status = 0, level; struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); BUG_ON(!inode); @@ -1201,6 +1208,9 @@ int ocfs2_data_lock_full(struct inode *inode, goto out; } + if (ocfs2_mount_local(osb)) + goto out; + lockres = &OCFS2_I(inode)->ip_data_lockres; level = write ? LKM_EXMODE : LKM_PRMODE; @@ -1269,6 +1279,7 @@ void ocfs2_data_unlock(struct inode *inode, { int level = write ? LKM_EXMODE : LKM_PRMODE; struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry_void(); @@ -1276,7 +1287,8 @@ void ocfs2_data_unlock(struct inode *inode, (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); - if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) + if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) && + !ocfs2_mount_local(osb)) ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); mlog_exit_void(); @@ -1467,8 +1479,9 @@ static int ocfs2_meta_lock_update(struct inode *inode, { int status = 0; struct ocfs2_inode_info *oi = OCFS2_I(inode); - struct ocfs2_lock_res *lockres; + struct ocfs2_lock_res *lockres = NULL; struct ocfs2_dinode *fe; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry_void(); @@ -1483,10 +1496,12 @@ static int ocfs2_meta_lock_update(struct inode *inode, } spin_unlock(&oi->ip_lock); - lockres = &oi->ip_meta_lockres; + if (!ocfs2_mount_local(osb)) { + lockres = &oi->ip_meta_lockres; - if (!ocfs2_should_refresh_lock_res(lockres)) - goto bail; + if (!ocfs2_should_refresh_lock_res(lockres)) + goto bail; + } /* This will discard any caching information we might have had * for the inode metadata. */ @@ -1496,7 +1511,7 @@ static int ocfs2_meta_lock_update(struct inode *inode, * map (directories, bitmap files, etc) */ ocfs2_extent_map_trunc(inode, 0); - if (ocfs2_meta_lvb_is_trustable(inode, lockres)) { + if (lockres && ocfs2_meta_lvb_is_trustable(inode, lockres)) { mlog(0, "Trusting LVB on inode %llu\n", (unsigned long long)oi->ip_blkno); ocfs2_refresh_inode_from_lvb(inode); @@ -1543,7 +1558,8 @@ static int ocfs2_meta_lock_update(struct inode *inode, status = 0; bail_refresh: - ocfs2_complete_lock_res_refresh(lockres, status); + if (lockres) + ocfs2_complete_lock_res_refresh(lockres, status); bail: mlog_exit(status); return status; @@ -1585,7 +1601,7 @@ int ocfs2_meta_lock_full(struct inode *inode, int arg_flags) { int status, level, dlm_flags, acquired; - struct ocfs2_lock_res *lockres; + struct ocfs2_lock_res *lockres = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *local_bh = NULL; @@ -1607,6 +1623,9 @@ int ocfs2_meta_lock_full(struct inode *inode, goto bail; } + if (ocfs2_mount_local(osb)) + goto local; + if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) wait_event(osb->recovery_event, ocfs2_node_map_is_empty(osb, &osb->recovery_map)); @@ -1636,6 +1655,7 @@ int ocfs2_meta_lock_full(struct inode *inode, wait_event(osb->recovery_event, ocfs2_node_map_is_empty(osb, &osb->recovery_map)); +local: /* * We only see this flag if we're being called from * ocfs2_read_locked_inode(). It means we're locking an inode @@ -1644,7 +1664,8 @@ int ocfs2_meta_lock_full(struct inode *inode, */ if (inode->i_state & I_NEW) { status = 0; - ocfs2_complete_lock_res_refresh(lockres, 0); + if (lockres) + ocfs2_complete_lock_res_refresh(lockres, 0); goto bail; } @@ -1767,6 +1788,7 @@ void ocfs2_meta_unlock(struct inode *inode, { int level = ex ? LKM_EXMODE : LKM_PRMODE; struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry_void(); @@ -1774,7 +1796,8 @@ void ocfs2_meta_unlock(struct inode *inode, (unsigned long long)OCFS2_I(inode)->ip_blkno, ex ? "EXMODE" : "PRMODE"); - if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) + if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) && + !ocfs2_mount_local(osb)) ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); mlog_exit_void(); @@ -1783,7 +1806,7 @@ void ocfs2_meta_unlock(struct inode *inode, int ocfs2_super_lock(struct ocfs2_super *osb, int ex) { - int status; + int status = 0; int level = ex ? LKM_EXMODE : LKM_PRMODE; struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; struct buffer_head *bh; @@ -1794,6 +1817,9 @@ int ocfs2_super_lock(struct ocfs2_super *osb, if (ocfs2_is_hard_readonly(osb)) return -EROFS; + if (ocfs2_mount_local(osb)) + goto bail; + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); if (status < 0) { mlog_errno(status); @@ -1832,7 +1858,8 @@ void ocfs2_super_unlock(struct ocfs2_super *osb, int level = ex ? LKM_EXMODE : LKM_PRMODE; struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; - ocfs2_cluster_unlock(osb, lockres, level); + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, lockres, level); } int ocfs2_rename_lock(struct ocfs2_super *osb) @@ -1843,6 +1870,9 @@ int ocfs2_rename_lock(struct ocfs2_super *osb) if (ocfs2_is_hard_readonly(osb)) return -EROFS; + if (ocfs2_mount_local(osb)) + return 0; + status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0); if (status < 0) mlog_errno(status); @@ -1854,7 +1884,8 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb) { struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; - ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE); + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE); } int ocfs2_dentry_lock(struct dentry *dentry, int ex) @@ -1869,6 +1900,9 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex) if (ocfs2_is_hard_readonly(osb)) return -EROFS; + if (ocfs2_mount_local(osb)) + return 0; + ret = ocfs2_cluster_lock(osb, &dl->dl_lockres, level, 0, 0); if (ret < 0) mlog_errno(ret); @@ -1882,7 +1916,8 @@ void ocfs2_dentry_unlock(struct dentry *dentry, int ex) struct ocfs2_dentry_lock *dl = dentry->d_fsdata; struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - ocfs2_cluster_unlock(osb, &dl->dl_lockres, level); + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, &dl->dl_lockres, level); } /* Reference counting of the dlm debug structure. We want this because @@ -2145,12 +2180,15 @@ static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) int ocfs2_dlm_init(struct ocfs2_super *osb) { - int status; + int status = 0; u32 dlm_key; - struct dlm_ctxt *dlm; + struct dlm_ctxt *dlm = NULL; mlog_entry_void(); + if (ocfs2_mount_local(osb)) + goto local; + status = ocfs2_dlm_init_debug(osb); if (status < 0) { mlog_errno(status); @@ -2178,11 +2216,12 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) goto bail; } + dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb); + +local: ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); - dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb); - osb->dlm = dlm; status = 0; @@ -2679,6 +2718,15 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, inode = ocfs2_lock_res_inode(lockres); mapping = inode->i_mapping; + /* + * We need this before the filemap_fdatawrite() so that it can + * transfer the dirty bit from the PTE to the + * page. Unfortunately this means that even for EX->PR + * downconverts, we'll lose our mappings and have to build + * them up again. + */ + unmap_mapping_range(mapping, 0, 0, 0); + if (filemap_fdatawrite(mapping)) { mlog(ML_ERROR, "Could not sync inode %llu for downconvert!", (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -2686,7 +2734,6 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, sync_mapping_buffers(mapping); if (blocking == LKM_EXMODE) { truncate_inode_pages(mapping, 0); - unmap_mapping_range(mapping, 0, 0, 0); } else { /* We only need to wait on the I/O if we're not also * truncating pages because truncate_inode_pages waits diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 06be6e774cf..56e1fefc120 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -60,14 +60,11 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp) inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0); - if (IS_ERR(inode)) { - mlog_errno(PTR_ERR(inode)); + if (IS_ERR(inode)) return (void *)inode; - } if (handle->ih_generation != inode->i_generation) { iput(inode); - mlog_errno(-ESTALE); return ERR_PTR(-ESTALE); } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 8786b3c490a..10953a508f2 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -68,7 +68,7 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) struct ocfs2_inode_info *oi = OCFS2_I(inode); mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, - file->f_dentry->d_name.len, file->f_dentry->d_name.name); + file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); spin_lock(&oi->ip_lock); @@ -98,8 +98,8 @@ static int ocfs2_file_release(struct inode *inode, struct file *file) struct ocfs2_inode_info *oi = OCFS2_I(inode); mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, - file->f_dentry->d_name.len, - file->f_dentry->d_name.name); + file->f_path.dentry->d_name.len, + file->f_path.dentry->d_name.name); spin_lock(&oi->ip_lock); if (!--oi->ip_open_count) @@ -149,10 +149,29 @@ int ocfs2_should_update_atime(struct inode *inode, ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))) return 0; + /* + * We can be called with no vfsmnt structure - NFSD will + * sometimes do this. + * + * Note that our action here is different than touch_atime() - + * if we can't tell whether this is a noatime mount, then we + * don't know whether to trust the value of s_atime_quantum. + */ + if (vfsmnt == NULL) + return 0; + if ((vfsmnt->mnt_flags & MNT_NOATIME) || ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) return 0; + if (vfsmnt->mnt_flags & MNT_RELATIME) { + if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) || + (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0)) + return 1; + + return 0; + } + now = CURRENT_TIME; if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum)) return 0; @@ -958,8 +977,6 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd) } ret = generic_permission(inode, mask, NULL); - if (ret) - mlog_errno(ret); ocfs2_meta_unlock(inode, 0); out: @@ -1131,13 +1148,13 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, { int ret, rw_level, have_alloc_sem = 0; struct file *filp = iocb->ki_filp; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; int appending = filp->f_flags & O_APPEND ? 1 : 0; mlog_entry("(0x%p, %u, '%.*s')\n", filp, (unsigned int)nr_segs, - filp->f_dentry->d_name.len, - filp->f_dentry->d_name.name); + filp->f_path.dentry->d_name.len, + filp->f_path.dentry->d_name.name); /* happy write of zero bytes */ if (iocb->ki_left == 0) @@ -1159,7 +1176,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, goto out; } - ret = ocfs2_prepare_inode_for_write(filp->f_dentry, &iocb->ki_pos, + ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos, iocb->ki_left, appending); if (ret < 0) { mlog_errno(ret); @@ -1207,12 +1224,12 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, unsigned int flags) { int ret; - struct inode *inode = out->f_dentry->d_inode; + struct inode *inode = out->f_path.dentry->d_inode; mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, (unsigned int)len, - out->f_dentry->d_name.len, - out->f_dentry->d_name.name); + out->f_path.dentry->d_name.len, + out->f_path.dentry->d_name.name); inode_double_lock(inode, pipe->inode); @@ -1222,7 +1239,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, goto out; } - ret = ocfs2_prepare_inode_for_write(out->f_dentry, ppos, len, 0); + ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0); if (ret < 0) { mlog_errno(ret); goto out_unlock; @@ -1247,12 +1264,12 @@ static ssize_t ocfs2_file_splice_read(struct file *in, unsigned int flags) { int ret = 0; - struct inode *inode = in->f_dentry->d_inode; + struct inode *inode = in->f_path.dentry->d_inode; mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe, (unsigned int)len, - in->f_dentry->d_name.len, - in->f_dentry->d_name.name); + in->f_path.dentry->d_name.len, + in->f_path.dentry->d_name.name); /* * See the comment in ocfs2_file_aio_read() @@ -1278,12 +1295,12 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, { int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; struct file *filp = iocb->ki_filp; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; mlog_entry("(0x%p, %u, '%.*s')\n", filp, (unsigned int)nr_segs, - filp->f_dentry->d_name.len, - filp->f_dentry->d_name.name); + filp->f_path.dentry->d_name.len, + filp->f_path.dentry->d_name.name); if (!inode) { ret = -EINVAL; diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index cbfd45a97a6..8fc52d6d0ce 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c @@ -154,6 +154,9 @@ int ocfs2_register_hb_callbacks(struct ocfs2_super *osb) { int status; + if (ocfs2_mount_local(osb)) + return 0; + status = o2hb_register_callback(&osb->osb_hb_down); if (status < 0) { mlog_errno(status); @@ -172,6 +175,9 @@ void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb) { int status; + if (ocfs2_mount_local(osb)) + return; + status = o2hb_unregister_callback(&osb->osb_hb_down); if (status < 0) mlog_errno(status); @@ -186,6 +192,9 @@ void ocfs2_stop_heartbeat(struct ocfs2_super *osb) int ret; char *argv[5], *envp[3]; + if (ocfs2_mount_local(osb)) + return; + if (!osb->uuid_str) { /* This can happen if we don't get far enough in mount... */ mlog(0, "No UUID with which to stop heartbeat!\n\n"); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 42e361f3054..28ab56f2b98 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -146,7 +146,6 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags) if (is_bad_inode(inode)) { iput(inode); inode = ERR_PTR(-ESTALE); - mlog_errno(PTR_ERR(inode)); goto bail; } @@ -155,8 +154,7 @@ bail: mlog(0, "returning inode with number %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); mlog_exit_ptr(inode); - } else - mlog_errno(PTR_ERR(inode)); + } return inode; } @@ -247,7 +245,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, * today. change if needed. */ if (!OCFS2_IS_VALID_DINODE(fe) || !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { - mlog(ML_ERROR, "Invalid dinode: i_ino=%lu, i_blkno=%llu, " + mlog(0, "Invalid dinode: i_ino=%lu, i_blkno=%llu, " "signature = %.*s, flags = 0x%x\n", inode->i_ino, (unsigned long long)le64_to_cpu(fe->i_blkno), 7, @@ -423,7 +421,8 @@ static int ocfs2_read_locked_inode(struct inode *inode, * cluster lock before trusting anything anyway. */ can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) - && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK); + && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK) + && !ocfs2_mount_local(osb); /* * To maintain backwards compatibility with older versions of @@ -477,11 +476,8 @@ static int ocfs2_read_locked_inode(struct inode *inode, S_ISBLK(le16_to_cpu(fe->i_mode))) inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); - if (ocfs2_populate_inode(inode, fe, 0) < 0) { - mlog(ML_ERROR, "populate failed! i_blkno=%llu, i_ino=%lu\n", - (unsigned long long)fe->i_blkno, inode->i_ino); + if (ocfs2_populate_inode(inode, fe, 0) < 0) goto bail; - } BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 1d7f4ab1e5e..825cb0ae1b4 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -144,8 +144,10 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) ocfs2_abort(osb->sb, "Detected aborted journal"); handle = ERR_PTR(-EROFS); } - } else - atomic_inc(&(osb->journal->j_num_trans)); + } else { + if (!ocfs2_mount_local(osb)) + atomic_inc(&(osb->journal->j_num_trans)); + } return handle; } @@ -507,9 +509,23 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0); - status = ocfs2_journal_toggle_dirty(osb, 0); - if (status < 0) - mlog_errno(status); + if (ocfs2_mount_local(osb)) { + journal_lock_updates(journal->j_journal); + status = journal_flush(journal->j_journal); + journal_unlock_updates(journal->j_journal); + if (status < 0) + mlog_errno(status); + } + + if (status == 0) { + /* + * Do not toggle if flush was unsuccessful otherwise + * will leave dirty metadata in a "clean" journal + */ + status = ocfs2_journal_toggle_dirty(osb, 0); + if (status < 0) + mlog_errno(status); + } /* Shutdown the kernel journal system */ journal_destroy(journal->j_journal); @@ -549,7 +565,7 @@ static void ocfs2_clear_journal_error(struct super_block *sb, } } -int ocfs2_journal_load(struct ocfs2_journal *journal) +int ocfs2_journal_load(struct ocfs2_journal *journal, int local) { int status = 0; struct ocfs2_super *osb; @@ -576,14 +592,18 @@ int ocfs2_journal_load(struct ocfs2_journal *journal) } /* Launch the commit thread */ - osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt"); - if (IS_ERR(osb->commit_task)) { - status = PTR_ERR(osb->commit_task); + if (!local) { + osb->commit_task = kthread_run(ocfs2_commit_thread, osb, + "ocfs2cmt"); + if (IS_ERR(osb->commit_task)) { + status = PTR_ERR(osb->commit_task); + osb->commit_task = NULL; + mlog(ML_ERROR, "unable to launch ocfs2commit thread, " + "error=%d", status); + goto done; + } + } else osb->commit_task = NULL; - mlog(ML_ERROR, "unable to launch ocfs2commit thread, error=%d", - status); - goto done; - } done: mlog_exit(status); diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 899112ad813..d026b4f2775 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -157,7 +157,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, void ocfs2_journal_shutdown(struct ocfs2_super *osb); int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full); -int ocfs2_journal_load(struct ocfs2_journal *journal); +int ocfs2_journal_load(struct ocfs2_journal *journal, int local); int ocfs2_check_journals_nolocks(struct ocfs2_super *osb); void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num); @@ -174,6 +174,9 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + if (ocfs2_mount_local(osb)) + return; + if (!ocfs2_inode_fully_checkpointed(inode)) { /* WARNING: This only kicks off a single * checkpoint. If someone races you and adds more @@ -303,8 +306,8 @@ int ocfs2_journal_dirty_data(handle_t *handle, * for the dinode, one for the new block. */ #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) -/* file update (nlink, etc) + dir entry block */ -#define OCFS2_LINK_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) +/* file update (nlink, etc) + directory mtime/ctime + dir entry block */ +#define OCFS2_LINK_CREDITS (2*OCFS2_INODE_UPDATE_CREDITS + 1) /* inode + dir inode (if we unlink a dir), + dir entry block + orphan * dir inode link */ diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 698d79a74ef..4dedd978910 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -776,7 +776,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, { int status; - *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); if (!(*ac)) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 69f85ae392d..51b02044768 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -83,10 +83,12 @@ static struct vm_operations_struct ocfs2_file_vm_ops = { int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) { int ret = 0, lock_level = 0; + struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb); /* We don't want to support shared writable mappings yet. */ - if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) - && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { + if (!ocfs2_mount_local(osb) && + ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) && + ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); /* This is -EINVAL because generic_file_readonly_mmap * returns it in a similar situation. */ diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 21db45ddf14..f3d7803b4b4 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -587,9 +587,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, } ocfs2_inode_set_new(osb, inode); - status = ocfs2_create_new_inode_locks(inode); - if (status < 0) - mlog_errno(status); + if (!ocfs2_mount_local(osb)) { + status = ocfs2_create_new_inode_locks(inode); + if (status < 0) + mlog_errno(status); + } status = 0; /* error in ocfs2_create_new_inode_locks is not * critical */ @@ -930,14 +932,15 @@ static int ocfs2_unlink(struct inode *dir, goto leave; } - if (S_ISDIR(inode->i_mode)) { + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + if (S_ISDIR(inode->i_mode)) drop_nlink(dir); - status = ocfs2_mark_inode_dirty(handle, dir, - parent_node_bh); - if (status < 0) { - mlog_errno(status); + + status = ocfs2_mark_inode_dirty(handle, dir, parent_node_bh); + if (status < 0) { + mlog_errno(status); + if (S_ISDIR(inode->i_mode)) inc_nlink(dir); - } } leave: @@ -1066,6 +1069,7 @@ static int ocfs2_rename(struct inode *old_dir, char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; struct buffer_head *orphan_entry_bh = NULL; struct buffer_head *newfe_bh = NULL; + struct buffer_head *old_inode_bh = NULL; struct buffer_head *insert_entry_bh = NULL; struct ocfs2_super *osb = NULL; u64 newfe_blkno; @@ -1077,7 +1081,7 @@ static int ocfs2_rename(struct inode *old_dir, struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir, // this is the 1st dirent bh - nlink_t old_dir_nlink = old_dir->i_nlink, new_dir_nlink = new_dir->i_nlink; + nlink_t old_dir_nlink = old_dir->i_nlink; /* At some point it might be nice to break this function up a * bit. */ @@ -1137,12 +1141,11 @@ static int ocfs2_rename(struct inode *old_dir, } /* - * Though we don't require an inode meta data update if - * old_inode is not a directory, we lock anyway here to ensure - * the vote thread on other nodes won't have to concurrently - * downconvert the inode and the dentry locks. + * Aside from allowing a meta data update, the locking here + * also ensures that the vote thread on other nodes won't have + * to concurrently downconvert the inode and the dentry locks. */ - status = ocfs2_meta_lock(old_inode, NULL, 1); + status = ocfs2_meta_lock(old_inode, &old_inode_bh, 1); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -1353,6 +1356,7 @@ static int ocfs2_rename(struct inode *old_dir, old_inode->i_ctime = CURRENT_TIME; mark_inode_dirty(old_inode); + ocfs2_mark_inode_dirty(handle, old_inode, old_inode_bh); /* now that the name has been added to new_dir, remove the old name */ status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh); @@ -1382,27 +1386,22 @@ static int ocfs2_rename(struct inode *old_dir, } } mark_inode_dirty(old_dir); - if (new_inode) + ocfs2_mark_inode_dirty(handle, old_dir, old_dir_bh); + if (new_inode) { mark_inode_dirty(new_inode); + ocfs2_mark_inode_dirty(handle, new_inode, newfe_bh); + } - if (old_dir != new_dir) - if (new_dir_nlink != new_dir->i_nlink) { - if (!new_dir_bh) { - mlog(ML_ERROR, "need to change nlink for new " - "dir %llu from %d to %d but bh is NULL\n", - (unsigned long long)OCFS2_I(new_dir)->ip_blkno, - (int)new_dir_nlink, new_dir->i_nlink); - } else { - struct ocfs2_dinode *fe; - status = ocfs2_journal_access(handle, - new_dir, - new_dir_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - fe = (struct ocfs2_dinode *) new_dir_bh->b_data; - fe->i_links_count = cpu_to_le16(new_dir->i_nlink); - status = ocfs2_journal_dirty(handle, new_dir_bh); - } - } + if (old_dir != new_dir) { + /* Keep the same times on both directories.*/ + new_dir->i_ctime = new_dir->i_mtime = old_dir->i_ctime; + + /* + * This will also pick up the i_nlink change from the + * block above. + */ + ocfs2_mark_inode_dirty(handle, new_dir, new_dir_bh); + } if (old_dir_nlink != old_dir->i_nlink) { if (!old_dir_bh) { @@ -1453,6 +1452,8 @@ bail: iput(new_inode); if (newfe_bh) brelse(newfe_bh); + if (old_inode_bh) + brelse(old_inode_bh); if (old_dir_bh) brelse(old_dir_bh); if (new_dir_bh) @@ -1824,6 +1825,13 @@ static int __ocfs2_add_entry(handle_t *handle, (le16_to_cpu(de->rec_len) >= rec_len)) || (le16_to_cpu(de->rec_len) >= (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) { + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); + if (retval < 0) { + mlog_errno(retval); + goto bail; + } + status = ocfs2_journal_access(handle, dir, insert_bh, OCFS2_JOURNAL_ACCESS_WRITE); /* By now the buffer is marked for journaling */ @@ -1846,7 +1854,6 @@ static int __ocfs2_add_entry(handle_t *handle, de->name_len = namelen; memcpy(de->name, name, namelen); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; dir->i_version++; status = ocfs2_journal_dirty(handle, insert_bh); retval = 0; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index b767fd7da6e..db8e77cd35d 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -349,6 +349,11 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb) return ret; } +static inline int ocfs2_mount_local(struct ocfs2_super *osb) +{ + return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); +} + #define OCFS2_IS_VALID_DINODE(ptr) \ (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 3330a5dc6be..e61e218f5e0 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -85,8 +85,8 @@ #define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask) \ OCFS2_SB(sb)->s_feature_incompat &= ~(mask) -#define OCFS2_FEATURE_COMPAT_SUPP 0 -#define OCFS2_FEATURE_INCOMPAT_SUPP 0 +#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB +#define OCFS2_FEATURE_INCOMPAT_SUPP OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT #define OCFS2_FEATURE_RO_COMPAT_SUPP 0 /* @@ -96,6 +96,32 @@ */ #define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV 0x0002 +/* + * tunefs sets this incompat flag before starting the resize and clears it + * at the end. This flag protects users from inadvertently mounting the fs + * after an aborted run without fsck-ing. + */ +#define OCFS2_FEATURE_INCOMPAT_RESIZE_INPROG 0x0004 + +/* Used to denote a non-clustered volume */ +#define OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT 0x0008 + +/* Support for sparse allocation in b-trees */ +#define OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC 0x0010 + +/* + * backup superblock flag is used to indicate that this volume + * has backup superblocks. + */ +#define OCFS2_FEATURE_COMPAT_BACKUP_SB 0x0001 + +/* The byte offset of the first backup block will be 1G. + * The following will be 4G, 16G, 64G, 256G and 1T. + */ +#define OCFS2_BACKUP_SB_START 1 << 30 + +/* the max backup superblock nums */ +#define OCFS2_MAX_BACKUP_SUPERBLOCKS 6 /* * Flags on ocfs2_dinode.i_flags @@ -554,6 +580,20 @@ static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb) return size / sizeof(struct ocfs2_truncate_rec); } + +static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index) +{ + u64 offset = OCFS2_BACKUP_SB_START; + + if (index >= 0 && index < OCFS2_MAX_BACKUP_SUPERBLOCKS) { + offset <<= (2 * index); + offset >>= sb->s_blocksize_bits; + return offset; + } + + return 0; + +} #else static inline int ocfs2_fast_symlink_chars(int blocksize) { @@ -619,6 +659,19 @@ static inline int ocfs2_truncate_recs_per_inode(int blocksize) return size / sizeof(struct ocfs2_truncate_rec); } + +static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index) +{ + uint64_t offset = OCFS2_BACKUP_SB_START; + + if (index >= 0 && index < OCFS2_MAX_BACKUP_SUPERBLOCKS) { + offset <<= (2 * index); + offset /= blocksize; + return offset; + } + + return 0; +} #endif /* __KERNEL__ */ diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index aa6f5aadedc..2d3ac32cb74 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -175,7 +175,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) struct buffer_head *bh = NULL; struct ocfs2_slot_info *si; - si = kcalloc(1, sizeof(struct ocfs2_slot_info), GFP_KERNEL); + si = kzalloc(sizeof(struct ocfs2_slot_info), GFP_KERNEL); if (!si) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 000d71cca6c..6dbb1176275 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -488,7 +488,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, int status; u32 slot; - *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); if (!(*ac)) { status = -ENOMEM; mlog_errno(status); @@ -530,7 +530,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb, { int status; - *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); if (!(*ac)) { status = -ENOMEM; mlog_errno(status); @@ -595,7 +595,7 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb, mlog_entry_void(); - *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); if (!(*ac)) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 4bf39540e65..6e300a88a47 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -508,6 +508,27 @@ bail: return status; } +static int ocfs2_verify_heartbeat(struct ocfs2_super *osb) +{ + if (ocfs2_mount_local(osb)) { + if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { + mlog(ML_ERROR, "Cannot heartbeat on a locally " + "mounted device.\n"); + return -EINVAL; + } + } + + if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { + if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb)) { + mlog(ML_ERROR, "Heartbeat has to be started to mount " + "a read-write clustered device.\n"); + return -EINVAL; + } + } + + return 0; +} + static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) { struct dentry *root; @@ -516,16 +537,24 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) struct inode *inode = NULL; struct ocfs2_super *osb = NULL; struct buffer_head *bh = NULL; + char nodestr[8]; mlog_entry("%p, %p, %i", sb, data, silent); - /* for now we only have one cluster/node, make sure we see it - * in the heartbeat universe */ - if (!o2hb_check_local_node_heartbeating()) { + if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) { status = -EINVAL; goto read_super_error; } + /* for now we only have one cluster/node, make sure we see it + * in the heartbeat universe */ + if (parsed_opt & OCFS2_MOUNT_HB_LOCAL) { + if (!o2hb_check_local_node_heartbeating()) { + status = -EINVAL; + goto read_super_error; + } + } + /* probe for superblock */ status = ocfs2_sb_probe(sb, &bh, §or_size); if (status < 0) { @@ -541,11 +570,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) } brelse(bh); bh = NULL; - - if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) { - status = -EINVAL; - goto read_super_error; - } osb->s_mount_opt = parsed_opt; sb->s_magic = OCFS2_SUPER_MAGIC; @@ -588,21 +612,16 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) } if (!ocfs2_is_hard_readonly(osb)) { - /* If this isn't a hard readonly mount, then we need - * to make sure that heartbeat is in a valid state, - * and that we mark ourselves soft readonly is -oro - * was specified. */ - if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { - mlog(ML_ERROR, "No heartbeat for device (%s)\n", - sb->s_id); - status = -EINVAL; - goto read_super_error; - } - if (sb->s_flags & MS_RDONLY) ocfs2_set_ro_flag(osb, 0); } + status = ocfs2_verify_heartbeat(osb); + if (status < 0) { + mlog_errno(status); + goto read_super_error; + } + osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, ocfs2_debugfs_root); if (!osb->osb_debug_root) { @@ -635,9 +654,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) ocfs2_complete_mount_recovery(osb); - printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %d, slot %d) " + if (ocfs2_mount_local(osb)) + snprintf(nodestr, sizeof(nodestr), "local"); + else + snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num); + + printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) " "with %s data mode.\n", - osb->dev_str, osb->node_num, osb->slot_num, + osb->dev_str, nodestr, osb->slot_num, osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : "ordered"); @@ -999,7 +1023,11 @@ static int ocfs2_fill_local_node_info(struct ocfs2_super *osb) /* XXX hold a ref on the node while mounte? easy enough, if * desirable. */ - osb->node_num = o2nm_this_node(); + if (ocfs2_mount_local(osb)) + osb->node_num = 0; + else + osb->node_num = o2nm_this_node(); + if (osb->node_num == O2NM_MAX_NODES) { mlog(ML_ERROR, "could not find this host's node number\n"); status = -ENOENT; @@ -1084,6 +1112,9 @@ static int ocfs2_mount_volume(struct super_block *sb) goto leave; } + if (ocfs2_mount_local(osb)) + goto leave; + /* This should be sent *after* we recovered our journal as it * will cause other nodes to unmark us as needing * recovery. However, we need to send it *before* dropping the @@ -1114,6 +1145,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) { int tmp; struct ocfs2_super *osb = NULL; + char nodestr[8]; mlog_entry("(0x%p)\n", sb); @@ -1177,8 +1209,13 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) atomic_set(&osb->vol_state, VOLUME_DISMOUNTED); - printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %d)\n", - osb->dev_str, osb->node_num); + if (ocfs2_mount_local(osb)) + snprintf(nodestr, sizeof(nodestr), "local"); + else + snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num); + + printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n", + osb->dev_str, nodestr); ocfs2_delete_osb(osb); kfree(osb); @@ -1194,7 +1231,7 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN); - osb->uuid_str = kcalloc(1, OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL); + osb->uuid_str = kzalloc(OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL); if (osb->uuid_str == NULL) return -ENOMEM; @@ -1225,7 +1262,7 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog_entry_void(); - osb = kcalloc(1, sizeof(struct ocfs2_super), GFP_KERNEL); + osb = kzalloc(sizeof(struct ocfs2_super), GFP_KERNEL); if (!osb) { status = -ENOMEM; mlog_errno(status); @@ -1350,7 +1387,7 @@ static int ocfs2_initialize_super(struct super_block *sb, */ /* initialize our journal structure */ - journal = kcalloc(1, sizeof(struct ocfs2_journal), GFP_KERNEL); + journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL); if (!journal) { mlog(ML_ERROR, "unable to alloc journal\n"); status = -ENOMEM; @@ -1536,6 +1573,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) { int status = 0; int dirty; + int local; struct ocfs2_dinode *local_alloc = NULL; /* only used if we * recover * ourselves. */ @@ -1563,8 +1601,10 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) "recovering volume.\n"); } + local = ocfs2_mount_local(osb); + /* will play back anything left in the journal. */ - ocfs2_journal_load(osb->journal); + ocfs2_journal_load(osb->journal, local); if (dirty) { /* recover my local alloc if we didn't unmount cleanly. */ diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 957d6878b03..03b0191534d 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -158,8 +158,7 @@ static void *ocfs2_follow_link(struct dentry *dentry, } status = vfs_follow_link(nd, link); - if (status && status != -ENOENT) - mlog_errno(status); + bail: if (page) { kunmap(page); diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c index 5b4dca79990..f30e63b9910 100644 --- a/fs/ocfs2/vote.c +++ b/fs/ocfs2/vote.c @@ -479,7 +479,7 @@ static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response { struct ocfs2_net_wait_ctxt *w; - w = kcalloc(1, sizeof(*w), GFP_NOFS); + w = kzalloc(sizeof(*w), GFP_NOFS); if (!w) { mlog_errno(-ENOMEM); goto bail; @@ -642,7 +642,7 @@ static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb, BUG_ON(!ocfs2_is_valid_vote_request(type)); - request = kcalloc(1, sizeof(*request), GFP_NOFS); + request = kzalloc(sizeof(*request), GFP_NOFS); if (!request) { mlog_errno(-ENOMEM); } else { @@ -887,7 +887,7 @@ static inline int ocfs2_translate_response(int response) static int ocfs2_handle_response_message(struct o2net_msg *msg, u32 len, - void *data) + void *data, void **ret_data) { unsigned int response_id, node_num; int response_status; @@ -943,7 +943,7 @@ bail: static int ocfs2_handle_vote_message(struct o2net_msg *msg, u32 len, - void *data) + void *data, void **ret_data) { int status; struct ocfs2_super *osb = data; @@ -1000,11 +1000,14 @@ int ocfs2_register_net_handlers(struct ocfs2_super *osb) { int status = 0; + if (ocfs2_mount_local(osb)) + return 0; + status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE, osb->net_key, sizeof(struct ocfs2_response_msg), ocfs2_handle_response_message, - osb, &osb->osb_net_handlers); + osb, NULL, &osb->osb_net_handlers); if (status) { mlog_errno(status); goto bail; @@ -1014,7 +1017,7 @@ int ocfs2_register_net_handlers(struct ocfs2_super *osb) osb->net_key, sizeof(struct ocfs2_vote_msg), ocfs2_handle_vote_message, - osb, &osb->osb_net_handlers); + osb, NULL, &osb->osb_net_handlers); if (status) { mlog_errno(status); goto bail; diff --git a/fs/open.c b/fs/open.c index 89e0c237a63..c989fb4cf7b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -165,7 +165,7 @@ asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user * buf) file = fget(fd); if (!file) goto out; - error = vfs_statfs_native(file->f_dentry, &tmp); + error = vfs_statfs_native(file->f_path.dentry, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; fput(file); @@ -186,7 +186,7 @@ asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz, struct statfs64 __user file = fget(fd); if (!file) goto out; - error = vfs_statfs64(file->f_dentry, &tmp); + error = vfs_statfs64(file->f_path.dentry, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; fput(file); @@ -302,7 +302,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) if (file->f_flags & O_LARGEFILE) small = 0; - dentry = file->f_dentry; + dentry = file->f_path.dentry; inode = dentry->d_inode; error = -EINVAL; if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE)) @@ -448,8 +448,8 @@ asmlinkage long sys_fchdir(unsigned int fd) if (!file) goto out; - dentry = file->f_dentry; - mnt = file->f_vfsmnt; + dentry = file->f_path.dentry; + mnt = file->f_path.mnt; inode = dentry->d_inode; error = -ENOTDIR; @@ -503,7 +503,7 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) if (!file) goto out; - dentry = file->f_dentry; + dentry = file->f_path.dentry; inode = dentry->d_inode; audit_inode(NULL, inode); @@ -662,7 +662,7 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group) if (!file) goto out; - dentry = file->f_dentry; + dentry = file->f_path.dentry; audit_inode(NULL, dentry->d_inode); error = chown_common(dentry, user, group); fput(file); @@ -688,8 +688,8 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, } f->f_mapping = inode->i_mapping; - f->f_dentry = dentry; - f->f_vfsmnt = mnt; + f->f_path.dentry = dentry; + f->f_path.mnt = mnt; f->f_pos = 0; f->f_op = fops_get(inode->i_fop); file_move(f, &inode->i_sb->s_files); @@ -723,8 +723,8 @@ cleanup_all: if (f->f_mode & FMODE_WRITE) put_write_access(inode); file_kill(f); - f->f_dentry = NULL; - f->f_vfsmnt = NULL; + f->f_path.dentry = NULL; + f->f_path.mnt = NULL; cleanup_file: put_filp(f); dput(dentry); @@ -822,7 +822,7 @@ struct file *nameidata_to_filp(struct nameidata *nd, int flags) /* Pick up the filp from the open intent */ filp = nd->intent.open.file; /* Has the filesystem initialised the file for us? */ - if (filp->f_dentry == NULL) + if (filp->f_path.dentry == NULL) filp = __dentry_open(nd->dentry, nd->mnt, flags, filp, NULL); else path_release(nd); @@ -864,8 +864,7 @@ int get_unused_fd(void) repeat: fdt = files_fdtable(files); - fd = find_next_zero_bit(fdt->open_fds->fds_bits, - fdt->max_fdset, + fd = find_next_zero_bit(fdt->open_fds->fds_bits, fdt->max_fds, files->next_fd); /* @@ -965,7 +964,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode) put_unused_fd(fd); fd = PTR_ERR(f); } else { - fsnotify_open(f->f_dentry); + fsnotify_open(f->f_path.dentry); fd_install(fd, f); } } @@ -1087,6 +1086,7 @@ EXPORT_SYMBOL(sys_close); asmlinkage long sys_vhangup(void) { if (capable(CAP_SYS_TTY_CONFIG)) { + /* XXX: this needs locking */ tty_vhangup(current->signal->tty); return 0; } diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 26f44e0074e..99c0bc37ba0 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -262,7 +262,7 @@ found: static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct op_inode_info *oi = OP_I(inode); struct device_node *dp = oi->u.node; struct device_node *child; diff --git a/fs/partitions/Kconfig b/fs/partitions/Kconfig index e478f194183..74552c60b67 100644 --- a/fs/partitions/Kconfig +++ b/fs/partitions/Kconfig @@ -194,7 +194,7 @@ config LDM_DEBUG config SGI_PARTITION bool "SGI partition support" if PARTITION_ADVANCED - default y if (SGI_IP22 || SGI_IP27 || ((MACH_JAZZ || SNI_RM200_PCI) && !CPU_LITTLE_ENDIAN)) + default y if (SGI_IP22 || SGI_IP27 || ((MACH_JAZZ || SNI_RM) && !CPU_LITTLE_ENDIAN)) help Say Y here if you would like to be able to read the hard disk partition table format used by SGI machines. diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 1901137f4ec..3d73d94d93a 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -276,12 +276,39 @@ static struct part_attribute part_attr_stat = { .show = part_stat_read }; +#ifdef CONFIG_FAIL_MAKE_REQUEST + +static ssize_t part_fail_store(struct hd_struct * p, + const char *buf, size_t count) +{ + int i; + + if (count > 0 && sscanf(buf, "%d", &i) > 0) + p->make_it_fail = (i == 0) ? 0 : 1; + + return count; +} +static ssize_t part_fail_read(struct hd_struct * p, char *page) +{ + return sprintf(page, "%d\n", p->make_it_fail); +} +static struct part_attribute part_attr_fail = { + .attr = {.name = "make-it-fail", .mode = S_IRUGO | S_IWUSR }, + .store = part_fail_store, + .show = part_fail_read +}; + +#endif + static struct attribute * default_attrs[] = { &part_attr_uevent.attr, &part_attr_dev.attr, &part_attr_start.attr, &part_attr_size.attr, &part_attr_stat.attr, +#ifdef CONFIG_FAIL_MAKE_REQUEST + &part_attr_fail.attr, +#endif NULL, }; diff --git a/fs/pipe.c b/fs/pipe.c index ae36b89b1a3..68090e84f58 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -207,7 +207,7 @@ int generic_pipe_buf_pin(struct pipe_inode_info *info, struct pipe_buffer *buf) return 0; } -static struct pipe_buf_operations anon_pipe_buf_ops = { +static const struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, @@ -222,7 +222,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, unsigned long nr_segs, loff_t pos) { struct file *filp = iocb->ki_filp; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct pipe_inode_info *pipe; int do_wakeup; ssize_t ret; @@ -243,7 +243,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, if (bufs) { int curbuf = pipe->curbuf; struct pipe_buffer *buf = pipe->bufs + curbuf; - struct pipe_buf_operations *ops = buf->ops; + const struct pipe_buf_operations *ops = buf->ops; void *addr; size_t chars = buf->len; int error, atomic; @@ -335,7 +335,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov, unsigned long nr_segs, loff_t ppos) { struct file *filp = iocb->ki_filp; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct pipe_inode_info *pipe; ssize_t ret; int do_wakeup; @@ -365,7 +365,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov, int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & (PIPE_BUFFERS-1); struct pipe_buffer *buf = pipe->bufs + lastbuf; - struct pipe_buf_operations *ops = buf->ops; + const struct pipe_buf_operations *ops = buf->ops; int offset = buf->offset + buf->len; if (ops->can_merge && offset + chars <= PAGE_SIZE) { @@ -520,7 +520,7 @@ static int pipe_ioctl(struct inode *pino, struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct pipe_inode_info *pipe; int count, buf, nrbufs; @@ -548,7 +548,7 @@ static unsigned int pipe_poll(struct file *filp, poll_table *wait) { unsigned int mask; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct pipe_inode_info *pipe = inode->i_pipe; int nrbufs; @@ -601,7 +601,7 @@ pipe_release(struct inode *inode, int decr, int decw) static int pipe_read_fasync(int fd, struct file *filp, int on) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; int retval; mutex_lock(&inode->i_mutex); @@ -618,7 +618,7 @@ pipe_read_fasync(int fd, struct file *filp, int on) static int pipe_write_fasync(int fd, struct file *filp, int on) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; int retval; mutex_lock(&inode->i_mutex); @@ -635,7 +635,7 @@ pipe_write_fasync(int fd, struct file *filp, int on) static int pipe_rdwr_fasync(int fd, struct file *filp, int on) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct pipe_inode_info *pipe = inode->i_pipe; int retval; @@ -756,7 +756,7 @@ const struct file_operations rdwr_fifo_fops = { .fasync = pipe_rdwr_fasync, }; -static struct file_operations read_pipe_fops = { +static const struct file_operations read_pipe_fops = { .llseek = no_llseek, .read = do_sync_read, .aio_read = pipe_read, @@ -768,7 +768,7 @@ static struct file_operations read_pipe_fops = { .fasync = pipe_read_fasync, }; -static struct file_operations write_pipe_fops = { +static const struct file_operations write_pipe_fops = { .llseek = no_llseek, .read = bad_pipe_r, .write = do_sync_write, @@ -780,7 +780,7 @@ static struct file_operations write_pipe_fops = { .fasync = pipe_write_fasync, }; -static struct file_operations rdwr_pipe_fops = { +static const struct file_operations rdwr_pipe_fops = { .llseek = no_llseek, .read = do_sync_read, .aio_read = pipe_read, @@ -914,8 +914,8 @@ struct file *create_write_pipe(void) */ dentry->d_flags &= ~DCACHE_UNHASHED; d_instantiate(dentry, inode); - f->f_vfsmnt = mntget(pipe_mnt); - f->f_dentry = dentry; + f->f_path.mnt = mntget(pipe_mnt); + f->f_path.dentry = dentry; f->f_mapping = inode->i_mapping; f->f_flags = O_WRONLY; @@ -935,8 +935,9 @@ struct file *create_write_pipe(void) void free_write_pipe(struct file *f) { - mntput(f->f_vfsmnt); - dput(f->f_dentry); + free_pipe_info(f->f_dentry->d_inode); + dput(f->f_path.dentry); + mntput(f->f_path.mnt); put_filp(f); } @@ -947,9 +948,9 @@ struct file *create_read_pipe(struct file *wrf) return ERR_PTR(-ENFILE); /* Grab pipe from the writer */ - f->f_vfsmnt = mntget(wrf->f_vfsmnt); - f->f_dentry = dget(wrf->f_dentry); - f->f_mapping = wrf->f_dentry->d_inode->i_mapping; + f->f_path.mnt = mntget(wrf->f_path.mnt); + f->f_path.dentry = dget(wrf->f_path.dentry); + f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping; f->f_pos = 0; f->f_flags = O_RDONLY; @@ -994,6 +995,8 @@ int do_pipe(int *fd) err_fdr: put_unused_fd(fdr); err_read_pipe: + dput(fr->f_dentry); + mntput(fr->f_vfsmnt); put_filp(fr); err_write_pipe: free_write_pipe(fw); diff --git a/fs/pnode.c b/fs/pnode.c index da42ee61c1d..56aacead836 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -6,7 +6,7 @@ * Author : Ram Pai (linuxram@us.ibm.com) * */ -#include <linux/namespace.h> +#include <linux/mnt_namespace.h> #include <linux/mount.h> #include <linux/fs.h> #include "pnode.h" diff --git a/fs/pnode.h b/fs/pnode.h index 020e1bb60fd..d45bd8ec36b 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -13,7 +13,7 @@ #define IS_MNT_SHARED(mnt) (mnt->mnt_flags & MNT_SHARED) #define IS_MNT_SLAVE(mnt) (mnt->mnt_master) -#define IS_MNT_NEW(mnt) (!mnt->mnt_namespace) +#define IS_MNT_NEW(mnt) (!mnt->mnt_ns) #define CLEAR_MNT_SHARED(mnt) (mnt->mnt_flags &= ~MNT_SHARED) #define IS_MNT_UNBINDABLE(mnt) (mnt->mnt_flags & MNT_UNBINDABLE) diff --git a/fs/proc/array.c b/fs/proc/array.c index 25e917fb473..70e4fab117b 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -346,20 +346,13 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) sigemptyset(&sigcatch); cutime = cstime = utime = stime = cputime_zero; - mutex_lock(&tty_mutex); rcu_read_lock(); if (lock_task_sighand(task, &flags)) { struct signal_struct *sig = task->signal; - struct tty_struct *tty = sig->tty; - - if (tty) { - /* - * sig->tty is not stable, but tty_mutex - * protects us from release_dev(tty) - */ - barrier(); - tty_pgrp = tty->pgrp; - tty_nr = new_encode_dev(tty_devnum(tty)); + + if (sig->tty) { + tty_pgrp = sig->tty->pgrp; + tty_nr = new_encode_dev(tty_devnum(sig->tty)); } num_threads = atomic_read(&sig->count); @@ -388,14 +381,13 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) stime = cputime_add(stime, sig->stime); } - sid = sig->session; + sid = signal_session(sig); pgid = process_group(task); ppid = rcu_dereference(task->real_parent)->tgid; unlock_task_sighand(task, &flags); } rcu_read_unlock(); - mutex_unlock(&tty_mutex); if (!whole || num_threads<2) wchan = get_wchan(task); diff --git a/fs/proc/base.c b/fs/proc/base.c index b859fc749c0..1a979ea3b37 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -59,7 +59,7 @@ #include <linux/string.h> #include <linux/seq_file.h> #include <linux/namei.h> -#include <linux/namespace.h> +#include <linux/mnt_namespace.h> #include <linux/mm.h> #include <linux/smp_lock.h> #include <linux/rcupdate.h> @@ -365,33 +365,35 @@ struct proc_mounts { static int mounts_open(struct inode *inode, struct file *file) { struct task_struct *task = get_proc_task(inode); - struct namespace *namespace = NULL; + struct mnt_namespace *ns = NULL; struct proc_mounts *p; int ret = -EINVAL; if (task) { task_lock(task); - namespace = task->nsproxy->namespace; - if (namespace) - get_namespace(namespace); + if (task->nsproxy) { + ns = task->nsproxy->mnt_ns; + if (ns) + get_mnt_ns(ns); + } task_unlock(task); put_task_struct(task); } - if (namespace) { + if (ns) { ret = -ENOMEM; p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL); if (p) { file->private_data = &p->m; ret = seq_open(file, &mounts_op); if (!ret) { - p->m.private = namespace; - p->event = namespace->event; + p->m.private = ns; + p->event = ns->event; return 0; } kfree(p); } - put_namespace(namespace); + put_mnt_ns(ns); } return ret; } @@ -399,15 +401,15 @@ static int mounts_open(struct inode *inode, struct file *file) static int mounts_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; - struct namespace *namespace = m->private; - put_namespace(namespace); + struct mnt_namespace *ns = m->private; + put_mnt_ns(ns); return seq_release(inode, file); } static unsigned mounts_poll(struct file *file, poll_table *wait) { struct proc_mounts *p = file->private_data; - struct namespace *ns = p->m.private; + struct mnt_namespace *ns = p->m.private; unsigned res = 0; poll_wait(file, &ns->poll, wait); @@ -437,21 +439,21 @@ static int mountstats_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *m = file->private_data; - struct namespace *namespace = NULL; + struct mnt_namespace *mnt_ns = NULL; struct task_struct *task = get_proc_task(inode); if (task) { task_lock(task); if (task->nsproxy) - namespace = task->nsproxy->namespace; - if (namespace) - get_namespace(namespace); + mnt_ns = task->nsproxy->mnt_ns; + if (mnt_ns) + get_mnt_ns(mnt_ns); task_unlock(task); put_task_struct(task); } - if (namespace) - m->private = namespace; + if (mnt_ns) + m->private = mnt_ns; else { seq_release(inode, file); ret = -EINVAL; @@ -472,7 +474,7 @@ static struct file_operations proc_mountstats_operations = { static ssize_t proc_info_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - struct inode * inode = file->f_dentry->d_inode; + struct inode * inode = file->f_path.dentry->d_inode; unsigned long page; ssize_t length; struct task_struct *task = get_proc_task(inode); @@ -512,7 +514,7 @@ static int mem_open(struct inode* inode, struct file* file) static ssize_t mem_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - struct task_struct *task = get_proc_task(file->f_dentry->d_inode); + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); char *page; unsigned long src = *ppos; int ret = -ESRCH; @@ -584,7 +586,7 @@ static ssize_t mem_write(struct file * file, const char * buf, { int copied; char *page; - struct task_struct *task = get_proc_task(file->f_dentry->d_inode); + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); unsigned long dst = *ppos; copied = -ESRCH; @@ -654,7 +656,7 @@ static struct file_operations proc_mem_operations = { static ssize_t oom_adjust_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = get_proc_task(file->f_dentry->d_inode); + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); char buffer[PROC_NUMBUF]; size_t len; int oom_adjust; @@ -694,7 +696,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, return -EINVAL; if (*end == '\n') end++; - task = get_proc_task(file->f_dentry->d_inode); + task = get_proc_task(file->f_path.dentry->d_inode); if (!task) return -ESRCH; if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { @@ -718,7 +720,7 @@ static struct file_operations proc_oom_adjust_operations = { static ssize_t proc_loginuid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - struct inode * inode = file->f_dentry->d_inode; + struct inode * inode = file->f_path.dentry->d_inode; struct task_struct *task = get_proc_task(inode); ssize_t length; char tmpbuf[TMPBUFLEN]; @@ -734,7 +736,7 @@ static ssize_t proc_loginuid_read(struct file * file, char __user * buf, static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { - struct inode * inode = file->f_dentry->d_inode; + struct inode * inode = file->f_path.dentry->d_inode; char *page, *tmp; ssize_t length; uid_t loginuid; @@ -853,6 +855,65 @@ static struct file_operations proc_seccomp_operations = { }; #endif /* CONFIG_SECCOMP */ +#ifdef CONFIG_FAULT_INJECTION +static ssize_t proc_fault_inject_read(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); + char buffer[PROC_NUMBUF]; + size_t len; + int make_it_fail; + loff_t __ppos = *ppos; + + if (!task) + return -ESRCH; + make_it_fail = task->make_it_fail; + put_task_struct(task); + + len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail); + if (__ppos >= len) + return 0; + if (count > len-__ppos) + count = len-__ppos; + if (copy_to_user(buf, buffer + __ppos, count)) + return -EFAULT; + *ppos = __ppos + count; + return count; +} + +static ssize_t proc_fault_inject_write(struct file * file, + const char __user * buf, size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF], *end; + int make_it_fail; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + make_it_fail = simple_strtol(buffer, &end, 0); + if (*end == '\n') + end++; + task = get_proc_task(file->f_dentry->d_inode); + if (!task) + return -ESRCH; + task->make_it_fail = make_it_fail; + put_task_struct(task); + if (end - buffer == 0) + return -EIO; + return end - buffer; +} + +static struct file_operations proc_fault_inject_operations = { + .read = proc_fault_inject_read, + .write = proc_fault_inject_write, +}; +#endif + static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; @@ -1078,7 +1139,7 @@ static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, char *name, int len, instantiate_t instantiate, struct task_struct *task, void *ptr) { - struct dentry *child, *dir = filp->f_dentry; + struct dentry *child, *dir = filp->f_path.dentry; struct inode *inode; struct qstr qname; ino_t ino = 0; @@ -1157,8 +1218,8 @@ static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsm spin_lock(&files->file_lock); file = fcheck_files(files, fd); if (file) { - *mnt = mntget(file->f_vfsmnt); - *dentry = dget(file->f_dentry); + *mnt = mntget(file->f_path.mnt); + *dentry = dget(file->f_path.dentry); spin_unlock(&files->file_lock); put_files_struct(files); return 0; @@ -1293,7 +1354,7 @@ static int proc_fd_fill_cache(struct file *filp, void *dirent, filldir_t filldir static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) { - struct dentry *dentry = filp->f_dentry; + struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; struct task_struct *p = get_proc_task(inode); unsigned int fd, tid, ino; @@ -1440,7 +1501,7 @@ static int proc_pident_readdir(struct file *filp, { int i; int pid; - struct dentry *dentry = filp->f_dentry; + struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; struct task_struct *task = get_proc_task(inode); struct pid_entry *p, *last; @@ -1496,7 +1557,7 @@ out_no_task: static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - struct inode * inode = file->f_dentry->d_inode; + struct inode * inode = file->f_path.dentry->d_inode; unsigned long page; ssize_t length; struct task_struct *task = get_proc_task(inode); @@ -1512,7 +1573,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, goto out; length = security_getprocattr(task, - (char*)file->f_dentry->d_name.name, + (char*)file->f_path.dentry->d_name.name, (void*)page, count); if (length >= 0) length = simple_read_from_buffer(buf, count, ppos, (char *)page, length); @@ -1526,7 +1587,7 @@ out_no_task: static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { - struct inode * inode = file->f_dentry->d_inode; + struct inode * inode = file->f_path.dentry->d_inode; char *page; ssize_t length; struct task_struct *task = get_proc_task(inode); @@ -1552,7 +1613,7 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, goto out_free; length = security_setprocattr(task, - (char*)file->f_dentry->d_name.name, + (char*)file->f_path.dentry->d_name.name, (void*)page, count); out_free: free_page((unsigned long) page); @@ -1745,6 +1806,27 @@ static int proc_base_fill_cache(struct file *filp, void *dirent, filldir_t filld proc_base_instantiate, task, p); } +#ifdef CONFIG_TASK_IO_ACCOUNTING +static int proc_pid_io_accounting(struct task_struct *task, char *buffer) +{ + return sprintf(buffer, + "rchar: %llu\n" + "wchar: %llu\n" + "syscr: %llu\n" + "syscw: %llu\n" + "read_bytes: %llu\n" + "write_bytes: %llu\n" + "cancelled_write_bytes: %llu\n", + (unsigned long long)task->rchar, + (unsigned long long)task->wchar, + (unsigned long long)task->syscr, + (unsigned long long)task->syscw, + (unsigned long long)task->ioac.read_bytes, + (unsigned long long)task->ioac.write_bytes, + (unsigned long long)task->ioac.cancelled_write_bytes); +} +#endif + /* * Thread groups */ @@ -1793,6 +1875,12 @@ static struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, loginuid), #endif +#ifdef CONFIG_FAULT_INJECTION + REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject), +#endif +#ifdef CONFIG_TASK_IO_ACCOUNTING + INF("io", S_IRUGO, pid_io_accounting), +#endif }; static int proc_tgid_base_readdir(struct file * filp, @@ -1994,7 +2082,7 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; - struct task_struct *reaper = get_proc_task(filp->f_dentry->d_inode); + struct task_struct *reaper = get_proc_task(filp->f_path.dentry->d_inode); struct task_struct *task; int tgid; @@ -2068,6 +2156,9 @@ static struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, loginuid), #endif +#ifdef CONFIG_FAULT_INJECTION + REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject), +#endif }; static int proc_tid_base_readdir(struct file * filp, @@ -2235,15 +2326,25 @@ static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filld /* for the /proc/TGID/task/ directories */ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir) { - struct dentry *dentry = filp->f_dentry; + struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; - struct task_struct *leader = get_proc_task(inode); + struct task_struct *leader = NULL; struct task_struct *task; int retval = -ENOENT; ino_t ino; int tid; unsigned long pos = filp->f_pos; /* avoiding "long long" filp->f_pos */ + task = get_proc_task(inode); + if (!task) + goto out_no_task; + rcu_read_lock(); + if (pid_alive(task)) { + leader = task->group_leader; + get_task_struct(leader); + } + rcu_read_unlock(); + put_task_struct(task); if (!leader) goto out_no_task; retval = 0; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 4ba03009cf7..853cb877d5f 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -52,7 +52,7 @@ static ssize_t proc_file_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { - struct inode * inode = file->f_dentry->d_inode; + struct inode * inode = file->f_path.dentry->d_inode; char *page; ssize_t retval=0; int eof=0; @@ -203,7 +203,7 @@ static ssize_t proc_file_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; struct proc_dir_entry * dp; dp = PDE(inode); @@ -432,7 +432,7 @@ int proc_readdir(struct file * filp, struct proc_dir_entry * de; unsigned int ino; int i; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; int ret = 0; lock_kernel(); @@ -453,7 +453,7 @@ int proc_readdir(struct file * filp, /* fall through */ case 1: if (filldir(dirent, "..", 2, i, - parent_ino(filp->f_dentry), + parent_ino(filp->f_path.dentry), DT_DIR) < 0) goto out; i++; @@ -558,7 +558,7 @@ static void proc_kill_inodes(struct proc_dir_entry *de) file_list_lock(); list_for_each(p, &sb->s_files) { struct file * filp = list_entry(p, struct file, f_u.fu_list); - struct dentry * dentry = filp->f_dentry; + struct dentry * dentry = filp->f_path.dentry; struct inode * inode; const struct file_operations *fops; diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index d7dbdf9e0f4..5ec67257e5f 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -46,7 +46,7 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) file = vma->vm_file; if (file) { - struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; dev = inode->i_sb->s_dev; ino = inode->i_ino; } @@ -67,7 +67,7 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) if (len < 1) len = 1; seq_printf(m, "%*c", len, ' '); - seq_path(m, file->f_vfsmnt, file->f_dentry, ""); + seq_path(m, file->f_path.mnt, file->f_path.dentry, ""); } seq_putc(m, '\n'); diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 51815cece6f..b37ce33f67e 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -39,13 +39,14 @@ #include <linux/seq_file.h> #include <linux/times.h> #include <linux/profile.h> +#include <linux/utsname.h> #include <linux/blkdev.h> #include <linux/hugetlb.h> #include <linux/jiffies.h> #include <linux/sysrq.h> #include <linux/vmalloc.h> #include <linux/crash_dump.h> -#include <linux/pspace.h> +#include <linux/pid_namespace.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/io.h> @@ -92,7 +93,7 @@ static int loadavg_read_proc(char *page, char **start, off_t off, LOAD_INT(a), LOAD_FRAC(a), LOAD_INT(b), LOAD_FRAC(b), LOAD_INT(c), LOAD_FRAC(c), - nr_running(), nr_threads, init_pspace.last_pid); + nr_running(), nr_threads, current->nsproxy->pid_ns->last_pid); return proc_calc_metrics(page, start, off, count, eof, len); } @@ -252,8 +253,10 @@ static int version_read_proc(char *page, char **start, off_t off, { int len; - strcpy(page, linux_banner); - len = strlen(page); + len = snprintf(page, PAGE_SIZE, linux_proc_banner, + utsname()->sysname, + utsname()->release, + utsname()->version); return proc_calc_metrics(page, start, off, count, eof, len); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6b769afac55..55ade0d1562 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -94,8 +94,8 @@ int proc_exe_link(struct inode *inode, struct dentry **dentry, struct vfsmount * } if (vma) { - *mnt = mntget(vma->vm_file->f_vfsmnt); - *dentry = dget(vma->vm_file->f_dentry); + *mnt = mntget(vma->vm_file->f_path.mnt); + *dentry = dget(vma->vm_file->f_path.dentry); result = 0; } @@ -135,7 +135,7 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats int len; if (file) { - struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; dev = inode->i_sb->s_dev; ino = inode->i_ino; } @@ -156,7 +156,7 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats */ if (file) { pad_len_spaces(m, len); - seq_path(m, file->f_vfsmnt, file->f_dentry, "\n"); + seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n"); } else { const char *name = arch_vma_name(vma); if (!name) { diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 091aa8e48e0..fcc5caf93f5 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -126,8 +126,8 @@ int proc_exe_link(struct inode *inode, struct dentry **dentry, struct vfsmount * } if (vma) { - *mnt = mntget(vma->vm_file->f_vfsmnt); - *dentry = dget(vma->vm_file->f_dentry); + *mnt = mntget(vma->vm_file->f_path.mnt); + *dentry = dget(vma->vm_file->f_path.dentry); result = 0; } diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c index 0d7103fa0df..c94db1db7a7 100644 --- a/fs/qnx4/dir.c +++ b/fs/qnx4/dir.c @@ -22,7 +22,7 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; unsigned int offset; struct buffer_head *bh; struct qnx4_inode_entry *de; diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index 0947fb57dcf..54ebbc84207 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -25,11 +25,13 @@ */ #include <linux/fs.h> +#include <linux/mm.h> const struct address_space_operations ramfs_aops = { .readpage = simple_readpage, .prepare_write = simple_prepare_write, - .commit_write = simple_commit_write + .commit_write = simple_commit_write, + .set_page_dirty = __set_page_dirty_nobuffers, }; const struct file_operations ramfs_file_operations = { diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index bfe5dbf1002..e9d6c473328 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -11,6 +11,7 @@ #include <linux/module.h> #include <linux/fs.h> +#include <linux/mm.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/init.h> @@ -30,7 +31,8 @@ static int ramfs_nommu_setattr(struct dentry *, struct iattr *); const struct address_space_operations ramfs_aops = { .readpage = simple_readpage, .prepare_write = simple_prepare_write, - .commit_write = simple_commit_write + .commit_write = simple_commit_write, + .set_page_dirty = __set_page_dirty_nobuffers, }; const struct file_operations ramfs_file_operations = { @@ -232,7 +234,7 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file, unsigned long pgoff, unsigned long flags) { unsigned long maxpages, lpages, nr, loop, ret; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; struct page **pages = NULL, **ptr, *page; loff_t isize; diff --git a/fs/read_write.c b/fs/read_write.c index f792000a28e..707ac21700d 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -64,13 +64,13 @@ loff_t remote_llseek(struct file *file, loff_t offset, int origin) lock_kernel(); switch (origin) { case 2: - offset += i_size_read(file->f_dentry->d_inode); + offset += i_size_read(file->f_path.dentry->d_inode); break; case 1: offset += file->f_pos; } retval = -EINVAL; - if (offset>=0 && offset<=file->f_dentry->d_inode->i_sb->s_maxbytes) { + if (offset>=0 && offset<=file->f_path.dentry->d_inode->i_sb->s_maxbytes) { if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; @@ -95,7 +95,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin) lock_kernel(); switch (origin) { case 2: - offset += i_size_read(file->f_dentry->d_inode); + offset += i_size_read(file->f_path.dentry->d_inode); break; case 1: offset += file->f_pos; @@ -203,7 +203,7 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) goto Einval; - inode = file->f_dentry->d_inode; + inode = file->f_path.dentry->d_inode; if (unlikely(inode->i_flock && MANDATORY_LOCK(inode))) { int retval = locks_mandatory_area( read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, @@ -273,7 +273,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) else ret = do_sync_read(file, buf, count, pos); if (ret > 0) { - fsnotify_access(file->f_dentry); + fsnotify_access(file->f_path.dentry); current->rchar += ret; } current->syscr++; @@ -331,7 +331,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ else ret = do_sync_write(file, buf, count, pos); if (ret > 0) { - fsnotify_modify(file->f_dentry); + fsnotify_modify(file->f_path.dentry); current->wchar += ret; } current->syscw++; @@ -450,8 +450,6 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) return seg; } -EXPORT_UNUSED_SYMBOL(iov_shorten); /* June 2006 */ - ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn) { @@ -628,9 +626,9 @@ out: kfree(iov); if ((ret + (type == READ)) > 0) { if (type == READ) - fsnotify_access(file->f_dentry); + fsnotify_access(file->f_path.dentry); else - fsnotify_modify(file->f_dentry); + fsnotify_modify(file->f_path.dentry); } return ret; } @@ -722,7 +720,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, if (!(in_file->f_mode & FMODE_READ)) goto fput_in; retval = -EINVAL; - in_inode = in_file->f_dentry->d_inode; + in_inode = in_file->f_path.dentry->d_inode; if (!in_inode) goto fput_in; if (!in_file->f_op || !in_file->f_op->sendfile) @@ -754,7 +752,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, retval = -EINVAL; if (!out_file->f_op || !out_file->f_op->sendpage) goto fput_out; - out_inode = out_file->f_dentry->d_inode; + out_inode = out_file->f_path.dentry->d_inode; retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count); if (retval < 0) goto fput_out; diff --git a/fs/readdir.c b/fs/readdir.c index bff3ee58e2f..f39f5b31325 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -21,7 +21,7 @@ int vfs_readdir(struct file *file, filldir_t filler, void *buf) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; int res = -ENOTDIR; if (!file->f_op || !file->f_op->readdir) goto out; diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index e3d466a228d..b286ccb0858 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -708,7 +708,7 @@ static void oid_groups(reiserfs_blocknr_hint_t * hint) */ static int get_left_neighbor(reiserfs_blocknr_hint_t * hint) { - struct path *path; + struct treepath *path; struct buffer_head *bh; struct item_head *ih; int pos_in_item; diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 657050ad743..96a2f8889da 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -45,7 +45,7 @@ static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, // static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */ INITIALIZE_PATH(path_to_entry); struct buffer_head *bh; @@ -135,7 +135,7 @@ static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir) /* Ignore the .reiserfs_priv entry */ if (reiserfs_xattrs(inode->i_sb) && !old_format_only(inode->i_sb) && - filp->f_dentry == inode->i_sb->s_root && + filp->f_path.dentry == inode->i_sb->s_root && REISERFS_SB(inode->i_sb)->priv_root && REISERFS_SB(inode->i_sb)->priv_root->d_inode && deh_objectid(deh) == diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 373d862c3f8..5109f1d5e7f 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -48,6 +48,11 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp) } mutex_lock(&inode->i_mutex); + + mutex_lock(&(REISERFS_I(inode)->i_mmap)); + if (REISERFS_I(inode)->i_flags & i_ever_mapped) + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; + reiserfs_write_lock(inode->i_sb); /* freeing preallocation only involves relogging blocks that * are already in the current transaction. preallocation gets @@ -100,11 +105,24 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp) err = reiserfs_truncate_file(inode, 0); } out: + mutex_unlock(&(REISERFS_I(inode)->i_mmap)); mutex_unlock(&inode->i_mutex); reiserfs_write_unlock(inode->i_sb); return err; } +static int reiserfs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *inode; + + inode = file->f_path.dentry->d_inode; + mutex_lock(&(REISERFS_I(inode)->i_mmap)); + REISERFS_I(inode)->i_flags |= i_ever_mapped; + mutex_unlock(&(REISERFS_I(inode)->i_mmap)); + + return generic_file_mmap(file, vma); +} + static void reiserfs_vfs_truncate_file(struct inode *inode) { reiserfs_truncate_file(inode, 1); @@ -1288,7 +1306,7 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t loff_t pos; // Current position in the file. ssize_t res; // return value of various functions that we call. int err = 0; - struct inode *inode = file->f_dentry->d_inode; // Inode of the file that we are writing to. + struct inode *inode = file->f_path.dentry->d_inode; // Inode of the file that we are writing to. /* To simplify coding at this time, we store locked pages in array for now */ struct page *prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME]; @@ -1335,7 +1353,7 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t if (count == 0) goto out; - res = remove_suid(file->f_dentry); + res = remove_suid(file->f_path.dentry); if (res) goto out; @@ -1527,7 +1545,7 @@ const struct file_operations reiserfs_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = reiserfs_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = reiserfs_file_mmap, .open = generic_file_open, .release = reiserfs_file_release, .fsync = reiserfs_sync_file, diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index 6d0e554daa9..0ee35c6c9b7 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -957,7 +957,7 @@ static int get_far_parent(struct tree_balance *p_s_tb, { struct buffer_head *p_s_parent; INITIALIZE_PATH(s_path_to_neighbor_father); - struct path *p_s_path = p_s_tb->tb_path; + struct treepath *p_s_path = p_s_tb->tb_path; struct cpu_key s_lr_father_key; int n_counter, n_position = INT_MAX, @@ -1074,7 +1074,7 @@ static int get_far_parent(struct tree_balance *p_s_tb, */ static int get_parents(struct tree_balance *p_s_tb, int n_h) { - struct path *p_s_path = p_s_tb->tb_path; + struct treepath *p_s_path = p_s_tb->tb_path; int n_position, n_ret_value, n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h); @@ -1885,7 +1885,7 @@ static int check_balance(int mode, static int get_direct_parent(struct tree_balance *p_s_tb, int n_h) { struct buffer_head *p_s_bh; - struct path *p_s_path = p_s_tb->tb_path; + struct treepath *p_s_path = p_s_tb->tb_path; int n_position, n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h); diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 254239e6f9e..9fcbfe31697 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -207,7 +207,7 @@ static int file_capable(struct inode *inode, long block) } /*static*/ int restart_transaction(struct reiserfs_transaction_handle *th, - struct inode *inode, struct path *path) + struct inode *inode, struct treepath *path) { struct super_block *s = th->t_super; int len = th->t_blocks_allocated; @@ -570,7 +570,7 @@ static inline int _allocate_block(struct reiserfs_transaction_handle *th, long block, struct inode *inode, b_blocknr_t * allocated_block_nr, - struct path *path, int flags) + struct treepath *path, int flags) { BUG_ON(!th->t_trans_id); @@ -1107,7 +1107,7 @@ static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size) // // called by read_locked_inode -static void init_inode(struct inode *inode, struct path *path) +static void init_inode(struct inode *inode, struct treepath *path) { struct buffer_head *bh; struct item_head *ih; @@ -1125,6 +1125,7 @@ static void init_inode(struct inode *inode, struct path *path) REISERFS_I(inode)->i_prealloc_count = 0; REISERFS_I(inode)->i_trans_id = 0; REISERFS_I(inode)->i_jl = NULL; + mutex_init(&(REISERFS_I(inode)->i_mmap)); reiserfs_init_acl_access(inode); reiserfs_init_acl_default(inode); reiserfs_init_xattr_rwsem(inode); @@ -1284,7 +1285,7 @@ static void inode2sd_v1(void *sd, struct inode *inode, loff_t size) /* NOTE, you must prepare the buffer head before sending it here, ** and then log it after the call */ -static void update_stat_data(struct path *path, struct inode *inode, +static void update_stat_data(struct treepath *path, struct inode *inode, loff_t size) { struct buffer_head *bh; @@ -1653,7 +1654,7 @@ int reiserfs_write_inode(struct inode *inode, int do_sync) containing "." and ".." entries */ static int reiserfs_new_directory(struct reiserfs_transaction_handle *th, struct inode *inode, - struct item_head *ih, struct path *path, + struct item_head *ih, struct treepath *path, struct inode *dir) { struct super_block *sb = th->t_super; @@ -1712,7 +1713,7 @@ static int reiserfs_new_directory(struct reiserfs_transaction_handle *th, containing the body of symlink */ static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct inode *inode, /* Inode of symlink */ struct item_head *ih, - struct path *path, const char *symname, + struct treepath *path, const char *symname, int item_len) { struct super_block *sb = th->t_super; @@ -1832,6 +1833,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, REISERFS_I(inode)->i_attrs = REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK; sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode); + mutex_init(&(REISERFS_I(inode)->i_mmap)); reiserfs_init_acl_access(inode); reiserfs_init_acl_default(inode); reiserfs_init_xattr_rwsem(inode); diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 9c57578cb83..b484d2913c0 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -99,7 +99,7 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, long reiserfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; int ret; /* These are just misnamed, they actually get/put from/to user an int */ diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index abde1edc223..23f5cd5bbf5 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -54,7 +54,7 @@ static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off) // comment? maybe something like set de to point to what the path points to? static inline void set_de_item_location(struct reiserfs_dir_entry *de, - struct path *path) + struct treepath *path) { de->de_bh = get_last_bh(path); de->de_ih = get_ih(path); @@ -113,7 +113,7 @@ entry position in the item /* The function is NOT SCHEDULE-SAFE! */ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, - struct path *path, struct reiserfs_dir_entry *de) + struct treepath *path, struct reiserfs_dir_entry *de) { int retval; @@ -282,7 +282,7 @@ static int linear_search_in_dir_item(struct cpu_key *key, // may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND // FIXME: should add something like IOERROR static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen, - struct path *path_to_entry, + struct treepath *path_to_entry, struct reiserfs_dir_entry *de) { struct cpu_key key_to_search; diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index c533ec1bcae..ecc9943202f 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -295,7 +295,7 @@ static int show_oidmap(struct seq_file *m, struct super_block *sb) } #if defined( REISERFS_USE_OIDMAPF ) if (sb_info->oidmap.use_file && (sb_info->oidmap.mapf != NULL)) { - loff_t size = sb_info->oidmap.mapf->f_dentry->d_inode->i_size; + loff_t size = sb_info->oidmap.mapf->f_path.dentry->d_inode->i_size; total_used += size / sizeof(reiserfs_oidinterval_d_t); } #endif diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 5240abe1a70..afb21ea4530 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -244,7 +244,7 @@ static const struct reiserfs_key MAX_KEY = { of the path, and going upwards. We must check the path's validity at each step. If the key is not in the path, there is no delimiting key in the tree (buffer is first or last buffer in tree), and in this case we return a special key, either MIN_KEY or MAX_KEY. */ -static inline const struct reiserfs_key *get_lkey(const struct path +static inline const struct reiserfs_key *get_lkey(const struct treepath *p_s_chk_path, const struct super_block *p_s_sb) @@ -290,7 +290,7 @@ static inline const struct reiserfs_key *get_lkey(const struct path } /* Get delimiting key of the buffer at the path and its right neighbor. */ -inline const struct reiserfs_key *get_rkey(const struct path *p_s_chk_path, +inline const struct reiserfs_key *get_rkey(const struct treepath *p_s_chk_path, const struct super_block *p_s_sb) { int n_position, n_path_offset = p_s_chk_path->path_length; @@ -337,7 +337,7 @@ inline const struct reiserfs_key *get_rkey(const struct path *p_s_chk_path, the path. These delimiting keys are stored at least one level above that buffer in the tree. If the buffer is the first or last node in the tree order then one of the delimiting keys may be absent, and in this case get_lkey and get_rkey return a special key which is MIN_KEY or MAX_KEY. */ -static inline int key_in_buffer(struct path *p_s_chk_path, /* Path which should be checked. */ +static inline int key_in_buffer(struct treepath *p_s_chk_path, /* Path which should be checked. */ const struct cpu_key *p_s_key, /* Key which should be checked. */ struct super_block *p_s_sb /* Super block pointer. */ ) @@ -374,7 +374,7 @@ inline void decrement_bcount(struct buffer_head *p_s_bh) } /* Decrement b_count field of the all buffers in the path. */ -void decrement_counters_in_path(struct path *p_s_search_path) +void decrement_counters_in_path(struct treepath *p_s_search_path) { int n_path_offset = p_s_search_path->path_length; @@ -391,7 +391,7 @@ void decrement_counters_in_path(struct path *p_s_search_path) p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; } -int reiserfs_check_path(struct path *p) +int reiserfs_check_path(struct treepath *p) { RFALSE(p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET, "path not properly relsed"); @@ -403,7 +403,7 @@ int reiserfs_check_path(struct path *p) ** ** only called from fix_nodes() */ -void pathrelse_and_restore(struct super_block *s, struct path *p_s_search_path) +void pathrelse_and_restore(struct super_block *s, struct treepath *p_s_search_path) { int n_path_offset = p_s_search_path->path_length; @@ -421,7 +421,7 @@ void pathrelse_and_restore(struct super_block *s, struct path *p_s_search_path) } /* Release all buffers in the path. */ -void pathrelse(struct path *p_s_search_path) +void pathrelse(struct treepath *p_s_search_path) { int n_path_offset = p_s_search_path->path_length; @@ -602,7 +602,7 @@ static void search_by_key_reada(struct super_block *s, correctness of the bottom of the path */ /* The function is NOT SCHEDULE-SAFE! */ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key, /* Key to search. */ - struct path *p_s_search_path, /* This structure was + struct treepath *p_s_search_path,/* This structure was allocated and initialized by the calling function. It is filled up @@ -813,7 +813,7 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key, /* /* The function is NOT SCHEDULE-SAFE! */ int search_for_position_by_key(struct super_block *p_s_sb, /* Pointer to the super block. */ const struct cpu_key *p_cpu_key, /* Key to search (cpu variable) */ - struct path *p_s_search_path /* Filled up by this function. */ + struct treepath *p_s_search_path /* Filled up by this function. */ ) { struct item_head *p_le_ih; /* pointer to on-disk structure */ @@ -884,7 +884,7 @@ int search_for_position_by_key(struct super_block *p_s_sb, /* Pointer to the sup } /* Compare given item and item pointed to by the path. */ -int comp_items(const struct item_head *stored_ih, const struct path *p_s_path) +int comp_items(const struct item_head *stored_ih, const struct treepath *p_s_path) { struct buffer_head *p_s_bh; struct item_head *ih; @@ -911,7 +911,7 @@ int comp_items(const struct item_head *stored_ih, const struct path *p_s_path) #define block_in_use(bh) (buffer_locked(bh) || (held_by_others(bh))) // prepare for delete or cut of direct item -static inline int prepare_for_direct_item(struct path *path, +static inline int prepare_for_direct_item(struct treepath *path, struct item_head *le_ih, struct inode *inode, loff_t new_file_length, int *cut_size) @@ -952,7 +952,7 @@ static inline int prepare_for_direct_item(struct path *path, return M_CUT; /* Cut from this item. */ } -static inline int prepare_for_direntry_item(struct path *path, +static inline int prepare_for_direntry_item(struct treepath *path, struct item_head *le_ih, struct inode *inode, loff_t new_file_length, @@ -987,7 +987,7 @@ static inline int prepare_for_direntry_item(struct path *path, In case of file truncate calculate whether this item must be deleted/truncated or last unformatted node of this item will be converted to a direct item. This function returns a determination of what balance mode the calling function should employ. */ -static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, struct inode *inode, struct path *p_s_path, const struct cpu_key *p_s_item_key, int *p_n_removed, /* Number of unformatted nodes which were removed +static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, struct inode *inode, struct treepath *p_s_path, const struct cpu_key *p_s_item_key, int *p_n_removed, /* Number of unformatted nodes which were removed from end of the file. */ int *p_n_cut_size, unsigned long long n_new_file_length /* MAX_KEY_OFFSET in case of delete. */ ) @@ -1125,7 +1125,7 @@ static int calc_deleted_bytes_number(struct tree_balance *p_s_tb, char c_mode) static void init_tb_struct(struct reiserfs_transaction_handle *th, struct tree_balance *p_s_tb, struct super_block *p_s_sb, - struct path *p_s_path, int n_size) + struct treepath *p_s_path, int n_size) { BUG_ON(!th->t_trans_id); @@ -1176,7 +1176,7 @@ char head2type(struct item_head *ih) #endif /* Delete object item. */ -int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct path *p_s_path, /* Path to the deleted item. */ +int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath *p_s_path, /* Path to the deleted item. */ const struct cpu_key *p_s_item_key, /* Key to search for the deleted item. */ struct inode *p_s_inode, /* inode is here just to update i_blocks and quotas */ struct buffer_head *p_s_un_bh) @@ -1459,7 +1459,7 @@ static void unmap_buffers(struct page *page, loff_t pos) bh = next; } while (bh != head); if (PAGE_SIZE == bh->b_size) { - clear_page_dirty(page); + cancel_dirty_page(page, PAGE_CACHE_SIZE); } } } @@ -1468,7 +1468,7 @@ static void unmap_buffers(struct page *page, loff_t pos) static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th, struct inode *p_s_inode, struct page *page, - struct path *p_s_path, + struct treepath *p_s_path, const struct cpu_key *p_s_item_key, loff_t n_new_file_size, char *p_c_mode) { @@ -1503,7 +1503,7 @@ static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th, pointer being converted. Therefore we have to delete inserted direct item(s) */ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th, - struct inode *inode, struct path *path) + struct inode *inode, struct treepath *path) { struct cpu_key tail_key; int tail_len; @@ -1545,7 +1545,7 @@ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th, /* (Truncate or cut entry) or delete object item. Returns < 0 on failure */ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, - struct path *p_s_path, + struct treepath *p_s_path, struct cpu_key *p_s_item_key, struct inode *p_s_inode, struct page *page, loff_t n_new_file_size) @@ -1920,7 +1920,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p #ifdef CONFIG_REISERFS_CHECK // this makes sure, that we __append__, not overwrite or add holes -static void check_research_for_paste(struct path *path, +static void check_research_for_paste(struct treepath *path, const struct cpu_key *p_s_key) { struct item_head *found_ih = get_ih(path); @@ -1954,7 +1954,7 @@ static void check_research_for_paste(struct path *path, #endif /* config reiserfs check */ /* Paste bytes to the existing item. Returns bytes number pasted into the item. */ -int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct path *p_s_search_path, /* Path to the pasted item. */ +int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct treepath *p_s_search_path, /* Path to the pasted item. */ const struct cpu_key *p_s_key, /* Key to search for the needed item. */ struct inode *inode, /* Inode item belongs to */ const char *p_c_body, /* Pointer to the bytes to paste. */ @@ -2036,7 +2036,7 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct path } /* Insert new item into the buffer at the path. */ -int reiserfs_insert_item(struct reiserfs_transaction_handle *th, struct path *p_s_path, /* Path to the inserteded item. */ +int reiserfs_insert_item(struct reiserfs_transaction_handle *th, struct treepath *p_s_path, /* Path to the inserteded item. */ const struct cpu_key *key, struct item_head *p_s_ih, /* Pointer to the item header to insert. */ struct inode *inode, const char *p_c_body) { /* Pointer to the bytes to insert. */ diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 7fb5fb036f9..58ad4551a7c 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -23,7 +23,7 @@ #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/vfs.h> -#include <linux/namespace.h> +#include <linux/mnt_namespace.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/quotaops.h> diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c index 36f108fc1cf..f8121a1147e 100644 --- a/fs/reiserfs/tail_conversion.c +++ b/fs/reiserfs/tail_conversion.c @@ -15,7 +15,7 @@ /* path points to first direct item of the file regarless of how many of them are there */ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode, - struct path *path, struct buffer_head *unbh, + struct treepath *path, struct buffer_head *unbh, loff_t tail_offset) { struct super_block *sb = inode->i_sb; @@ -171,7 +171,7 @@ void reiserfs_unmap_buffer(struct buffer_head *bh) what we expect from it (number of cut bytes). But when tail remains in the unformatted node, we set mode to SKIP_BALANCING and unlock inode */ -int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_inode, struct page *page, struct path *p_s_path, /* path to the indirect item. */ +int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_inode, struct page *page, struct treepath *p_s_path, /* path to the indirect item. */ const struct cpu_key *p_s_item_key, /* Key to look for unformatted node pointer to be cut. */ loff_t n_new_file_size, /* New file size. */ char *p_c_mode) diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 1e4d6859017..f01389fd162 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -274,7 +274,7 @@ static struct file *open_xa_file(const struct inode *inode, const char *name, */ static int __xattr_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */ INITIALIZE_PATH(path_to_entry); struct buffer_head *bh; @@ -420,7 +420,7 @@ static int __xattr_readdir(struct file *filp, void *dirent, filldir_t filldir) static int xattr_readdir(struct file *file, filldir_t filler, void *buf) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; int res = -ENOTDIR; if (!file->f_op || !file->f_op->readdir) goto out; @@ -508,7 +508,7 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer, goto out; } - xinode = fp->f_dentry->d_inode; + xinode = fp->f_path.dentry->d_inode; REISERFS_I(inode)->i_flags |= i_has_xattr_dir; /* we need to copy it off.. */ @@ -527,7 +527,7 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer, newattrs.ia_size = buffer_size; newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; mutex_lock(&xinode->i_mutex); - err = notify_change(fp->f_dentry, &newattrs); + err = notify_change(fp->f_path.dentry, &newattrs); if (err) goto out_filp; @@ -626,7 +626,7 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer, goto out; } - xinode = fp->f_dentry->d_inode; + xinode = fp->f_path.dentry->d_inode; isize = xinode->i_size; REISERFS_I(inode)->i_flags |= i_has_xattr_dir; diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 97ae1b92bc4..5296a29cc5e 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -135,7 +135,7 @@ static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size) int n; *size = reiserfs_acl_size(acl->a_count); - ext_acl = (reiserfs_acl_header *) kmalloc(sizeof(reiserfs_acl_header) + + ext_acl = kmalloc(sizeof(reiserfs_acl_header) + acl->a_count * sizeof(reiserfs_acl_entry), GFP_NOFS); diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c index c5af088d4a4..d3e243a6f60 100644 --- a/fs/romfs/inode.c +++ b/fs/romfs/inode.c @@ -276,7 +276,7 @@ static unsigned char romfs_dtype_table[] = { static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *i = filp->f_dentry->d_inode; + struct inode *i = filp->f_path.dentry->d_inode; struct romfs_inode ri; unsigned long offset, maxoff; int j, ino, nextfh; diff --git a/fs/select.c b/fs/select.c index dcbc1112b7e..fe0893afd93 100644 --- a/fs/select.c +++ b/fs/select.c @@ -311,7 +311,7 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, { fd_set_bits fds; void *bits; - int ret, max_fdset; + int ret, max_fds; unsigned int size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ @@ -321,13 +321,13 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, if (n < 0) goto out_nofds; - /* max_fdset can increase, so grab it once to avoid race */ + /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); - max_fdset = fdt->max_fdset; + max_fds = fdt->max_fds; rcu_read_unlock(); - if (n > max_fdset) - n = max_fdset; + if (n > max_fds) + n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), diff --git a/fs/seq_file.c b/fs/seq_file.c index 10690aa401c..0ac22af7afe 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -269,7 +269,7 @@ EXPORT_SYMBOL(seq_lseek); /** * seq_release - free the structures associated with sequential file. * @file: file in question - * @inode: file->f_dentry->d_inode + * @inode: file->f_path.dentry->d_inode * * Frees the structures associated with sequential file; can be used * as ->f_op->release() if you don't have private data to destroy. diff --git a/fs/smbfs/cache.c b/fs/smbfs/cache.c index 74b86d9725a..8182f0542a2 100644 --- a/fs/smbfs/cache.c +++ b/fs/smbfs/cache.c @@ -125,7 +125,7 @@ smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir, struct smb_cache_control *ctrl, struct qstr *qname, struct smb_fattr *entry) { - struct dentry *newdent, *dentry = filp->f_dentry; + struct dentry *newdent, *dentry = filp->f_path.dentry; struct inode *newino, *inode = dentry->d_inode; struct smb_cache_control ctl = *ctrl; int valid = 0; diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c index 70d9c5a37f5..b1e58d1ac9c 100644 --- a/fs/smbfs/dir.c +++ b/fs/smbfs/dir.c @@ -78,7 +78,7 @@ struct inode_operations smb_dir_inode_operations_unix = static int smb_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct dentry *dentry = filp->f_dentry; + struct dentry *dentry = filp->f_path.dentry; struct inode *dir = dentry->d_inode; struct smb_sb_info *server = server_from_dentry(dentry); union smb_dir_cache *cache = NULL; @@ -238,12 +238,12 @@ out: static int smb_dir_open(struct inode *dir, struct file *file) { - struct dentry *dentry = file->f_dentry; + struct dentry *dentry = file->f_path.dentry; struct smb_sb_info *server; int error = 0; VERBOSE("(%s/%s)\n", dentry->d_parent->d_name.name, - file->f_dentry->d_name.name); + file->f_path.dentry->d_name.name); /* * Directory timestamps in the core protocol aren't updated diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c index 50784d13c87..e50533a7951 100644 --- a/fs/smbfs/file.c +++ b/fs/smbfs/file.c @@ -102,7 +102,7 @@ static int smb_readpage(struct file *file, struct page *page) { int error; - struct dentry *dentry = file->f_dentry; + struct dentry *dentry = file->f_path.dentry; page_cache_get(page); error = smb_readpage_sync(dentry, page); @@ -205,7 +205,7 @@ static int smb_updatepage(struct file *file, struct page *page, unsigned long offset, unsigned int count) { - struct dentry *dentry = file->f_dentry; + struct dentry *dentry = file->f_path.dentry; DEBUG1("(%s/%s %d@%lld)\n", DENTRY_PATH(dentry), count, ((unsigned long long)page->index << PAGE_CACHE_SHIFT) + offset); @@ -218,7 +218,7 @@ smb_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file * file = iocb->ki_filp; - struct dentry * dentry = file->f_dentry; + struct dentry * dentry = file->f_path.dentry; ssize_t status; VERBOSE("file %s/%s, count=%lu@%lu\n", DENTRY_PATH(dentry), @@ -243,7 +243,7 @@ out: static int smb_file_mmap(struct file * file, struct vm_area_struct * vma) { - struct dentry * dentry = file->f_dentry; + struct dentry * dentry = file->f_path.dentry; int status; VERBOSE("file %s/%s, address %lu - %lu\n", @@ -264,7 +264,7 @@ static ssize_t smb_file_sendfile(struct file *file, loff_t *ppos, size_t count, read_actor_t actor, void *target) { - struct dentry *dentry = file->f_dentry; + struct dentry *dentry = file->f_path.dentry; ssize_t status; VERBOSE("file %s/%s, pos=%Ld, count=%d\n", @@ -323,7 +323,7 @@ smb_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file * file = iocb->ki_filp; - struct dentry * dentry = file->f_dentry; + struct dentry * dentry = file->f_path.dentry; ssize_t result; VERBOSE("file %s/%s, count=%lu@%lu\n", @@ -355,7 +355,7 @@ static int smb_file_open(struct inode *inode, struct file * file) { int result; - struct dentry *dentry = file->f_dentry; + struct dentry *dentry = file->f_path.dentry; int smb_mode = (file->f_mode & O_ACCMODE) - 1; lock_kernel(); diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c index 4af4cd729a5..84dfe3f3482 100644 --- a/fs/smbfs/inode.c +++ b/fs/smbfs/inode.c @@ -482,12 +482,13 @@ smb_put_super(struct super_block *sb) smb_close_socket(server); if (server->conn_pid) - kill_proc(server->conn_pid, SIGTERM, 1); + kill_pid(server->conn_pid, SIGTERM, 1); kfree(server->ops); smb_unload_nls(server); sb->s_fs_info = NULL; smb_unlock_server(server); + put_pid(server->conn_pid); kfree(server); } @@ -530,7 +531,7 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent) INIT_LIST_HEAD(&server->xmitq); INIT_LIST_HEAD(&server->recvq); server->conn_error = 0; - server->conn_pid = 0; + server->conn_pid = NULL; server->state = CONN_INVALID; /* no connection yet */ server->generation = 0; diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c index 40e174db987..feac4605061 100644 --- a/fs/smbfs/proc.c +++ b/fs/smbfs/proc.c @@ -873,11 +873,11 @@ smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt) filp = fget(opt->fd); if (!filp) goto out; - if (!smb_valid_socket(filp->f_dentry->d_inode)) + if (!smb_valid_socket(filp->f_path.dentry->d_inode)) goto out_putf; server->sock_file = filp; - server->conn_pid = current->pid; + server->conn_pid = get_pid(task_pid(current)); server->opt = *opt; server->generation += 1; server->state = CONN_VALID; @@ -898,7 +898,7 @@ smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt) /* * Store the server in sock user_data (Only used by sunrpc) */ - sk = SOCKET_I(filp->f_dentry->d_inode)->sk; + sk = SOCKET_I(filp->f_path.dentry->d_inode)->sk; sk->sk_user_data = server; /* chain into the data_ready callback */ @@ -971,8 +971,8 @@ smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt) } VERBOSE("protocol=%d, max_xmit=%d, pid=%d capabilities=0x%x\n", - server->opt.protocol, server->opt.max_xmit, server->conn_pid, - server->opt.capabilities); + server->opt.protocol, server->opt.max_xmit, + pid_nr(server->conn_pid), server->opt.capabilities); /* FIXME: this really should be done by smbmount. */ if (server->opt.max_xmit > SMB_MAX_PACKET_SIZE) { @@ -1939,7 +1939,7 @@ static int smb_proc_readdir_short(struct file *filp, void *dirent, filldir_t filldir, struct smb_cache_control *ctl) { - struct dentry *dir = filp->f_dentry; + struct dentry *dir = filp->f_path.dentry; struct smb_sb_info *server = server_from_dentry(dir); struct qstr qname; struct smb_fattr fattr; @@ -2291,7 +2291,7 @@ static int smb_proc_readdir_long(struct file *filp, void *dirent, filldir_t filldir, struct smb_cache_control *ctl) { - struct dentry *dir = filp->f_dentry; + struct dentry *dir = filp->f_path.dentry; struct smb_sb_info *server = server_from_dentry(dir); struct qstr qname; struct smb_fattr fattr; @@ -2859,7 +2859,7 @@ static int smb_proc_readdir_null(struct file *filp, void *dirent, filldir_t filldir, struct smb_cache_control *ctl) { - struct smb_sb_info *server = server_from_dentry(filp->f_dentry); + struct smb_sb_info *server = server_from_dentry(filp->f_path.dentry); if (smb_proc_ops_wait(server) < 0) return -EIO; diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c index e6754044128..89eaf31f1d4 100644 --- a/fs/smbfs/smbiod.c +++ b/fs/smbfs/smbiod.c @@ -152,7 +152,7 @@ int smbiod_retry(struct smb_sb_info *server) { struct list_head *head; struct smb_request *req; - pid_t pid = server->conn_pid; + struct pid *pid = get_pid(server->conn_pid); int result = 0; VERBOSE("state: %d\n", server->state); @@ -222,7 +222,7 @@ int smbiod_retry(struct smb_sb_info *server) /* * Note: use the "priv" flag, as a user process may need to reconnect. */ - result = kill_proc(pid, SIGUSR1, 1); + result = kill_pid(pid, SIGUSR1, 1); if (result) { /* FIXME: this is most likely fatal, umount? */ printk(KERN_ERR "smb_retry: signal failed [%d]\n", result); @@ -233,6 +233,7 @@ int smbiod_retry(struct smb_sb_info *server) /* FIXME: The retried requests should perhaps get a "time boost". */ out: + put_pid(pid); return result; } diff --git a/fs/smbfs/sock.c b/fs/smbfs/sock.c index 6815b1b12b6..92ea6b2367d 100644 --- a/fs/smbfs/sock.c +++ b/fs/smbfs/sock.c @@ -82,10 +82,10 @@ server_sock(struct smb_sb_info *server) if (server && (file = server->sock_file)) { #ifdef SMBFS_PARANOIA - if (!smb_valid_socket(file->f_dentry->d_inode)) + if (!smb_valid_socket(file->f_path.dentry->d_inode)) PARANOIA("bad socket!\n"); #endif - return SOCKET_I(file->f_dentry->d_inode); + return SOCKET_I(file->f_path.dentry->d_inode); } return NULL; } diff --git a/fs/splice.c b/fs/splice.c index da74583a00e..2fca6ebf4cc 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -42,7 +42,7 @@ struct splice_pipe_desc { struct partial_page *partial; /* pages[] may not be contig */ int nr_pages; /* number of pages in map */ unsigned int flags; /* splice flags */ - struct pipe_buf_operations *ops;/* ops associated with output pipe */ + const struct pipe_buf_operations *ops;/* ops associated with output pipe */ }; /* @@ -139,7 +139,7 @@ error: return err; } -static struct pipe_buf_operations page_cache_pipe_buf_ops = { +static const struct pipe_buf_operations page_cache_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, @@ -159,7 +159,7 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, return generic_pipe_buf_steal(pipe, buf); } -static struct pipe_buf_operations user_page_pipe_buf_ops = { +static const struct pipe_buf_operations user_page_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, @@ -724,7 +724,7 @@ static ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, for (;;) { if (pipe->nrbufs) { struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; - struct pipe_buf_operations *ops = buf->ops; + const struct pipe_buf_operations *ops = buf->ops; sd.len = buf->len; if (sd.len > sd.total_len) @@ -844,7 +844,7 @@ generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, ssize_t ret; int err; - err = remove_suid(out->f_dentry); + err = remove_suid(out->f_path.dentry); if (unlikely(err)) return err; @@ -890,10 +890,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, ssize_t ret; int err; - err = should_remove_suid(out->f_dentry); + err = should_remove_suid(out->f_path.dentry); if (unlikely(err)) { mutex_lock(&inode->i_mutex); - err = __remove_suid(out->f_dentry, err); + err = __remove_suid(out->f_path.dentry, err); mutex_unlock(&inode->i_mutex); if (err) return err; @@ -1008,7 +1008,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, * randomly drop data for eg socket -> socket splicing. Use the * piped splicing for that! */ - i_mode = in->f_dentry->d_inode->i_mode; + i_mode = in->f_path.dentry->d_inode->i_mode; if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) return -EINVAL; @@ -1132,7 +1132,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, loff_t offset, *off; long ret; - pipe = pipe_info(in->f_dentry->d_inode); + pipe = pipe_info(in->f_path.dentry->d_inode); if (pipe) { if (off_in) return -ESPIPE; @@ -1153,7 +1153,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, return ret; } - pipe = pipe_info(out->f_dentry->d_inode); + pipe = pipe_info(out->f_path.dentry->d_inode); if (pipe) { if (off_out) return -ESPIPE; @@ -1321,7 +1321,7 @@ static long do_vmsplice(struct file *file, const struct iovec __user *iov, .ops = &user_page_pipe_buf_ops, }; - pipe = pipe_info(file->f_dentry->d_inode); + pipe = pipe_info(file->f_path.dentry->d_inode); if (!pipe) return -EBADF; if (unlikely(nr_segs > UIO_MAXIOV)) @@ -1549,8 +1549,8 @@ static int link_pipe(struct pipe_inode_info *ipipe, static long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) { - struct pipe_inode_info *ipipe = pipe_info(in->f_dentry->d_inode); - struct pipe_inode_info *opipe = pipe_info(out->f_dentry->d_inode); + struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); + struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); int ret = -EINVAL; /* diff --git a/fs/stack.c b/fs/stack.c new file mode 100644 index 00000000000..8ffb880d2f4 --- /dev/null +++ b/fs/stack.c @@ -0,0 +1,38 @@ +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/fs_stack.h> + +/* does _NOT_ require i_mutex to be held. + * + * This function cannot be inlined since i_size_{read,write} is rather + * heavy-weight on 32-bit systems + */ +void fsstack_copy_inode_size(struct inode *dst, const struct inode *src) +{ + i_size_write(dst, i_size_read((struct inode *)src)); + dst->i_blocks = src->i_blocks; +} +EXPORT_SYMBOL_GPL(fsstack_copy_inode_size); + +/* copy all attributes; get_nlinks is optional way to override the i_nlink + * copying + */ +void fsstack_copy_attr_all(struct inode *dest, const struct inode *src, + int (*get_nlinks)(struct inode *)) +{ + if (!get_nlinks) + dest->i_nlink = src->i_nlink; + else + dest->i_nlink = (*get_nlinks)(dest); + + dest->i_mode = src->i_mode; + dest->i_uid = src->i_uid; + dest->i_gid = src->i_gid; + dest->i_rdev = src->i_rdev; + dest->i_atime = src->i_atime; + dest->i_mtime = src->i_mtime; + dest->i_ctime = src->i_ctime; + dest->i_blkbits = src->i_blkbits; + dest->i_flags = src->i_flags; +} +EXPORT_SYMBOL_GPL(fsstack_copy_attr_all); diff --git a/fs/stat.c b/fs/stat.c index a0ebfc7f8a6..38a8cb2a28d 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -102,7 +102,7 @@ int vfs_fstat(unsigned int fd, struct kstat *stat) int error = -EBADF; if (f) { - error = vfs_getattr(f->f_vfsmnt, f->f_dentry, stat); + error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat); fput(f); } return error; diff --git a/fs/super.c b/fs/super.c index 84c320f6ad7..3e7458c2bb7 100644 --- a/fs/super.c +++ b/fs/super.c @@ -570,7 +570,7 @@ static void mark_files_ro(struct super_block *sb) file_list_lock(); list_for_each_entry(f, &sb->s_files, f_u.fu_list) { - if (S_ISREG(f->f_dentry->d_inode->i_mode) && file_count(f)) + if (S_ISREG(f->f_path.dentry->d_inode->i_mode) && file_count(f)) f->f_mode &= ~FMODE_WRITE; } file_list_unlock(); @@ -753,9 +753,9 @@ int get_sb_bdev(struct file_system_type *fs_type, * will protect the lockfs code from trying to start a snapshot * while we are mounting */ - mutex_lock(&bdev->bd_mount_mutex); + down(&bdev->bd_mount_sem); s = sget(fs_type, test_bdev_super, set_bdev_super, bdev); - mutex_unlock(&bdev->bd_mount_mutex); + up(&bdev->bd_mount_sem); if (IS_ERR(s)) goto error_s; diff --git a/fs/sync.c b/fs/sync.c index 865f32be386..d0feff61e6a 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -94,7 +94,7 @@ long do_fsync(struct file *file, int datasync) * livelocks in fsync_buffers_list(). */ mutex_lock(&mapping->host->i_mutex); - err = file->f_op->fsync(file, file->f_dentry, datasync); + err = file->f_op->fsync(file, file->f_path.dentry, datasync); if (!ret) ret = err; mutex_unlock(&mapping->host->i_mutex); @@ -223,7 +223,7 @@ asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes, if (!file) goto out; - i_mode = file->f_dentry->d_inode->i_mode; + i_mode = file->f_path.dentry->d_inode->i_mode; ret = -ESPIPE; if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) && !S_ISLNK(i_mode)) diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 98022e41cda..d3b9f5f07db 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -16,6 +16,7 @@ #include <linux/slab.h> #include <asm/uaccess.h> +#include <asm/semaphore.h> #include "sysfs.h" @@ -35,7 +36,7 @@ static ssize_t read(struct file * file, char __user * userbuf, size_t count, loff_t * off) { char *buffer = file->private_data; - struct dentry *dentry = file->f_dentry; + struct dentry *dentry = file->f_path.dentry; int size = dentry->d_inode->i_size; loff_t offs = *off; int ret; @@ -81,7 +82,7 @@ static ssize_t write(struct file * file, const char __user * userbuf, size_t count, loff_t * off) { char *buffer = file->private_data; - struct dentry *dentry = file->f_dentry; + struct dentry *dentry = file->f_path.dentry; int size = dentry->d_inode->i_size; loff_t offs = *off; @@ -105,7 +106,7 @@ static ssize_t write(struct file * file, const char __user * userbuf, static int mmap(struct file *file, struct vm_area_struct *vma) { - struct dentry *dentry = file->f_dentry; + struct dentry *dentry = file->f_path.dentry; struct bin_attribute *attr = to_bin_attr(dentry); struct kobject *kobj = to_kobj(dentry->d_parent); @@ -117,8 +118,8 @@ static int mmap(struct file *file, struct vm_area_struct *vma) static int open(struct inode * inode, struct file * file) { - struct kobject *kobj = sysfs_get_kobject(file->f_dentry->d_parent); - struct bin_attribute * attr = to_bin_attr(file->f_dentry); + struct kobject *kobj = sysfs_get_kobject(file->f_path.dentry->d_parent); + struct bin_attribute * attr = to_bin_attr(file->f_path.dentry); int error = -EINVAL; if (!kobj || !attr) @@ -146,19 +147,18 @@ static int open(struct inode * inode, struct file * file) Error: module_put(attr->attr.owner); Done: - if (error && kobj) + if (error) kobject_put(kobj); return error; } static int release(struct inode * inode, struct file * file) { - struct kobject * kobj = to_kobj(file->f_dentry->d_parent); - struct bin_attribute * attr = to_bin_attr(file->f_dentry); + struct kobject * kobj = to_kobj(file->f_path.dentry->d_parent); + struct bin_attribute * attr = to_bin_attr(file->f_path.dentry); u8 * buffer = file->private_data; - if (kobj) - kobject_put(kobj); + kobject_put(kobj); module_put(attr->attr.owner); kfree(buffer); return 0; diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index a5782e8c7f0..9dcdf556c99 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/kobject.h> #include <linux/namei.h> +#include <asm/semaphore.h> #include "sysfs.h" DECLARE_RWSEM(sysfs_rename_sem); @@ -32,8 +33,7 @@ static struct dentry_operations sysfs_dentry_ops = { /* * Allocates a new sysfs_dirent and links it to the parent sysfs_dirent */ -static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent * parent_sd, - void * element) +static struct sysfs_dirent * __sysfs_new_dirent(void * element) { struct sysfs_dirent * sd; @@ -45,12 +45,28 @@ static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent * parent_sd, atomic_set(&sd->s_count, 1); atomic_set(&sd->s_event, 1); INIT_LIST_HEAD(&sd->s_children); - list_add(&sd->s_sibling, &parent_sd->s_children); + INIT_LIST_HEAD(&sd->s_sibling); sd->s_element = element; return sd; } +static void __sysfs_list_dirent(struct sysfs_dirent *parent_sd, + struct sysfs_dirent *sd) +{ + if (sd) + list_add(&sd->s_sibling, &parent_sd->s_children); +} + +static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent *parent_sd, + void * element) +{ + struct sysfs_dirent *sd; + sd = __sysfs_new_dirent(element); + __sysfs_list_dirent(parent_sd, sd); + return sd; +} + /* * * Return -EEXIST if there is already a sysfs element with the same name for @@ -77,14 +93,14 @@ int sysfs_dirent_exist(struct sysfs_dirent *parent_sd, } -int sysfs_make_dirent(struct sysfs_dirent * parent_sd, struct dentry * dentry, - void * element, umode_t mode, int type) +static struct sysfs_dirent * +__sysfs_make_dirent(struct dentry *dentry, void *element, mode_t mode, int type) { struct sysfs_dirent * sd; - sd = sysfs_new_dirent(parent_sd, element); + sd = __sysfs_new_dirent(element); if (!sd) - return -ENOMEM; + goto out; sd->s_mode = mode; sd->s_type = type; @@ -94,7 +110,19 @@ int sysfs_make_dirent(struct sysfs_dirent * parent_sd, struct dentry * dentry, dentry->d_op = &sysfs_dentry_ops; } - return 0; +out: + return sd; +} + +int sysfs_make_dirent(struct sysfs_dirent * parent_sd, struct dentry * dentry, + void * element, umode_t mode, int type) +{ + struct sysfs_dirent *sd; + + sd = __sysfs_make_dirent(dentry, element, mode, type); + __sysfs_list_dirent(parent_sd, sd); + + return sd ? 0 : -ENOMEM; } static int init_dir(struct inode * inode) @@ -165,11 +193,11 @@ int sysfs_create_subdir(struct kobject * k, const char * n, struct dentry ** d) /** * sysfs_create_dir - create a directory for an object. - * @parent: parent parent object. * @kobj: object we're creating directory for. + * @shadow_parent: parent parent object. */ -int sysfs_create_dir(struct kobject * kobj) +int sysfs_create_dir(struct kobject * kobj, struct dentry *shadow_parent) { struct dentry * dentry = NULL; struct dentry * parent; @@ -177,7 +205,9 @@ int sysfs_create_dir(struct kobject * kobj) BUG_ON(!kobj); - if (kobj->parent) + if (shadow_parent) + parent = shadow_parent; + else if (kobj->parent) parent = kobj->parent->dentry; else if (sysfs_mount && sysfs_mount->mnt_sb) parent = sysfs_mount->mnt_sb->s_root; @@ -298,21 +328,12 @@ void sysfs_remove_subdir(struct dentry * d) } -/** - * sysfs_remove_dir - remove an object's directory. - * @kobj: object. - * - * The only thing special about this is that we remove any files in - * the directory before we remove the directory, and we've inlined - * what used to be sysfs_rmdir() below, instead of calling separately. - */ - -void sysfs_remove_dir(struct kobject * kobj) +static void __sysfs_remove_dir(struct dentry *dentry) { - struct dentry * dentry = dget(kobj->dentry); struct sysfs_dirent * parent_sd; struct sysfs_dirent * sd, * tmp; + dget(dentry); if (!dentry) return; @@ -333,32 +354,60 @@ void sysfs_remove_dir(struct kobject * kobj) * Drop reference from dget() on entrance. */ dput(dentry); +} + +/** + * sysfs_remove_dir - remove an object's directory. + * @kobj: object. + * + * The only thing special about this is that we remove any files in + * the directory before we remove the directory, and we've inlined + * what used to be sysfs_rmdir() below, instead of calling separately. + */ + +void sysfs_remove_dir(struct kobject * kobj) +{ + __sysfs_remove_dir(kobj->dentry); kobj->dentry = NULL; } -int sysfs_rename_dir(struct kobject * kobj, const char *new_name) +int sysfs_rename_dir(struct kobject * kobj, struct dentry *new_parent, + const char *new_name) { int error = 0; - struct dentry * new_dentry, * parent; - - if (!strcmp(kobject_name(kobj), new_name)) - return -EINVAL; + struct dentry * new_dentry; - if (!kobj->parent) - return -EINVAL; + if (!new_parent) + return -EFAULT; down_write(&sysfs_rename_sem); - parent = kobj->parent->dentry; - - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&new_parent->d_inode->i_mutex); - new_dentry = lookup_one_len(new_name, parent, strlen(new_name)); + new_dentry = lookup_one_len(new_name, new_parent, strlen(new_name)); if (!IS_ERR(new_dentry)) { - if (!new_dentry->d_inode) { + /* By allowing two different directories with the + * same d_parent we allow this routine to move + * between different shadows of the same directory + */ + if (kobj->dentry->d_parent->d_inode != new_parent->d_inode) + return -EINVAL; + else if (new_dentry->d_parent->d_inode != new_parent->d_inode) + error = -EINVAL; + else if (new_dentry == kobj->dentry) + error = -EINVAL; + else if (!new_dentry->d_inode) { error = kobject_set_name(kobj, "%s", new_name); if (!error) { + struct sysfs_dirent *sd, *parent_sd; + d_add(new_dentry, NULL); d_move(kobj->dentry, new_dentry); + + sd = kobj->dentry->d_fsdata; + parent_sd = new_parent->d_fsdata; + + list_del_init(&sd->s_sibling); + list_add(&sd->s_sibling, &parent_sd->s_children); } else d_drop(new_dentry); @@ -366,7 +415,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name) error = -EEXIST; dput(new_dentry); } - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&new_parent->d_inode->i_mutex); up_write(&sysfs_rename_sem); return error; @@ -378,12 +427,10 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent) struct sysfs_dirent *new_parent_sd, *sd; int error; - if (!new_parent) - return -EINVAL; - old_parent_dentry = kobj->parent ? kobj->parent->dentry : sysfs_mount->mnt_sb->s_root; - new_parent_dentry = new_parent->dentry; + new_parent_dentry = new_parent ? + new_parent->dentry : sysfs_mount->mnt_sb->s_root; again: mutex_lock(&old_parent_dentry->d_inode->i_mutex); @@ -419,7 +466,7 @@ out: static int sysfs_dir_open(struct inode *inode, struct file *file) { - struct dentry * dentry = file->f_dentry; + struct dentry * dentry = file->f_path.dentry; struct sysfs_dirent * parent_sd = dentry->d_fsdata; mutex_lock(&dentry->d_inode->i_mutex); @@ -432,7 +479,7 @@ static int sysfs_dir_open(struct inode *inode, struct file *file) static int sysfs_dir_close(struct inode *inode, struct file *file) { - struct dentry * dentry = file->f_dentry; + struct dentry * dentry = file->f_path.dentry; struct sysfs_dirent * cursor = file->private_data; mutex_lock(&dentry->d_inode->i_mutex); @@ -452,7 +499,7 @@ static inline unsigned char dt_type(struct sysfs_dirent *sd) static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) { - struct dentry *dentry = filp->f_dentry; + struct dentry *dentry = filp->f_path.dentry; struct sysfs_dirent * parent_sd = dentry->d_fsdata; struct sysfs_dirent *cursor = filp->private_data; struct list_head *p, *q = &cursor->s_sibling; @@ -509,7 +556,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin) { - struct dentry * dentry = file->f_dentry; + struct dentry * dentry = file->f_path.dentry; mutex_lock(&dentry->d_inode->i_mutex); switch (origin) { @@ -519,7 +566,7 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin) if (offset >= 0) break; default: - mutex_unlock(&file->f_dentry->d_inode->i_mutex); + mutex_unlock(&file->f_path.dentry->d_inode->i_mutex); return -EINVAL; } if (offset != file->f_pos) { @@ -547,6 +594,95 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin) return offset; } + +/** + * sysfs_make_shadowed_dir - Setup so a directory can be shadowed + * @kobj: object we're creating shadow of. + */ + +int sysfs_make_shadowed_dir(struct kobject *kobj, + void * (*follow_link)(struct dentry *, struct nameidata *)) +{ + struct inode *inode; + struct inode_operations *i_op; + + inode = kobj->dentry->d_inode; + if (inode->i_op != &sysfs_dir_inode_operations) + return -EINVAL; + + i_op = kmalloc(sizeof(*i_op), GFP_KERNEL); + if (!i_op) + return -ENOMEM; + + memcpy(i_op, &sysfs_dir_inode_operations, sizeof(*i_op)); + i_op->follow_link = follow_link; + + /* Locking of inode->i_op? + * Since setting i_op is a single word write and they + * are atomic we should be ok here. + */ + inode->i_op = i_op; + return 0; +} + +/** + * sysfs_create_shadow_dir - create a shadow directory for an object. + * @kobj: object we're creating directory for. + * + * sysfs_make_shadowed_dir must already have been called on this + * directory. + */ + +struct dentry *sysfs_create_shadow_dir(struct kobject *kobj) +{ + struct sysfs_dirent *sd; + struct dentry *parent, *dir, *shadow; + struct inode *inode; + + dir = kobj->dentry; + inode = dir->d_inode; + parent = dir->d_parent; + shadow = ERR_PTR(-EINVAL); + if (!sysfs_is_shadowed_inode(inode)) + goto out; + + shadow = d_alloc(parent, &dir->d_name); + if (!shadow) + goto nomem; + + sd = __sysfs_make_dirent(shadow, kobj, inode->i_mode, SYSFS_DIR); + if (!sd) + goto nomem; + + d_instantiate(shadow, igrab(inode)); + inc_nlink(inode); + inc_nlink(parent->d_inode); + shadow->d_op = &sysfs_dentry_ops; + + dget(shadow); /* Extra count - pin the dentry in core */ + +out: + return shadow; +nomem: + dput(shadow); + shadow = ERR_PTR(-ENOMEM); + goto out; +} + +/** + * sysfs_remove_shadow_dir - remove an object's directory. + * @shadow: dentry of shadow directory + * + * The only thing special about this is that we remove any files in + * the directory before we remove the directory, and we've inlined + * what used to be sysfs_rmdir() below, instead of calling separately. + */ + +void sysfs_remove_shadow_dir(struct dentry *shadow) +{ + __sysfs_remove_dir(shadow); +} + const struct file_operations sysfs_dir_operations = { .open = sysfs_dir_open, .release = sysfs_dir_close, diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 95c165101c9..c0e117649a4 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -7,6 +7,7 @@ #include <linux/kobject.h> #include <linux/namei.h> #include <linux/poll.h> +#include <linux/list.h> #include <asm/uaccess.h> #include <asm/semaphore.h> @@ -50,17 +51,29 @@ static struct sysfs_ops subsys_sysfs_ops = { .store = subsys_attr_store, }; +/** + * add_to_collection - add buffer to a collection + * @buffer: buffer to be added + * @node inode of set to add to + */ -struct sysfs_buffer { - size_t count; - loff_t pos; - char * page; - struct sysfs_ops * ops; - struct semaphore sem; - int needs_read_fill; - int event; -}; +static inline void +add_to_collection(struct sysfs_buffer *buffer, struct inode *node) +{ + struct sysfs_buffer_collection *set = node->i_private; + mutex_lock(&node->i_mutex); + list_add(&buffer->associates, &set->associates); + mutex_unlock(&node->i_mutex); +} + +static inline void +remove_from_collection(struct sysfs_buffer *buffer, struct inode *node) +{ + mutex_lock(&node->i_mutex); + list_del(&buffer->associates); + mutex_unlock(&node->i_mutex); +} /** * fill_read_buffer - allocate and fill buffer from object. @@ -70,7 +83,8 @@ struct sysfs_buffer { * Allocate @buffer->page, if it hasn't been already, then call the * kobject's show() method to fill the buffer with this attribute's * data. - * This is called only once, on the file's first read. + * This is called only once, on the file's first read unless an error + * is returned. */ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer) { @@ -88,12 +102,13 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer buffer->event = atomic_read(&sd->s_event); count = ops->show(kobj,attr,buffer->page); - buffer->needs_read_fill = 0; BUG_ON(count > (ssize_t)PAGE_SIZE); - if (count >= 0) + if (count >= 0) { + buffer->needs_read_fill = 0; buffer->count = count; - else + } else { ret = count; + } return ret; } @@ -153,8 +168,12 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) ssize_t retval = 0; down(&buffer->sem); + if (buffer->orphaned) { + retval = -ENODEV; + goto out; + } if (buffer->needs_read_fill) { - if ((retval = fill_read_buffer(file->f_dentry,buffer))) + if ((retval = fill_read_buffer(file->f_path.dentry,buffer))) goto out; } pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n", @@ -165,7 +184,6 @@ out: return retval; } - /** * fill_write_buffer - copy buffer from userspace. * @buffer: data buffer for file. @@ -243,19 +261,25 @@ sysfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t ssize_t len; down(&buffer->sem); + if (buffer->orphaned) { + len = -ENODEV; + goto out; + } len = fill_write_buffer(buffer, buf, count); if (len > 0) - len = flush_write_buffer(file->f_dentry, buffer, len); + len = flush_write_buffer(file->f_path.dentry, buffer, len); if (len > 0) *ppos += len; +out: up(&buffer->sem); return len; } -static int check_perm(struct inode * inode, struct file * file) +static int sysfs_open_file(struct inode *inode, struct file *file) { - struct kobject *kobj = sysfs_get_kobject(file->f_dentry->d_parent); - struct attribute * attr = to_attr(file->f_dentry); + struct kobject *kobj = sysfs_get_kobject(file->f_path.dentry->d_parent); + struct attribute * attr = to_attr(file->f_path.dentry); + struct sysfs_buffer_collection *set; struct sysfs_buffer * buffer; struct sysfs_ops * ops = NULL; int error = 0; @@ -285,6 +309,18 @@ static int check_perm(struct inode * inode, struct file * file) if (!ops) goto Eaccess; + /* make sure we have a collection to add our buffers to */ + mutex_lock(&inode->i_mutex); + if (!(set = inode->i_private)) { + if (!(set = inode->i_private = kmalloc(sizeof(struct sysfs_buffer_collection), GFP_KERNEL))) { + error = -ENOMEM; + goto Done; + } else { + INIT_LIST_HEAD(&set->associates); + } + } + mutex_unlock(&inode->i_mutex); + /* File needs write support. * The inode's perms must say it's ok, * and we must have a store method. @@ -310,9 +346,11 @@ static int check_perm(struct inode * inode, struct file * file) */ buffer = kzalloc(sizeof(struct sysfs_buffer), GFP_KERNEL); if (buffer) { + INIT_LIST_HEAD(&buffer->associates); init_MUTEX(&buffer->sem); buffer->needs_read_fill = 1; buffer->ops = ops; + add_to_collection(buffer, inode); file->private_data = buffer; } else error = -ENOMEM; @@ -325,25 +363,21 @@ static int check_perm(struct inode * inode, struct file * file) error = -EACCES; module_put(attr->owner); Done: - if (error && kobj) + if (error) kobject_put(kobj); return error; } -static int sysfs_open_file(struct inode * inode, struct file * filp) -{ - return check_perm(inode,filp); -} - static int sysfs_release(struct inode * inode, struct file * filp) { - struct kobject * kobj = to_kobj(filp->f_dentry->d_parent); - struct attribute * attr = to_attr(filp->f_dentry); + struct kobject * kobj = to_kobj(filp->f_path.dentry->d_parent); + struct attribute * attr = to_attr(filp->f_path.dentry); struct module * owner = attr->owner; struct sysfs_buffer * buffer = filp->private_data; - if (kobj) - kobject_put(kobj); + if (buffer) + remove_from_collection(buffer, inode); + kobject_put(kobj); /* After this point, attr should not be accessed. */ module_put(owner); @@ -372,8 +406,8 @@ static int sysfs_release(struct inode * inode, struct file * filp) static unsigned int sysfs_poll(struct file *filp, poll_table *wait) { struct sysfs_buffer * buffer = filp->private_data; - struct kobject * kobj = to_kobj(filp->f_dentry->d_parent); - struct sysfs_dirent * sd = filp->f_dentry->d_fsdata; + struct kobject * kobj = to_kobj(filp->f_path.dentry->d_parent); + struct sysfs_dirent * sd = filp->f_path.dentry->d_fsdata; int res = 0; poll_wait(filp, &kobj->poll, wait); @@ -548,7 +582,7 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file); void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) { - sysfs_hash_and_remove(kobj->dentry,attr->name); + sysfs_hash_and_remove(kobj->dentry, attr->name); } diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 122145b0895..b20951c9376 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -13,6 +13,8 @@ #include <linux/dcache.h> #include <linux/namei.h> #include <linux/err.h> +#include <linux/fs.h> +#include <asm/semaphore.h> #include "sysfs.h" diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index e79e38d52c0..542d2bcc73d 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -13,6 +13,7 @@ #include <linux/backing-dev.h> #include <linux/capability.h> #include <linux/errno.h> +#include <asm/semaphore.h> #include "sysfs.h" extern struct super_block * sysfs_sb; @@ -32,6 +33,16 @@ static struct inode_operations sysfs_inode_operations ={ .setattr = sysfs_setattr, }; +void sysfs_delete_inode(struct inode *inode) +{ + /* Free the shadowed directory inode operations */ + if (sysfs_is_shadowed_inode(inode)) { + kfree(inode->i_op); + inode->i_op = NULL; + } + return generic_delete_inode(inode); +} + int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) { struct inode * inode = dentry->d_inode; @@ -209,6 +220,22 @@ const unsigned char * sysfs_get_name(struct sysfs_dirent *sd) return NULL; } +static inline void orphan_all_buffers(struct inode *node) +{ + struct sysfs_buffer_collection *set = node->i_private; + struct sysfs_buffer *buf; + + mutex_lock_nested(&node->i_mutex, I_MUTEX_CHILD); + if (node->i_private) { + list_for_each_entry(buf, &set->associates, associates) { + down(&buf->sem); + buf->orphaned = 1; + up(&buf->sem); + } + } + mutex_unlock(&node->i_mutex); +} + /* * Unhashes the dentry corresponding to given sysfs_dirent @@ -217,16 +244,23 @@ const unsigned char * sysfs_get_name(struct sysfs_dirent *sd) void sysfs_drop_dentry(struct sysfs_dirent * sd, struct dentry * parent) { struct dentry * dentry = sd->s_dentry; + struct inode *inode; if (dentry) { spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); if (!(d_unhashed(dentry) && dentry->d_inode)) { + inode = dentry->d_inode; + spin_lock(&inode->i_lock); + __iget(inode); + spin_unlock(&inode->i_lock); dget_locked(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); simple_unlink(parent->d_inode, dentry); + orphan_all_buffers(inode); + iput(inode); } else { spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); @@ -248,7 +282,7 @@ int sysfs_hash_and_remove(struct dentry * dir, const char * name) return -ENOENT; parent_sd = dir->d_fsdata; - mutex_lock(&dir->d_inode->i_mutex); + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (!sd->s_element) continue; diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index e503f858fba..f6a87a82488 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -8,6 +8,7 @@ #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/init.h> +#include <asm/semaphore.h> #include "sysfs.h" @@ -18,9 +19,12 @@ struct vfsmount *sysfs_mount; struct super_block * sysfs_sb = NULL; struct kmem_cache *sysfs_dir_cachep; +static void sysfs_clear_inode(struct inode *inode); + static struct super_operations sysfs_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = sysfs_delete_inode, + .clear_inode = sysfs_clear_inode, }; static struct sysfs_dirent sysfs_root = { @@ -31,6 +35,11 @@ static struct sysfs_dirent sysfs_root = { .s_iattr = NULL, }; +static void sysfs_clear_inode(struct inode *inode) +{ + kfree(inode->i_private); +} + static int sysfs_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index f50e3cc2ded..4869f611192 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -7,6 +7,7 @@ #include <linux/module.h> #include <linux/kobject.h> #include <linux/namei.h> +#include <asm/semaphore.h> #include "sysfs.h" diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index bd7cec295da..fe1cbfd208e 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -2,6 +2,7 @@ extern struct vfsmount * sysfs_mount; extern struct kmem_cache *sysfs_dir_cachep; +extern void sysfs_delete_inode(struct inode *inode); extern struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent *); extern int sysfs_create(struct dentry *, int mode, int (*init)(struct inode *)); @@ -33,6 +34,22 @@ struct sysfs_symlink { struct kobject * target_kobj; }; +struct sysfs_buffer { + struct list_head associates; + size_t count; + loff_t pos; + char * page; + struct sysfs_ops * ops; + struct semaphore sem; + int orphaned; + int needs_read_fill; + int event; +}; + +struct sysfs_buffer_collection { + struct list_head associates; +}; + static inline struct kobject * to_kobj(struct dentry * dentry) { struct sysfs_dirent * sd = dentry->d_fsdata; @@ -96,3 +113,7 @@ static inline void sysfs_put(struct sysfs_dirent * sd) release_sysfs_dirent(sd); } +static inline int sysfs_is_shadowed_inode(struct inode *inode) +{ + return S_ISDIR(inode->i_mode) && inode->i_op->follow_link; +} diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index f2bef962d30..ebf7007fa16 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -70,7 +70,7 @@ fail: static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir) { unsigned long pos = filp->f_pos; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct super_block *sb = inode->i_sb; unsigned offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; diff --git a/fs/sysv/super.c b/fs/sysv/super.c index dc9e7dc07fb..6f9707a1b95 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -528,9 +528,6 @@ static struct file_system_type v7_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; -extern int sysv_init_icache(void) __init; -extern void sysv_destroy_icache(void); - static int __init init_sysv_fs(void) { int error; diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index 9dcc8212093..dcb18b2171f 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -143,6 +143,9 @@ extern int sysv_sync_inode(struct inode *); extern int sysv_sync_file(struct file *, struct dentry *, int); extern void sysv_set_inode(struct inode *, dev_t); extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *); +extern int sysv_init_icache(void); +extern void sysv_destroy_icache(void); + /* dir.c */ extern struct sysv_dir_entry *sysv_find_entry(struct dentry *, struct page **); diff --git a/fs/udf/dir.c b/fs/udf/dir.c index 8c28efa3b8f..2391c9150c4 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -77,7 +77,7 @@ const struct file_operations udf_dir_operations = { int udf_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *dir = filp->f_dentry->d_inode; + struct inode *dir = filp->f_path.dentry->d_inode; int result; lock_kernel(); @@ -225,7 +225,7 @@ do_udf_readdir(struct inode * dir, struct file *filp, filldir_t filldir, void *d if ( cfi.fileCharacteristics & FID_FILE_CHAR_PARENT ) { - iblock = parent_ino(filp->f_dentry); + iblock = parent_ino(filp->f_path.dentry); flen = 2; memcpy(fname, "..", flen); dt_type = DT_DIR; diff --git a/fs/udf/file.c b/fs/udf/file.c index 7aedd552cba..d81f2db7b0e 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -108,7 +108,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, { ssize_t retval; struct file *file = iocb->ki_filp; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; int err, pos; size_t count = iocb->ki_left; diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index b8238147577..638f4c585e8 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -227,24 +227,27 @@ failed: * We can come here from ufs_writepage or ufs_prepare_write, * locked_page is argument of these functions, so we already lock it. */ -static void ufs_change_blocknr(struct inode *inode, unsigned int baseblk, +static void ufs_change_blocknr(struct inode *inode, unsigned int beg, unsigned int count, unsigned int oldb, unsigned int newb, struct page *locked_page) { - unsigned int blk_per_page = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits); - struct address_space *mapping = inode->i_mapping; - pgoff_t index, cur_index = locked_page->index; - unsigned int i, j; + const unsigned mask = (1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1; + struct address_space * const mapping = inode->i_mapping; + pgoff_t index, cur_index; + unsigned end, pos, j; struct page *page; struct buffer_head *head, *bh; UFSD("ENTER, ino %lu, count %u, oldb %u, newb %u\n", inode->i_ino, count, oldb, newb); + BUG_ON(!locked_page); BUG_ON(!PageLocked(locked_page)); - for (i = 0; i < count; i += blk_per_page) { - index = (baseblk+i) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + cur_index = locked_page->index; + + for (end = count + beg; beg < end; beg = (beg | mask) + 1) { + index = beg >> (PAGE_CACHE_SHIFT - inode->i_blkbits); if (likely(cur_index != index)) { page = ufs_get_locked_page(mapping, index); @@ -253,21 +256,32 @@ static void ufs_change_blocknr(struct inode *inode, unsigned int baseblk, } else page = locked_page; - j = i; head = page_buffers(page); bh = head; + pos = beg & mask; + for (j = 0; j < pos; ++j) + bh = bh->b_this_page; + j = 0; do { - if (likely(bh->b_blocknr == j + oldb && j < count)) { - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); - bh->b_blocknr = newb + j++; - mark_buffer_dirty(bh); + if (buffer_mapped(bh)) { + pos = bh->b_blocknr - oldb; + if (pos < count) { + UFSD(" change from %llu to %llu\n", + (unsigned long long)pos + oldb, + (unsigned long long)pos + newb); + bh->b_blocknr = newb + pos; + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + mark_buffer_dirty(bh); + ++j; + } } bh = bh->b_this_page; } while (bh != head); - set_page_dirty(page); + if (j) + set_page_dirty(page); if (likely(cur_index != index)) ufs_put_locked_page(page); @@ -275,6 +289,25 @@ static void ufs_change_blocknr(struct inode *inode, unsigned int baseblk, UFSD("EXIT\n"); } +static void ufs_clear_frags(struct inode *inode, sector_t beg, unsigned int n, + int sync) +{ + struct buffer_head *bh; + sector_t end = beg + n; + + for (; beg < end; ++beg) { + bh = sb_getblk(inode->i_sb, beg); + lock_buffer(bh); + memset(bh->b_data, 0, inode->i_sb->s_blocksize); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + if (IS_SYNC(inode) || sync) + sync_dirty_buffer(bh); + brelse(bh); + } +} + unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment, unsigned goal, unsigned count, int * err, struct page *locked_page) { @@ -350,6 +383,8 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment, *p = cpu_to_fs32(sb, result); *err = 0; UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count); + ufs_clear_frags(inode, result + oldcount, newcount - oldcount, + locked_page != NULL); } unlock_super(sb); UFSD("EXIT, result %u\n", result); @@ -363,6 +398,8 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment, if (result) { *err = 0; UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count); + ufs_clear_frags(inode, result + oldcount, newcount - oldcount, + locked_page != NULL); unlock_super(sb); UFSD("EXIT, result %u\n", result); return result; @@ -392,6 +429,8 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment, } result = ufs_alloc_fragments (inode, cgno, goal, request, err); if (result) { + ufs_clear_frags(inode, result + oldcount, newcount - oldcount, + locked_page != NULL); ufs_change_blocknr(inode, fragment - oldcount, oldcount, tmp, result, locked_page); diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 7f0a0aa6358..a6c0ca9f48b 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -106,12 +106,13 @@ static void ufs_check_page(struct page *page) char *kaddr = page_address(page); unsigned offs, rec_len; unsigned limit = PAGE_CACHE_SIZE; + const unsigned chunk_mask = UFS_SB(sb)->s_uspi->s_dirblksize - 1; struct ufs_dir_entry *p; char *error; if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) { limit = dir->i_size & ~PAGE_CACHE_MASK; - if (limit & (UFS_SECTOR_SIZE - 1)) + if (limit & chunk_mask) goto Ebadsize; if (!limit) goto out; @@ -126,7 +127,7 @@ static void ufs_check_page(struct page *page) goto Ealign; if (rec_len < UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, p))) goto Enamelen; - if (((offs + rec_len - 1) ^ offs) & ~(UFS_SECTOR_SIZE-1)) + if (((offs + rec_len - 1) ^ offs) & ~chunk_mask) goto Espan; if (fs32_to_cpu(sb, p->d_ino) > (UFS_SB(sb)->s_uspi->s_ipg * UFS_SB(sb)->s_uspi->s_ncg)) @@ -310,6 +311,7 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode) int namelen = dentry->d_name.len; struct super_block *sb = dir->i_sb; unsigned reclen = UFS_DIR_REC_LEN(namelen); + const unsigned int chunk_size = UFS_SB(sb)->s_uspi->s_dirblksize; unsigned short rec_len, name_len; struct page *page = NULL; struct ufs_dir_entry *de; @@ -342,8 +344,8 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode) if ((char *)de == dir_end) { /* We hit i_size */ name_len = 0; - rec_len = UFS_SECTOR_SIZE; - de->d_reclen = cpu_to_fs16(sb, UFS_SECTOR_SIZE); + rec_len = chunk_size; + de->d_reclen = cpu_to_fs16(sb, chunk_size); de->d_ino = 0; goto got_it; } @@ -426,12 +428,12 @@ static int ufs_readdir(struct file *filp, void *dirent, filldir_t filldir) { loff_t pos = filp->f_pos; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct super_block *sb = inode->i_sb; unsigned int offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; unsigned long npages = ufs_dir_pages(inode); - unsigned chunk_mask = ~(UFS_SECTOR_SIZE - 1); + unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); int need_revalidate = filp->f_version != inode->i_version; unsigned flags = UFS_SB(sb)->s_flags; @@ -511,7 +513,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, struct super_block *sb = inode->i_sb; struct address_space *mapping = page->mapping; char *kaddr = page_address(page); - unsigned from = ((char*)dir - kaddr) & ~(UFS_SECTOR_SIZE - 1); + unsigned from = ((char*)dir - kaddr) & ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); unsigned to = ((char*)dir - kaddr) + fs16_to_cpu(sb, dir->d_reclen); struct ufs_dir_entry *pde = NULL; struct ufs_dir_entry *de = (struct ufs_dir_entry *) (kaddr + from); @@ -556,6 +558,7 @@ int ufs_make_empty(struct inode * inode, struct inode *dir) struct super_block * sb = dir->i_sb; struct address_space *mapping = inode->i_mapping; struct page *page = grab_cache_page(mapping, 0); + const unsigned int chunk_size = UFS_SB(sb)->s_uspi->s_dirblksize; struct ufs_dir_entry * de; char *base; int err; @@ -563,7 +566,7 @@ int ufs_make_empty(struct inode * inode, struct inode *dir) if (!page) return -ENOMEM; kmap(page); - err = mapping->a_ops->prepare_write(NULL, page, 0, UFS_SECTOR_SIZE); + err = mapping->a_ops->prepare_write(NULL, page, 0, chunk_size); if (err) { unlock_page(page); goto fail; @@ -584,11 +587,11 @@ int ufs_make_empty(struct inode * inode, struct inode *dir) ((char *)de + fs16_to_cpu(sb, de->d_reclen)); de->d_ino = cpu_to_fs32(sb, dir->i_ino); ufs_set_de_type(sb, de, dir->i_mode); - de->d_reclen = cpu_to_fs16(sb, UFS_SECTOR_SIZE - UFS_DIR_REC_LEN(1)); + de->d_reclen = cpu_to_fs16(sb, chunk_size - UFS_DIR_REC_LEN(1)); ufs_set_de_namlen(sb, de, 2); strcpy (de->d_name, ".."); - err = ufs_commit_chunk(page, 0, UFS_SECTOR_SIZE); + err = ufs_commit_chunk(page, 0, chunk_size); fail: kunmap(page); page_cache_release(page); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index ee1eaa6f4ec..4295ca91cf8 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -156,36 +156,6 @@ out: return ret; } -static void ufs_clear_frag(struct inode *inode, struct buffer_head *bh) -{ - lock_buffer(bh); - memset(bh->b_data, 0, inode->i_sb->s_blocksize); - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - unlock_buffer(bh); - if (IS_SYNC(inode)) - sync_dirty_buffer(bh); -} - -static struct buffer_head * -ufs_clear_frags(struct inode *inode, sector_t beg, - unsigned int n, sector_t want) -{ - struct buffer_head *res = NULL, *bh; - sector_t end = beg + n; - - for (; beg < end; ++beg) { - bh = sb_getblk(inode->i_sb, beg); - ufs_clear_frag(inode, bh); - if (want != beg) - brelse(bh); - else - res = bh; - } - BUG_ON(!res); - return res; -} - /** * ufs_inode_getfrag() - allocate new fragment(s) * @inode - pointer to inode @@ -272,7 +242,8 @@ repeat: goal = tmp + uspi->s_fpb; tmp = ufs_new_fragments (inode, p, fragment - blockoff, goal, required + blockoff, - err, locked_page); + err, + phys != NULL ? locked_page : NULL); } /* * We will extend last allocated block @@ -280,7 +251,7 @@ repeat: else if (lastblock == block) { tmp = ufs_new_fragments(inode, p, fragment - (blockoff - lastblockoff), fs32_to_cpu(sb, *p), required + (blockoff - lastblockoff), - err, locked_page); + err, phys != NULL ? locked_page : NULL); } else /* (lastblock > block) */ { /* * We will allocate new block before last allocated block @@ -291,7 +262,8 @@ repeat: goal = tmp + uspi->s_fpb; } tmp = ufs_new_fragments(inode, p, fragment - blockoff, - goal, uspi->s_fpb, err, locked_page); + goal, uspi->s_fpb, err, + phys != NULL ? locked_page : NULL); } if (!tmp) { if ((!blockoff && *p) || @@ -302,7 +274,7 @@ repeat: } if (!phys) { - result = ufs_clear_frags(inode, tmp, required, tmp + blockoff); + result = sb_getblk(sb, tmp + blockoff); } else { *phys = tmp + blockoff; result = NULL; @@ -403,8 +375,7 @@ repeat: if (!phys) { - result = ufs_clear_frags(inode, tmp, uspi->s_fpb, - tmp + blockoff); + result = sb_getblk(sb, tmp + blockoff); } else { *phys = tmp + blockoff; *new = 1; @@ -469,15 +440,17 @@ int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head * it much more readable: */ #define GET_INODE_DATABLOCK(x) \ - ufs_inode_getfrag(inode, x, fragment, 1, &err, &phys, &new, bh_result->b_page) + ufs_inode_getfrag(inode, x, fragment, 1, &err, &phys, &new,\ + bh_result->b_page) #define GET_INODE_PTR(x) \ - ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, NULL, NULL, bh_result->b_page) + ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, NULL, NULL,\ + bh_result->b_page) #define GET_INDIRECT_DATABLOCK(x) \ ufs_inode_getblock(inode, bh, x, fragment, \ - &err, &phys, &new, bh_result->b_page); + &err, &phys, &new, bh_result->b_page) #define GET_INDIRECT_PTR(x) \ ufs_inode_getblock(inode, bh, x, fragment, \ - &err, NULL, NULL, bh_result->b_page); + &err, NULL, NULL, NULL) if (ptr < UFS_NDIR_FRAGMENT) { bh = GET_INODE_DATABLOCK(ptr); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 8a8e9382ec0..209be95e9d1 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -649,7 +649,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) kmalloc (sizeof(struct ufs_sb_private_info), GFP_KERNEL); if (!uspi) goto failed; - + uspi->s_dirblksize = UFS_SECTOR_SIZE; super_block_offset=UFS_SBLOCK; /* Keep 2Gig file limit. Some UFS variants need to override @@ -718,6 +718,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) break; case UFS_MOUNT_UFSTYPE_NEXTSTEP: + /*TODO: check may be we need set special dir block size?*/ UFSD("ufstype=nextstep\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); @@ -733,6 +734,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) break; case UFS_MOUNT_UFSTYPE_NEXTSTEP_CD: + /*TODO: check may be we need set special dir block size?*/ UFSD("ufstype=nextstep-cd\n"); uspi->s_fsize = block_size = 2048; uspi->s_fmask = ~(2048 - 1); @@ -754,6 +756,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) uspi->s_fshift = 10; uspi->s_sbsize = super_block_size = 2048; uspi->s_sbbase = 0; + uspi->s_dirblksize = 1024; flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; if (!(sb->s_flags & MS_RDONLY)) { if (!silent) diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index ea11d04c41a..0437b0a6fe9 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -109,10 +109,10 @@ static int ufs_trunc_direct (struct inode * inode) tmp = fs32_to_cpu(sb, *p); if (!tmp ) ufs_panic (sb, "ufs_trunc_direct", "internal error"); + frag2 -= frag1; frag1 = ufs_fragnum (frag1); - frag2 = ufs_fragnum (frag2); - ufs_free_fragments (inode, tmp + frag1, frag2 - frag1); + ufs_free_fragments(inode, tmp + frag1, frag2); mark_inode_dirty(inode); frag_to_free = tmp + frag1; diff --git a/fs/xattr.c b/fs/xattr.c index 0901bdc2ce2..38646132ab0 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -268,7 +268,7 @@ sys_fsetxattr(int fd, char __user *name, void __user *value, f = fget(fd); if (!f) return error; - dentry = f->f_dentry; + dentry = f->f_path.dentry; audit_inode(NULL, dentry->d_inode); error = setxattr(dentry, name, value, size, flags); fput(f); @@ -351,7 +351,7 @@ sys_fgetxattr(int fd, char __user *name, void __user *value, size_t size) f = fget(fd); if (!f) return error; - error = getxattr(f->f_dentry, name, value, size); + error = getxattr(f->f_path.dentry, name, value, size); fput(f); return error; } @@ -423,7 +423,7 @@ sys_flistxattr(int fd, char __user *list, size_t size) f = fget(fd); if (!f) return error; - error = listxattr(f->f_dentry, list, size); + error = listxattr(f->f_path.dentry, list, size); fput(f); return error; } @@ -484,7 +484,7 @@ sys_fremovexattr(int fd, char __user *name) f = fget(fd); if (!f) return error; - dentry = f->f_dentry; + dentry = f->f_path.dentry; audit_inode(NULL, dentry->d_inode); error = removexattr(dentry, name); fput(f); diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 8e6b56fc1ca..7b54461695e 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -341,9 +341,9 @@ xfs_start_page_writeback( { ASSERT(PageLocked(page)); ASSERT(!PageWriteback(page)); - set_page_writeback(page); if (clear_dirty) - clear_page_dirty(page); + clear_page_dirty_for_io(page); + set_page_writeback(page); unlock_page(page); if (!buffers) { end_page_writeback(page); @@ -1406,7 +1406,7 @@ xfs_vm_direct_IO( xfs_end_io_direct); } - if (unlikely(ret <= 0 && iocb->private)) + if (unlikely(ret != -EIOCBQUEUED && iocb->private)) xfs_destroy_ioend(iocb->private); return ret; } diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index d93d8dd1958..d26f5cd2ba7 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -55,7 +55,7 @@ __xfs_file_read( loff_t pos) { struct file *file = iocb->ki_filp; - bhv_vnode_t *vp = vn_from_inode(file->f_dentry->d_inode); + bhv_vnode_t *vp = vn_from_inode(file->f_path.dentry->d_inode); BUG_ON(iocb->ki_pos != pos); if (unlikely(file->f_flags & O_DIRECT)) @@ -131,7 +131,7 @@ xfs_file_sendfile( read_actor_t actor, void *target) { - return bhv_vop_sendfile(vn_from_inode(filp->f_dentry->d_inode), + return bhv_vop_sendfile(vn_from_inode(filp->f_path.dentry->d_inode), filp, pos, 0, count, actor, target, NULL); } @@ -143,7 +143,7 @@ xfs_file_sendfile_invis( read_actor_t actor, void *target) { - return bhv_vop_sendfile(vn_from_inode(filp->f_dentry->d_inode), + return bhv_vop_sendfile(vn_from_inode(filp->f_path.dentry->d_inode), filp, pos, IO_INVIS, count, actor, target, NULL); } @@ -155,7 +155,7 @@ xfs_file_splice_read( size_t len, unsigned int flags) { - return bhv_vop_splice_read(vn_from_inode(infilp->f_dentry->d_inode), + return bhv_vop_splice_read(vn_from_inode(infilp->f_path.dentry->d_inode), infilp, ppos, pipe, len, flags, 0, NULL); } @@ -167,7 +167,7 @@ xfs_file_splice_read_invis( size_t len, unsigned int flags) { - return bhv_vop_splice_read(vn_from_inode(infilp->f_dentry->d_inode), + return bhv_vop_splice_read(vn_from_inode(infilp->f_path.dentry->d_inode), infilp, ppos, pipe, len, flags, IO_INVIS, NULL); } @@ -180,7 +180,7 @@ xfs_file_splice_write( size_t len, unsigned int flags) { - return bhv_vop_splice_write(vn_from_inode(outfilp->f_dentry->d_inode), + return bhv_vop_splice_write(vn_from_inode(outfilp->f_path.dentry->d_inode), pipe, outfilp, ppos, len, flags, 0, NULL); } @@ -192,7 +192,7 @@ xfs_file_splice_write_invis( size_t len, unsigned int flags) { - return bhv_vop_splice_write(vn_from_inode(outfilp->f_dentry->d_inode), + return bhv_vop_splice_write(vn_from_inode(outfilp->f_path.dentry->d_inode), pipe, outfilp, ppos, len, flags, IO_INVIS, NULL); } @@ -212,7 +212,7 @@ xfs_file_close( struct file *filp, fl_owner_t id) { - return -bhv_vop_close(vn_from_inode(filp->f_dentry->d_inode), 0, + return -bhv_vop_close(vn_from_inode(filp->f_path.dentry->d_inode), 0, file_count(filp) > 1 ? L_FALSE : L_TRUE, NULL); } @@ -251,7 +251,7 @@ xfs_vm_nopage( unsigned long address, int *type) { - struct inode *inode = area->vm_file->f_dentry->d_inode; + struct inode *inode = area->vm_file->f_path.dentry->d_inode; bhv_vnode_t *vp = vn_from_inode(inode); ASSERT_ALWAYS(vp->v_vfsp->vfs_flag & VFS_DMI); @@ -268,7 +268,7 @@ xfs_file_readdir( filldir_t filldir) { int error = 0; - bhv_vnode_t *vp = vn_from_inode(filp->f_dentry->d_inode); + bhv_vnode_t *vp = vn_from_inode(filp->f_path.dentry->d_inode); uio_t uio; iovec_t iov; int eof = 0; @@ -345,7 +345,7 @@ xfs_file_mmap( vma->vm_ops = &xfs_file_vm_ops; #ifdef CONFIG_XFS_DMAPI - if (vn_from_inode(filp->f_dentry->d_inode)->v_vfsp->vfs_flag & VFS_DMI) + if (vn_from_inode(filp->f_path.dentry->d_inode)->v_vfsp->vfs_flag & VFS_DMI) vma->vm_ops = &xfs_dmapi_file_vm_ops; #endif /* CONFIG_XFS_DMAPI */ @@ -360,7 +360,7 @@ xfs_file_ioctl( unsigned long p) { int error; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; bhv_vnode_t *vp = vn_from_inode(inode); error = bhv_vop_ioctl(vp, inode, filp, 0, cmd, (void __user *)p); @@ -382,7 +382,7 @@ xfs_file_ioctl_invis( unsigned long p) { int error; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; bhv_vnode_t *vp = vn_from_inode(inode); error = bhv_vop_ioctl(vp, inode, filp, IO_INVIS, cmd, (void __user *)p); @@ -404,7 +404,7 @@ xfs_vm_mprotect( struct vm_area_struct *vma, unsigned int newflags) { - bhv_vnode_t *vp = vn_from_inode(vma->vm_file->f_dentry->d_inode); + bhv_vnode_t *vp = vn_from_inode(vma->vm_file->f_path.dentry->d_inode); int error = 0; if (vp->v_vfsp->vfs_flag & VFS_DMI) { diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 74d094829a4..f011c9cd0d6 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -107,9 +107,9 @@ xfs_find_handle( if (!file) return -EBADF; - ASSERT(file->f_dentry); - ASSERT(file->f_dentry->d_inode); - inode = igrab(file->f_dentry->d_inode); + ASSERT(file->f_path.dentry); + ASSERT(file->f_path.dentry->d_inode); + inode = igrab(file->f_path.dentry->d_inode); fput(file); break; } @@ -333,10 +333,10 @@ xfs_open_by_handle( } /* Ensure umount returns EBUSY on umounts while this file is open. */ - mntget(parfilp->f_vfsmnt); + mntget(parfilp->f_path.mnt); /* Create file pointer. */ - filp = dentry_open(dentry, parfilp->f_vfsmnt, hreq.oflags); + filp = dentry_open(dentry, parfilp->f_path.mnt, hreq.oflags); if (IS_ERR(filp)) { put_unused_fd(new_fd); return -XFS_ERROR(-PTR_ERR(filp)); diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index 270db0f3861..b83cebc165f 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -112,7 +112,7 @@ xfs_compat_ioctl( unsigned cmd, unsigned long arg) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; bhv_vnode_t *vp = vn_from_inode(inode); int error; diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index fa842f1c9fa..65e79b471d4 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -805,7 +805,7 @@ start: !capable(CAP_FSETID)) { error = xfs_write_clear_setuid(xip); if (likely(!error)) - error = -remove_suid(file->f_dentry); + error = -remove_suid(file->f_path.dentry); if (unlikely(error)) { xfs_iunlock(xip, iolock); goto out_unlock_mutex; diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 80562b60fb9..50d0faea371 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -71,7 +71,7 @@ xfs_swapext( /* Pull information for the target fd */ if (((fp = fget((int)sxp->sx_fdtarget)) == NULL) || - ((vp = vn_from_inode(fp->f_dentry->d_inode)) == NULL)) { + ((vp = vn_from_inode(fp->f_path.dentry->d_inode)) == NULL)) { error = XFS_ERROR(EINVAL); goto error0; } @@ -83,7 +83,7 @@ xfs_swapext( } if (((tfp = fget((int)sxp->sx_fdtmp)) == NULL) || - ((tvp = vn_from_inode(tfp->f_dentry->d_inode)) == NULL)) { + ((tvp = vn_from_inode(tfp->f_path.dentry->d_inode)) == NULL)) { error = XFS_ERROR(EINVAL); goto error0; } |