diff options
Diffstat (limited to 'fs')
373 files changed, 10982 insertions, 7792 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index ed835836e0d..32ef4009d03 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -40,7 +40,9 @@ extern struct file_system_type v9fs_fs_type; extern const struct address_space_operations v9fs_addr_operations; extern const struct file_operations v9fs_file_operations; +extern const struct file_operations v9fs_file_operations_dotl; extern const struct file_operations v9fs_dir_operations; +extern const struct file_operations v9fs_dir_operations_dotl; extern const struct dentry_operations v9fs_dentry_operations; extern const struct dentry_operations v9fs_cached_dentry_operations; diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 0adfd64dfce..d61e3b28ce3 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -203,3 +203,11 @@ const struct file_operations v9fs_dir_operations = { .open = v9fs_file_open, .release = v9fs_dir_release, }; + +const struct file_operations v9fs_dir_operations_dotl = { + .read = generic_read_dir, + .llseek = generic_file_llseek, + .readdir = v9fs_dir_readdir, + .open = v9fs_file_open, + .release = v9fs_dir_release, +}; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index df52d488d2a..25b300e1c9d 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -296,3 +296,14 @@ const struct file_operations v9fs_file_operations = { .mmap = generic_file_readonly_mmap, .fsync = v9fs_file_fsync, }; + +const struct file_operations v9fs_file_operations_dotl = { + .llseek = generic_file_llseek, + .read = v9fs_file_read, + .write = v9fs_file_write, + .open = v9fs_file_open, + .release = v9fs_dir_release, + .lock = v9fs_file_lock, + .mmap = generic_file_readonly_mmap, + .fsync = v9fs_file_fsync, +}; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index f2434fc9d2c..4331b3b5ee1 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -44,9 +44,12 @@ #include "cache.h" static const struct inode_operations v9fs_dir_inode_operations; -static const struct inode_operations v9fs_dir_inode_operations_ext; +static const struct inode_operations v9fs_dir_inode_operations_dotu; +static const struct inode_operations v9fs_dir_inode_operations_dotl; static const struct inode_operations v9fs_file_inode_operations; +static const struct inode_operations v9fs_file_inode_operations_dotl; static const struct inode_operations v9fs_symlink_inode_operations; +static const struct inode_operations v9fs_symlink_inode_operations_dotl; /** * unixmode2p9mode - convert unix mode bits to plan 9 @@ -253,9 +256,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) return ERR_PTR(-ENOMEM); } - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); + inode_init_owner(inode, NULL, mode); inode->i_blocks = 0; inode->i_rdev = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -275,25 +276,44 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) init_special_inode(inode, inode->i_mode, inode->i_rdev); break; case S_IFREG: - inode->i_op = &v9fs_file_inode_operations; - inode->i_fop = &v9fs_file_operations; + if (v9fs_proto_dotl(v9ses)) { + inode->i_op = &v9fs_file_inode_operations_dotl; + inode->i_fop = &v9fs_file_operations_dotl; + } else { + inode->i_op = &v9fs_file_inode_operations; + inode->i_fop = &v9fs_file_operations; + } + break; + case S_IFLNK: - if (!v9fs_proto_dotu(v9ses)) { - P9_DPRINTK(P9_DEBUG_ERROR, - "extended modes used w/o 9P2000.u\n"); + if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) { + P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with " + "legacy protocol.\n"); err = -EINVAL; goto error; } - inode->i_op = &v9fs_symlink_inode_operations; + + if (v9fs_proto_dotl(v9ses)) + inode->i_op = &v9fs_symlink_inode_operations_dotl; + else + inode->i_op = &v9fs_symlink_inode_operations; + break; case S_IFDIR: inc_nlink(inode); - if (v9fs_proto_dotu(v9ses)) - inode->i_op = &v9fs_dir_inode_operations_ext; + if (v9fs_proto_dotl(v9ses)) + inode->i_op = &v9fs_dir_inode_operations_dotl; + else if (v9fs_proto_dotu(v9ses)) + inode->i_op = &v9fs_dir_inode_operations_dotu; else inode->i_op = &v9fs_dir_inode_operations; - inode->i_fop = &v9fs_dir_operations; + + if (v9fs_proto_dotl(v9ses)) + inode->i_fop = &v9fs_dir_operations_dotl; + else + inode->i_fop = &v9fs_dir_operations; + break; default: P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n", @@ -434,14 +454,12 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) { int retval; struct inode *file_inode; - struct v9fs_session_info *v9ses; struct p9_fid *v9fid; P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file, rmdir); file_inode = file->d_inode; - v9ses = v9fs_inode2v9ses(file_inode); v9fid = v9fs_fid_clone(file); if (IS_ERR(v9fid)) return PTR_ERR(v9fid); @@ -484,12 +502,11 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, ofid = NULL; fid = NULL; name = (char *) dentry->d_name.name; - dfid = v9fs_fid_clone(dentry->d_parent); + dfid = v9fs_fid_lookup(dentry->d_parent); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid clone failed %d\n", err); - dfid = NULL; - goto error; + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + return ERR_PTR(err); } /* clone a fid to use for creation */ @@ -497,8 +514,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, if (IS_ERR(ofid)) { err = PTR_ERR(ofid); P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); - ofid = NULL; - goto error; + return ERR_PTR(err); } err = p9_client_fcreate(ofid, name, perm, mode, extension); @@ -508,14 +524,13 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, } /* now walk from the parent so we can get unopened fid */ - fid = p9_client_walk(dfid, 1, &name, 0); + fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); fid = NULL; goto error; - } else - dfid = NULL; + } /* instantiate inode and assign the unopened fid to the dentry */ inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); @@ -538,9 +553,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, return ofid; error: - if (dfid) - p9_client_clunk(dfid); - if (ofid) p9_client_clunk(ofid); @@ -675,8 +687,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, if (IS_ERR(fid)) { result = PTR_ERR(fid); if (result == -ENOENT) { - d_add(dentry, NULL); - return NULL; + inode = NULL; + goto inst_out; } return ERR_PTR(result); @@ -693,7 +705,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, if (result < 0) goto error; - if ((fid->qid.version) && (v9ses->cache)) +inst_out: + if (v9ses->cache) dentry->d_op = &v9fs_cached_dentry_operations; else dentry->d_op = &v9fs_dentry_operations; @@ -772,6 +785,13 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto clunk_olddir; } + if (v9fs_proto_dotl(v9ses)) { + retval = p9_client_rename(oldfid, newdirfid, + (char *) new_dentry->d_name.name); + if (retval != -ENOSYS) + goto clunk_newdir; + } + /* 9P can only handle file rename in the same directory */ if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) { P9_DPRINTK(P9_DEBUG_ERROR, @@ -1197,6 +1217,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev)); else if (S_ISFIFO(mode)) *name = 0; + else if (S_ISSOCK(mode)) + *name = 0; else { __putname(name); return -EINVAL; @@ -1208,7 +1230,21 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) return retval; } -static const struct inode_operations v9fs_dir_inode_operations_ext = { +static const struct inode_operations v9fs_dir_inode_operations_dotu = { + .create = v9fs_vfs_create, + .lookup = v9fs_vfs_lookup, + .symlink = v9fs_vfs_symlink, + .link = v9fs_vfs_link, + .unlink = v9fs_vfs_unlink, + .mkdir = v9fs_vfs_mkdir, + .rmdir = v9fs_vfs_rmdir, + .mknod = v9fs_vfs_mknod, + .rename = v9fs_vfs_rename, + .getattr = v9fs_vfs_getattr, + .setattr = v9fs_vfs_setattr, +}; + +static const struct inode_operations v9fs_dir_inode_operations_dotl = { .create = v9fs_vfs_create, .lookup = v9fs_vfs_lookup, .symlink = v9fs_vfs_symlink, @@ -1239,6 +1275,11 @@ static const struct inode_operations v9fs_file_inode_operations = { .setattr = v9fs_vfs_setattr, }; +static const struct inode_operations v9fs_file_inode_operations_dotl = { + .getattr = v9fs_vfs_getattr, + .setattr = v9fs_vfs_setattr, +}; + static const struct inode_operations v9fs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = v9fs_vfs_follow_link, @@ -1246,3 +1287,11 @@ static const struct inode_operations v9fs_symlink_inode_operations = { .getattr = v9fs_vfs_getattr, .setattr = v9fs_vfs_setattr, }; + +static const struct inode_operations v9fs_symlink_inode_operations_dotl = { + .readlink = generic_readlink, + .follow_link = v9fs_vfs_follow_link, + .put_link = v9fs_vfs_put_link, + .getattr = v9fs_vfs_getattr, + .setattr = v9fs_vfs_setattr, +}; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 806da5d3b3a..be74d020436 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -38,6 +38,7 @@ #include <linux/idr.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/statfs.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -45,7 +46,7 @@ #include "v9fs_vfs.h" #include "fid.h" -static const struct super_operations v9fs_super_ops; +static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl; /** * v9fs_set_super - set the superblock @@ -76,7 +77,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, sb->s_blocksize_bits = fls(v9ses->maxdata - 1); sb->s_blocksize = 1 << sb->s_blocksize_bits; sb->s_magic = V9FS_MAGIC; - sb->s_op = &v9fs_super_ops; + if (v9fs_proto_dotl(v9ses)) + sb->s_op = &v9fs_super_ops_dotl; + else + sb->s_op = &v9fs_super_ops; sb->s_bdi = &v9ses->bdi; sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | @@ -211,6 +215,42 @@ v9fs_umount_begin(struct super_block *sb) v9fs_session_begin_cancel(v9ses); } +static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_rstatfs rs; + int res; + + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) { + res = PTR_ERR(fid); + goto done; + } + + v9ses = v9fs_inode2v9ses(dentry->d_inode); + if (v9fs_proto_dotl(v9ses)) { + res = p9_client_statfs(fid, &rs); + if (res == 0) { + buf->f_type = rs.type; + buf->f_bsize = rs.bsize; + buf->f_blocks = rs.blocks; + buf->f_bfree = rs.bfree; + buf->f_bavail = rs.bavail; + buf->f_files = rs.files; + buf->f_ffree = rs.ffree; + buf->f_fsid.val[0] = rs.fsid & 0xFFFFFFFFUL; + buf->f_fsid.val[1] = (rs.fsid >> 32) & 0xFFFFFFFFUL; + buf->f_namelen = rs.namelen; + } + if (res != -ENOSYS) + goto done; + } + res = simple_statfs(dentry, buf); +done: + return res; +} + static const struct super_operations v9fs_super_ops = { #ifdef CONFIG_9P_FSCACHE .alloc_inode = v9fs_alloc_inode, @@ -222,6 +262,17 @@ static const struct super_operations v9fs_super_ops = { .umount_begin = v9fs_umount_begin, }; +static const struct super_operations v9fs_super_ops_dotl = { +#ifdef CONFIG_9P_FSCACHE + .alloc_inode = v9fs_alloc_inode, + .destroy_inode = v9fs_destroy_inode, +#endif + .statfs = v9fs_statfs, + .clear_inode = v9fs_clear_inode, + .show_options = generic_show_options, + .umount_begin = v9fs_umount_begin, +}; + struct file_system_type v9fs_fs_type = { .name = "9p", .get_sb = v9fs_get_sb, diff --git a/fs/Makefile b/fs/Makefile index 97f340f14ba..e6ec1d309b1 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \ attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o drop_caches.o splice.o sync.o utimes.o \ - stack.o fs_struct.o + stack.o fs_struct.o statfs.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o diff --git a/fs/afs/dir.c b/fs/afs/dir.c index adc1cb771b5..b42d5cc1d6d 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -189,13 +189,9 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index, struct key *key) { struct page *page; - struct file file = { - .private_data = key, - }; - _enter("{%lu},%lu", dir->i_ino, index); - page = read_mapping_page(dir->i_mapping, index, &file); + page = read_cache_page(dir->i_mapping, index, afs_page_filler, key); if (!IS_ERR(page)) { kmap(page); if (!PageChecked(page)) diff --git a/fs/afs/file.c b/fs/afs/file.c index 0df9bc2b724..14d89fa58fe 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -121,34 +121,19 @@ static void afs_file_readpage_read_complete(struct page *page, #endif /* - * AFS read page from file, directory or symlink + * read page from file, directory or symlink, given a key to use */ -static int afs_readpage(struct file *file, struct page *page) +int afs_page_filler(void *data, struct page *page) { - struct afs_vnode *vnode; - struct inode *inode; - struct key *key; + struct inode *inode = page->mapping->host; + struct afs_vnode *vnode = AFS_FS_I(inode); + struct key *key = data; size_t len; off_t offset; int ret; - inode = page->mapping->host; - - if (file) { - key = file->private_data; - ASSERT(key != NULL); - } else { - key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); - goto error_nokey; - } - } - _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index); - vnode = AFS_FS_I(inode); - BUG_ON(!PageLocked(page)); ret = -ESTALE; @@ -214,31 +199,56 @@ static int afs_readpage(struct file *file, struct page *page) unlock_page(page); } - if (!file) - key_put(key); _leave(" = 0"); return 0; error: SetPageError(page); unlock_page(page); - if (!file) - key_put(key); -error_nokey: _leave(" = %d", ret); return ret; } /* + * read page from file, directory or symlink, given a file to nominate the key + * to be used + */ +static int afs_readpage(struct file *file, struct page *page) +{ + struct key *key; + int ret; + + if (file) { + key = file->private_data; + ASSERT(key != NULL); + ret = afs_page_filler(key, page); + } else { + struct inode *inode = page->mapping->host; + key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + } else { + ret = afs_page_filler(key, page); + key_put(key); + } + } + return ret; +} + +/* * read a set of pages */ static int afs_readpages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { + struct key *key = file->private_data; struct afs_vnode *vnode; int ret = 0; - _enter(",{%lu},,%d", mapping->host->i_ino, nr_pages); + _enter("{%d},{%lu},,%d", + key_serial(key), mapping->host->i_ino, nr_pages); + + ASSERT(key != NULL); vnode = AFS_FS_I(mapping->host); if (vnode->flags & AFS_VNODE_DELETED) { @@ -279,7 +289,7 @@ static int afs_readpages(struct file *file, struct address_space *mapping, } /* load the missing pages from the network */ - ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file); + ret = read_cache_pages(mapping, pages, afs_page_filler, key); _leave(" = %d [netting]", ret); return ret; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a10f2582844..807f284cc75 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -494,6 +494,7 @@ extern const struct file_operations afs_file_operations; extern int afs_open(struct inode *, struct file *); extern int afs_release(struct inode *, struct file *); +extern int afs_page_filler(void *, struct page *); /* * flock.c diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index b3feddc4f7d..a9e23039ea3 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -49,9 +49,6 @@ static unsigned long afs_mntpt_expiry_timeout = 10 * 60; */ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key) { - struct file file = { - .private_data = key, - }; struct page *page; size_t size; char *buf; @@ -61,7 +58,8 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key) vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); /* read the contents of the symlink into the pagecache */ - page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, &file); + page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, + afs_page_filler, key); if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index e4b75d6eda8..9bd4b3876c9 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -205,7 +205,7 @@ static struct inode *anon_inode_mkinode(void) * that it already _is_ on the dirty list. */ inode->i_state = I_DIRTY; - inode->i_mode = S_IRUSR | S_IWUSR; + inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_flags |= S_PRIVATE; diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index d29b7f6df86..d832062869f 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -736,11 +736,14 @@ static const struct file_operations _dev_ioctl_fops = { }; static struct miscdevice _autofs_dev_ioctl_misc = { - .minor = MISC_DYNAMIC_MINOR, + .minor = AUTOFS_MINOR, .name = AUTOFS_DEVICE_NAME, .fops = &_dev_ioctl_fops }; +MODULE_ALIAS_MISCDEV(AUTOFS_MINOR); +MODULE_ALIAS("devname:autofs"); + /* Register/deregister misc character device */ int autofs_dev_ioctl_init(void) { diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index e8e5e63ac95..db4117ed780 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -18,13 +18,14 @@ #include <linux/slab.h> #include <linux/param.h> #include <linux/time.h> +#include <linux/smp_lock.h> #include "autofs_i.h" static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); static int autofs4_dir_unlink(struct inode *,struct dentry *); static int autofs4_dir_rmdir(struct inode *,struct dentry *); static int autofs4_dir_mkdir(struct inode *,struct dentry *,int); -static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long); +static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long); static int autofs4_dir_open(struct inode *inode, struct file *file); static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); static void *autofs4_follow_link(struct dentry *, struct nameidata *); @@ -38,7 +39,7 @@ const struct file_operations autofs4_root_operations = { .read = generic_read_dir, .readdir = dcache_readdir, .llseek = dcache_dir_lseek, - .ioctl = autofs4_root_ioctl, + .unlocked_ioctl = autofs4_root_ioctl, }; const struct file_operations autofs4_dir_operations = { @@ -902,8 +903,8 @@ int is_autofs4_dentry(struct dentry *dentry) * ioctl()'s on the root directory is the chief method for the daemon to * generate kernel reactions */ -static int autofs4_root_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) { struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb); void __user *p = (void __user *)arg; @@ -947,3 +948,16 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp, return -ENOSYS; } } + +static long autofs4_root_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + long ret; + struct inode *inode = filp->f_dentry->d_inode; + + lock_kernel(); + ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); + unlock_kernel(); + + return ret; +} diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 1e41aadb106..8f73841fc97 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -105,14 +105,12 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode, } set_bit(ino, info->si_imap); info->si_freei--; - inode->i_uid = current_fsuid(); - inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid(); + inode_init_owner(inode, dir, mode); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; inode->i_blocks = 0; inode->i_op = &bfs_file_inops; inode->i_fop = &bfs_file_operations; inode->i_mapping->a_ops = &bfs_aops; - inode->i_mode = mode; inode->i_ino = ino; BFS_I(inode)->i_dsk_ino = ino; BFS_I(inode)->i_sblock = 0; diff --git a/fs/block_dev.c b/fs/block_dev.c index 6dcee88c2e5..26e5f502662 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -245,37 +245,14 @@ struct super_block *freeze_bdev(struct block_device *bdev) sb = get_active_super(bdev); if (!sb) goto out; - if (sb->s_flags & MS_RDONLY) { - sb->s_frozen = SB_FREEZE_TRANS; - up_write(&sb->s_umount); + error = freeze_super(sb); + if (error) { + deactivate_super(sb); + bdev->bd_fsfreeze_count--; mutex_unlock(&bdev->bd_fsfreeze_mutex); - return sb; - } - - sb->s_frozen = SB_FREEZE_WRITE; - smp_wmb(); - - sync_filesystem(sb); - - sb->s_frozen = SB_FREEZE_TRANS; - smp_wmb(); - - sync_blockdev(sb->s_bdev); - - if (sb->s_op->freeze_fs) { - error = sb->s_op->freeze_fs(sb); - if (error) { - printk(KERN_ERR - "VFS:Filesystem freeze failed\n"); - sb->s_frozen = SB_UNFROZEN; - deactivate_locked_super(sb); - bdev->bd_fsfreeze_count--; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return ERR_PTR(error); - } + return ERR_PTR(error); } - up_write(&sb->s_umount); - + deactivate_super(sb); out: sync_blockdev(bdev); mutex_unlock(&bdev->bd_fsfreeze_mutex); @@ -296,40 +273,22 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb) mutex_lock(&bdev->bd_fsfreeze_mutex); if (!bdev->bd_fsfreeze_count) - goto out_unlock; + goto out; error = 0; if (--bdev->bd_fsfreeze_count > 0) - goto out_unlock; + goto out; if (!sb) - goto out_unlock; - - BUG_ON(sb->s_bdev != bdev); - down_write(&sb->s_umount); - if (sb->s_flags & MS_RDONLY) - goto out_unfrozen; - - if (sb->s_op->unfreeze_fs) { - error = sb->s_op->unfreeze_fs(sb); - if (error) { - printk(KERN_ERR - "VFS:Filesystem thaw failed\n"); - sb->s_frozen = SB_FREEZE_TRANS; - bdev->bd_fsfreeze_count++; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return error; - } - } - -out_unfrozen: - sb->s_frozen = SB_UNFROZEN; - smp_wmb(); - wake_up(&sb->s_wait_unfrozen); + goto out; - if (sb) - deactivate_locked_super(sb); -out_unlock: + error = thaw_super(sb); + if (error) { + bdev->bd_fsfreeze_count++; + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return error; + } +out: mutex_unlock(&bdev->bd_fsfreeze_mutex); return 0; } @@ -417,7 +376,7 @@ int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync) */ mutex_unlock(&bd_inode->i_mutex); - error = blkdev_issue_flush(bdev, NULL); + error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT); if (error == -EOPNOTSUPP) error = 0; @@ -668,41 +627,209 @@ void bd_forget(struct inode *inode) iput(bdev->bd_inode); } -int bd_claim(struct block_device *bdev, void *holder) +/** + * bd_may_claim - test whether a block device can be claimed + * @bdev: block device of interest + * @whole: whole block device containing @bdev, may equal @bdev + * @holder: holder trying to claim @bdev + * + * Test whther @bdev can be claimed by @holder. + * + * CONTEXT: + * spin_lock(&bdev_lock). + * + * RETURNS: + * %true if @bdev can be claimed, %false otherwise. + */ +static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, + void *holder) { - int res; - spin_lock(&bdev_lock); - - /* first decide result */ if (bdev->bd_holder == holder) - res = 0; /* already a holder */ + return true; /* already a holder */ else if (bdev->bd_holder != NULL) - res = -EBUSY; /* held by someone else */ + return false; /* held by someone else */ else if (bdev->bd_contains == bdev) - res = 0; /* is a whole device which isn't held */ + return true; /* is a whole device which isn't held */ - else if (bdev->bd_contains->bd_holder == bd_claim) - res = 0; /* is a partition of a device that is being partitioned */ - else if (bdev->bd_contains->bd_holder != NULL) - res = -EBUSY; /* is a partition of a held device */ + else if (whole->bd_holder == bd_claim) + return true; /* is a partition of a device that is being partitioned */ + else if (whole->bd_holder != NULL) + return false; /* is a partition of a held device */ else - res = 0; /* is a partition of an un-held device */ + return true; /* is a partition of an un-held device */ +} + +/** + * bd_prepare_to_claim - prepare to claim a block device + * @bdev: block device of interest + * @whole: the whole device containing @bdev, may equal @bdev + * @holder: holder trying to claim @bdev + * + * Prepare to claim @bdev. This function fails if @bdev is already + * claimed by another holder and waits if another claiming is in + * progress. This function doesn't actually claim. On successful + * return, the caller has ownership of bd_claiming and bd_holder[s]. + * + * CONTEXT: + * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab + * it multiple times. + * + * RETURNS: + * 0 if @bdev can be claimed, -EBUSY otherwise. + */ +static int bd_prepare_to_claim(struct block_device *bdev, + struct block_device *whole, void *holder) +{ +retry: + /* if someone else claimed, fail */ + if (!bd_may_claim(bdev, whole, holder)) + return -EBUSY; + + /* if someone else is claiming, wait for it to finish */ + if (whole->bd_claiming && whole->bd_claiming != holder) { + wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); + DEFINE_WAIT(wait); + + prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&bdev_lock); + schedule(); + finish_wait(wq, &wait); + spin_lock(&bdev_lock); + goto retry; + } + + /* yay, all mine */ + return 0; +} - /* now impose change */ - if (res==0) { +/** + * bd_start_claiming - start claiming a block device + * @bdev: block device of interest + * @holder: holder trying to claim @bdev + * + * @bdev is about to be opened exclusively. Check @bdev can be opened + * exclusively and mark that an exclusive open is in progress. Each + * successful call to this function must be matched with a call to + * either bd_claim() or bd_abort_claiming(). If this function + * succeeds, the matching bd_claim() is guaranteed to succeed. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * Pointer to the block device containing @bdev on success, ERR_PTR() + * value on failure. + */ +static struct block_device *bd_start_claiming(struct block_device *bdev, + void *holder) +{ + struct gendisk *disk; + struct block_device *whole; + int partno, err; + + might_sleep(); + + /* + * @bdev might not have been initialized properly yet, look up + * and grab the outer block device the hard way. + */ + disk = get_gendisk(bdev->bd_dev, &partno); + if (!disk) + return ERR_PTR(-ENXIO); + + whole = bdget_disk(disk, 0); + put_disk(disk); + if (!whole) + return ERR_PTR(-ENOMEM); + + /* prepare to claim, if successful, mark claiming in progress */ + spin_lock(&bdev_lock); + + err = bd_prepare_to_claim(bdev, whole, holder); + if (err == 0) { + whole->bd_claiming = holder; + spin_unlock(&bdev_lock); + return whole; + } else { + spin_unlock(&bdev_lock); + bdput(whole); + return ERR_PTR(err); + } +} + +/* releases bdev_lock */ +static void __bd_abort_claiming(struct block_device *whole, void *holder) +{ + BUG_ON(whole->bd_claiming != holder); + whole->bd_claiming = NULL; + wake_up_bit(&whole->bd_claiming, 0); + + spin_unlock(&bdev_lock); + bdput(whole); +} + +/** + * bd_abort_claiming - abort claiming a block device + * @whole: whole block device returned by bd_start_claiming() + * @holder: holder trying to claim @bdev + * + * Abort a claiming block started by bd_start_claiming(). Note that + * @whole is not the block device to be claimed but the whole device + * returned by bd_start_claiming(). + * + * CONTEXT: + * Grabs and releases bdev_lock. + */ +static void bd_abort_claiming(struct block_device *whole, void *holder) +{ + spin_lock(&bdev_lock); + __bd_abort_claiming(whole, holder); /* releases bdev_lock */ +} + +/** + * bd_claim - claim a block device + * @bdev: block device to claim + * @holder: holder trying to claim @bdev + * + * Try to claim @bdev which must have been opened successfully. This + * function may be called with or without preceding + * blk_start_claiming(). In the former case, this function is always + * successful and terminates the claiming block. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * 0 if successful, -EBUSY if @bdev is already claimed. + */ +int bd_claim(struct block_device *bdev, void *holder) +{ + struct block_device *whole = bdev->bd_contains; + int res; + + might_sleep(); + + spin_lock(&bdev_lock); + + res = bd_prepare_to_claim(bdev, whole, holder); + if (res == 0) { /* note that for a whole device bd_holders * will be incremented twice, and bd_holder will * be set to bd_claim before being set to holder */ - bdev->bd_contains->bd_holders ++; - bdev->bd_contains->bd_holder = bd_claim; + whole->bd_holders++; + whole->bd_holder = bd_claim; bdev->bd_holders++; bdev->bd_holder = holder; } - spin_unlock(&bdev_lock); + + if (whole->bd_claiming) + __bd_abort_claiming(whole, holder); /* releases bdev_lock */ + else + spin_unlock(&bdev_lock); + return res; } - EXPORT_SYMBOL(bd_claim); void bd_release(struct block_device *bdev) @@ -1316,6 +1443,7 @@ EXPORT_SYMBOL(blkdev_get); static int blkdev_open(struct inode * inode, struct file * filp) { + struct block_device *whole = NULL; struct block_device *bdev; int res; @@ -1338,22 +1466,25 @@ static int blkdev_open(struct inode * inode, struct file * filp) if (bdev == NULL) return -ENOMEM; + if (filp->f_mode & FMODE_EXCL) { + whole = bd_start_claiming(bdev, filp); + if (IS_ERR(whole)) { + bdput(bdev); + return PTR_ERR(whole); + } + } + filp->f_mapping = bdev->bd_inode->i_mapping; res = blkdev_get(bdev, filp->f_mode); - if (res) - return res; - if (filp->f_mode & FMODE_EXCL) { - res = bd_claim(bdev, filp); - if (res) - goto out_blkdev_put; + if (whole) { + if (res == 0) + BUG_ON(bd_claim(bdev, filp) != 0); + else + bd_abort_claiming(whole, filp); } - return 0; - - out_blkdev_put: - blkdev_put(bdev, filp->f_mode); return res; } @@ -1564,27 +1695,34 @@ EXPORT_SYMBOL(lookup_bdev); */ struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) { - struct block_device *bdev; - int error = 0; + struct block_device *bdev, *whole; + int error; bdev = lookup_bdev(path); if (IS_ERR(bdev)) return bdev; + whole = bd_start_claiming(bdev, holder); + if (IS_ERR(whole)) { + bdput(bdev); + return whole; + } + error = blkdev_get(bdev, mode); if (error) - return ERR_PTR(error); + goto out_abort_claiming; + error = -EACCES; if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) - goto blkdev_put; - error = bd_claim(bdev, holder); - if (error) - goto blkdev_put; + goto out_blkdev_put; + BUG_ON(bd_claim(bdev, holder) != 0); return bdev; - -blkdev_put: + +out_blkdev_put: blkdev_put(bdev, mode); +out_abort_claiming: + bd_abort_claiming(whole, holder); return ERR_PTR(error); } diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 6ef7b26724e..8d432cd9d58 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -282,14 +282,14 @@ int btrfs_acl_chmod(struct inode *inode) return ret; } -struct xattr_handler btrfs_xattr_acl_default_handler = { +const struct xattr_handler btrfs_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .get = btrfs_xattr_acl_get, .set = btrfs_xattr_acl_set, }; -struct xattr_handler btrfs_xattr_acl_access_handler = { +const struct xattr_handler btrfs_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .get = btrfs_xattr_acl_get, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b34d32fdaae..c6a4f459ad7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1589,7 +1589,7 @@ static void btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len) { blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, - DISCARD_FL_BARRIER); + BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); } static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2bfdc641d4e..d601629b85d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4121,16 +4121,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (ret != 0) goto fail; - inode->i_uid = current_fsuid(); - - if (dir && (dir->i_mode & S_ISGID)) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - inode->i_gid = current_fsgid(); - - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); inode->i_ino = objectid; inode_set_bytes(inode, 0); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 1866dff0538..2909a03e523 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -832,11 +832,14 @@ static const struct file_operations btrfs_ctl_fops = { }; static struct miscdevice btrfs_misc = { - .minor = MISC_DYNAMIC_MINOR, + .minor = BTRFS_MINOR, .name = "btrfs-control", .fops = &btrfs_ctl_fops }; +MODULE_ALIAS_MISCDEV(BTRFS_MINOR); +MODULE_ALIAS("devname:btrfs-control"); + static int btrfs_interface_init(void) { return misc_register(&btrfs_misc); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 193b58f7d3f..59acd3eb288 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -282,7 +282,7 @@ err: * List of handlers for synthetic system.* attributes. All real ondisk * attributes are handled directly. */ -struct xattr_handler *btrfs_xattr_handlers[] = { +const struct xattr_handler *btrfs_xattr_handlers[] = { #ifdef CONFIG_BTRFS_FS_POSIX_ACL &btrfs_xattr_acl_access_handler, &btrfs_xattr_acl_default_handler, diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 721efa0346e..7a43fd640bb 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -21,9 +21,9 @@ #include <linux/xattr.h> -extern struct xattr_handler btrfs_xattr_acl_access_handler; -extern struct xattr_handler btrfs_xattr_acl_default_handler; -extern struct xattr_handler *btrfs_xattr_handlers[]; +extern const struct xattr_handler btrfs_xattr_acl_access_handler; +extern const struct xattr_handler btrfs_xattr_acl_default_handler; +extern const struct xattr_handler *btrfs_xattr_handlers[]; extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, void *buffer, size_t size); diff --git a/fs/buffer.c b/fs/buffer.c index c9c266db062..e8aa7081d25 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -275,6 +275,7 @@ void invalidate_bdev(struct block_device *bdev) return; invalidate_bh_lrus(); + lru_add_drain_all(); /* make sure all lru add caches are flushed */ invalidate_mapping_pages(mapping, 0, -1); } EXPORT_SYMBOL(invalidate_bdev); @@ -560,26 +561,17 @@ repeat: return err; } -static void do_thaw_all(struct work_struct *work) +static void do_thaw_one(struct super_block *sb, void *unused) { - struct super_block *sb; char b[BDEVNAME_SIZE]; + while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) + printk(KERN_WARNING "Emergency Thaw on %s\n", + bdevname(sb->s_bdev, b)); +} - spin_lock(&sb_lock); -restart: - list_for_each_entry(sb, &super_blocks, s_list) { - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); - while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) - printk(KERN_WARNING "Emergency Thaw on %s\n", - bdevname(sb->s_bdev, b)); - up_read(&sb->s_umount); - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; - } - spin_unlock(&sb_lock); +static void do_thaw_all(struct work_struct *work) +{ + iterate_supers(do_thaw_one, NULL); kfree(work); printk(KERN_WARNING "Emergency Thaw complete\n"); } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index a9005d862ed..d9c60b84949 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -274,7 +274,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; int rc = 0; struct page **pages; - struct pagevec pvec; loff_t offset; u64 len; @@ -297,8 +296,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, if (rc < 0) goto out; - /* set uptodate and add to lru in pagevec-sized chunks */ - pagevec_init(&pvec, 0); for (; !list_empty(page_list) && len > 0; rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) { struct page *page = @@ -312,7 +309,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, zero_user_segment(page, s, PAGE_CACHE_SIZE); } - if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) { + if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) { page_cache_release(page); dout("readpages %p add_to_page_cache failed %p\n", inode, page); @@ -323,10 +320,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, flush_dcache_page(page); SetPageUptodate(page); unlock_page(page); - if (pagevec_add(&pvec, page) == 0) - pagevec_lru_add_file(&pvec); /* add to lru */ + page_cache_release(page); } - pagevec_lru_add_file(&pvec); rc = 0; out: @@ -568,7 +563,7 @@ static void writepages_finish(struct ceph_osd_request *req, ceph_release_pages(req->r_pages, req->r_num_pages); if (req->r_pages_from_pool) mempool_free(req->r_pages, - ceph_client(inode->i_sb)->wb_pagevec_pool); + ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); else kfree(req->r_pages); ceph_osdc_put_request(req); diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c index 818afe72e6c..9f46de2ba7a 100644 --- a/fs/ceph/auth.c +++ b/fs/ceph/auth.c @@ -150,7 +150,8 @@ int ceph_build_auth_request(struct ceph_auth_client *ac, ret = ac->ops->build_request(ac, p + sizeof(u32), end); if (ret < 0) { - pr_err("error %d building request\n", ret); + pr_err("error %d building auth method %s request\n", ret, + ac->ops->name); return ret; } dout(" built request %d bytes\n", ret); @@ -216,8 +217,8 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, if (ac->protocol != protocol) { ret = ceph_auth_init_protocol(ac, protocol); if (ret) { - pr_err("error %d on auth protocol %d init\n", - ret, protocol); + pr_err("error %d on auth method %s init\n", + ret, ac->ops->name); goto out; } } @@ -229,7 +230,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, if (ret == -EAGAIN) { return ceph_build_auth_request(ac, reply_buf, reply_len); } else if (ret) { - pr_err("authentication error %d\n", ret); + pr_err("auth method '%s' error %d\n", ac->ops->name, ret); return ret; } return 0; diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h index ca4f57cfb26..4429a707c02 100644 --- a/fs/ceph/auth.h +++ b/fs/ceph/auth.h @@ -15,6 +15,8 @@ struct ceph_auth_client; struct ceph_authorizer; struct ceph_auth_client_ops { + const char *name; + /* * true if we are authenticated and can connect to * services. diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c index 8cd9e3af07f..24407c11929 100644 --- a/fs/ceph/auth_none.c +++ b/fs/ceph/auth_none.c @@ -94,6 +94,7 @@ static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac, } static const struct ceph_auth_client_ops ceph_auth_none_ops = { + .name = "none", .reset = reset, .destroy = destroy, .is_authenticated = is_authenticated, diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c index fee5a08da88..7b206231566 100644 --- a/fs/ceph/auth_x.c +++ b/fs/ceph/auth_x.c @@ -127,7 +127,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, int ret; char *dbuf; char *ticket_buf; - u8 struct_v; + u8 reply_struct_v; dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); if (!dbuf) @@ -139,14 +139,14 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, goto out_dbuf; ceph_decode_need(&p, end, 1 + sizeof(u32), bad); - struct_v = ceph_decode_8(&p); - if (struct_v != 1) + reply_struct_v = ceph_decode_8(&p); + if (reply_struct_v != 1) goto bad; num = ceph_decode_32(&p); dout("%d tickets\n", num); while (num--) { int type; - u8 struct_v; + u8 tkt_struct_v, blob_struct_v; struct ceph_x_ticket_handler *th; void *dp, *dend; int dlen; @@ -165,8 +165,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, type = ceph_decode_32(&p); dout(" ticket type %d %s\n", type, ceph_entity_type_name(type)); - struct_v = ceph_decode_8(&p); - if (struct_v != 1) + tkt_struct_v = ceph_decode_8(&p); + if (tkt_struct_v != 1) goto bad; th = get_ticket_handler(ac, type); @@ -186,8 +186,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, dend = dbuf + dlen; dp = dbuf; - struct_v = ceph_decode_8(&dp); - if (struct_v != 1) + tkt_struct_v = ceph_decode_8(&dp); + if (tkt_struct_v != 1) goto bad; memcpy(&old_key, &th->session_key, sizeof(old_key)); @@ -224,7 +224,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, tpend = tp + dlen; dout(" ticket blob is %d bytes\n", dlen); ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); - struct_v = ceph_decode_8(&tp); + blob_struct_v = ceph_decode_8(&tp); new_secret_id = ceph_decode_64(&tp); ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend); if (ret) @@ -618,6 +618,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, static const struct ceph_auth_client_ops ceph_x_ops = { + .name = "x", .is_authenticated = ceph_x_is_authenticated, .build_request = ceph_x_build_request, .handle_reply = ceph_x_handle_reply, diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index d9400534b27..0dd0b81e64f 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -867,7 +867,8 @@ void __ceph_remove_cap(struct ceph_cap *cap) { struct ceph_mds_session *session = cap->session; struct ceph_inode_info *ci = cap->ci; - struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; + struct ceph_mds_client *mdsc = + &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; int removed = 0; dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); @@ -937,9 +938,9 @@ static int send_cap_msg(struct ceph_mds_session *session, seq, issue_seq, mseq, follows, size, max_size, xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL); - if (IS_ERR(msg)) - return PTR_ERR(msg); + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS); + if (!msg) + return -ENOMEM; msg->hdr.tid = cpu_to_le64(flush_tid); @@ -1298,7 +1299,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci) */ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) { - struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; + struct ceph_mds_client *mdsc = + &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; struct inode *inode = &ci->vfs_inode; int was = ci->i_dirty_caps; int dirty = 0; @@ -1336,7 +1338,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) static int __mark_caps_flushing(struct inode *inode, struct ceph_mds_session *session) { - struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int flushing; @@ -1663,7 +1665,7 @@ ack: static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, unsigned *flush_tid) { - struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int unlock_session = session ? 0 : 1; int flushing = 0; @@ -1716,10 +1718,9 @@ out_unlocked: static int caps_are_flushed(struct inode *inode, unsigned tid) { struct ceph_inode_info *ci = ceph_inode(inode); - int dirty, i, ret = 1; + int i, ret = 1; spin_lock(&inode->i_lock); - dirty = __ceph_caps_dirty(ci); for (i = 0; i < CEPH_CAP_BITS; i++) if ((ci->i_flushing_caps & (1 << i)) && ci->i_cap_flush_tid[i] <= tid) { @@ -1829,7 +1830,8 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } else { - struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = + &ceph_sb_to_client(inode->i_sb)->mdsc; spin_lock(&inode->i_lock); if (__ceph_caps_dirty(ci)) @@ -2411,7 +2413,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, __releases(inode->i_lock) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; unsigned seq = le32_to_cpu(m->seq); int dirty = le32_to_cpu(m->dirty); int cleaned = 0; diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h index 0c2241ef365..3b9eeed097b 100644 --- a/fs/ceph/ceph_fs.h +++ b/fs/ceph/ceph_fs.h @@ -19,7 +19,7 @@ * Ceph release version */ #define CEPH_VERSION_MAJOR 0 -#define CEPH_VERSION_MINOR 19 +#define CEPH_VERSION_MINOR 20 #define CEPH_VERSION_PATCH 0 #define _CEPH_STRINGIFY(x) #x @@ -36,7 +36,7 @@ * client-facing protocol. */ #define CEPH_OSD_PROTOCOL 8 /* cluster internal */ -#define CEPH_MDS_PROTOCOL 9 /* cluster internal */ +#define CEPH_MDS_PROTOCOL 12 /* cluster internal */ #define CEPH_MON_PROTOCOL 5 /* cluster internal */ #define CEPH_OSDC_PROTOCOL 24 /* server/client */ #define CEPH_MDSC_PROTOCOL 32 /* server/client */ @@ -53,8 +53,18 @@ /* * feature bits */ -#define CEPH_FEATURE_SUPPORTED 0 -#define CEPH_FEATURE_REQUIRED 0 +#define CEPH_FEATURE_UID 1 +#define CEPH_FEATURE_NOSRCADDR 2 +#define CEPH_FEATURE_FLOCK 4 + +#define CEPH_FEATURE_SUPPORTED_MON CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR +#define CEPH_FEATURE_REQUIRED_MON CEPH_FEATURE_UID +#define CEPH_FEATURE_SUPPORTED_MDS CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK +#define CEPH_FEATURE_REQUIRED_MDS CEPH_FEATURE_UID +#define CEPH_FEATURE_SUPPORTED_OSD CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR +#define CEPH_FEATURE_REQUIRED_OSD CEPH_FEATURE_UID +#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR +#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR /* @@ -91,6 +101,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); #define CEPH_AUTH_NONE 0x1 #define CEPH_AUTH_CEPHX 0x2 +#define CEPH_AUTH_UID_DEFAULT ((__u64) -1) + /********************************************* * message layer @@ -128,11 +140,27 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); #define CEPH_MSG_CLIENT_SNAP 0x312 #define CEPH_MSG_CLIENT_CAPRELEASE 0x313 +/* pool ops */ +#define CEPH_MSG_POOLOP_REPLY 48 +#define CEPH_MSG_POOLOP 49 + + /* osd */ #define CEPH_MSG_OSD_MAP 41 #define CEPH_MSG_OSD_OP 42 #define CEPH_MSG_OSD_OPREPLY 43 +/* pool operations */ +enum { + POOL_OP_CREATE = 0x01, + POOL_OP_DELETE = 0x02, + POOL_OP_AUID_CHANGE = 0x03, + POOL_OP_CREATE_SNAP = 0x11, + POOL_OP_DELETE_SNAP = 0x12, + POOL_OP_CREATE_UNMANAGED_SNAP = 0x21, + POOL_OP_DELETE_UNMANAGED_SNAP = 0x22, +}; + struct ceph_mon_request_header { __le64 have_version; __le16 session_mon; @@ -155,6 +183,31 @@ struct ceph_mon_statfs_reply { struct ceph_statfs st; } __attribute__ ((packed)); +const char *ceph_pool_op_name(int op); + +struct ceph_mon_poolop { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; + __le32 pool; + __le32 op; + __le64 auid; + __le64 snapid; + __le32 name_len; +} __attribute__ ((packed)); + +struct ceph_mon_poolop_reply { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; + __le32 reply_code; + __le32 epoch; + char has_data; + char data[0]; +} __attribute__ ((packed)); + +struct ceph_mon_unmanaged_snap { + __le64 snapid; +} __attribute__ ((packed)); + struct ceph_osd_getmap { struct ceph_mon_request_header monhdr; struct ceph_fsid fsid; @@ -308,6 +361,7 @@ union ceph_mds_request_args { struct { __le32 frag; /* which dir fragment */ __le32 max_entries; /* how many dentries to grab */ + __le32 max_bytes; } __attribute__ ((packed)) readdir; struct { __le32 mode; diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c index 8e4be6a80c6..7503aee828c 100644 --- a/fs/ceph/ceph_strings.c +++ b/fs/ceph/ceph_strings.c @@ -10,7 +10,6 @@ const char *ceph_entity_type_name(int type) case CEPH_ENTITY_TYPE_OSD: return "osd"; case CEPH_ENTITY_TYPE_MON: return "mon"; case CEPH_ENTITY_TYPE_CLIENT: return "client"; - case CEPH_ENTITY_TYPE_ADMIN: return "admin"; case CEPH_ENTITY_TYPE_AUTH: return "auth"; default: return "unknown"; } @@ -45,6 +44,7 @@ const char *ceph_osd_op_name(int op) case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; case CEPH_OSD_OP_RMXATTR: return "rmxattr"; + case CEPH_OSD_OP_CMPXATTR: return "cmpxattr"; case CEPH_OSD_OP_PULL: return "pull"; case CEPH_OSD_OP_PUSH: return "push"; @@ -174,3 +174,17 @@ const char *ceph_snap_op_name(int o) } return "???"; } + +const char *ceph_pool_op_name(int op) +{ + switch (op) { + case POOL_OP_CREATE: return "create"; + case POOL_OP_DELETE: return "delete"; + case POOL_OP_AUID_CHANGE: return "auid change"; + case POOL_OP_CREATE_SNAP: return "create snap"; + case POOL_OP_DELETE_SNAP: return "delete snap"; + case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap"; + case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap"; + } + return "???"; +} diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index f7048da92ac..3be33fb066c 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -113,7 +113,7 @@ static int osdmap_show(struct seq_file *s, void *p) static int monc_show(struct seq_file *s, void *p) { struct ceph_client *client = s->private; - struct ceph_mon_statfs_request *req; + struct ceph_mon_generic_request *req; struct ceph_mon_client *monc = &client->monc; struct rb_node *rp; @@ -126,9 +126,14 @@ static int monc_show(struct seq_file *s, void *p) if (monc->want_next_osdmap) seq_printf(s, "want next osdmap\n"); - for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) { - req = rb_entry(rp, struct ceph_mon_statfs_request, node); - seq_printf(s, "%lld statfs\n", req->tid); + for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { + __u16 op; + req = rb_entry(rp, struct ceph_mon_generic_request, node); + op = le16_to_cpu(req->request->hdr.type); + if (op == CEPH_MSG_STATFS) + seq_printf(s, "%lld statfs\n", req->tid); + else + seq_printf(s, "%lld unknown\n", req->tid); } mutex_unlock(&monc->mutex); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 650d2db5ed2..4fd30900eff 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -51,8 +51,11 @@ int ceph_init_dentry(struct dentry *dentry) return -ENOMEM; /* oh well */ spin_lock(&dentry->d_lock); - if (dentry->d_fsdata) /* lost a race */ + if (dentry->d_fsdata) { + /* lost a race */ + kmem_cache_free(ceph_dentry_cachep, di); goto out_unlock; + } di->dentry = dentry; di->lease_session = NULL; dentry->d_fsdata = di; @@ -125,7 +128,8 @@ more: dentry = list_entry(p, struct dentry, d_u.d_child); di = ceph_dentry(dentry); while (1) { - dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next, + dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next, + d_unhashed(dentry) ? "!hashed" : "hashed", parent->d_subdirs.prev, parent->d_subdirs.next); if (p == &parent->d_subdirs) { fi->at_end = 1; @@ -229,6 +233,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) u32 ftype; struct ceph_mds_reply_info_parsed *rinfo; const int max_entries = client->mount_args->max_readdir; + const int max_bytes = client->mount_args->max_readdir_bytes; dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); if (fi->at_end) @@ -312,6 +317,7 @@ more: req->r_readdir_offset = fi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); req->r_args.readdir.max_entries = cpu_to_le32(max_entries); + req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes); req->r_num_caps = max_entries + 1; err = ceph_mdsc_do_request(mdsc, NULL, req); if (err < 0) { @@ -335,7 +341,7 @@ more: if (req->r_reply_info.dir_end) { kfree(fi->last_name); fi->last_name = NULL; - fi->next_offset = 0; + fi->next_offset = 2; } else { rinfo = &req->r_reply_info; err = note_last_dentry(fi, @@ -478,7 +484,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err) { - struct ceph_client *client = ceph_client(dentry->d_sb); + struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); struct inode *parent = dentry->d_parent->d_inode; /* .snap dir? */ @@ -568,7 +574,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, !is_root_ceph_dentry(dir, dentry) && (ci->i_ceph_flags & CEPH_I_COMPLETE) && (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { - di->offset = ci->i_max_offset++; spin_unlock(&dir->i_lock); dout(" dir %p complete, -ENOENT\n", dir); d_add(dentry, NULL); @@ -888,13 +893,22 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ - new_dentry->d_time = jiffies; - ceph_dentry(new_dentry)->lease_shared_gen = 0; + ceph_invalidate_dentry_lease(new_dentry); } ceph_mdsc_put_request(req); return err; } +/* + * Ensure a dentry lease will no longer revalidate. + */ +void ceph_invalidate_dentry_lease(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + dentry->d_time = jiffies; + ceph_dentry(dentry)->lease_shared_gen = 0; + spin_unlock(&dentry->d_lock); +} /* * Check if dentry lease is valid. If not, delete the lease. Try to @@ -972,8 +986,9 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *dir = dentry->d_parent->d_inode; - dout("d_revalidate %p '%.*s' inode %p\n", dentry, - dentry->d_name.len, dentry->d_name.name, dentry->d_inode); + dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, + dentry->d_name.len, dentry->d_name.name, dentry->d_inode, + ceph_dentry(dentry)->offset); /* always trust cached snapped dentries, snapdir dentry */ if (ceph_snap(dir) != CEPH_NOSNAP) { @@ -1050,7 +1065,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, struct ceph_inode_info *ci = ceph_inode(inode); int left; - if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT)) + if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) return -EISDIR; if (!cf->dir_info) { @@ -1152,7 +1167,7 @@ void ceph_dentry_lru_add(struct dentry *dn) dout("dentry_lru_add %p %p '%.*s'\n", di, dn, dn->d_name.len, dn->d_name.name); if (di) { - mdsc = &ceph_client(dn->d_sb)->mdsc; + mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_add_tail(&di->lru, &mdsc->dentry_lru); mdsc->num_dentry++; @@ -1165,10 +1180,10 @@ void ceph_dentry_lru_touch(struct dentry *dn) struct ceph_dentry_info *di = ceph_dentry(dn); struct ceph_mds_client *mdsc; - dout("dentry_lru_touch %p %p '%.*s'\n", di, dn, - dn->d_name.len, dn->d_name.name); + dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, + dn->d_name.len, dn->d_name.name, di->offset); if (di) { - mdsc = &ceph_client(dn->d_sb)->mdsc; + mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_move_tail(&di->lru, &mdsc->dentry_lru); spin_unlock(&mdsc->dentry_lru_lock); @@ -1183,7 +1198,7 @@ void ceph_dentry_lru_del(struct dentry *dn) dout("dentry_lru_del %p %p '%.*s'\n", di, dn, dn->d_name.len, dn->d_name.name); if (di) { - mdsc = &ceph_client(dn->d_sb)->mdsc; + mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_del_init(&di->lru); mdsc->num_dentry--; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 9d67572fb32..17447644d67 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -93,11 +93,11 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, return ERR_PTR(-ESTALE); dentry = d_obtain_alias(inode); - if (!dentry) { + if (IS_ERR(dentry)) { pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n", fh->ino, inode); iput(inode); - return ERR_PTR(-ENOMEM); + return dentry; } err = ceph_init_dentry(dentry); @@ -115,7 +115,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, static struct dentry *__cfh_to_dentry(struct super_block *sb, struct ceph_nfs_confh *cfh) { - struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc; struct inode *inode; struct dentry *dentry; struct ceph_vino vino; @@ -149,11 +149,11 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb, } dentry = d_obtain_alias(inode); - if (!dentry) { + if (IS_ERR(dentry)) { pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n", cfh->ino, inode); iput(inode); - return ERR_PTR(-ENOMEM); + return dentry; } err = ceph_init_dentry(dentry); if (err < 0) { @@ -202,11 +202,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb, return ERR_PTR(-ESTALE); dentry = d_obtain_alias(inode); - if (!dentry) { + if (IS_ERR(dentry)) { pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n", cfh->ino, inode); iput(inode); - return ERR_PTR(-ENOMEM); + return dentry; } err = ceph_init_dentry(dentry); if (err < 0) { diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ed6f19721d6..6512b6701b9 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -317,16 +317,16 @@ void ceph_release_page_vector(struct page **pages, int num_pages) /* * allocate a vector new pages */ -static struct page **alloc_page_vector(int num_pages) +struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags) { struct page **pages; int i; - pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); + pages = kmalloc(sizeof(*pages) * num_pages, flags); if (!pages) return ERR_PTR(-ENOMEM); for (i = 0; i < num_pages; i++) { - pages[i] = alloc_page(GFP_NOFS); + pages[i] = __page_cache_alloc(flags); if (pages[i] == NULL) { ceph_release_page_vector(pages, i); return ERR_PTR(-ENOMEM); @@ -540,7 +540,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, * in sequence. */ } else { - pages = alloc_page_vector(num_pages); + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); } if (IS_ERR(pages)) return PTR_ERR(pages); @@ -649,8 +649,8 @@ more: do_sync, ci->i_truncate_seq, ci->i_truncate_size, &mtime, false, 2); - if (IS_ERR(req)) - return PTR_ERR(req); + if (!req) + return -ENOMEM; num_pages = calc_pages_for(pos, len); @@ -668,7 +668,7 @@ more: truncate_inode_pages_range(inode->i_mapping, pos, (pos+len) | (PAGE_CACHE_SIZE-1)); } else { - pages = alloc_page_vector(num_pages); + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; @@ -809,7 +809,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, struct file *file = iocb->ki_filp; struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; + struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; loff_t endoff = pos + iov->iov_len; int got = 0; int ret, err; @@ -844,8 +844,7 @@ retry_snap: if ((ret >= 0 || ret == -EIOCBQUEUED) && ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { - err = vfs_fsync_range(file, file->f_path.dentry, - pos, pos + ret - 1, 1); + err = vfs_fsync_range(file, pos, pos + ret - 1, 1); if (err < 0) ret = err; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 85b4d2ffdeb..a81b8b662c7 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -384,7 +384,7 @@ void ceph_destroy_inode(struct inode *inode) */ if (ci->i_snap_realm) { struct ceph_mds_client *mdsc = - &ceph_client(ci->vfs_inode.i_sb)->mdsc; + &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; struct ceph_snap_realm *realm = ci->i_snap_realm; dout(" dropping residual ref to snap realm %p\n", realm); @@ -619,11 +619,12 @@ static int fill_inode(struct inode *inode, memcpy(ci->i_xattrs.blob->vec.iov_base, iinfo->xattr_data, iinfo->xattr_len); ci->i_xattrs.version = le64_to_cpu(info->xattr_version); + xattr_blob = NULL; } inode->i_mapping->a_ops = &ceph_aops; inode->i_mapping->backing_dev_info = - &ceph_client(inode->i_sb)->backing_dev_info; + &ceph_sb_to_client(inode->i_sb)->backing_dev_info; switch (inode->i_mode & S_IFMT) { case S_IFIFO: @@ -674,14 +675,15 @@ static int fill_inode(struct inode *inode, /* set dir completion flag? */ if (ci->i_files == 0 && ci->i_subdirs == 0 && ceph_snap(inode) == CEPH_NOSNAP && - (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) { + (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && + (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { dout(" marking %p complete (empty)\n", inode); ci->i_ceph_flags |= CEPH_I_COMPLETE; ci->i_max_offset = 2; } /* it may be better to set st_size in getattr instead? */ - if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES)) + if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) inode->i_size = ci->i_rbytes; break; default: @@ -802,6 +804,37 @@ out_unlock: } /* + * Set dentry's directory position based on the current dir's max, and + * order it in d_subdirs, so that dcache_readdir behaves. + */ +static void ceph_set_dentry_offset(struct dentry *dn) +{ + struct dentry *dir = dn->d_parent; + struct inode *inode = dn->d_parent->d_inode; + struct ceph_dentry_info *di; + + BUG_ON(!inode); + + di = ceph_dentry(dn); + + spin_lock(&inode->i_lock); + if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) { + spin_unlock(&inode->i_lock); + return; + } + di->offset = ceph_inode(inode)->i_max_offset++; + spin_unlock(&inode->i_lock); + + spin_lock(&dcache_lock); + spin_lock(&dn->d_lock); + list_move_tail(&dir->d_subdirs, &dn->d_u.d_child); + dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, + dn->d_u.d_child.prev, dn->d_u.d_child.next); + spin_unlock(&dn->d_lock); + spin_unlock(&dcache_lock); +} + +/* * splice a dentry to an inode. * caller must hold directory i_mutex for this to be safe. * @@ -814,6 +847,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, { struct dentry *realdn; + BUG_ON(dn->d_inode); + /* dn must be unhashed */ if (!d_unhashed(dn)) d_drop(dn); @@ -835,44 +870,17 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, dn = realdn; } else { BUG_ON(!ceph_dentry(dn)); - dout("dn %p attached to %p ino %llx.%llx\n", dn, dn->d_inode, ceph_vinop(dn->d_inode)); } if ((!prehash || *prehash) && d_unhashed(dn)) d_rehash(dn); + ceph_set_dentry_offset(dn); out: return dn; } /* - * Set dentry's directory position based on the current dir's max, and - * order it in d_subdirs, so that dcache_readdir behaves. - */ -static void ceph_set_dentry_offset(struct dentry *dn) -{ - struct dentry *dir = dn->d_parent; - struct inode *inode = dn->d_parent->d_inode; - struct ceph_dentry_info *di; - - BUG_ON(!inode); - - di = ceph_dentry(dn); - - spin_lock(&inode->i_lock); - di->offset = ceph_inode(inode)->i_max_offset++; - spin_unlock(&inode->i_lock); - - spin_lock(&dcache_lock); - spin_lock(&dn->d_lock); - list_move_tail(&dir->d_subdirs, &dn->d_u.d_child); - dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, - dn->d_u.d_child.prev, dn->d_u.d_child.next); - spin_unlock(&dn->d_lock); - spin_unlock(&dcache_lock); -} - -/* * Incorporate results into the local cache. This is either just * one inode, or a directory, dentry, and possibly linked-to inode (e.g., * after a lookup). @@ -933,14 +941,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, if (!rinfo->head->is_target && !rinfo->head->is_dentry) { dout("fill_trace reply is empty!\n"); - if (rinfo->head->result == 0 && req->r_locked_dir) { - struct ceph_inode_info *ci = - ceph_inode(req->r_locked_dir); - dout(" clearing %p complete (empty trace)\n", - req->r_locked_dir); - ci->i_ceph_flags &= ~CEPH_I_COMPLETE; - ci->i_release_count++; - } + if (rinfo->head->result == 0 && req->r_locked_dir) + ceph_invalidate_dir_request(req); return 0; } @@ -1011,13 +1013,18 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, req->r_old_dentry->d_name.len, req->r_old_dentry->d_name.name, dn, dn->d_name.len, dn->d_name.name); + /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ - dn->d_time = jiffies; - ceph_dentry(dn)->lease_shared_gen = 0; + ceph_invalidate_dentry_lease(dn); + /* take overwritten dentry's readdir offset */ + dout("dn %p gets %p offset %lld (old offset %lld)\n", + req->r_old_dentry, dn, ceph_dentry(dn)->offset, + ceph_dentry(req->r_old_dentry)->offset); ceph_dentry(req->r_old_dentry)->offset = ceph_dentry(dn)->offset; + dn = req->r_old_dentry; /* use old_dentry */ in = dn->d_inode; } @@ -1059,7 +1066,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, goto done; } req->r_dentry = dn; /* may have spliced */ - ceph_set_dentry_offset(dn); igrab(in); } else if (ceph_ino(in) == vino.ino && ceph_snap(in) == vino.snap) { @@ -1102,7 +1108,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, err = PTR_ERR(dn); goto done; } - ceph_set_dentry_offset(dn); req->r_dentry = dn; /* may have spliced */ igrab(in); rinfo->head->is_dentry = 1; /* fool notrace handlers */ @@ -1429,7 +1434,7 @@ void ceph_queue_vmtruncate(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); - if (queue_work(ceph_client(inode->i_sb)->trunc_wq, + if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, &ci->i_vmtruncate_work)) { dout("ceph_queue_vmtruncate %p\n", inode); igrab(inode); @@ -1518,7 +1523,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) struct inode *parent_inode = dentry->d_parent->d_inode; const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; - struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc; int issued; int release = 0, dirtied = 0; int mask = 0; diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 8a5bcae6284..d085f07756b 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -98,7 +98,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) struct ceph_ioctl_dataloc dl; struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; + struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; u64 len = 1, olen; u64 tmp; struct ceph_object_layout ol; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 24561a557e0..885aa5710cf 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -40,7 +40,7 @@ static void __wake_requests(struct ceph_mds_client *mdsc, struct list_head *head); -const static struct ceph_connection_operations mds_con_ops; +static const struct ceph_connection_operations mds_con_ops; /* @@ -665,10 +665,10 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq) struct ceph_msg *msg; struct ceph_mds_session_head *h; - msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL); - if (IS_ERR(msg)) { + msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS); + if (!msg) { pr_err("create_session_msg ENOMEM creating msg\n"); - return ERR_PTR(PTR_ERR(msg)); + return NULL; } h = msg->front.iov_base; h->op = cpu_to_le32(op); @@ -687,7 +687,6 @@ static int __open_session(struct ceph_mds_client *mdsc, struct ceph_msg *msg; int mstate; int mds = session->s_mds; - int err = 0; /* wait for mds to go active? */ mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); @@ -698,13 +697,9 @@ static int __open_session(struct ceph_mds_client *mdsc, /* send connect message */ msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq); - if (IS_ERR(msg)) { - err = PTR_ERR(msg); - goto out; - } + if (!msg) + return -ENOMEM; ceph_con_send(&session->s_con, msg); - -out: return 0; } @@ -804,12 +799,49 @@ out: } static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, - void *arg) + void *arg) { struct ceph_inode_info *ci = ceph_inode(inode); + int drop = 0; + dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); - ceph_remove_cap(cap); + spin_lock(&inode->i_lock); + __ceph_remove_cap(cap); + if (!__ceph_is_any_real_caps(ci)) { + struct ceph_mds_client *mdsc = + &ceph_sb_to_client(inode->i_sb)->mdsc; + + spin_lock(&mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) { + pr_info(" dropping dirty %s state for %p %lld\n", + ceph_cap_string(ci->i_dirty_caps), + inode, ceph_ino(inode)); + ci->i_dirty_caps = 0; + list_del_init(&ci->i_dirty_item); + drop = 1; + } + if (!list_empty(&ci->i_flushing_item)) { + pr_info(" dropping dirty+flushing %s state for %p %lld\n", + ceph_cap_string(ci->i_flushing_caps), + inode, ceph_ino(inode)); + ci->i_flushing_caps = 0; + list_del_init(&ci->i_flushing_item); + mdsc->num_cap_flushing--; + drop = 1; + } + if (drop && ci->i_wrbuffer_ref) { + pr_info(" dropping dirty data for %p %lld\n", + inode, ceph_ino(inode)); + ci->i_wrbuffer_ref = 0; + ci->i_wrbuffer_ref_head = 0; + drop++; + } + spin_unlock(&mdsc->cap_dirty_lock); + } + spin_unlock(&inode->i_lock); + while (drop--) + iput(inode); return 0; } @@ -821,6 +853,7 @@ static void remove_session_caps(struct ceph_mds_session *session) dout("remove_session_caps on %p\n", session); iterate_session_caps(session, remove_session_caps_cb, NULL); BUG_ON(session->s_nr_caps > 0); + BUG_ON(!list_empty(&session->s_cap_flushing)); cleanup_cap_releases(session); } @@ -883,8 +916,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, ceph_mds_state_name(state)); msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, ++session->s_renew_seq); - if (IS_ERR(msg)) - return PTR_ERR(msg); + if (!msg) + return -ENOMEM; ceph_con_send(&session->s_con, msg); return 0; } @@ -931,17 +964,15 @@ static int request_close_session(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_msg *msg; - int err = 0; dout("request_close_session mds%d state %s seq %lld\n", session->s_mds, session_state_name(session->s_state), session->s_seq); msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); - if (IS_ERR(msg)) - err = PTR_ERR(msg); - else - ceph_con_send(&session->s_con, msg); - return err; + if (!msg) + return -ENOMEM; + ceph_con_send(&session->s_con, msg); + return 0; } /* @@ -1059,7 +1090,7 @@ static int add_cap_releases(struct ceph_mds_client *mdsc, while (session->s_num_cap_releases < session->s_nr_caps + extra) { spin_unlock(&session->s_cap_lock); msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, - 0, 0, NULL); + GFP_NOFS); if (!msg) goto out_unlocked; dout("add_cap_releases %p msg %p now %d\n", session, msg, @@ -1151,10 +1182,8 @@ static void send_cap_releases(struct ceph_mds_client *mdsc, struct ceph_msg *msg; dout("send_cap_releases mds%d\n", session->s_mds); - while (1) { - spin_lock(&session->s_cap_lock); - if (list_empty(&session->s_cap_releases_done)) - break; + spin_lock(&session->s_cap_lock); + while (!list_empty(&session->s_cap_releases_done)) { msg = list_first_entry(&session->s_cap_releases_done, struct ceph_msg, list_head); list_del_init(&msg->list_head); @@ -1162,10 +1191,49 @@ static void send_cap_releases(struct ceph_mds_client *mdsc, msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); dout("send_cap_releases mds%d %p\n", session->s_mds, msg); ceph_con_send(&session->s_con, msg); + spin_lock(&session->s_cap_lock); } spin_unlock(&session->s_cap_lock); } +static void discard_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + struct ceph_mds_cap_release *head; + unsigned num; + + dout("discard_cap_releases mds%d\n", session->s_mds); + spin_lock(&session->s_cap_lock); + + /* zero out the in-progress message */ + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, list_head); + head = msg->front.iov_base; + num = le32_to_cpu(head->num); + dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); + head->num = cpu_to_le32(0); + session->s_num_cap_releases += num; + + /* requeue completed messages */ + while (!list_empty(&session->s_cap_releases_done)) { + msg = list_first_entry(&session->s_cap_releases_done, + struct ceph_msg, list_head); + list_del_init(&msg->list_head); + + head = msg->front.iov_base; + num = le32_to_cpu(head->num); + dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, + num); + session->s_num_cap_releases += num; + head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); + list_add(&msg->list_head, &session->s_cap_releases); + } + + spin_unlock(&session->s_cap_lock); +} + /* * requests */ @@ -1181,6 +1249,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) if (!req) return ERR_PTR(-ENOMEM); + mutex_init(&req->r_fill_mutex); req->r_started = jiffies; req->r_resend_mds = -1; INIT_LIST_HEAD(&req->r_unsafe_dir_item); @@ -1251,7 +1320,7 @@ retry: len += 1 + temp->d_name.len; temp = temp->d_parent; if (temp == NULL) { - pr_err("build_path_dentry corrupt dentry %p\n", dentry); + pr_err("build_path corrupt dentry %p\n", dentry); return ERR_PTR(-EINVAL); } } @@ -1267,7 +1336,7 @@ retry: struct inode *inode = temp->d_inode; if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { - dout("build_path_dentry path+%d: %p SNAPDIR\n", + dout("build_path path+%d: %p SNAPDIR\n", pos, temp); } else if (stop_on_nosnap && inode && ceph_snap(inode) == CEPH_NOSNAP) { @@ -1278,20 +1347,18 @@ retry: break; strncpy(path + pos, temp->d_name.name, temp->d_name.len); - dout("build_path_dentry path+%d: %p '%.*s'\n", - pos, temp, temp->d_name.len, path + pos); } if (pos) path[--pos] = '/'; temp = temp->d_parent; if (temp == NULL) { - pr_err("build_path_dentry corrupt dentry\n"); + pr_err("build_path corrupt dentry\n"); kfree(path); return ERR_PTR(-EINVAL); } } if (pos != 0) { - pr_err("build_path_dentry did not end path lookup where " + pr_err("build_path did not end path lookup where " "expected, namelen is %d, pos is %d\n", len, pos); /* presumably this is only possible if racing with a rename of one of the parent directories (we can not @@ -1303,7 +1370,7 @@ retry: *base = ceph_ino(temp->d_inode); *plen = len; - dout("build_path_dentry on %p %d built %llx '%.*s'\n", + dout("build_path on %p %d built %llx '%.*s'\n", dentry, atomic_read(&dentry->d_count), *base, len, path); return path; } @@ -1426,9 +1493,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, if (req->r_old_dentry_drop) len += req->r_old_dentry->d_name.len; - msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL); - if (IS_ERR(msg)) + msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS); + if (!msg) { + msg = ERR_PTR(-ENOMEM); goto out_free2; + } msg->hdr.tid = cpu_to_le64(req->r_tid); @@ -1517,9 +1586,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, } msg = create_request_message(mdsc, req, mds); if (IS_ERR(msg)) { - req->r_reply = ERR_PTR(PTR_ERR(msg)); + req->r_err = PTR_ERR(msg); complete_request(mdsc, req); - return -PTR_ERR(msg); + return PTR_ERR(msg); } req->r_request = msg; @@ -1552,7 +1621,7 @@ static int __do_request(struct ceph_mds_client *mdsc, int mds = -1; int err = -EAGAIN; - if (req->r_reply) + if (req->r_err || req->r_got_result) goto out; if (req->r_timeout && @@ -1609,7 +1678,7 @@ out: return err; finish: - req->r_reply = ERR_PTR(err); + req->r_err = err; complete_request(mdsc, req); goto out; } @@ -1630,10 +1699,9 @@ static void __wake_requests(struct ceph_mds_client *mdsc, /* * Wake up threads with requests pending for @mds, so that they can - * resubmit their requests to a possibly different mds. If @all is set, - * wake up if their requests has been forwarded to @mds, too. + * resubmit their requests to a possibly different mds. */ -static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all) +static void kick_requests(struct ceph_mds_client *mdsc, int mds) { struct ceph_mds_request *req; struct rb_node *p; @@ -1689,64 +1757,78 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, __register_request(mdsc, req, dir); __do_request(mdsc, req); - /* wait */ - if (!req->r_reply) { - mutex_unlock(&mdsc->mutex); - if (req->r_timeout) { - err = (long)wait_for_completion_interruptible_timeout( - &req->r_completion, req->r_timeout); - if (err == 0) - req->r_reply = ERR_PTR(-EIO); - else if (err < 0) - req->r_reply = ERR_PTR(err); - } else { - err = wait_for_completion_interruptible( - &req->r_completion); - if (err) - req->r_reply = ERR_PTR(err); - } - mutex_lock(&mdsc->mutex); + if (req->r_err) { + err = req->r_err; + __unregister_request(mdsc, req); + dout("do_request early error %d\n", err); + goto out; } - if (IS_ERR(req->r_reply)) { - err = PTR_ERR(req->r_reply); - req->r_reply = NULL; + /* wait */ + mutex_unlock(&mdsc->mutex); + dout("do_request waiting\n"); + if (req->r_timeout) { + err = (long)wait_for_completion_interruptible_timeout( + &req->r_completion, req->r_timeout); + if (err == 0) + err = -EIO; + } else { + err = wait_for_completion_interruptible(&req->r_completion); + } + dout("do_request waited, got %d\n", err); + mutex_lock(&mdsc->mutex); - if (err == -ERESTARTSYS) { - /* aborted */ - req->r_aborted = true; + /* only abort if we didn't race with a real reply */ + if (req->r_got_result) { + err = le32_to_cpu(req->r_reply_info.head->result); + } else if (err < 0) { + dout("aborted request %lld with %d\n", req->r_tid, err); - if (req->r_locked_dir && - (req->r_op & CEPH_MDS_OP_WRITE)) { - struct ceph_inode_info *ci = - ceph_inode(req->r_locked_dir); + /* + * ensure we aren't running concurrently with + * ceph_fill_trace or ceph_readdir_prepopulate, which + * rely on locks (dir mutex) held by our caller. + */ + mutex_lock(&req->r_fill_mutex); + req->r_err = err; + req->r_aborted = true; + mutex_unlock(&req->r_fill_mutex); - dout("aborted, clearing I_COMPLETE on %p\n", - req->r_locked_dir); - spin_lock(&req->r_locked_dir->i_lock); - ci->i_ceph_flags &= ~CEPH_I_COMPLETE; - ci->i_release_count++; - spin_unlock(&req->r_locked_dir->i_lock); - } - } else { - /* clean up this request */ - __unregister_request(mdsc, req); - if (!list_empty(&req->r_unsafe_item)) - list_del_init(&req->r_unsafe_item); - complete(&req->r_safe_completion); - } - } else if (req->r_err) { - err = req->r_err; + if (req->r_locked_dir && + (req->r_op & CEPH_MDS_OP_WRITE)) + ceph_invalidate_dir_request(req); } else { - err = le32_to_cpu(req->r_reply_info.head->result); + err = req->r_err; } - mutex_unlock(&mdsc->mutex); +out: + mutex_unlock(&mdsc->mutex); dout("do_request %p done, result %d\n", req, err); return err; } /* + * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS + * namespace request. + */ +void ceph_invalidate_dir_request(struct ceph_mds_request *req) +{ + struct inode *inode = req->r_locked_dir; + struct ceph_inode_info *ci = ceph_inode(inode); + + dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode); + spin_lock(&inode->i_lock); + ci->i_ceph_flags &= ~CEPH_I_COMPLETE; + ci->i_release_count++; + spin_unlock(&inode->i_lock); + + if (req->r_dentry) + ceph_invalidate_dentry_lease(req->r_dentry); + if (req->r_old_dentry) + ceph_invalidate_dentry_lease(req->r_old_dentry); +} + +/* * Handle mds reply. * * We take the session mutex and parse and process the reply immediately. @@ -1797,6 +1879,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_unlock(&mdsc->mutex); goto out; } + if (req->r_got_safe && !head->safe) { + pr_warning("got unsafe after safe on %llu from mds%d\n", + tid, mds); + mutex_unlock(&mdsc->mutex); + goto out; + } result = le32_to_cpu(head->result); @@ -1838,11 +1926,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_unlock(&mdsc->mutex); goto out; } - } - - BUG_ON(req->r_reply); - - if (!head->safe) { + } else { req->r_got_unsafe = true; list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); } @@ -1871,21 +1955,30 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) } /* insert trace into our cache */ + mutex_lock(&req->r_fill_mutex); err = ceph_fill_trace(mdsc->client->sb, req, req->r_session); if (err == 0) { if (result == 0 && rinfo->dir_nr) ceph_readdir_prepopulate(req, req->r_session); ceph_unreserve_caps(&req->r_caps_reservation); } + mutex_unlock(&req->r_fill_mutex); up_read(&mdsc->snap_rwsem); out_err: - if (err) { - req->r_err = err; + mutex_lock(&mdsc->mutex); + if (!req->r_aborted) { + if (err) { + req->r_err = err; + } else { + req->r_reply = msg; + ceph_msg_get(msg); + req->r_got_result = true; + } } else { - req->r_reply = msg; - ceph_msg_get(msg); + dout("reply arrived after request %lld was aborted\n", tid); } + mutex_unlock(&mdsc->mutex); add_cap_releases(mdsc, req->r_session, -1); mutex_unlock(&session->s_mutex); @@ -1984,6 +2077,8 @@ static void handle_session(struct ceph_mds_session *session, switch (op) { case CEPH_SESSION_OPEN: + if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) + pr_info("mds%d reconnect success\n", session->s_mds); session->s_state = CEPH_MDS_SESSION_OPEN; renewed_caps(mdsc, session, 0); wake = 1; @@ -1997,10 +2092,12 @@ static void handle_session(struct ceph_mds_session *session, break; case CEPH_SESSION_CLOSE: + if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) + pr_info("mds%d reconnect denied\n", session->s_mds); remove_session_caps(session); wake = 1; /* for good measure */ complete(&mdsc->session_close_waiters); - kick_requests(mdsc, mds, 0); /* cur only */ + kick_requests(mdsc, mds); break; case CEPH_SESSION_STALE: @@ -2132,54 +2229,44 @@ out: * * called with mdsc->mutex held. */ -static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds) +static void send_mds_reconnect(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) { - struct ceph_mds_session *session = NULL; struct ceph_msg *reply; struct rb_node *p; + int mds = session->s_mds; int err = -ENOMEM; struct ceph_pagelist *pagelist; - pr_info("reconnect to recovering mds%d\n", mds); + pr_info("mds%d reconnect start\n", mds); pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); if (!pagelist) goto fail_nopagelist; ceph_pagelist_init(pagelist); - reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL); - if (IS_ERR(reply)) { - err = PTR_ERR(reply); + reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS); + if (!reply) goto fail_nomsg; - } - - /* find session */ - session = __ceph_lookup_mds_session(mdsc, mds); - mutex_unlock(&mdsc->mutex); /* drop lock for duration */ - if (session) { - mutex_lock(&session->s_mutex); + mutex_lock(&session->s_mutex); + session->s_state = CEPH_MDS_SESSION_RECONNECTING; + session->s_seq = 0; - session->s_state = CEPH_MDS_SESSION_RECONNECTING; - session->s_seq = 0; + ceph_con_open(&session->s_con, + ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); - ceph_con_open(&session->s_con, - ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); - - /* replay unsafe requests */ - replay_unsafe_requests(mdsc, session); - } else { - dout("no session for mds%d, will send short reconnect\n", - mds); - } + /* replay unsafe requests */ + replay_unsafe_requests(mdsc, session); down_read(&mdsc->snap_rwsem); - if (!session) - goto send; dout("session %p state %s\n", session, session_state_name(session->s_state)); + /* drop old cap expires; we're about to reestablish that state */ + discard_cap_releases(mdsc, session); + /* traverse this session's caps */ err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); if (err) @@ -2208,36 +2295,29 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds) goto fail; } -send: reply->pagelist = pagelist; reply->hdr.data_len = cpu_to_le32(pagelist->length); reply->nr_pages = calc_pages_for(0, pagelist->length); ceph_con_send(&session->s_con, reply); - session->s_state = CEPH_MDS_SESSION_OPEN; mutex_unlock(&session->s_mutex); mutex_lock(&mdsc->mutex); __wake_requests(mdsc, &session->s_waiting); mutex_unlock(&mdsc->mutex); - ceph_put_mds_session(session); - up_read(&mdsc->snap_rwsem); - mutex_lock(&mdsc->mutex); return; fail: ceph_msg_put(reply); up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); - ceph_put_mds_session(session); fail_nomsg: ceph_pagelist_release(pagelist); kfree(pagelist); fail_nopagelist: pr_err("error %d preparing reconnect for mds%d\n", err, mds); - mutex_lock(&mdsc->mutex); return; } @@ -2290,7 +2370,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, } /* kick any requests waiting on the recovering mds */ - kick_requests(mdsc, i, 1); + kick_requests(mdsc, i); } else if (oldstate == newstate) { continue; /* nothing new with this mds */ } @@ -2299,22 +2379,21 @@ static void check_new_map(struct ceph_mds_client *mdsc, * send reconnect? */ if (s->s_state == CEPH_MDS_SESSION_RESTARTING && - newstate >= CEPH_MDS_STATE_RECONNECT) - send_mds_reconnect(mdsc, i); + newstate >= CEPH_MDS_STATE_RECONNECT) { + mutex_unlock(&mdsc->mutex); + send_mds_reconnect(mdsc, s); + mutex_lock(&mdsc->mutex); + } /* - * kick requests on any mds that has gone active. - * - * kick requests on cur or forwarder: we may have sent - * the request to mds1, mds1 told us it forwarded it - * to mds2, but then we learn mds1 failed and can't be - * sure it successfully forwarded our request before - * it died. + * kick request on any mds that has gone active. */ if (oldstate < CEPH_MDS_STATE_ACTIVE && newstate >= CEPH_MDS_STATE_ACTIVE) { - pr_info("mds%d reconnect completed\n", s->s_mds); - kick_requests(mdsc, i, 1); + if (oldstate != CEPH_MDS_STATE_CREATING && + oldstate != CEPH_MDS_STATE_STARTING) + pr_info("mds%d recovery completed\n", s->s_mds); + kick_requests(mdsc, i); ceph_kick_flushing_caps(mdsc, s); wake_up_session_caps(s, 1); } @@ -2457,8 +2536,8 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, dnamelen = dentry->d_name.len; len += dnamelen; - msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL); - if (IS_ERR(msg)) + msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS); + if (!msg) return; lease = msg->front.iov_base; lease->action = action; @@ -2603,7 +2682,9 @@ static void delayed_work(struct work_struct *work) else ceph_con_keepalive(&s->s_con); add_cap_releases(mdsc, s, -1); - send_cap_releases(mdsc, s); + if (s->s_state == CEPH_MDS_SESSION_OPEN || + s->s_state == CEPH_MDS_SESSION_HUNG) + send_cap_releases(mdsc, s); mutex_unlock(&s->s_mutex); ceph_put_mds_session(s); @@ -2620,6 +2701,9 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) mdsc->client = client; mutex_init(&mdsc->mutex); mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); + if (mdsc->mdsmap == NULL) + return -ENOMEM; + init_completion(&mdsc->safe_umount_waiters); init_completion(&mdsc->session_close_waiters); INIT_LIST_HEAD(&mdsc->waiting_for_map); @@ -2645,6 +2729,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) init_waitqueue_head(&mdsc->cap_flushing_wq); spin_lock_init(&mdsc->dentry_lru_lock); INIT_LIST_HEAD(&mdsc->dentry_lru); + return 0; } @@ -2740,6 +2825,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { u64 want_tid, want_flush; + if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) + return; + dout("sync\n"); mutex_lock(&mdsc->mutex); want_tid = mdsc->last_tid; @@ -2922,9 +3010,10 @@ static void con_put(struct ceph_connection *con) static void peer_reset(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; - pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n", - s->s_mds); + pr_warning("mds%d closed our session\n", s->s_mds); + send_mds_reconnect(mdsc, s); } static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) @@ -3031,7 +3120,7 @@ static int invalidate_authorizer(struct ceph_connection *con) return ceph_monc_validate_auth(&mdsc->client->monc); } -const static struct ceph_connection_operations mds_con_ops = { +static const struct ceph_connection_operations mds_con_ops = { .get = con_get, .put = con_put, .dispatch = dispatch, diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 961cc6f6587..d9936c4f121 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -165,6 +165,8 @@ struct ceph_mds_request { struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ struct inode *r_target_inode; /* resulting inode */ + struct mutex r_fill_mutex; + union ceph_mds_request_args r_args; int r_fmode; /* file mode, if expecting cap */ @@ -213,7 +215,7 @@ struct ceph_mds_request { struct completion r_safe_completion; ceph_mds_request_callback_t r_callback; struct list_head r_unsafe_item; /* per-session unsafe list item */ - bool r_got_unsafe, r_got_safe; + bool r_got_unsafe, r_got_safe, r_got_result; bool r_did_prepopulate; u32 r_readdir_offset; @@ -301,6 +303,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, struct dentry *dn, int mask); +extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); + extern struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index cd4fadb6491..60b74839ebe 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -39,18 +39,6 @@ static void queue_con(struct ceph_connection *con); static void con_work(struct work_struct *); static void ceph_fault(struct ceph_connection *con); -const char *ceph_name_type_str(int t) -{ - switch (t) { - case CEPH_ENTITY_TYPE_MON: return "mon"; - case CEPH_ENTITY_TYPE_MDS: return "mds"; - case CEPH_ENTITY_TYPE_OSD: return "osd"; - case CEPH_ENTITY_TYPE_CLIENT: return "client"; - case CEPH_ENTITY_TYPE_ADMIN: return "admin"; - default: return "???"; - } -} - /* * nicely render a sockaddr as a string. */ @@ -340,6 +328,7 @@ static void reset_connection(struct ceph_connection *con) ceph_msg_put(con->out_msg); con->out_msg = NULL; } + con->out_keepalive_pending = false; con->in_seq = 0; con->in_seq_acked = 0; } @@ -357,6 +346,7 @@ void ceph_con_close(struct ceph_connection *con) clear_bit(WRITE_PENDING, &con->state); mutex_lock(&con->mutex); reset_connection(con); + con->peer_global_seq = 0; cancel_delayed_work(&con->work); mutex_unlock(&con->mutex); queue_con(con); @@ -661,7 +651,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr, dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, con->connect_seq, global_seq, proto); - con->out_connect.features = CEPH_FEATURE_SUPPORTED; + con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT; con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); con->out_connect.global_seq = cpu_to_le32(global_seq); @@ -1124,8 +1114,8 @@ static void fail_protocol(struct ceph_connection *con) static int process_connect(struct ceph_connection *con) { - u64 sup_feat = CEPH_FEATURE_SUPPORTED; - u64 req_feat = CEPH_FEATURE_REQUIRED; + u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT; + u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT; u64 server_feat = le64_to_cpu(con->in_reply.features); dout("process_connect on %p tag %d\n", con, (int)con->in_tag); @@ -1233,6 +1223,7 @@ static int process_connect(struct ceph_connection *con) clear_bit(CONNECTING, &con->state); con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); con->connect_seq++; + con->peer_features = server_feat; dout("process_connect got READY gseq %d cseq %d (%d)\n", con->peer_global_seq, le32_to_cpu(con->in_reply.connect_seq), @@ -1402,19 +1393,17 @@ static int read_partial_message(struct ceph_connection *con) con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); if (skip) { /* skip this message */ - dout("alloc_msg returned NULL, skipping message\n"); + dout("alloc_msg said skip message\n"); con->in_base_pos = -front_len - middle_len - data_len - sizeof(m->footer); con->in_tag = CEPH_MSGR_TAG_READY; con->in_seq++; return 0; } - if (IS_ERR(con->in_msg)) { - ret = PTR_ERR(con->in_msg); - con->in_msg = NULL; + if (!con->in_msg) { con->error_msg = "error allocating memory for incoming message"; - return ret; + return -ENOMEM; } m = con->in_msg; m->front.iov_len = 0; /* haven't read it yet */ @@ -1514,14 +1503,14 @@ static void process_message(struct ceph_connection *con) /* if first message, set peer_name */ if (con->peer_name.type == 0) - con->peer_name = msg->hdr.src.name; + con->peer_name = msg->hdr.src; con->in_seq++; mutex_unlock(&con->mutex); dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n", msg, le64_to_cpu(msg->hdr.seq), - ENTITY_NAME(msg->hdr.src.name), + ENTITY_NAME(msg->hdr.src), le16_to_cpu(msg->hdr.type), ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), le32_to_cpu(msg->hdr.front_len), @@ -1546,7 +1535,6 @@ static int try_write(struct ceph_connection *con) dout("try_write start %p state %lu nref %d\n", con, con->state, atomic_read(&con->nref)); - mutex_lock(&con->mutex); more: dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); @@ -1639,7 +1627,6 @@ do_next: done: ret = 0; out: - mutex_unlock(&con->mutex); dout("try_write done on %p\n", con); return ret; } @@ -1651,7 +1638,6 @@ out: */ static int try_read(struct ceph_connection *con) { - struct ceph_messenger *msgr; int ret = -1; if (!con->sock) @@ -1661,9 +1647,6 @@ static int try_read(struct ceph_connection *con) return 0; dout("try_read start on %p\n", con); - msgr = con->msgr; - - mutex_lock(&con->mutex); more: dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, @@ -1758,7 +1741,6 @@ more: done: ret = 0; out: - mutex_unlock(&con->mutex); dout("try_read done on %p\n", con); return ret; @@ -1830,6 +1812,8 @@ more: dout("con_work %p start, clearing QUEUED\n", con); clear_bit(QUEUED, &con->state); + mutex_lock(&con->mutex); + if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ dout("con_work CLOSED\n"); con_close_socket(con); @@ -1844,11 +1828,16 @@ more: if (test_and_clear_bit(SOCK_CLOSED, &con->state) || try_read(con) < 0 || try_write(con) < 0) { + mutex_unlock(&con->mutex); backoff = 1; ceph_fault(con); /* error/fault path */ + goto done_unlocked; } done: + mutex_unlock(&con->mutex); + +done_unlocked: clear_bit(BUSY, &con->state); dout("con->state=%lu\n", con->state); if (test_bit(QUEUED, &con->state)) { @@ -1947,7 +1936,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr) /* the zero page is needed if a request is "canceled" while the message * is being written over the socket */ - msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO); if (!msgr->zero_page) { kfree(msgr); return ERR_PTR(-ENOMEM); @@ -1987,9 +1976,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) } /* set src+dst */ - msg->hdr.src.name = con->msgr->inst.name; - msg->hdr.src.addr = con->msgr->my_enc_addr; - msg->hdr.orig_src = msg->hdr.src; + msg->hdr.src = con->msgr->inst.name; BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); @@ -2083,12 +2070,11 @@ void ceph_con_keepalive(struct ceph_connection *con) * construct a new message with given type, size * the new msg has a ref count of 1. */ -struct ceph_msg *ceph_msg_new(int type, int front_len, - int page_len, int page_off, struct page **pages) +struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags) { struct ceph_msg *m; - m = kmalloc(sizeof(*m), GFP_NOFS); + m = kmalloc(sizeof(*m), flags); if (m == NULL) goto out; kref_init(&m->kref); @@ -2100,8 +2086,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, m->hdr.version = 0; m->hdr.front_len = cpu_to_le32(front_len); m->hdr.middle_len = 0; - m->hdr.data_len = cpu_to_le32(page_len); - m->hdr.data_off = cpu_to_le16(page_off); + m->hdr.data_len = 0; + m->hdr.data_off = 0; m->hdr.reserved = 0; m->footer.front_crc = 0; m->footer.middle_crc = 0; @@ -2115,11 +2101,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, /* front */ if (front_len) { if (front_len > PAGE_CACHE_SIZE) { - m->front.iov_base = __vmalloc(front_len, GFP_NOFS, + m->front.iov_base = __vmalloc(front_len, flags, PAGE_KERNEL); m->front_is_vmalloc = true; } else { - m->front.iov_base = kmalloc(front_len, GFP_NOFS); + m->front.iov_base = kmalloc(front_len, flags); } if (m->front.iov_base == NULL) { pr_err("msg_new can't allocate %d bytes\n", @@ -2135,19 +2121,18 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, m->middle = NULL; /* data */ - m->nr_pages = calc_pages_for(page_off, page_len); - m->pages = pages; + m->nr_pages = 0; + m->pages = NULL; m->pagelist = NULL; - dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len, - m->nr_pages); + dout("ceph_msg_new %p front %d\n", m, front_len); return m; out2: ceph_msg_put(m); out: - pr_err("msg_new can't create type %d len %d\n", type, front_len); - return ERR_PTR(-ENOMEM); + pr_err("msg_new can't create type %d front %d\n", type, front_len); + return NULL; } /* @@ -2190,29 +2175,25 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, mutex_unlock(&con->mutex); msg = con->ops->alloc_msg(con, hdr, skip); mutex_lock(&con->mutex); - if (IS_ERR(msg)) - return msg; - - if (*skip) + if (!msg || *skip) return NULL; } if (!msg) { *skip = 0; - msg = ceph_msg_new(type, front_len, 0, 0, NULL); + msg = ceph_msg_new(type, front_len, GFP_NOFS); if (!msg) { pr_err("unable to allocate msg type %d len %d\n", type, front_len); - return ERR_PTR(-ENOMEM); + return NULL; } } memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); - if (middle_len) { + if (middle_len && !msg->middle) { ret = ceph_alloc_middle(con, msg); - if (ret < 0) { ceph_msg_put(msg); - return msg; + return NULL; } } diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h index a5caf91cc97..00a9430b1ff 100644 --- a/fs/ceph/messenger.h +++ b/fs/ceph/messenger.h @@ -49,10 +49,8 @@ struct ceph_connection_operations { int *skip); }; -extern const char *ceph_name_type_str(int t); - /* use format string %s%d */ -#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num) +#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num) struct ceph_messenger { struct ceph_entity_inst inst; /* my name+address */ @@ -144,6 +142,7 @@ struct ceph_connection { struct ceph_entity_addr peer_addr; /* peer address */ struct ceph_entity_name peer_name; /* peer name */ struct ceph_entity_addr peer_addr_for_me; + unsigned peer_features; u32 connect_seq; /* identify the most recent connection attempt for this connection, client */ u32 peer_global_seq; /* peer's global seq for this connection */ @@ -158,7 +157,6 @@ struct ceph_connection { struct list_head out_queue; struct list_head out_sent; /* sending or sent but unacked */ u64 out_seq; /* last message queued for send */ - u64 out_seq_sent; /* last message sent */ bool out_keepalive_pending; u64 in_seq, in_seq_acked; /* last message received, acked */ @@ -234,9 +232,7 @@ extern void ceph_con_keepalive(struct ceph_connection *con); extern struct ceph_connection *ceph_con_get(struct ceph_connection *con); extern void ceph_con_put(struct ceph_connection *con); -extern struct ceph_msg *ceph_msg_new(int type, int front_len, - int page_len, int page_off, - struct page **pages); +extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags); extern void ceph_msg_kfree(struct ceph_msg *m); diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c index 8fdc011ca95..f6510a476e7 100644 --- a/fs/ceph/mon_client.c +++ b/fs/ceph/mon_client.c @@ -28,7 +28,7 @@ * resend any outstanding requests. */ -const static struct ceph_connection_operations mon_con_ops; +static const struct ceph_connection_operations mon_con_ops; static int __validate_auth(struct ceph_mon_client *monc); @@ -104,6 +104,7 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) monc->pending_auth = 1; monc->m_auth->front.iov_len = len; monc->m_auth->hdr.front_len = cpu_to_le32(len); + ceph_con_revoke(monc->con, monc->m_auth); ceph_msg_get(monc->m_auth); /* keep our ref */ ceph_con_send(monc->con, monc->m_auth); } @@ -187,16 +188,12 @@ static void __send_subscribe(struct ceph_mon_client *monc) monc->want_next_osdmap); if ((__sub_expired(monc) && !monc->sub_sent) || monc->want_next_osdmap == 1) { - struct ceph_msg *msg; + struct ceph_msg *msg = monc->m_subscribe; struct ceph_mon_subscribe_item *i; void *p, *end; - msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL); - if (!msg) - return; - p = msg->front.iov_base; - end = p + msg->front.iov_len; + end = p + msg->front_max; dout("__send_subscribe to 'mdsmap' %u+\n", (unsigned)monc->have_mdsmap); @@ -226,7 +223,8 @@ static void __send_subscribe(struct ceph_mon_client *monc) msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - ceph_con_send(monc->con, msg); + ceph_con_revoke(monc->con, msg); + ceph_con_send(monc->con, ceph_msg_get(msg)); monc->sub_sent = jiffies | 1; /* never 0 */ } @@ -353,14 +351,14 @@ out: /* * statfs */ -static struct ceph_mon_statfs_request *__lookup_statfs( +static struct ceph_mon_generic_request *__lookup_generic_req( struct ceph_mon_client *monc, u64 tid) { - struct ceph_mon_statfs_request *req; - struct rb_node *n = monc->statfs_request_tree.rb_node; + struct ceph_mon_generic_request *req; + struct rb_node *n = monc->generic_request_tree.rb_node; while (n) { - req = rb_entry(n, struct ceph_mon_statfs_request, node); + req = rb_entry(n, struct ceph_mon_generic_request, node); if (tid < req->tid) n = n->rb_left; else if (tid > req->tid) @@ -371,16 +369,16 @@ static struct ceph_mon_statfs_request *__lookup_statfs( return NULL; } -static void __insert_statfs(struct ceph_mon_client *monc, - struct ceph_mon_statfs_request *new) +static void __insert_generic_request(struct ceph_mon_client *monc, + struct ceph_mon_generic_request *new) { - struct rb_node **p = &monc->statfs_request_tree.rb_node; + struct rb_node **p = &monc->generic_request_tree.rb_node; struct rb_node *parent = NULL; - struct ceph_mon_statfs_request *req = NULL; + struct ceph_mon_generic_request *req = NULL; while (*p) { parent = *p; - req = rb_entry(parent, struct ceph_mon_statfs_request, node); + req = rb_entry(parent, struct ceph_mon_generic_request, node); if (new->tid < req->tid) p = &(*p)->rb_left; else if (new->tid > req->tid) @@ -390,113 +388,157 @@ static void __insert_statfs(struct ceph_mon_client *monc, } rb_link_node(&new->node, parent, p); - rb_insert_color(&new->node, &monc->statfs_request_tree); + rb_insert_color(&new->node, &monc->generic_request_tree); +} + +static void release_generic_request(struct kref *kref) +{ + struct ceph_mon_generic_request *req = + container_of(kref, struct ceph_mon_generic_request, kref); + + if (req->reply) + ceph_msg_put(req->reply); + if (req->request) + ceph_msg_put(req->request); +} + +static void put_generic_request(struct ceph_mon_generic_request *req) +{ + kref_put(&req->kref, release_generic_request); +} + +static void get_generic_request(struct ceph_mon_generic_request *req) +{ + kref_get(&req->kref); +} + +static struct ceph_msg *get_generic_reply(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip) +{ + struct ceph_mon_client *monc = con->private; + struct ceph_mon_generic_request *req; + u64 tid = le64_to_cpu(hdr->tid); + struct ceph_msg *m; + + mutex_lock(&monc->mutex); + req = __lookup_generic_req(monc, tid); + if (!req) { + dout("get_generic_reply %lld dne\n", tid); + *skip = 1; + m = NULL; + } else { + dout("get_generic_reply %lld got %p\n", tid, req->reply); + m = ceph_msg_get(req->reply); + /* + * we don't need to track the connection reading into + * this reply because we only have one open connection + * at a time, ever. + */ + } + mutex_unlock(&monc->mutex); + return m; } static void handle_statfs_reply(struct ceph_mon_client *monc, struct ceph_msg *msg) { - struct ceph_mon_statfs_request *req; + struct ceph_mon_generic_request *req; struct ceph_mon_statfs_reply *reply = msg->front.iov_base; - u64 tid; + u64 tid = le64_to_cpu(msg->hdr.tid); if (msg->front.iov_len != sizeof(*reply)) goto bad; - tid = le64_to_cpu(msg->hdr.tid); dout("handle_statfs_reply %p tid %llu\n", msg, tid); mutex_lock(&monc->mutex); - req = __lookup_statfs(monc, tid); + req = __lookup_generic_req(monc, tid); if (req) { - *req->buf = reply->st; + *(struct ceph_statfs *)req->buf = reply->st; req->result = 0; + get_generic_request(req); } mutex_unlock(&monc->mutex); - if (req) + if (req) { complete(&req->completion); + put_generic_request(req); + } return; bad: - pr_err("corrupt statfs reply, no tid\n"); + pr_err("corrupt generic reply, no tid\n"); ceph_msg_dump(msg); } /* - * (re)send a statfs request + * Do a synchronous statfs(). */ -static int send_statfs(struct ceph_mon_client *monc, - struct ceph_mon_statfs_request *req) +int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) { - struct ceph_msg *msg; + struct ceph_mon_generic_request *req; struct ceph_mon_statfs *h; + int err; - dout("send_statfs tid %llu\n", req->tid); - msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL); - if (IS_ERR(msg)) - return PTR_ERR(msg); - req->request = msg; - msg->hdr.tid = cpu_to_le64(req->tid); - h = msg->front.iov_base; + req = kzalloc(sizeof(*req), GFP_NOFS); + if (!req) + return -ENOMEM; + + kref_init(&req->kref); + req->buf = buf; + init_completion(&req->completion); + + err = -ENOMEM; + req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS); + if (!req->request) + goto out; + req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS); + if (!req->reply) + goto out; + + /* fill out request */ + h = req->request->front.iov_base; h->monhdr.have_version = 0; h->monhdr.session_mon = cpu_to_le16(-1); h->monhdr.session_mon_tid = 0; h->fsid = monc->monmap->fsid; - ceph_con_send(monc->con, msg); - return 0; -} - -/* - * Do a synchronous statfs(). - */ -int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) -{ - struct ceph_mon_statfs_request req; - int err; - - req.buf = buf; - init_completion(&req.completion); - - /* allocate memory for reply */ - err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1); - if (err) - return err; /* register request */ mutex_lock(&monc->mutex); - req.tid = ++monc->last_tid; - req.last_attempt = jiffies; - req.delay = BASE_DELAY_INTERVAL; - __insert_statfs(monc, &req); - monc->num_statfs_requests++; + req->tid = ++monc->last_tid; + req->request->hdr.tid = cpu_to_le64(req->tid); + __insert_generic_request(monc, req); + monc->num_generic_requests++; mutex_unlock(&monc->mutex); /* send request and wait */ - err = send_statfs(monc, &req); - if (!err) - err = wait_for_completion_interruptible(&req.completion); + ceph_con_send(monc->con, ceph_msg_get(req->request)); + err = wait_for_completion_interruptible(&req->completion); mutex_lock(&monc->mutex); - rb_erase(&req.node, &monc->statfs_request_tree); - monc->num_statfs_requests--; - ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1); + rb_erase(&req->node, &monc->generic_request_tree); + monc->num_generic_requests--; mutex_unlock(&monc->mutex); if (!err) - err = req.result; + err = req->result; + +out: + kref_put(&req->kref, release_generic_request); return err; } /* * Resend pending statfs requests. */ -static void __resend_statfs(struct ceph_mon_client *monc) +static void __resend_generic_request(struct ceph_mon_client *monc) { - struct ceph_mon_statfs_request *req; + struct ceph_mon_generic_request *req; struct rb_node *p; - for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) { - req = rb_entry(p, struct ceph_mon_statfs_request, node); - send_statfs(monc, req); + for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) { + req = rb_entry(p, struct ceph_mon_generic_request, node); + ceph_con_revoke(monc->con, req->request); + ceph_con_send(monc->con, ceph_msg_get(req->request)); } } @@ -586,26 +628,26 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; - /* msg pools */ - err = ceph_msgpool_init(&monc->msgpool_subscribe_ack, - sizeof(struct ceph_mon_subscribe_ack), 1, false); - if (err < 0) + /* msgs */ + err = -ENOMEM; + monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK, + sizeof(struct ceph_mon_subscribe_ack), + GFP_NOFS); + if (!monc->m_subscribe_ack) goto out_monmap; - err = ceph_msgpool_init(&monc->msgpool_statfs_reply, - sizeof(struct ceph_mon_statfs_reply), 0, false); - if (err < 0) - goto out_pool1; - err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false); - if (err < 0) - goto out_pool2; - - monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL); + + monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS); + if (!monc->m_subscribe) + goto out_subscribe_ack; + + monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS); + if (!monc->m_auth_reply) + goto out_subscribe; + + monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS); monc->pending_auth = 0; - if (IS_ERR(monc->m_auth)) { - err = PTR_ERR(monc->m_auth); - monc->m_auth = NULL; - goto out_pool3; - } + if (!monc->m_auth) + goto out_auth_reply; monc->cur_mon = -1; monc->hunting = true; @@ -613,8 +655,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) monc->sub_sent = 0; INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); - monc->statfs_request_tree = RB_ROOT; - monc->num_statfs_requests = 0; + monc->generic_request_tree = RB_ROOT; + monc->num_generic_requests = 0; monc->last_tid = 0; monc->have_mdsmap = 0; @@ -622,12 +664,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) monc->want_next_osdmap = 1; return 0; -out_pool3: - ceph_msgpool_destroy(&monc->msgpool_auth_reply); -out_pool2: - ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); -out_pool1: - ceph_msgpool_destroy(&monc->msgpool_statfs_reply); +out_auth_reply: + ceph_msg_put(monc->m_auth_reply); +out_subscribe: + ceph_msg_put(monc->m_subscribe); +out_subscribe_ack: + ceph_msg_put(monc->m_subscribe_ack); out_monmap: kfree(monc->monmap); out: @@ -651,9 +693,9 @@ void ceph_monc_stop(struct ceph_mon_client *monc) ceph_auth_destroy(monc->auth); ceph_msg_put(monc->m_auth); - ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); - ceph_msgpool_destroy(&monc->msgpool_statfs_reply); - ceph_msgpool_destroy(&monc->msgpool_auth_reply); + ceph_msg_put(monc->m_auth_reply); + ceph_msg_put(monc->m_subscribe); + ceph_msg_put(monc->m_subscribe_ack); kfree(monc->monmap); } @@ -681,7 +723,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc, monc->client->msgr->inst.name.num = monc->auth->global_id; __send_subscribe(monc); - __resend_statfs(monc); + __resend_generic_request(monc); } mutex_unlock(&monc->mutex); } @@ -770,18 +812,17 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, switch (type) { case CEPH_MSG_MON_SUBSCRIBE_ACK: - m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len); + m = ceph_msg_get(monc->m_subscribe_ack); break; case CEPH_MSG_STATFS_REPLY: - m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len); - break; + return get_generic_reply(con, hdr, skip); case CEPH_MSG_AUTH_REPLY: - m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len); + m = ceph_msg_get(monc->m_auth_reply); break; case CEPH_MSG_MON_MAP: case CEPH_MSG_MDS_MAP: case CEPH_MSG_OSD_MAP: - m = ceph_msg_new(type, front_len, 0, 0, NULL); + m = ceph_msg_new(type, front_len, GFP_NOFS); break; } @@ -826,7 +867,7 @@ out: mutex_unlock(&monc->mutex); } -const static struct ceph_connection_operations mon_con_ops = { +static const struct ceph_connection_operations mon_con_ops = { .get = ceph_con_get, .put = ceph_con_put, .dispatch = dispatch, diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h index b958ad5afa0..174d794321d 100644 --- a/fs/ceph/mon_client.h +++ b/fs/ceph/mon_client.h @@ -2,10 +2,10 @@ #define _FS_CEPH_MON_CLIENT_H #include <linux/completion.h> +#include <linux/kref.h> #include <linux/rbtree.h> #include "messenger.h" -#include "msgpool.h" struct ceph_client; struct ceph_mount_args; @@ -22,7 +22,7 @@ struct ceph_monmap { }; struct ceph_mon_client; -struct ceph_mon_statfs_request; +struct ceph_mon_generic_request; /* @@ -40,17 +40,19 @@ struct ceph_mon_request { }; /* - * statfs() is done a bit differently because we need to get data back + * ceph_mon_generic_request is being used for the statfs and poolop requests + * which are bening done a bit differently because we need to get data back * to the caller */ -struct ceph_mon_statfs_request { +struct ceph_mon_generic_request { + struct kref kref; u64 tid; struct rb_node node; int result; - struct ceph_statfs *buf; + void *buf; struct completion completion; - unsigned long last_attempt, delay; /* jiffies */ struct ceph_msg *request; /* original request */ + struct ceph_msg *reply; /* and reply */ }; struct ceph_mon_client { @@ -61,7 +63,7 @@ struct ceph_mon_client { struct delayed_work delayed_work; struct ceph_auth_client *auth; - struct ceph_msg *m_auth; + struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack; int pending_auth; bool hunting; @@ -70,14 +72,9 @@ struct ceph_mon_client { struct ceph_connection *con; bool have_fsid; - /* msg pools */ - struct ceph_msgpool msgpool_subscribe_ack; - struct ceph_msgpool msgpool_statfs_reply; - struct ceph_msgpool msgpool_auth_reply; - - /* pending statfs requests */ - struct rb_root statfs_request_tree; - int num_statfs_requests; + /* pending generic requests */ + struct rb_root generic_request_tree; + int num_generic_requests; u64 last_tid; /* mds/osd map */ diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c index ca3b44a89f2..dd65a643813 100644 --- a/fs/ceph/msgpool.c +++ b/fs/ceph/msgpool.c @@ -7,180 +7,58 @@ #include "msgpool.h" -/* - * We use msg pools to preallocate memory for messages we expect to - * receive over the wire, to avoid getting ourselves into OOM - * conditions at unexpected times. We take use a few different - * strategies: - * - * - for request/response type interactions, we preallocate the - * memory needed for the response when we generate the request. - * - * - for messages we can receive at any time from the MDS, we preallocate - * a pool of messages we can re-use. - * - * - for writeback, we preallocate some number of messages to use for - * requests and their replies, so that we always make forward - * progress. - * - * The msgpool behaves like a mempool_t, but keeps preallocated - * ceph_msgs strung together on a list_head instead of using a pointer - * vector. This avoids vector reallocation when we adjust the number - * of preallocated items (which happens frequently). - */ +static void *alloc_fn(gfp_t gfp_mask, void *arg) +{ + struct ceph_msgpool *pool = arg; + void *p; + p = ceph_msg_new(0, pool->front_len, gfp_mask); + if (!p) + pr_err("msgpool %s alloc failed\n", pool->name); + return p; +} -/* - * Allocate or release as necessary to meet our target pool size. - */ -static int __fill_msgpool(struct ceph_msgpool *pool) +static void free_fn(void *element, void *arg) { - struct ceph_msg *msg; - - while (pool->num < pool->min) { - dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num, - pool->min); - spin_unlock(&pool->lock); - msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL); - spin_lock(&pool->lock); - if (IS_ERR(msg)) - return PTR_ERR(msg); - msg->pool = pool; - list_add(&msg->list_head, &pool->msgs); - pool->num++; - } - while (pool->num > pool->min) { - msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head); - dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num, - pool->min, msg); - list_del_init(&msg->list_head); - pool->num--; - ceph_msg_kfree(msg); - } - return 0; + ceph_msg_put(element); } int ceph_msgpool_init(struct ceph_msgpool *pool, - int front_len, int min, bool blocking) + int front_len, int size, bool blocking, const char *name) { - int ret; - - dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min); - spin_lock_init(&pool->lock); pool->front_len = front_len; - INIT_LIST_HEAD(&pool->msgs); - pool->num = 0; - pool->min = min; - pool->blocking = blocking; - init_waitqueue_head(&pool->wait); - - spin_lock(&pool->lock); - ret = __fill_msgpool(pool); - spin_unlock(&pool->lock); - return ret; + pool->pool = mempool_create(size, alloc_fn, free_fn, pool); + if (!pool->pool) + return -ENOMEM; + pool->name = name; + return 0; } void ceph_msgpool_destroy(struct ceph_msgpool *pool) { - dout("msgpool_destroy %p\n", pool); - spin_lock(&pool->lock); - pool->min = 0; - __fill_msgpool(pool); - spin_unlock(&pool->lock); + mempool_destroy(pool->pool); } -int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta) +struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, + int front_len) { - int ret; - - spin_lock(&pool->lock); - dout("msgpool_resv %p delta %d\n", pool, delta); - pool->min += delta; - ret = __fill_msgpool(pool); - spin_unlock(&pool->lock); - return ret; -} - -struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len) -{ - wait_queue_t wait; - struct ceph_msg *msg; - - if (front_len && front_len > pool->front_len) { - pr_err("msgpool_get pool %p need front %d, pool size is %d\n", - pool, front_len, pool->front_len); + if (front_len > pool->front_len) { + pr_err("msgpool_get pool %s need front %d, pool size is %d\n", + pool->name, front_len, pool->front_len); WARN_ON(1); /* try to alloc a fresh message */ - msg = ceph_msg_new(0, front_len, 0, 0, NULL); - if (!IS_ERR(msg)) - return msg; - } - - if (!front_len) - front_len = pool->front_len; - - if (pool->blocking) { - /* mempool_t behavior; first try to alloc */ - msg = ceph_msg_new(0, front_len, 0, 0, NULL); - if (!IS_ERR(msg)) - return msg; + return ceph_msg_new(0, front_len, GFP_NOFS); } - while (1) { - spin_lock(&pool->lock); - if (likely(pool->num)) { - msg = list_entry(pool->msgs.next, struct ceph_msg, - list_head); - list_del_init(&msg->list_head); - pool->num--; - dout("msgpool_get %p got %p, now %d/%d\n", pool, msg, - pool->num, pool->min); - spin_unlock(&pool->lock); - return msg; - } - pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num, - pool->min, pool->blocking ? "waiting" : "may fail"); - spin_unlock(&pool->lock); - - if (!pool->blocking) { - WARN_ON(1); - - /* maybe we can allocate it now? */ - msg = ceph_msg_new(0, front_len, 0, 0, NULL); - if (!IS_ERR(msg)) - return msg; - - pr_err("msgpool_get %p empty + alloc failed\n", pool); - return ERR_PTR(-ENOMEM); - } - - init_wait(&wait); - prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); - schedule(); - finish_wait(&pool->wait, &wait); - } + return mempool_alloc(pool->pool, GFP_NOFS); } void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg) { - spin_lock(&pool->lock); - if (pool->num < pool->min) { - /* reset msg front_len; user may have changed it */ - msg->front.iov_len = pool->front_len; - msg->hdr.front_len = cpu_to_le32(pool->front_len); + /* reset msg front_len; user may have changed it */ + msg->front.iov_len = pool->front_len; + msg->hdr.front_len = cpu_to_le32(pool->front_len); - kref_set(&msg->kref, 1); /* retake a single ref */ - list_add(&msg->list_head, &pool->msgs); - pool->num++; - dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg, - pool->num, pool->min); - spin_unlock(&pool->lock); - wake_up(&pool->wait); - } else { - dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg, - pool->num, pool->min); - spin_unlock(&pool->lock); - ceph_msg_kfree(msg); - } + kref_init(&msg->kref); /* retake single ref */ } diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h index bc834bfcd72..a362605f936 100644 --- a/fs/ceph/msgpool.h +++ b/fs/ceph/msgpool.h @@ -1,6 +1,7 @@ #ifndef _FS_CEPH_MSGPOOL #define _FS_CEPH_MSGPOOL +#include <linux/mempool.h> #include "messenger.h" /* @@ -8,18 +9,15 @@ * avoid unexpected OOM conditions. */ struct ceph_msgpool { - spinlock_t lock; + const char *name; + mempool_t *pool; int front_len; /* preallocated payload size */ - struct list_head msgs; /* msgs in the pool; each has 1 ref */ - int num, min; /* cur, min # msgs in the pool */ - bool blocking; - wait_queue_head_t wait; }; extern int ceph_msgpool_init(struct ceph_msgpool *pool, - int front_len, int size, bool blocking); + int front_len, int size, bool blocking, + const char *name); extern void ceph_msgpool_destroy(struct ceph_msgpool *pool); -extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta); extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *, int front_len); extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *); diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h index 8aaab414f3f..892a0298dfd 100644 --- a/fs/ceph/msgr.h +++ b/fs/ceph/msgr.h @@ -50,7 +50,6 @@ struct ceph_entity_name { #define CEPH_ENTITY_TYPE_MDS 0x02 #define CEPH_ENTITY_TYPE_OSD 0x04 #define CEPH_ENTITY_TYPE_CLIENT 0x08 -#define CEPH_ENTITY_TYPE_ADMIN 0x10 #define CEPH_ENTITY_TYPE_AUTH 0x20 #define CEPH_ENTITY_TYPE_ANY 0xFF @@ -120,7 +119,7 @@ struct ceph_msg_connect_reply { /* * message header */ -struct ceph_msg_header { +struct ceph_msg_header_old { __le64 seq; /* message seq# for this session */ __le64 tid; /* transaction id */ __le16 type; /* message type */ @@ -138,6 +137,24 @@ struct ceph_msg_header { __le32 crc; /* header crc32c */ } __attribute__ ((packed)); +struct ceph_msg_header { + __le64 seq; /* message seq# for this session */ + __le64 tid; /* transaction id */ + __le16 type; /* message type */ + __le16 priority; /* priority. higher value == higher priority */ + __le16 version; /* version of message encoding */ + + __le32 front_len; /* bytes in main payload */ + __le32 middle_len;/* bytes in middle payload */ + __le32 data_len; /* bytes of data payload */ + __le16 data_off; /* sender: include full offset; + receiver: mask against ~PAGE_MASK */ + + struct ceph_entity_name src; + __le32 reserved; + __le32 crc; /* header crc32c */ +} __attribute__ ((packed)); + #define CEPH_MSG_PRIO_LOW 64 #define CEPH_MSG_PRIO_DEFAULT 127 #define CEPH_MSG_PRIO_HIGH 196 diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index 3514f71ff85..afa7bb3895c 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -16,7 +16,7 @@ #define OSD_OP_FRONT_LEN 4096 #define OSD_OPREPLY_FRONT_LEN 512 -const static struct ceph_connection_operations osd_con_ops; +static const struct ceph_connection_operations osd_con_ops; static int __kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *kickosd); @@ -147,7 +147,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req = kzalloc(sizeof(*req), GFP_NOFS); } if (req == NULL) - return ERR_PTR(-ENOMEM); + return NULL; req->r_osdc = osdc; req->r_mempool = use_mempool; @@ -164,10 +164,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); else msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, - OSD_OPREPLY_FRONT_LEN, 0, 0, NULL); - if (IS_ERR(msg)) { + OSD_OPREPLY_FRONT_LEN, GFP_NOFS); + if (!msg) { ceph_osdc_put_request(req); - return ERR_PTR(PTR_ERR(msg)); + return NULL; } req->r_reply = msg; @@ -178,10 +178,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op, 0); else - msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL); - if (IS_ERR(msg)) { + msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS); + if (!msg) { ceph_osdc_put_request(req); - return ERR_PTR(PTR_ERR(msg)); + return NULL; } msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); memset(msg->front.iov_base, 0, msg->front.iov_len); @@ -715,7 +715,7 @@ static void handle_timeout(struct work_struct *work) * should mark the osd as failed and we should find out about * it from an updated osd map. */ - while (!list_empty(&osdc->req_lru)) { + while (timeout && !list_empty(&osdc->req_lru)) { req = list_entry(osdc->req_lru.next, struct ceph_osd_request, r_req_lru_item); @@ -1078,6 +1078,7 @@ done: if (newmap) kick_requests(osdc, NULL); up_read(&osdc->map_sem); + wake_up(&osdc->client->auth_wq); return; bad: @@ -1087,45 +1088,6 @@ bad: return; } - -/* - * A read request prepares specific pages that data is to be read into. - * When a message is being read off the wire, we call prepare_pages to - * find those pages. - * 0 = success, -1 failure. - */ -static int __prepare_pages(struct ceph_connection *con, - struct ceph_msg_header *hdr, - struct ceph_osd_request *req, - u64 tid, - struct ceph_msg *m) -{ - struct ceph_osd *osd = con->private; - struct ceph_osd_client *osdc; - int ret = -1; - int data_len = le32_to_cpu(hdr->data_len); - unsigned data_off = le16_to_cpu(hdr->data_off); - - int want = calc_pages_for(data_off & ~PAGE_MASK, data_len); - - if (!osd) - return -1; - - osdc = osd->o_osdc; - - dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m, - tid, req->r_num_pages, want); - if (unlikely(req->r_num_pages < want)) - goto out; - m->pages = req->r_pages; - m->nr_pages = req->r_num_pages; - ret = 0; /* success */ -out: - BUG_ON(ret < 0 || m->nr_pages < want); - - return ret; -} - /* * Register request, send initial attempt. */ @@ -1252,11 +1214,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) if (!osdc->req_mempool) goto out; - err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true); + err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true, + "osd_op"); if (err < 0) goto out_mempool; err = ceph_msgpool_init(&osdc->msgpool_op_reply, - OSD_OPREPLY_FRONT_LEN, 10, true); + OSD_OPREPLY_FRONT_LEN, 10, true, + "osd_op_reply"); if (err < 0) goto out_msgpool; return 0; @@ -1302,8 +1266,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, 0, truncate_seq, truncate_size, NULL, false, 1); - if (IS_ERR(req)) - return PTR_ERR(req); + if (!req) + return -ENOMEM; /* it may be a short read due to an object boundary */ req->r_pages = pages; @@ -1345,8 +1309,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, snapc, do_sync, truncate_seq, truncate_size, mtime, nofail, 1); - if (IS_ERR(req)) - return PTR_ERR(req); + if (!req) + return -ENOMEM; /* it may be a short write due to an object boundary */ req->r_pages = pages; @@ -1394,7 +1358,8 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) } /* - * lookup and return message for incoming reply + * lookup and return message for incoming reply. set up reply message + * pages. */ static struct ceph_msg *get_reply(struct ceph_connection *con, struct ceph_msg_header *hdr, @@ -1407,7 +1372,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, int front = le32_to_cpu(hdr->front_len); int data_len = le32_to_cpu(hdr->data_len); u64 tid; - int err; tid = le64_to_cpu(hdr->tid); mutex_lock(&osdc->request_mutex); @@ -1425,13 +1389,14 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, req->r_reply, req->r_con_filling_msg); ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply); ceph_con_put(req->r_con_filling_msg); + req->r_con_filling_msg = NULL; } if (front > req->r_reply->front.iov_len) { pr_warning("get_reply front %d > preallocated %d\n", front, (int)req->r_reply->front.iov_len); - m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL); - if (IS_ERR(m)) + m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS); + if (!m) goto out; ceph_msg_put(req->r_reply); req->r_reply = m; @@ -1439,12 +1404,19 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, m = ceph_msg_get(req->r_reply); if (data_len > 0) { - err = __prepare_pages(con, hdr, req, tid, m); - if (err < 0) { + unsigned data_off = le16_to_cpu(hdr->data_off); + int want = calc_pages_for(data_off & ~PAGE_MASK, data_len); + + if (unlikely(req->r_num_pages < want)) { + pr_warning("tid %lld reply %d > expected %d pages\n", + tid, want, m->nr_pages); *skip = 1; ceph_msg_put(m); - m = ERR_PTR(err); + m = NULL; + goto out; } + m->pages = req->r_pages; + m->nr_pages = req->r_num_pages; } *skip = 0; req->r_con_filling_msg = ceph_con_get(con); @@ -1466,7 +1438,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, switch (type) { case CEPH_MSG_OSD_MAP: - return ceph_msg_new(type, front, 0, 0, NULL); + return ceph_msg_new(type, front, GFP_NOFS); case CEPH_MSG_OSD_OPREPLY: return get_reply(con, hdr, skip); default: @@ -1552,7 +1524,7 @@ static int invalidate_authorizer(struct ceph_connection *con) return ceph_monc_validate_auth(&osdc->client->monc); } -const static struct ceph_connection_operations osd_con_ops = { +static const struct ceph_connection_operations osd_con_ops = { .get = get_osd_con, .put = put_osd_con, .dispatch = dispatch, diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c index 5f8dbf7c745..b6859f47d36 100644 --- a/fs/ceph/pagelist.c +++ b/fs/ceph/pagelist.c @@ -20,7 +20,7 @@ int ceph_pagelist_release(struct ceph_pagelist *pl) static int ceph_pagelist_addpage(struct ceph_pagelist *pl) { - struct page *page = alloc_page(GFP_NOFS); + struct page *page = __page_cache_alloc(GFP_NOFS); if (!page) return -ENOMEM; pl->room += PAGE_SIZE; diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h index fd56451a871..8fcc023056c 100644 --- a/fs/ceph/rados.h +++ b/fs/ceph/rados.h @@ -101,8 +101,8 @@ struct ceph_pg_pool { __le64 snap_seq; /* seq for per-pool snapshot */ __le32 snap_epoch; /* epoch of last snap */ __le32 num_snaps; - __le32 num_removed_snap_intervals; - __le64 uid; + __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */ + __le64 auid; /* who owns the pg */ } __attribute__ ((packed)); /* @@ -208,6 +208,7 @@ enum { /* read */ CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2, + CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3, /* write */ CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1, @@ -305,6 +306,22 @@ enum { #define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ #define EBLACKLISTED ESHUTDOWN /* blacklisted */ +/* xattr comparison */ +enum { + CEPH_OSD_CMPXATTR_OP_NOP = 0, + CEPH_OSD_CMPXATTR_OP_EQ = 1, + CEPH_OSD_CMPXATTR_OP_NE = 2, + CEPH_OSD_CMPXATTR_OP_GT = 3, + CEPH_OSD_CMPXATTR_OP_GTE = 4, + CEPH_OSD_CMPXATTR_OP_LT = 5, + CEPH_OSD_CMPXATTR_OP_LTE = 6 +}; + +enum { + CEPH_OSD_CMPXATTR_MODE_STRING = 1, + CEPH_OSD_CMPXATTR_MODE_U64 = 2 +}; + /* * an individual object operation. each may be accompanied by some data * payload @@ -321,6 +338,8 @@ struct ceph_osd_op { struct { __le32 name_len; __le32 value_len; + __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ + __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ } __attribute__ ((packed)) xattr; struct { __u8 class_len; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index d5114db7045..c0b26b6badb 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -512,7 +512,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap) { struct inode *inode = &ci->vfs_inode; - struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; BUG_ON(capsnap->writing); capsnap->size = inode->i_size; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 110857ba926..7c663d9b9f8 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -8,14 +8,11 @@ #include <linux/module.h> #include <linux/mount.h> #include <linux/parser.h> -#include <linux/rwsem.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/statfs.h> #include <linux/string.h> -#include <linux/version.h> -#include <linux/vmalloc.h> #include "decode.h" #include "super.h" @@ -107,12 +104,40 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) static int ceph_syncfs(struct super_block *sb, int wait) { dout("sync_fs %d\n", wait); - ceph_osdc_sync(&ceph_client(sb)->osdc); - ceph_mdsc_sync(&ceph_client(sb)->mdsc); + ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc); + ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc); dout("sync_fs %d done\n", wait); return 0; } +static int default_congestion_kb(void) +{ + int congestion_kb; + + /* + * Copied from NFS + * + * congestion size, scale with available memory. + * + * 64MB: 8192k + * 128MB: 11585k + * 256MB: 16384k + * 512MB: 23170k + * 1GB: 32768k + * 2GB: 46340k + * 4GB: 65536k + * 8GB: 92681k + * 16GB: 131072k + * + * This allows larger machines to have larger/more transfers. + * Limit the default to 256M + */ + congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + if (congestion_kb > 256*1024) + congestion_kb = 256*1024; + + return congestion_kb; +} /** * ceph_show_options - Show mount options in /proc/mounts @@ -138,6 +163,35 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) seq_puts(m, ",nocrc"); if (args->flags & CEPH_OPT_NOASYNCREADDIR) seq_puts(m, ",noasyncreaddir"); + + if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) + seq_printf(m, ",mount_timeout=%d", args->mount_timeout); + if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) + seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl); + if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT) + seq_printf(m, ",osdtimeout=%d", args->osd_timeout); + if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) + seq_printf(m, ",osdkeepalivetimeout=%d", + args->osd_keepalive_timeout); + if (args->wsize) + seq_printf(m, ",wsize=%d", args->wsize); + if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT) + seq_printf(m, ",rsize=%d", args->rsize); + if (args->congestion_kb != default_congestion_kb()) + seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb); + if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) + seq_printf(m, ",caps_wanted_delay_min=%d", + args->caps_wanted_delay_min); + if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) + seq_printf(m, ",caps_wanted_delay_max=%d", + args->caps_wanted_delay_max); + if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT) + seq_printf(m, ",cap_release_safety=%d", + args->cap_release_safety); + if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT) + seq_printf(m, ",readdir_max_entries=%d", args->max_readdir); + if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) + seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes); if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) seq_printf(m, ",snapdirname=%s", args->snapdir_name); if (args->name) @@ -161,35 +215,6 @@ static void ceph_inode_init_once(void *foo) inode_init_once(&ci->vfs_inode); } -static int default_congestion_kb(void) -{ - int congestion_kb; - - /* - * Copied from NFS - * - * congestion size, scale with available memory. - * - * 64MB: 8192k - * 128MB: 11585k - * 256MB: 16384k - * 512MB: 23170k - * 1GB: 32768k - * 2GB: 46340k - * 4GB: 65536k - * 8GB: 92681k - * 16GB: 131072k - * - * This allows larger machines to have larger/more transfers. - * Limit the default to 256M - */ - congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); - if (congestion_kb > 256*1024) - congestion_kb = 256*1024; - - return congestion_kb; -} - static int __init init_caches(void) { ceph_inode_cachep = kmem_cache_create("ceph_inode_info", @@ -308,7 +333,9 @@ enum { Opt_osd_idle_ttl, Opt_caps_wanted_delay_min, Opt_caps_wanted_delay_max, + Opt_cap_release_safety, Opt_readdir_max_entries, + Opt_readdir_max_bytes, Opt_congestion_kb, Opt_last_int, /* int args above */ @@ -339,7 +366,9 @@ static match_table_t arg_tokens = { {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, + {Opt_cap_release_safety, "cap_release_safety=%d"}, {Opt_readdir_max_entries, "readdir_max_entries=%d"}, + {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, {Opt_congestion_kb, "write_congestion_kb=%d"}, /* int args above */ {Opt_snapdirname, "snapdirname=%s"}, @@ -388,8 +417,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options, args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; args->rsize = CEPH_MOUNT_RSIZE_DEFAULT; args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); - args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4; - args->max_readdir = 1024; + args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; + args->max_readdir = CEPH_MAX_READDIR_DEFAULT; + args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; args->congestion_kb = default_congestion_kb(); /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ @@ -497,6 +527,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options, case Opt_readdir_max_entries: args->max_readdir = intval; break; + case Opt_readdir_max_bytes: + args->max_readdir_bytes = intval; + break; case Opt_congestion_kb: args->congestion_kb = intval; break; @@ -682,9 +715,10 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) /* * true if we have the mon map (and have thus joined the cluster) */ -static int have_mon_map(struct ceph_client *client) +static int have_mon_and_osd_map(struct ceph_client *client) { - return client->monc.monmap && client->monc.monmap->epoch; + return client->monc.monmap && client->monc.monmap->epoch && + client->osdc.osdmap && client->osdc.osdmap->epoch; } /* @@ -762,7 +796,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, if (err < 0) goto out; - while (!have_mon_map(client)) { + while (!have_mon_and_osd_map(client)) { err = -EIO; if (timeout && time_after_eq(jiffies, started + timeout)) goto out; @@ -770,8 +804,8 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, /* wait */ dout("mount waiting for mon_map\n"); err = wait_event_interruptible_timeout(client->auth_wq, - have_mon_map(client) || (client->auth_err < 0), - timeout); + have_mon_and_osd_map(client) || (client->auth_err < 0), + timeout); if (err == -EINTR || err == -ERESTARTSYS) goto out; if (client->auth_err < 0) { @@ -884,6 +918,8 @@ static int ceph_compare_super(struct super_block *sb, void *data) /* * construct our own bdi so we can control readahead, etc. */ +static atomic_long_t bdi_seq = ATOMIC_INIT(0); + static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) { int err; @@ -893,7 +929,8 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) client->backing_dev_info.ra_pages = (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; - err = bdi_register_dev(&client->backing_dev_info, sb->s_dev); + err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d", + atomic_long_inc_return(&bdi_seq)); if (!err) sb->s_bdi = &client->backing_dev_info; return err; @@ -932,9 +969,9 @@ static int ceph_get_sb(struct file_system_type *fs_type, goto out; } - if (ceph_client(sb) != client) { + if (ceph_sb_to_client(sb) != client) { ceph_destroy_client(client); - client = ceph_client(sb); + client = ceph_sb_to_client(sb); dout("get_sb got existing client %p\n", client); } else { dout("get_sb using new client %p\n", client); @@ -952,8 +989,7 @@ static int ceph_get_sb(struct file_system_type *fs_type, out_splat: ceph_mdsc_close_sessions(&client->mdsc); - up_write(&sb->s_umount); - deactivate_super(sb); + deactivate_locked_super(sb); goto out_final; out: diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 13513b80d87..3725c9ee9d0 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -52,24 +52,25 @@ struct ceph_mount_args { int sb_flags; + int flags; + struct ceph_fsid fsid; + struct ceph_entity_addr my_addr; int num_mon; struct ceph_entity_addr *mon_addr; - int flags; int mount_timeout; int osd_idle_ttl; - int caps_wanted_delay_min, caps_wanted_delay_max; - struct ceph_fsid fsid; - struct ceph_entity_addr my_addr; - int wsize; - int rsize; /* max readahead */ - int max_readdir; /* max readdir size */ - int congestion_kb; /* max readdir size */ int osd_timeout; int osd_keepalive_timeout; + int wsize; + int rsize; /* max readahead */ + int congestion_kb; /* max writeback in flight */ + int caps_wanted_delay_min, caps_wanted_delay_max; + int cap_release_safety; + int max_readdir; /* max readdir result (entires) */ + int max_readdir_bytes; /* max readdir result (bytes) */ char *snapdir_name; /* default ".snap" */ char *name; char *secret; - int cap_release_safety; }; /* @@ -80,13 +81,14 @@ struct ceph_mount_args { #define CEPH_OSD_KEEPALIVE_DEFAULT 5 #define CEPH_OSD_IDLE_TTL_DEFAULT 60 #define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ +#define CEPH_MAX_READDIR_DEFAULT 1024 +#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) #define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) #define CEPH_SNAPDIRNAME_DEFAULT ".snap" #define CEPH_AUTH_NAME_DEFAULT "guest" - /* * Delay telling the MDS we no longer want caps, in case we reopen * the file. Delay a minimum amount of time, even if we send a cap @@ -96,6 +98,7 @@ struct ceph_mount_args { #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ +#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4) /* mount state */ enum { @@ -160,12 +163,6 @@ struct ceph_client { #endif }; -static inline struct ceph_client *ceph_client(struct super_block *sb) -{ - return sb->s_fs_info; -} - - /* * File i/o capability. This tracks shared state with the metadata * server that allows us to cache or writeback attributes or to read @@ -871,6 +868,7 @@ extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, extern void ceph_dentry_lru_add(struct dentry *dn); extern void ceph_dentry_lru_touch(struct dentry *dn); extern void ceph_dentry_lru_del(struct dentry *dn); +extern void ceph_invalidate_dentry_lease(struct dentry *dentry); /* * our d_ops vary depending on whether the inode is live, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 2845422907f..68aeebc6968 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -7,7 +7,8 @@ static bool ceph_is_valid_xattr(const char *name) { - return !strncmp(name, XATTR_SECURITY_PREFIX, + return !strncmp(name, "ceph.", 5) || + !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); @@ -76,14 +77,14 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val, } static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { - { true, "user.ceph.dir.entries", ceph_vxattrcb_entries}, - { true, "user.ceph.dir.files", ceph_vxattrcb_files}, - { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs}, - { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries}, - { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles}, - { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, - { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes}, - { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime}, + { true, "ceph.dir.entries", ceph_vxattrcb_entries}, + { true, "ceph.dir.files", ceph_vxattrcb_files}, + { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs}, + { true, "ceph.dir.rentries", ceph_vxattrcb_rentries}, + { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles}, + { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, + { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes}, + { true, "ceph.dir.rctime", ceph_vxattrcb_rctime}, { true, NULL, NULL } }; @@ -107,7 +108,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, } static struct ceph_vxattr_cb ceph_file_vxattrs[] = { - { true, "user.ceph.layout", ceph_vxattrcb_layout}, + { true, "ceph.layout", ceph_vxattrcb_layout}, { NULL, NULL } }; @@ -186,12 +187,6 @@ static int __set_xattr(struct ceph_inode_info *ci, ci->i_xattrs.names_size -= xattr->name_len; ci->i_xattrs.vals_size -= xattr->val_len; } - if (!xattr) { - pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n", - &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name, - xattr->val); - return -ENOMEM; - } ci->i_xattrs.names_size += name_len; ci->i_xattrs.vals_size += val_len; if (val) @@ -574,7 +569,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) ci->i_xattrs.version, ci->i_xattrs.index_version); if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && - (ci->i_xattrs.index_version > ci->i_xattrs.version)) { + (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { goto list_xattr; } else { spin_unlock(&inode->i_lock); @@ -622,7 +617,7 @@ out: static int ceph_sync_setxattr(struct dentry *dentry, const char *name, const char *value, size_t size, int flags) { - struct ceph_client *client = ceph_client(dentry->d_sb); + struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); struct inode *parent_inode = dentry->d_parent->d_inode; @@ -641,7 +636,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, return -ENOMEM; err = -ENOMEM; for (i = 0; i < nr_pages; i++) { - pages[i] = alloc_page(GFP_NOFS); + pages[i] = __page_cache_alloc(GFP_NOFS); if (!pages[i]) { nr_pages = i; goto out; @@ -779,7 +774,7 @@ out: static int ceph_send_removexattr(struct dentry *dentry, const char *name) { - struct ceph_client *client = ceph_client(dentry->d_sb); + struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); struct ceph_mds_client *mdsc = &client->mdsc; struct inode *inode = dentry->d_inode; struct inode *parent_inode = dentry->d_parent->d_inode; diff --git a/fs/coda/file.c b/fs/coda/file.c index 4c813f2cdc5..7196077b168 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -217,7 +217,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync) BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); host_file = cfi->cfi_container; - err = vfs_fsync(host_file, host_file->f_path.dentry, datasync); + err = vfs_fsync(host_file, datasync); if ( !err && !datasync ) { lock_kernel(); err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index 773f2ce9aa0..ca25d96d45c 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -1,6 +1,6 @@ /* * Pioctl operations for Coda. - * Original version: (C) 1996 Peter Braam + * Original version: (C) 1996 Peter Braam * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University * * Carnegie Mellon encourages users of this code to contribute improvements @@ -23,21 +23,22 @@ #include <linux/coda_fs_i.h> #include <linux/coda_psdev.h> +#include <linux/smp_lock.h> + /* pioctl ops */ static int coda_ioctl_permission(struct inode *inode, int mask); -static int coda_pioctl(struct inode * inode, struct file * filp, - unsigned int cmd, unsigned long user_data); +static long coda_pioctl(struct file *filp, unsigned int cmd, + unsigned long user_data); /* exported from this file */ -const struct inode_operations coda_ioctl_inode_operations = -{ +const struct inode_operations coda_ioctl_inode_operations = { .permission = coda_ioctl_permission, .setattr = coda_setattr, }; const struct file_operations coda_ioctl_operations = { .owner = THIS_MODULE, - .ioctl = coda_pioctl, + .unlocked_ioctl = coda_pioctl, }; /* the coda pioctl inode ops */ @@ -46,48 +47,53 @@ static int coda_ioctl_permission(struct inode *inode, int mask) return (mask & MAY_EXEC) ? -EACCES : 0; } -static int coda_pioctl(struct inode * inode, struct file * filp, - unsigned int cmd, unsigned long user_data) +static long coda_pioctl(struct file *filp, unsigned int cmd, + unsigned long user_data) { struct path path; - int error; + int error; struct PioctlData data; - struct inode *target_inode = NULL; - struct coda_inode_info *cnp; + struct inode *inode = filp->f_dentry->d_inode; + struct inode *target_inode = NULL; + struct coda_inode_info *cnp; - /* get the Pioctl data arguments from user space */ - if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) { - return -EINVAL; - } - - /* - * Look up the pathname. Note that the pathname is in - * user memory, and namei takes care of this - */ - if (data.follow) { - error = user_path(data.path, &path); - } else { - error = user_lpath(data.path, &path); + lock_kernel(); + + /* get the Pioctl data arguments from user space */ + if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) { + error = -EINVAL; + goto out; } - - if ( error ) { - return error; - } else { + + /* + * Look up the pathname. Note that the pathname is in + * user memory, and namei takes care of this + */ + if (data.follow) + error = user_path(data.path, &path); + else + error = user_lpath(data.path, &path); + + if (error) + goto out; + else target_inode = path.dentry->d_inode; - } - + /* return if it is not a Coda inode */ - if ( target_inode->i_sb != inode->i_sb ) { + if (target_inode->i_sb != inode->i_sb) { path_put(&path); - return -EINVAL; + error = -EINVAL; + goto out; } /* now proceed to make the upcall */ - cnp = ITOC(target_inode); + cnp = ITOC(target_inode); error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data); path_put(&path); - return error; -} +out: + unlock_kernel(); + return error; +} diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index be4392ca209..66b9cf79c5b 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -73,8 +73,7 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait) return mask; } -static int coda_psdev_ioctl(struct inode * inode, struct file * filp, - unsigned int cmd, unsigned long arg) +static long coda_psdev_ioctl(struct file * filp, unsigned int cmd, unsigned long arg) { unsigned int data; @@ -344,7 +343,7 @@ static const struct file_operations coda_psdev_fops = { .read = coda_psdev_read, .write = coda_psdev_write, .poll = coda_psdev_poll, - .ioctl = coda_psdev_ioctl, + .unlocked_ioctl = coda_psdev_ioctl, .open = coda_psdev_open, .release = coda_psdev_release, }; diff --git a/fs/dcache.c b/fs/dcache.c index f1358e5c3a5..d96047b4a63 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -536,7 +536,7 @@ restart: */ static void prune_dcache(int count) { - struct super_block *sb; + struct super_block *sb, *n; int w_count; int unused = dentry_stat.nr_unused; int prune_ratio; @@ -545,13 +545,14 @@ static void prune_dcache(int count) if (unused == 0 || count == 0) return; spin_lock(&dcache_lock); -restart: if (count >= unused) prune_ratio = 1; else prune_ratio = unused / count; spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { + list_for_each_entry_safe(sb, n, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; if (sb->s_nr_dentry_unused == 0) continue; sb->s_count++; @@ -590,14 +591,10 @@ restart: } spin_lock(&sb_lock); count -= pruned; - /* - * restart only when sb is no longer on the list and - * we have more work to do. - */ - if (__put_super_and_need_restart(sb) && count > 0) { - spin_unlock(&sb_lock); - goto restart; - } + __put_super(sb); + /* more work left to do? */ + if (count <= 0) + break; } spin_unlock(&sb_lock); spin_unlock(&dcache_lock); @@ -1529,6 +1526,7 @@ void d_delete(struct dentry * dentry) spin_lock(&dentry->d_lock); isdir = S_ISDIR(dentry->d_inode->i_mode); if (atomic_read(&dentry->d_count) == 1) { + dentry->d_flags &= ~DCACHE_CANT_MOUNT; dentry_iput(dentry); fsnotify_nameremove(dentry, isdir); return; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 0120247b41c..8b3ffd5b523 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -384,18 +384,15 @@ static int devpts_get_sb(struct file_system_type *fs_type, s->s_flags |= MS_ACTIVE; } - simple_set_mnt(mnt, s); - memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts)); error = mknod_ptmx(s); if (error) - goto out_dput; + goto out_undo_sget; - return 0; + simple_set_mnt(mnt, s); -out_dput: - dput(s->s_root); /* undo dget() in simple_set_mnt() */ + return 0; out_undo_sget: deactivate_locked_super(s); diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 17903b49129..031dbe3a15c 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -733,10 +733,7 @@ static void lkb_add_ordered(struct list_head *new, struct list_head *head, if (lkb->lkb_rqmode < mode) break; - if (!lkb) - list_add_tail(new, head); - else - __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue); + __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue); } /* add/remove lkb to rsb's grant/convert/wait queue */ diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 8b6e73c4743..b6272853130 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -215,6 +215,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode) if (!ast_type) { kref_get(&lkb->lkb_ref); list_add_tail(&lkb->lkb_astqueue, &proc->asts); + lkb->lkb_ast_first = type; wake_up_interruptible(&proc->wait); } if (type == AST_COMP && (ast_type & AST_COMP)) @@ -223,7 +224,6 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode) eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type); if (eol) { - lkb->lkb_ast_type &= ~AST_BAST; lkb->lkb_flags |= DLM_IFL_ENDOFLIFE; } @@ -706,7 +706,7 @@ static int device_close(struct inode *inode, struct file *file) } static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type, - int bmode, char __user *buf, size_t count) + int mode, char __user *buf, size_t count) { #ifdef CONFIG_COMPAT struct dlm_lock_result32 result32; @@ -733,7 +733,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type, if (type == AST_BAST) { result.user_astaddr = ua->bastaddr; result.user_astparam = ua->bastparam; - result.bast_mode = bmode; + result.bast_mode = mode; } else { result.user_astaddr = ua->castaddr; result.user_astparam = ua->castparam; @@ -801,7 +801,9 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, struct dlm_user_proc *proc = file->private_data; struct dlm_lkb *lkb; DECLARE_WAITQUEUE(wait, current); - int error, type=0, bmode=0, removed = 0; + int error = 0, removed; + int ret_type, ret_mode; + int bastmode, castmode, do_bast, do_cast; if (count == sizeof(struct dlm_device_version)) { error = copy_version_to_user(buf, count); @@ -820,6 +822,8 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, #endif return -EINVAL; + try_another: + /* do we really need this? can a read happen after a close? */ if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags)) return -EINVAL; @@ -855,13 +859,55 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue); - if (lkb->lkb_ast_type & AST_COMP) { - lkb->lkb_ast_type &= ~AST_COMP; - type = AST_COMP; - } else if (lkb->lkb_ast_type & AST_BAST) { - lkb->lkb_ast_type &= ~AST_BAST; - type = AST_BAST; - bmode = lkb->lkb_bastmode; + removed = 0; + ret_type = 0; + ret_mode = 0; + do_bast = lkb->lkb_ast_type & AST_BAST; + do_cast = lkb->lkb_ast_type & AST_COMP; + bastmode = lkb->lkb_bastmode; + castmode = lkb->lkb_castmode; + + /* when both are queued figure out which to do first and + switch first so the other goes in the next read */ + + if (do_cast && do_bast) { + if (lkb->lkb_ast_first == AST_COMP) { + ret_type = AST_COMP; + ret_mode = castmode; + lkb->lkb_ast_type &= ~AST_COMP; + lkb->lkb_ast_first = AST_BAST; + } else { + ret_type = AST_BAST; + ret_mode = bastmode; + lkb->lkb_ast_type &= ~AST_BAST; + lkb->lkb_ast_first = AST_COMP; + } + } else { + ret_type = lkb->lkb_ast_first; + ret_mode = (ret_type == AST_COMP) ? castmode : bastmode; + lkb->lkb_ast_type &= ~ret_type; + lkb->lkb_ast_first = 0; + } + + /* if we're doing a bast but the bast is unnecessary, then + switch to do nothing or do a cast if that was needed next */ + + if ((ret_type == AST_BAST) && + dlm_modes_compat(bastmode, lkb->lkb_castmode_done)) { + ret_type = 0; + ret_mode = 0; + + if (do_cast) { + ret_type = AST_COMP; + ret_mode = castmode; + lkb->lkb_ast_type &= ~AST_COMP; + lkb->lkb_ast_first = 0; + } + } + + if (lkb->lkb_ast_first != lkb->lkb_ast_type) { + log_print("device_read %x ast_first %x ast_type %x", + lkb->lkb_id, lkb->lkb_ast_first, lkb->lkb_ast_type); } if (!lkb->lkb_ast_type) { @@ -870,15 +916,29 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, } spin_unlock(&proc->asts_spin); - error = copy_result_to_user(lkb->lkb_ua, - test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags), - type, bmode, buf, count); + if (ret_type) { + error = copy_result_to_user(lkb->lkb_ua, + test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags), + ret_type, ret_mode, buf, count); + + if (ret_type == AST_COMP) + lkb->lkb_castmode_done = castmode; + if (ret_type == AST_BAST) + lkb->lkb_bastmode_done = bastmode; + } /* removes reference for the proc->asts lists added by dlm_user_add_ast() and may result in the lkb being freed */ + if (removed) dlm_put_lkb(lkb); + /* the bast that was queued was eliminated (see unnecessary above), + leaving nothing to return */ + + if (!ret_type) + goto try_another; + return error; } diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 31f4b0e6d72..83c4f600786 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -12,7 +12,7 @@ /* A global variable is a bit ugly, but it keeps the code simple */ int sysctl_drop_caches; -static void drop_pagecache_sb(struct super_block *sb) +static void drop_pagecache_sb(struct super_block *sb, void *unused) { struct inode *inode, *toput_inode = NULL; @@ -33,26 +33,6 @@ static void drop_pagecache_sb(struct super_block *sb) iput(toput_inode); } -static void drop_pagecache(void) -{ - struct super_block *sb; - - spin_lock(&sb_lock); -restart: - list_for_each_entry(sb, &super_blocks, s_list) { - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (sb->s_root) - drop_pagecache_sb(sb); - up_read(&sb->s_umount); - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; - } - spin_unlock(&sb_lock); -} - static void drop_slab(void) { int nr_objects; @@ -68,7 +48,7 @@ int drop_caches_sysctl_handler(ctl_table *table, int write, proc_dointvec_minmax(table, write, buffer, length, ppos); if (write) { if (sysctl_drop_caches & 1) - drop_pagecache(); + iterate_supers(drop_pagecache_sb, NULL); if (sysctl_drop_caches & 2) drop_slab(); } diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index bfc2e0f78f0..0032a9f5a3a 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -731,15 +731,14 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, struct page *page_for_lower, size_t offset_in_page, size_t size); -int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, - size_t size); +int ecryptfs_write(struct inode *inode, char *data, loff_t offset, size_t size); int ecryptfs_read_lower(char *data, loff_t offset, size_t size, struct inode *ecryptfs_inode); int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, pgoff_t page_index, size_t offset_in_page, size_t size, struct inode *ecryptfs_inode); -struct page *ecryptfs_get_locked_page(struct file *file, loff_t index); +struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index); int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon); int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid, struct user_namespace *user_ns); diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index e7440a6f5eb..3bdddbcc785 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -276,9 +276,7 @@ static int ecryptfs_release(struct inode *inode, struct file *file) static int ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync) { - return vfs_fsync(ecryptfs_file_to_lower(file), - ecryptfs_dentry_to_lower(dentry), - datasync); + return vfs_fsync(ecryptfs_file_to_lower(file), datasync); } static int ecryptfs_fasync(int fd, struct file *file, int flag) diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index e2d4418affa..65dee2f336a 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -142,19 +142,10 @@ out: static int grow_file(struct dentry *ecryptfs_dentry) { struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode; - struct file fake_file; - struct ecryptfs_file_info tmp_file_info; char zero_virt[] = { 0x00 }; int rc = 0; - memset(&fake_file, 0, sizeof(fake_file)); - fake_file.f_path.dentry = ecryptfs_dentry; - memset(&tmp_file_info, 0, sizeof(tmp_file_info)); - ecryptfs_set_file_private(&fake_file, &tmp_file_info); - ecryptfs_set_file_lower( - &fake_file, - ecryptfs_inode_to_private(ecryptfs_inode)->lower_file); - rc = ecryptfs_write(&fake_file, zero_virt, 0, 1); + rc = ecryptfs_write(ecryptfs_inode, zero_virt, 0, 1); i_size_write(ecryptfs_inode, 0); rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat.flags |= @@ -784,8 +775,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, { int rc = 0; struct inode *inode = dentry->d_inode; - struct dentry *lower_dentry; - struct file fake_ecryptfs_file; struct ecryptfs_crypt_stat *crypt_stat; loff_t i_size = i_size_read(inode); loff_t lower_size_before_truncate; @@ -796,23 +785,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, goto out; } crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; - /* Set up a fake ecryptfs file, this is used to interface with - * the file in the underlying filesystem so that the - * truncation has an effect there as well. */ - memset(&fake_ecryptfs_file, 0, sizeof(fake_ecryptfs_file)); - fake_ecryptfs_file.f_path.dentry = dentry; - /* Released at out_free: label */ - ecryptfs_set_file_private(&fake_ecryptfs_file, - kmem_cache_alloc(ecryptfs_file_info_cache, - GFP_KERNEL)); - if (unlikely(!ecryptfs_file_to_private(&fake_ecryptfs_file))) { - rc = -ENOMEM; - goto out; - } - lower_dentry = ecryptfs_dentry_to_lower(dentry); - ecryptfs_set_file_lower( - &fake_ecryptfs_file, - ecryptfs_inode_to_private(dentry->d_inode)->lower_file); /* Switch on growing or shrinking file */ if (ia->ia_size > i_size) { char zero[] = { 0x00 }; @@ -822,7 +794,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, * this triggers code that will fill in 0's throughout * the intermediate portion of the previous end of the * file and the new and of the file */ - rc = ecryptfs_write(&fake_ecryptfs_file, zero, + rc = ecryptfs_write(inode, zero, (ia->ia_size - 1), 1); } else { /* ia->ia_size < i_size_read(inode) */ /* We're chopping off all the pages down to the page @@ -835,10 +807,10 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { rc = vmtruncate(inode, ia->ia_size); if (rc) - goto out_free; + goto out; lower_ia->ia_size = ia->ia_size; lower_ia->ia_valid |= ATTR_SIZE; - goto out_free; + goto out; } if (num_zeros) { char *zeros_virt; @@ -846,16 +818,16 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, zeros_virt = kzalloc(num_zeros, GFP_KERNEL); if (!zeros_virt) { rc = -ENOMEM; - goto out_free; + goto out; } - rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt, + rc = ecryptfs_write(inode, zeros_virt, ia->ia_size, num_zeros); kfree(zeros_virt); if (rc) { printk(KERN_ERR "Error attempting to zero out " "the remainder of the end page on " "reducing truncate; rc = [%d]\n", rc); - goto out_free; + goto out; } } vmtruncate(inode, ia->ia_size); @@ -864,7 +836,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, printk(KERN_ERR "Problem with " "ecryptfs_write_inode_size_to_metadata; " "rc = [%d]\n", rc); - goto out_free; + goto out; } /* We are reducing the size of the ecryptfs file, and need to * know if we need to reduce the size of the lower file. */ @@ -878,10 +850,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, } else lower_ia->ia_valid &= ~ATTR_SIZE; } -out_free: - if (ecryptfs_file_to_private(&fake_ecryptfs_file)) - kmem_cache_free(ecryptfs_file_info_cache, - ecryptfs_file_to_private(&fake_ecryptfs_file)); out: return rc; } diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 760983d0f25..cbd4e18adb2 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -281,7 +281,7 @@ static void ecryptfs_init_mount_crypt_stat( * * Returns zero on success; non-zero on error */ -static int ecryptfs_parse_options(struct super_block *sb, char *options) +static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options) { char *p; int rc = 0; @@ -293,7 +293,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) int fn_cipher_key_bytes; int fn_cipher_key_bytes_set = 0; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = - &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; + &sbi->mount_crypt_stat; substring_t args[MAX_OPT_ARGS]; int token; char *sig_src; @@ -483,68 +483,7 @@ out: } struct kmem_cache *ecryptfs_sb_info_cache; - -/** - * ecryptfs_fill_super - * @sb: The ecryptfs super block - * @raw_data: The options passed to mount - * @silent: Not used but required by function prototype - * - * Sets up what we can of the sb, rest is done in ecryptfs_read_super - * - * Returns zero on success; non-zero otherwise - */ -static int -ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent) -{ - struct ecryptfs_sb_info *esi; - int rc = 0; - - /* Released in ecryptfs_put_super() */ - ecryptfs_set_superblock_private(sb, - kmem_cache_zalloc(ecryptfs_sb_info_cache, - GFP_KERNEL)); - esi = ecryptfs_superblock_to_private(sb); - if (!esi) { - ecryptfs_printk(KERN_WARNING, "Out of memory\n"); - rc = -ENOMEM; - goto out; - } - - rc = bdi_setup_and_register(&esi->bdi, "ecryptfs", BDI_CAP_MAP_COPY); - if (rc) - goto out; - - sb->s_bdi = &esi->bdi; - sb->s_op = &ecryptfs_sops; - /* Released through deactivate_super(sb) from get_sb_nodev */ - sb->s_root = d_alloc(NULL, &(const struct qstr) { - .hash = 0,.name = "/",.len = 1}); - if (!sb->s_root) { - ecryptfs_printk(KERN_ERR, "d_alloc failed\n"); - rc = -ENOMEM; - goto out; - } - sb->s_root->d_op = &ecryptfs_dops; - sb->s_root->d_sb = sb; - sb->s_root->d_parent = sb->s_root; - /* Released in d_release when dput(sb->s_root) is called */ - /* through deactivate_super(sb) from get_sb_nodev() */ - ecryptfs_set_dentry_private(sb->s_root, - kmem_cache_zalloc(ecryptfs_dentry_info_cache, - GFP_KERNEL)); - if (!ecryptfs_dentry_to_private(sb->s_root)) { - ecryptfs_printk(KERN_ERR, - "dentry_info_cache alloc failed\n"); - rc = -ENOMEM; - goto out; - } - rc = 0; -out: - /* Should be able to rely on deactivate_super called from - * get_sb_nodev */ - return rc; -} +static struct file_system_type ecryptfs_fs_type; /** * ecryptfs_read_super @@ -565,6 +504,13 @@ static int ecryptfs_read_super(struct super_block *sb, const char *dev_name) ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n"); goto out; } + if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) { + rc = -EINVAL; + printk(KERN_ERR "Mount on filesystem of type " + "eCryptfs explicitly disallowed due to " + "known incompatibilities\n"); + goto out_free; + } ecryptfs_set_superblock_lower(sb, path.dentry->d_sb); sb->s_maxbytes = path.dentry->d_sb->s_maxbytes; sb->s_blocksize = path.dentry->d_sb->s_blocksize; @@ -588,11 +534,8 @@ out: * @dev_name: The path to mount over * @raw_data: The options passed into the kernel * - * The whole ecryptfs_get_sb process is broken into 4 functions: + * The whole ecryptfs_get_sb process is broken into 3 functions: * ecryptfs_parse_options(): handle options passed to ecryptfs, if any - * ecryptfs_fill_super(): used by get_sb_nodev, fills out the super_block - * with as much information as it can before needing - * the lower filesystem. * ecryptfs_read_super(): this accesses the lower filesystem and uses * ecryptfs_interpose to perform most of the linking * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c) @@ -601,30 +544,78 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) { + struct super_block *s; + struct ecryptfs_sb_info *sbi; + struct ecryptfs_dentry_info *root_info; + const char *err = "Getting sb failed"; int rc; - struct super_block *sb; - rc = get_sb_nodev(fs_type, flags, raw_data, ecryptfs_fill_super, mnt); - if (rc < 0) { - printk(KERN_ERR "Getting sb failed; rc = [%d]\n", rc); + sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL); + if (!sbi) { + rc = -ENOMEM; goto out; } - sb = mnt->mnt_sb; - rc = ecryptfs_parse_options(sb, raw_data); + + rc = ecryptfs_parse_options(sbi, raw_data); if (rc) { - printk(KERN_ERR "Error parsing options; rc = [%d]\n", rc); - goto out_abort; + err = "Error parsing options"; + goto out; + } + + s = sget(fs_type, NULL, set_anon_super, NULL); + if (IS_ERR(s)) { + rc = PTR_ERR(s); + goto out; } - rc = ecryptfs_read_super(sb, dev_name); + + s->s_flags = flags; + rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY); if (rc) { - printk(KERN_ERR "Reading sb failed; rc = [%d]\n", rc); - goto out_abort; + deactivate_locked_super(s); + goto out; } - goto out; -out_abort: - dput(sb->s_root); /* aka mnt->mnt_root, as set by get_sb_nodev() */ - deactivate_locked_super(sb); + + ecryptfs_set_superblock_private(s, sbi); + s->s_bdi = &sbi->bdi; + + /* ->kill_sb() will take care of sbi after that point */ + sbi = NULL; + s->s_op = &ecryptfs_sops; + + rc = -ENOMEM; + s->s_root = d_alloc(NULL, &(const struct qstr) { + .hash = 0,.name = "/",.len = 1}); + if (!s->s_root) { + deactivate_locked_super(s); + goto out; + } + s->s_root->d_op = &ecryptfs_dops; + s->s_root->d_sb = s; + s->s_root->d_parent = s->s_root; + + root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL); + if (!root_info) { + deactivate_locked_super(s); + goto out; + } + /* ->kill_sb() will take care of root_info */ + ecryptfs_set_dentry_private(s->s_root, root_info); + s->s_flags |= MS_ACTIVE; + rc = ecryptfs_read_super(s, dev_name); + if (rc) { + deactivate_locked_super(s); + err = "Reading sb failed"; + goto out; + } + simple_set_mnt(mnt, s); + return 0; + out: + if (sbi) { + ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat); + kmem_cache_free(ecryptfs_sb_info_cache, sbi); + } + printk(KERN_ERR "%s; rc = [%d]\n", err, rc); return rc; } @@ -633,11 +624,16 @@ out: * @sb: The ecryptfs super block * * Used to bring the superblock down and free the private data. - * Private data is free'd in ecryptfs_put_super() */ static void ecryptfs_kill_block_super(struct super_block *sb) { - generic_shutdown_super(sb); + struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb); + kill_anon_super(sb); + if (!sb_info) + return; + ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); + bdi_destroy(&sb_info->bdi); + kmem_cache_free(ecryptfs_sb_info_cache, sb_info); } static struct file_system_type ecryptfs_fs_type = { diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 2ee9a3a7b68..b1d82756544 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -44,17 +44,9 @@ * Returns locked and up-to-date page (if ok), with increased * refcnt. */ -struct page *ecryptfs_get_locked_page(struct file *file, loff_t index) +struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index) { - struct dentry *dentry; - struct inode *inode; - struct address_space *mapping; - struct page *page; - - dentry = file->f_path.dentry; - inode = dentry->d_inode; - mapping = inode->i_mapping; - page = read_mapping_page(mapping, index, (void *)file); + struct page *page = read_mapping_page(inode->i_mapping, index, NULL); if (!IS_ERR(page)) lock_page(page); return page; @@ -198,7 +190,7 @@ out: static int ecryptfs_readpage(struct file *file, struct page *page) { struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(page->mapping->host)->crypt_stat; int rc = 0; if (!crypt_stat @@ -300,8 +292,7 @@ static int ecryptfs_write_begin(struct file *file, if (!PageUptodate(page)) { struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private( - file->f_path.dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(mapping->host)->crypt_stat; if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED) || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) { @@ -487,7 +478,7 @@ static int ecryptfs_write_end(struct file *file, unsigned to = from + copied; struct inode *ecryptfs_inode = mapping->host; struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; int rc; if (crypt_stat->flags & ECRYPTFS_NEW_FILE) { diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 0cc4fafd655..db184ef15d3 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -93,7 +93,7 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, /** * ecryptfs_write - * @ecryptfs_file: The eCryptfs file into which to write + * @ecryptfs_inode: The eCryptfs file into which to write * @data: Virtual address where data to write is located * @offset: Offset in the eCryptfs file at which to begin writing the * data from @data @@ -109,12 +109,11 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, * * Returns zero on success; non-zero otherwise */ -int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, +int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, size_t size) { struct page *ecryptfs_page; struct ecryptfs_crypt_stat *crypt_stat; - struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode; char *ecryptfs_page_virt; loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode); loff_t data_offset = 0; @@ -145,7 +144,7 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, if (num_bytes > total_remaining_zeros) num_bytes = total_remaining_zeros; } - ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file, + ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode, ecryptfs_page_idx); if (IS_ERR(ecryptfs_page)) { rc = PTR_ERR(ecryptfs_page); @@ -302,10 +301,10 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, int ecryptfs_read(char *data, loff_t offset, size_t size, struct file *ecryptfs_file) { + struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode; struct page *ecryptfs_page; char *ecryptfs_page_virt; - loff_t ecryptfs_file_size = - i_size_read(ecryptfs_file->f_dentry->d_inode); + loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode); loff_t data_offset = 0; loff_t pos; int rc = 0; @@ -327,7 +326,7 @@ int ecryptfs_read(char *data, loff_t offset, size_t size, if (num_bytes > total_remaining_bytes) num_bytes = total_remaining_bytes; - ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file, + ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode, ecryptfs_page_idx); if (IS_ERR(ecryptfs_page)) { rc = PTR_ERR(ecryptfs_page); diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 0c0ae491d23..0435886e4a9 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -109,27 +109,6 @@ void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode) } /** - * ecryptfs_put_super - * @sb: Pointer to the ecryptfs super block - * - * Final actions when unmounting a file system. - * This will handle deallocation and release of our private data. - */ -static void ecryptfs_put_super(struct super_block *sb) -{ - struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb); - - lock_kernel(); - - ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); - bdi_destroy(&sb_info->bdi); - kmem_cache_free(ecryptfs_sb_info_cache, sb_info); - ecryptfs_set_superblock_private(sb, NULL); - - unlock_kernel(); -} - -/** * ecryptfs_statfs * @sb: The ecryptfs super block * @buf: The struct kstatfs to fill in with stats @@ -203,7 +182,6 @@ const struct super_operations ecryptfs_sops = { .alloc_inode = ecryptfs_alloc_inode, .destroy_inode = ecryptfs_destroy_inode, .drop_inode = generic_delete_inode, - .put_super = ecryptfs_put_super, .statfs = ecryptfs_statfs, .remount_fs = NULL, .clear_inode = ecryptfs_clear_inode, diff --git a/fs/exec.c b/fs/exec.c index e6e94c626c2..9badbc0bfb1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -242,9 +242,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm) * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ + BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; - vma->vm_flags = VM_STACK_FLAGS; + vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); INIT_LIST_HEAD(&vma->anon_vma_chain); err = insert_vm_struct(mm, vma); @@ -616,6 +617,7 @@ int setup_arg_pages(struct linux_binprm *bprm, else if (executable_stack == EXSTACK_DISABLE_X) vm_flags &= ~VM_EXEC; vm_flags |= mm->def_flags; + vm_flags |= VM_STACK_INCOMPLETE_SETUP; ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, vm_flags); @@ -630,6 +632,9 @@ int setup_arg_pages(struct linux_binprm *bprm, goto out_unlock; } + /* mprotect_fixup is overkill to remove the temporary stack flags */ + vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP; + stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ stack_size = vma->vm_end - vma->vm_start; /* diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 4cfab1cc75c..d91e9d829bc 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -608,7 +608,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent) de->inode_no = cpu_to_le64(parent->i_ino); memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR)); exofs_set_de_type(de, inode); - kunmap_atomic(page, KM_USER0); + kunmap_atomic(kaddr, KM_USER0); err = exofs_commit_chunk(page, 0, chunk_size); fail: page_cache_release(page); diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 76d2a79ef93..4bb6ef822e4 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -755,6 +755,21 @@ static int exofs_write_end(struct file *file, struct address_space *mapping, return ret; } +static int exofs_releasepage(struct page *page, gfp_t gfp) +{ + EXOFS_DBGMSG("page 0x%lx\n", page->index); + WARN_ON(1); + return try_to_free_buffers(page); +} + +static void exofs_invalidatepage(struct page *page, unsigned long offset) +{ + EXOFS_DBGMSG("page_has_buffers=>%d\n", page_has_buffers(page)); + WARN_ON(1); + + block_invalidatepage(page, offset); +} + const struct address_space_operations exofs_aops = { .readpage = exofs_readpage, .readpages = exofs_readpages, @@ -762,6 +777,21 @@ const struct address_space_operations exofs_aops = { .writepages = exofs_writepages, .write_begin = exofs_write_begin_export, .write_end = exofs_write_end, + .releasepage = exofs_releasepage, + .set_page_dirty = __set_page_dirty_nobuffers, + .invalidatepage = exofs_invalidatepage, + + /* Not implemented Yet */ + .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */ + .direct_IO = NULL, /* TODO: Should be trivial to do */ + + /* With these NULL has special meaning or default is not exported */ + .sync_page = NULL, + .get_xip_mem = NULL, + .migratepage = NULL, + .launder_page = NULL, + .is_partially_uptodate = NULL, + .error_remove_page = NULL, }; /****************************************************************************** @@ -1123,16 +1153,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) sbi = sb->s_fs_info; sb->s_dirt = 1; - inode->i_uid = current->cred->fsuid; - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else { - inode->i_gid = current->cred->fsgid; - } - inode->i_mode = mode; - + inode_init_owner(inode, dir, mode); inode->i_ino = sbi->s_nextid++; inode->i_blkbits = EXOFS_BLKSHIFT; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index a99e54318c3..ca7e2a0ed98 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -420,7 +420,7 @@ release_and_out: return error; } -struct xattr_handler ext2_xattr_acl_access_handler = { +const struct xattr_handler ext2_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .list = ext2_xattr_list_acl_access, @@ -428,7 +428,7 @@ struct xattr_handler ext2_xattr_acl_access_handler = { .set = ext2_xattr_set_acl, }; -struct xattr_handler ext2_xattr_acl_default_handler = { +const struct xattr_handler ext2_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .list = ext2_xattr_list_acl_default, diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 3cf038c055d..e8766a39677 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -1332,6 +1332,12 @@ retry_alloc: free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); /* + * skip this group (and avoid loading bitmap) if there + * are no free blocks + */ + if (!free_blocks) + continue; + /* * skip this group if the number of * free blocks is less than half of the reservation * window size. diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index ad7d572ee8d..938dbc739d0 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -106,7 +106,7 @@ void ext2_free_inode (struct inode * inode) struct super_block * sb = inode->i_sb; int is_directory; unsigned long ino; - struct buffer_head *bitmap_bh = NULL; + struct buffer_head *bitmap_bh; unsigned long block_group; unsigned long bit; struct ext2_super_block * es; @@ -135,14 +135,13 @@ void ext2_free_inode (struct inode * inode) ino > le32_to_cpu(es->s_inodes_count)) { ext2_error (sb, "ext2_free_inode", "reserved or nonexistent inode %lu", ino); - goto error_return; + return; } block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT2_INODES_PER_GROUP(sb); - brelse(bitmap_bh); bitmap_bh = read_inode_bitmap(sb, block_group); if (!bitmap_bh) - goto error_return; + return; /* Ok, now we can actually update the inode bitmaps.. */ if (!ext2_clear_bit_atomic(sb_bgl_lock(EXT2_SB(sb), block_group), @@ -154,7 +153,7 @@ void ext2_free_inode (struct inode * inode) mark_buffer_dirty(bitmap_bh); if (sb->s_flags & MS_SYNCHRONOUS) sync_dirty_buffer(bitmap_bh); -error_return: + brelse(bitmap_bh); } @@ -550,16 +549,12 @@ got: sb->s_dirt = 1; mark_buffer_dirty(bh2); - inode->i_uid = current_fsuid(); - if (test_opt (sb, GRPID)) - inode->i_gid = dir->i_gid; - else if (dir->i_mode & S_ISGID) { + if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; } else - inode->i_gid = current_fsgid(); - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); inode->i_ino = ino; inode->i_blocks = 0; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index fc13cc119aa..527c46d9bc1 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -22,7 +22,6 @@ * Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000 */ -#include <linux/smp_lock.h> #include <linux/time.h> #include <linux/highuid.h> #include <linux/pagemap.h> @@ -1406,11 +1405,11 @@ static int __ext2_write_inode(struct inode *inode, int do_sync) /* If this is the first large file * created, add a flag to the superblock. */ - lock_kernel(); + spin_lock(&EXT2_SB(sb)->s_lock); ext2_update_dynamic_rev(sb); EXT2_SET_RO_COMPAT_FEATURE(sb, EXT2_FEATURE_RO_COMPAT_LARGE_FILE); - unlock_kernel(); + spin_unlock(&EXT2_SB(sb)->s_lock); ext2_write_super(sb); } } @@ -1467,7 +1466,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr) if (error) return error; - if (iattr->ia_valid & ATTR_SIZE) + if (is_quota_modification(inode, iattr)) dquot_initialize(inode); if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 42e4a303b67..71e9eb1fa69 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -26,7 +26,6 @@ #include <linux/random.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> -#include <linux/smp_lock.h> #include <linux/vfs.h> #include <linux/seq_file.h> #include <linux/mount.h> @@ -39,7 +38,7 @@ #include "xip.h" static void ext2_sync_super(struct super_block *sb, - struct ext2_super_block *es); + struct ext2_super_block *es, int wait); static int ext2_remount (struct super_block * sb, int * flags, char * data); static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); static int ext2_sync_fs(struct super_block *sb, int wait); @@ -52,9 +51,11 @@ void ext2_error (struct super_block * sb, const char * function, struct ext2_super_block *es = sbi->s_es; if (!(sb->s_flags & MS_RDONLY)) { + spin_lock(&sbi->s_lock); sbi->s_mount_state |= EXT2_ERROR_FS; es->s_state |= cpu_to_le16(EXT2_ERROR_FS); - ext2_sync_super(sb, es); + spin_unlock(&sbi->s_lock); + ext2_sync_super(sb, es, 1); } va_start(args, fmt); @@ -84,6 +85,9 @@ void ext2_msg(struct super_block *sb, const char *prefix, va_end(args); } +/* + * This must be called with sbi->s_lock held. + */ void ext2_update_dynamic_rev(struct super_block *sb) { struct ext2_super_block *es = EXT2_SB(sb)->s_es; @@ -115,8 +119,6 @@ static void ext2_put_super (struct super_block * sb) int i; struct ext2_sb_info *sbi = EXT2_SB(sb); - lock_kernel(); - if (sb->s_dirt) ext2_write_super(sb); @@ -124,8 +126,10 @@ static void ext2_put_super (struct super_block * sb) if (!(sb->s_flags & MS_RDONLY)) { struct ext2_super_block *es = sbi->s_es; + spin_lock(&sbi->s_lock); es->s_state = cpu_to_le16(sbi->s_mount_state); - ext2_sync_super(sb, es); + spin_unlock(&sbi->s_lock); + ext2_sync_super(sb, es, 1); } db_count = sbi->s_gdb_count; for (i = 0; i < db_count; i++) @@ -140,8 +144,6 @@ static void ext2_put_super (struct super_block * sb) sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); kfree(sbi); - - unlock_kernel(); } static struct kmem_cache * ext2_inode_cachep; @@ -209,6 +211,7 @@ static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs) struct ext2_super_block *es = sbi->s_es; unsigned long def_mount_opts; + spin_lock(&sbi->s_lock); def_mount_opts = le32_to_cpu(es->s_default_mount_opts); if (sbi->s_sb_block != 1) @@ -281,6 +284,7 @@ static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs) if (!test_opt(sb, RESERVATION)) seq_puts(seq, ",noreservation"); + spin_unlock(&sbi->s_lock); return 0; } @@ -606,7 +610,6 @@ static int ext2_setup_super (struct super_block * sb, if (!le16_to_cpu(es->s_max_mnt_count)) es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); - ext2_write_super(sb); if (test_opt (sb, DEBUG)) ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, " "bpg=%lu, ipg=%lu, mo=%04lx]", @@ -767,6 +770,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = sbi; sbi->s_sb_block = sb_block; + spin_lock_init(&sbi->s_lock); + /* * See what the current blocksize for the device is, and * use that as the blocksize. Otherwise (or if the blocksize @@ -1079,7 +1084,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) ext2_msg(sb, KERN_WARNING, "warning: mounting ext3 filesystem as ext2"); - ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY); + if (ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY)) + sb->s_flags |= MS_RDONLY; + ext2_write_super(sb); return 0; cantfind_ext2: @@ -1120,30 +1127,26 @@ static void ext2_clear_super_error(struct super_block *sb) * be remapped. Nothing we can do but to retry the * write and hope for the best. */ - printk(KERN_ERR "EXT2-fs: %s previous I/O error to " - "superblock detected", sb->s_id); + ext2_msg(sb, KERN_ERR, + "previous I/O error to superblock detected\n"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } } -static void ext2_commit_super (struct super_block * sb, - struct ext2_super_block * es) -{ - ext2_clear_super_error(sb); - es->s_wtime = cpu_to_le32(get_seconds()); - mark_buffer_dirty(EXT2_SB(sb)->s_sbh); - sb->s_dirt = 0; -} - -static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es) +static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es, + int wait) { ext2_clear_super_error(sb); + spin_lock(&EXT2_SB(sb)->s_lock); es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb)); es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb)); es->s_wtime = cpu_to_le32(get_seconds()); + /* unlock before we do IO */ + spin_unlock(&EXT2_SB(sb)->s_lock); mark_buffer_dirty(EXT2_SB(sb)->s_sbh); - sync_dirty_buffer(EXT2_SB(sb)->s_sbh); + if (wait) + sync_dirty_buffer(EXT2_SB(sb)->s_sbh); sb->s_dirt = 0; } @@ -1157,43 +1160,18 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es) * may have been checked while mounted and e2fsck may have * set s_state to EXT2_VALID_FS after some corrections. */ - static int ext2_sync_fs(struct super_block *sb, int wait) { + struct ext2_sb_info *sbi = EXT2_SB(sb); struct ext2_super_block *es = EXT2_SB(sb)->s_es; - struct buffer_head *sbh = EXT2_SB(sb)->s_sbh; - - lock_kernel(); - if (buffer_write_io_error(sbh)) { - /* - * Oh, dear. A previous attempt to write the - * superblock failed. This could happen because the - * USB device was yanked out. Or it could happen to - * be a transient write error and maybe the block will - * be remapped. Nothing we can do but to retry the - * write and hope for the best. - */ - ext2_msg(sb, KERN_ERR, - "previous I/O error to superblock detected\n"); - clear_buffer_write_io_error(sbh); - set_buffer_uptodate(sbh); - } + spin_lock(&sbi->s_lock); if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) { ext2_debug("setting valid to 0\n"); es->s_state &= cpu_to_le16(~EXT2_VALID_FS); - es->s_free_blocks_count = - cpu_to_le32(ext2_count_free_blocks(sb)); - es->s_free_inodes_count = - cpu_to_le32(ext2_count_free_inodes(sb)); - es->s_mtime = cpu_to_le32(get_seconds()); - ext2_sync_super(sb, es); - } else { - ext2_commit_super(sb, es); } - sb->s_dirt = 0; - unlock_kernel(); - + spin_unlock(&sbi->s_lock); + ext2_sync_super(sb, es, wait); return 0; } @@ -1215,7 +1193,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) unsigned long old_sb_flags; int err; - lock_kernel(); + spin_lock(&sbi->s_lock); /* Store the old options */ old_sb_flags = sb->s_flags; @@ -1254,13 +1232,13 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP; } if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { - unlock_kernel(); + spin_unlock(&sbi->s_lock); return 0; } if (*flags & MS_RDONLY) { if (le16_to_cpu(es->s_state) & EXT2_VALID_FS || !(sbi->s_mount_state & EXT2_VALID_FS)) { - unlock_kernel(); + spin_unlock(&sbi->s_lock); return 0; } /* @@ -1269,6 +1247,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) */ es->s_state = cpu_to_le16(sbi->s_mount_state); es->s_mtime = cpu_to_le32(get_seconds()); + spin_unlock(&sbi->s_lock); + ext2_sync_super(sb, es, 1); } else { __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP); @@ -1288,16 +1268,16 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) sbi->s_mount_state = le16_to_cpu(es->s_state); if (!ext2_setup_super (sb, es, 0)) sb->s_flags &= ~MS_RDONLY; + spin_unlock(&sbi->s_lock); + ext2_write_super(sb); } - ext2_sync_super(sb, es); - unlock_kernel(); return 0; restore_opts: sbi->s_mount_opt = old_opts.s_mount_opt; sbi->s_resuid = old_opts.s_resuid; sbi->s_resgid = old_opts.s_resgid; sb->s_flags = old_sb_flags; - unlock_kernel(); + spin_unlock(&sbi->s_lock); return err; } @@ -1308,6 +1288,8 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) struct ext2_super_block *es = sbi->s_es; u64 fsid; + spin_lock(&sbi->s_lock); + if (test_opt (sb, MINIX_DF)) sbi->s_overhead_last = 0; else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) { @@ -1362,6 +1344,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) le64_to_cpup((void *)es->s_uuid + sizeof(u64)); buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + spin_unlock(&sbi->s_lock); return 0; } diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index e44dc92609b..7c3915780b1 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -101,7 +101,7 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *, static struct mb_cache *ext2_xattr_cache; -static struct xattr_handler *ext2_xattr_handler_map[] = { +static const struct xattr_handler *ext2_xattr_handler_map[] = { [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, #ifdef CONFIG_EXT2_FS_POSIX_ACL [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext2_xattr_acl_access_handler, @@ -113,7 +113,7 @@ static struct xattr_handler *ext2_xattr_handler_map[] = { #endif }; -struct xattr_handler *ext2_xattr_handlers[] = { +const struct xattr_handler *ext2_xattr_handlers[] = { &ext2_xattr_user_handler, &ext2_xattr_trusted_handler, #ifdef CONFIG_EXT2_FS_POSIX_ACL @@ -126,10 +126,10 @@ struct xattr_handler *ext2_xattr_handlers[] = { NULL }; -static inline struct xattr_handler * +static inline const struct xattr_handler * ext2_xattr_handler(int name_index) { - struct xattr_handler *handler = NULL; + const struct xattr_handler *handler = NULL; if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map)) handler = ext2_xattr_handler_map[name_index]; @@ -298,7 +298,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", /* list the attribute names */ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); entry = EXT2_XATTR_NEXT(entry)) { - struct xattr_handler *handler = + const struct xattr_handler *handler = ext2_xattr_handler(entry->e_name_index); if (handler) { @@ -345,7 +345,9 @@ static void ext2_xattr_update_super_block(struct super_block *sb) if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR)) return; + spin_lock(&EXT2_SB(sb)->s_lock); EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR); + spin_unlock(&EXT2_SB(sb)->s_lock); sb->s_dirt = 1; mark_buffer_dirty(EXT2_SB(sb)->s_sbh); } diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h index bf8175b2ced..a1a1c218461 100644 --- a/fs/ext2/xattr.h +++ b/fs/ext2/xattr.h @@ -55,11 +55,11 @@ struct ext2_xattr_entry { # ifdef CONFIG_EXT2_FS_XATTR -extern struct xattr_handler ext2_xattr_user_handler; -extern struct xattr_handler ext2_xattr_trusted_handler; -extern struct xattr_handler ext2_xattr_acl_access_handler; -extern struct xattr_handler ext2_xattr_acl_default_handler; -extern struct xattr_handler ext2_xattr_security_handler; +extern const struct xattr_handler ext2_xattr_user_handler; +extern const struct xattr_handler ext2_xattr_trusted_handler; +extern const struct xattr_handler ext2_xattr_acl_access_handler; +extern const struct xattr_handler ext2_xattr_acl_default_handler; +extern const struct xattr_handler ext2_xattr_security_handler; extern ssize_t ext2_listxattr(struct dentry *, char *, size_t); @@ -72,7 +72,7 @@ extern void ext2_xattr_put_super(struct super_block *); extern int init_ext2_xattr(void); extern void exit_ext2_xattr(void); -extern struct xattr_handler *ext2_xattr_handlers[]; +extern const struct xattr_handler *ext2_xattr_handlers[]; # else /* CONFIG_EXT2_FS_XATTR */ diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index b118c6383c6..3004e15d5da 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -67,7 +67,7 @@ ext2_init_security(struct inode *inode, struct inode *dir) return err; } -struct xattr_handler ext2_xattr_security_handler = { +const struct xattr_handler ext2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext2_xattr_security_list, .get = ext2_xattr_security_get, diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 2a26d71f477..667e46a8d62 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -50,7 +50,7 @@ ext2_xattr_trusted_set(struct dentry *dentry, const char *name, value, size, flags); } -struct xattr_handler ext2_xattr_trusted_handler = { +const struct xattr_handler ext2_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .list = ext2_xattr_trusted_list, .get = ext2_xattr_trusted_get, diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index 3f6caf3684b..099d20f4716 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -54,7 +54,7 @@ ext2_xattr_user_set(struct dentry *dentry, const char *name, name, value, size, flags); } -struct xattr_handler ext2_xattr_user_handler = { +const struct xattr_handler ext2_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .list = ext2_xattr_user_list, .get = ext2_xattr_user_get, diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index 82ba3415866..01552abbca3 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -456,7 +456,7 @@ release_and_out: return error; } -struct xattr_handler ext3_xattr_acl_access_handler = { +const struct xattr_handler ext3_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .list = ext3_xattr_list_acl_access, @@ -464,7 +464,7 @@ struct xattr_handler ext3_xattr_acl_access_handler = { .set = ext3_xattr_set_acl, }; -struct xattr_handler ext3_xattr_acl_default_handler = { +const struct xattr_handler ext3_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .list = ext3_xattr_list_acl_default, diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index a177122a1b2..4a32511f4de 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -1584,6 +1584,12 @@ retry_alloc: goto io_error; free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); /* + * skip this group (and avoid loading bitmap) if there + * are no free blocks + */ + if (!free_blocks) + continue; + /* * skip this group if the number of * free blocks is less than half of the reservation * window size. diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index 8209f266e9a..fcf7487734b 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -48,7 +48,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) struct inode *inode = dentry->d_inode; struct ext3_inode_info *ei = EXT3_I(inode); journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; - int ret = 0; + int ret, needs_barrier = 0; tid_t commit_tid; if (inode->i_sb->s_flags & MS_RDONLY) @@ -70,28 +70,27 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) * (they were dirtied by commit). But that's OK - the blocks are * safe in-journal, which is all fsync() needs to ensure. */ - if (ext3_should_journal_data(inode)) { - ret = ext3_force_commit(inode->i_sb); - goto out; - } + if (ext3_should_journal_data(inode)) + return ext3_force_commit(inode->i_sb); if (datasync) commit_tid = atomic_read(&ei->i_datasync_tid); else commit_tid = atomic_read(&ei->i_sync_tid); - if (log_start_commit(journal, commit_tid)) { - log_wait_commit(journal, commit_tid); - goto out; - } + if (test_opt(inode->i_sb, BARRIER) && + !journal_trans_will_send_data_barrier(journal, commit_tid)) + needs_barrier = 1; + log_start_commit(journal, commit_tid); + ret = log_wait_commit(journal, commit_tid); /* * In case we didn't commit a transaction, we have to flush * disk caches manually so that data really is on persistent * storage */ - if (test_opt(inode->i_sb, BARRIER)) - blkdev_issue_flush(inode->i_sb->s_bdev, NULL); -out: + if (needs_barrier) + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); return ret; } diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 0d0e97ed3ff..498021eb88f 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -538,16 +538,13 @@ got: if (S_ISDIR(mode)) percpu_counter_inc(&sbi->s_dirs_counter); - inode->i_uid = current_fsuid(); - if (test_opt (sb, GRPID)) - inode->i_gid = dir->i_gid; - else if (dir->i_mode & S_ISGID) { + + if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; } else - inode->i_gid = current_fsgid(); - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); inode->i_ino = ino; /* This is the optimal IO size (for stat), not the fs block size */ diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index ea33bdf0a30..735f0190ec2 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -3151,7 +3151,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; - if (ia_valid & ATTR_SIZE) + if (is_quota_modification(inode, attr)) dquot_initialize(inode); if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 1bee604cc6c..0fc1293d0e9 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -653,8 +653,12 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_printf(seq, ",commit=%u", (unsigned) (sbi->s_commit_interval / HZ)); } - if (test_opt(sb, BARRIER)) - seq_puts(seq, ",barrier=1"); + + /* + * Always display barrier state so it's clear what the status is. + */ + seq_puts(seq, ",barrier="); + seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); if (test_opt(sb, NOBH)) seq_puts(seq, ",nobh"); @@ -810,8 +814,8 @@ enum { Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, - Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, - Opt_usrquota, Opt_grpquota + Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, + Opt_resize, Opt_usrquota, Opt_grpquota }; static const match_table_t tokens = { @@ -865,6 +869,8 @@ static const match_table_t tokens = { {Opt_quota, "quota"}, {Opt_usrquota, "usrquota"}, {Opt_barrier, "barrier=%u"}, + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, {Opt_resize, "resize"}, {Opt_err, NULL}, }; @@ -967,7 +973,11 @@ static int parse_options (char *options, struct super_block *sb, int token; if (!*p) continue; - + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].to = args[0].from = 0; token = match_token(p, tokens, args); switch (token) { case Opt_bsd_df: @@ -1215,9 +1225,15 @@ set_qf_format: case Opt_abort: set_opt(sbi->s_mount_opt, ABORT); break; + case Opt_nobarrier: + clear_opt(sbi->s_mount_opt, BARRIER); + break; case Opt_barrier: - if (match_int(&args[0], &option)) - return 0; + if (args[0].from) { + if (match_int(&args[0], &option)) + return 0; + } else + option = 1; /* No argument, default to 1 */ if (option) set_opt(sbi->s_mount_opt, BARRIER); else @@ -1890,21 +1906,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); - err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext3_count_free_blocks(sb)); - if (!err) { - err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext3_count_free_inodes(sb)); - } - if (!err) { - err = percpu_counter_init(&sbi->s_dirs_counter, - ext3_count_dirs(sb)); - } - if (err) { - ext3_msg(sb, KERN_ERR, "error: insufficient memory"); - goto failed_mount3; - } - /* per fileystem reservation list head & lock */ spin_lock_init(&sbi->s_rsv_window_lock); sbi->s_rsv_window_root = RB_ROOT; @@ -1945,15 +1946,29 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) if (!test_opt(sb, NOLOAD) && EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { if (ext3_load_journal(sb, es, journal_devnum)) - goto failed_mount3; + goto failed_mount2; } else if (journal_inum) { if (ext3_create_journal(sb, es, journal_inum)) - goto failed_mount3; + goto failed_mount2; } else { if (!silent) ext3_msg(sb, KERN_ERR, "error: no journal found. " "mounting ext3 over ext2?"); + goto failed_mount2; + } + err = percpu_counter_init(&sbi->s_freeblocks_counter, + ext3_count_free_blocks(sb)); + if (!err) { + err = percpu_counter_init(&sbi->s_freeinodes_counter, + ext3_count_free_inodes(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirs_counter, + ext3_count_dirs(sb)); + } + if (err) { + ext3_msg(sb, KERN_ERR, "error: insufficient memory"); goto failed_mount3; } @@ -1978,7 +1993,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) ext3_msg(sb, KERN_ERR, "error: journal does not support " "requested data journaling mode"); - goto failed_mount4; + goto failed_mount3; } default: break; @@ -2001,19 +2016,19 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) if (IS_ERR(root)) { ext3_msg(sb, KERN_ERR, "error: get root inode failed"); ret = PTR_ERR(root); - goto failed_mount4; + goto failed_mount3; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { iput(root); ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck"); - goto failed_mount4; + goto failed_mount3; } sb->s_root = d_alloc_root(root); if (!sb->s_root) { ext3_msg(sb, KERN_ERR, "error: get root dentry failed"); iput(root); ret = -ENOMEM; - goto failed_mount4; + goto failed_mount3; } ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); @@ -2039,12 +2054,11 @@ cantfind_ext3: sb->s_id); goto failed_mount; -failed_mount4: - journal_destroy(sbi->s_journal); failed_mount3: percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); + journal_destroy(sbi->s_journal); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); @@ -2278,6 +2292,9 @@ static int ext3_load_journal(struct super_block *sb, return -EINVAL; } + if (!(journal->j_flags & JFS_BARRIER)) + printk(KERN_INFO "EXT3-fs: barriers not enabled\n"); + if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { err = journal_update_format(journal); if (err) { diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index 534a94c3a93..71fb8d65e54 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -104,7 +104,7 @@ static int ext3_xattr_list(struct dentry *dentry, char *buffer, static struct mb_cache *ext3_xattr_cache; -static struct xattr_handler *ext3_xattr_handler_map[] = { +static const struct xattr_handler *ext3_xattr_handler_map[] = { [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler, #ifdef CONFIG_EXT3_FS_POSIX_ACL [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler, @@ -116,7 +116,7 @@ static struct xattr_handler *ext3_xattr_handler_map[] = { #endif }; -struct xattr_handler *ext3_xattr_handlers[] = { +const struct xattr_handler *ext3_xattr_handlers[] = { &ext3_xattr_user_handler, &ext3_xattr_trusted_handler, #ifdef CONFIG_EXT3_FS_POSIX_ACL @@ -129,10 +129,10 @@ struct xattr_handler *ext3_xattr_handlers[] = { NULL }; -static inline struct xattr_handler * +static inline const struct xattr_handler * ext3_xattr_handler(int name_index) { - struct xattr_handler *handler = NULL; + const struct xattr_handler *handler = NULL; if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map)) handler = ext3_xattr_handler_map[name_index]; @@ -338,7 +338,7 @@ ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry, size_t rest = buffer_size; for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) { - struct xattr_handler *handler = + const struct xattr_handler *handler = ext3_xattr_handler(entry->e_name_index); if (handler) { diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h index 148a4dfc82a..377fe720116 100644 --- a/fs/ext3/xattr.h +++ b/fs/ext3/xattr.h @@ -58,11 +58,11 @@ struct ext3_xattr_entry { # ifdef CONFIG_EXT3_FS_XATTR -extern struct xattr_handler ext3_xattr_user_handler; -extern struct xattr_handler ext3_xattr_trusted_handler; -extern struct xattr_handler ext3_xattr_acl_access_handler; -extern struct xattr_handler ext3_xattr_acl_default_handler; -extern struct xattr_handler ext3_xattr_security_handler; +extern const struct xattr_handler ext3_xattr_user_handler; +extern const struct xattr_handler ext3_xattr_trusted_handler; +extern const struct xattr_handler ext3_xattr_acl_access_handler; +extern const struct xattr_handler ext3_xattr_acl_default_handler; +extern const struct xattr_handler ext3_xattr_security_handler; extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); @@ -76,7 +76,7 @@ extern void ext3_xattr_put_super(struct super_block *); extern int init_ext3_xattr(void); extern void exit_ext3_xattr(void); -extern struct xattr_handler *ext3_xattr_handlers[]; +extern const struct xattr_handler *ext3_xattr_handlers[]; # else /* CONFIG_EXT3_FS_XATTR */ diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index 3af91f476df..03a99bfc59f 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -69,7 +69,7 @@ ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir) return err; } -struct xattr_handler ext3_xattr_security_handler = { +const struct xattr_handler ext3_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext3_xattr_security_list, .get = ext3_xattr_security_get, diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c index e5562845ed9..dc8edda9ffe 100644 --- a/fs/ext3/xattr_trusted.c +++ b/fs/ext3/xattr_trusted.c @@ -51,7 +51,7 @@ ext3_xattr_trusted_set(struct dentry *dentry, const char *name, value, size, flags); } -struct xattr_handler ext3_xattr_trusted_handler = { +const struct xattr_handler ext3_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .list = ext3_xattr_trusted_list, .get = ext3_xattr_trusted_get, diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c index 3bcfe9ee0a6..7a321974d58 100644 --- a/fs/ext3/xattr_user.c +++ b/fs/ext3/xattr_user.c @@ -54,7 +54,7 @@ ext3_xattr_user_set(struct dentry *dentry, const char *name, name, value, size, flags); } -struct xattr_handler ext3_xattr_user_handler = { +const struct xattr_handler ext3_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .list = ext3_xattr_user_list, .get = ext3_xattr_user_get, diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 8a2a29d35a6..feaf498feaa 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -454,7 +454,7 @@ release_and_out: return error; } -struct xattr_handler ext4_xattr_acl_access_handler = { +const struct xattr_handler ext4_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .list = ext4_xattr_list_acl_access, @@ -462,7 +462,7 @@ struct xattr_handler ext4_xattr_acl_access_handler = { .set = ext4_xattr_set_acl, }; -struct xattr_handler ext4_xattr_acl_default_handler = { +const struct xattr_handler ext4_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .list = ext4_xattr_list_acl_default, diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 0d0c3239c1c..ef3d980e67c 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -100,9 +100,11 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) if (ext4_should_writeback_data(inode) && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(inode->i_sb->s_bdev, NULL); + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, + NULL, BLKDEV_IFL_WAIT); jbd2_log_wait_commit(journal, commit_tid); } else if (journal->j_flags & JBD2_BARRIER) - blkdev_issue_flush(inode->i_sb->s_bdev, NULL); + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); return ret; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 57f6eef6ccd..1a0e183a2f0 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -979,16 +979,12 @@ got: atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); } - inode->i_uid = current_fsuid(); - if (test_opt(sb, GRPID)) + if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); inode->i_gid = dir->i_gid; - else if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; } else - inode->i_gid = current_fsgid(); - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); /* This is the optimal IO size (for stat), not the fs block size */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 81d60541284..3e0f6af9d08 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5425,7 +5425,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; - if (ia_valid & ATTR_SIZE) + if (is_quota_modification(inode, attr)) dquot_initialize(inode); if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index b4c5aa8489d..2de0e951508 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -97,7 +97,7 @@ static int ext4_xattr_list(struct dentry *dentry, char *buffer, static struct mb_cache *ext4_xattr_cache; -static struct xattr_handler *ext4_xattr_handler_map[] = { +static const struct xattr_handler *ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler, @@ -109,7 +109,7 @@ static struct xattr_handler *ext4_xattr_handler_map[] = { #endif }; -struct xattr_handler *ext4_xattr_handlers[] = { +const struct xattr_handler *ext4_xattr_handlers[] = { &ext4_xattr_user_handler, &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL @@ -122,10 +122,10 @@ struct xattr_handler *ext4_xattr_handlers[] = { NULL }; -static inline struct xattr_handler * +static inline const struct xattr_handler * ext4_xattr_handler(int name_index) { - struct xattr_handler *handler = NULL; + const struct xattr_handler *handler = NULL; if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map)) handler = ext4_xattr_handler_map[name_index]; @@ -332,7 +332,7 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, size_t rest = buffer_size; for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { - struct xattr_handler *handler = + const struct xattr_handler *handler = ext4_xattr_handler(entry->e_name_index); if (handler) { diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 8ede88b18c2..518e96e4390 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -65,11 +65,11 @@ struct ext4_xattr_entry { # ifdef CONFIG_EXT4_FS_XATTR -extern struct xattr_handler ext4_xattr_user_handler; -extern struct xattr_handler ext4_xattr_trusted_handler; -extern struct xattr_handler ext4_xattr_acl_access_handler; -extern struct xattr_handler ext4_xattr_acl_default_handler; -extern struct xattr_handler ext4_xattr_security_handler; +extern const struct xattr_handler ext4_xattr_user_handler; +extern const struct xattr_handler ext4_xattr_trusted_handler; +extern const struct xattr_handler ext4_xattr_acl_access_handler; +extern const struct xattr_handler ext4_xattr_acl_default_handler; +extern const struct xattr_handler ext4_xattr_security_handler; extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); @@ -86,7 +86,7 @@ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, extern int init_ext4_xattr(void); extern void exit_ext4_xattr(void); -extern struct xattr_handler *ext4_xattr_handlers[]; +extern const struct xattr_handler *ext4_xattr_handlers[]; # else /* CONFIG_EXT4_FS_XATTR */ diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 8b145e98df0..9b21268e121 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -69,7 +69,7 @@ ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir) return err; } -struct xattr_handler ext4_xattr_security_handler = { +const struct xattr_handler ext4_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext4_xattr_security_list, .get = ext4_xattr_security_get, diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index 15b50edc658..37e6ebca2cc 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -51,7 +51,7 @@ ext4_xattr_trusted_set(struct dentry *dentry, const char *name, name, value, size, flags); } -struct xattr_handler ext4_xattr_trusted_handler = { +const struct xattr_handler ext4_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .list = ext4_xattr_trusted_list, .get = ext4_xattr_trusted_get, diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index c4ce05746ce..98c375352d0 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -54,7 +54,7 @@ ext4_xattr_user_set(struct dentry *dentry, const char *name, name, value, size, flags); } -struct xattr_handler ext4_xattr_user_handler = { +const struct xattr_handler ext4_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .list = ext4_xattr_user_list, .get = ext4_xattr_user_get, diff --git a/fs/fat/cache.c b/fs/fat/cache.c index 113f0a1e565..ae8200f84e3 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -242,9 +242,10 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) while (*fclus < cluster) { /* prevent the infinite loop of cluster chain */ if (*fclus > limit) { - fat_fs_error(sb, "%s: detected the cluster chain loop" - " (i_pos %lld)", __func__, - MSDOS_I(inode)->i_pos); + fat_fs_error_ratelimit(sb, + "%s: detected the cluster chain loop" + " (i_pos %lld)", __func__, + MSDOS_I(inode)->i_pos); nr = -EIO; goto out; } @@ -253,9 +254,9 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) if (nr < 0) goto out; else if (nr == FAT_ENT_FREE) { - fat_fs_error(sb, "%s: invalid cluster chain" - " (i_pos %lld)", __func__, - MSDOS_I(inode)->i_pos); + fat_fs_error_ratelimit(sb, "%s: invalid cluster chain" + " (i_pos %lld)", __func__, + MSDOS_I(inode)->i_pos); nr = -EIO; goto out; } else if (nr == FAT_ENT_EOF) { diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 530b4ca0151..ee42b9e0b16 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -19,6 +19,7 @@ #include <linux/buffer_head.h> #include <linux/compat.h> #include <asm/uaccess.h> +#include <linux/kernel.h> #include "fat.h" /* @@ -140,28 +141,22 @@ static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len, { const wchar_t *ip; wchar_t ec; - unsigned char *op, nc; + unsigned char *op; int charlen; - int k; ip = uni; op = ascii; while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) { ec = *ip++; - if ( (charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) { + if ((charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) { op += charlen; len -= charlen; } else { if (uni_xlate == 1) { - *op = ':'; - for (k = 4; k > 0; k--) { - nc = ec & 0xF; - op[k] = nc > 9 ? nc + ('a' - 10) - : nc + '0'; - ec >>= 4; - } - op += 5; + *op++ = ':'; + op = pack_hex_byte(op, ec >> 8); + op = pack_hex_byte(op, ec); len -= 5; } else { *op++ = '?'; @@ -758,9 +753,10 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp, return ret; } -static int fat_dir_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +static long fat_dir_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) { + struct inode *inode = filp->f_path.dentry->d_inode; struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg; int short_only, both; @@ -774,7 +770,7 @@ static int fat_dir_ioctl(struct inode *inode, struct file *filp, both = 1; break; default: - return fat_generic_ioctl(inode, filp, cmd, arg); + return fat_generic_ioctl(filp, cmd, arg); } if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2]))) @@ -814,7 +810,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd, both = 1; break; default: - return -ENOIOCTLCMD; + return fat_generic_ioctl(filp, cmd, (unsigned long)arg); } if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2]))) @@ -836,7 +832,7 @@ const struct file_operations fat_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = fat_readdir, - .ioctl = fat_dir_ioctl, + .unlocked_ioctl = fat_dir_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = fat_compat_dir_ioctl, #endif diff --git a/fs/fat/fat.h b/fs/fat/fat.h index e6efdfa0f6d..53dba57b49a 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -6,6 +6,7 @@ #include <linux/nls.h> #include <linux/fs.h> #include <linux/mutex.h> +#include <linux/ratelimit.h> #include <linux/msdos_fs.h> /* @@ -82,6 +83,8 @@ struct msdos_sb_info { struct fatent_operations *fatent_ops; struct inode *fat_inode; + struct ratelimit_state ratelimit; + spinlock_t inode_hash_lock; struct hlist_head inode_hashtable[FAT_HASH_SIZE]; }; @@ -298,8 +301,8 @@ extern int fat_free_clusters(struct inode *inode, int cluster); extern int fat_count_free_clusters(struct super_block *sb); /* fat/file.c */ -extern int fat_generic_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg); +extern long fat_generic_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg); extern const struct file_operations fat_file_operations; extern const struct inode_operations fat_file_inode_operations; extern int fat_setattr(struct dentry * dentry, struct iattr * attr); @@ -322,8 +325,13 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent, extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2); /* fat/misc.c */ -extern void fat_fs_error(struct super_block *s, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))) __cold; +extern void +__fat_fs_error(struct super_block *s, int report, const char *fmt, ...) + __attribute__ ((format (printf, 3, 4))) __cold; +#define fat_fs_error(s, fmt, args...) \ + __fat_fs_error(s, 1, fmt , ## args) +#define fat_fs_error_ratelimit(s, fmt, args...) \ + __fat_fs_error(s, __ratelimit(&MSDOS_SB(s)->ratelimit), fmt , ## args) extern int fat_clusters_flush(struct super_block *sb); extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, diff --git a/fs/fat/file.c b/fs/fat/file.c index e8c159de236..a14c2f6a489 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -8,6 +8,7 @@ #include <linux/capability.h> #include <linux/module.h> +#include <linux/compat.h> #include <linux/mount.h> #include <linux/time.h> #include <linux/buffer_head.h> @@ -114,9 +115,9 @@ out: return err; } -int fat_generic_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { + struct inode *inode = filp->f_path.dentry->d_inode; u32 __user *user_attr = (u32 __user *)arg; switch (cmd) { @@ -129,6 +130,15 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp, } } +#ifdef CONFIG_COMPAT +static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) + +{ + return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); +} +#endif + static int fat_file_release(struct inode *inode, struct file *filp) { if ((filp->f_mode & FMODE_WRITE) && @@ -159,7 +169,10 @@ const struct file_operations fat_file_operations = { .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, .release = fat_file_release, - .ioctl = fat_generic_ioctl, + .unlocked_ioctl = fat_generic_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = fat_generic_compat_ioctl, +#endif .fsync = fat_file_fsync, .splice_read = generic_file_splice_read, }; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 0ce143bd7d5..ed33904926e 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1250,6 +1250,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, sb->s_op = &fat_sops; sb->s_export_op = &fat_export_ops; sbi->dir_ops = fs_dir_inode_ops; + ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); error = parse_options(data, isvfat, silent, &debug, &sbi->options); if (error) @@ -1497,10 +1499,8 @@ out_fail: iput(fat_inode); if (root_inode) iput(root_inode); - if (sbi->nls_io) - unload_nls(sbi->nls_io); - if (sbi->nls_disk) - unload_nls(sbi->nls_disk); + unload_nls(sbi->nls_io); + unload_nls(sbi->nls_disk); if (sbi->options.iocharset != fat_default_iocharset) kfree(sbi->options.iocharset); sb->s_fs_info = NULL; diff --git a/fs/fat/misc.c b/fs/fat/misc.c index d3da05f2646..1fa23f6ffba 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -20,27 +20,29 @@ * In case the file system is remounted read-only, it can be made writable * again by remounting it. */ -void fat_fs_error(struct super_block *s, const char *fmt, ...) +void __fat_fs_error(struct super_block *s, int report, const char *fmt, ...) { struct fat_mount_options *opts = &MSDOS_SB(s)->options; va_list args; - printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id); + if (report) { + printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id); - printk(KERN_ERR " "); - va_start(args, fmt); - vprintk(fmt, args); - va_end(args); - printk("\n"); + printk(KERN_ERR " "); + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); + printk("\n"); + } if (opts->errors == FAT_ERRORS_PANIC) - panic(" FAT fs panic from previous error\n"); + panic("FAT: fs panic from previous error\n"); else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) { s->s_flags |= MS_RDONLY; - printk(KERN_ERR " File system has been set read-only\n"); + printk(KERN_ERR "FAT: Filesystem has been set read-only\n"); } } -EXPORT_SYMBOL_GPL(fat_fs_error); +EXPORT_SYMBOL_GPL(__fat_fs_error); /* Flushes the number of free clusters on FAT32 */ /* XXX: Need to write one per FSINFO block. Currently only writes 1 */ diff --git a/fs/fcntl.c b/fs/fcntl.c index 452d02f9075..f74d270ba15 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -14,6 +14,7 @@ #include <linux/dnotify.h> #include <linux/slab.h> #include <linux/module.h> +#include <linux/pipe_fs_i.h> #include <linux/security.h> #include <linux/ptrace.h> #include <linux/signal.h> @@ -412,6 +413,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_NOTIFY: err = fcntl_dirnotify(fd, filp, arg); break; + case F_SETPIPE_SZ: + case F_GETPIPE_SZ: + err = pipe_fcntl(filp, cmd, arg); + break; default: break; } @@ -614,9 +619,15 @@ int send_sigurg(struct fown_struct *fown) return ret; } -static DEFINE_RWLOCK(fasync_lock); +static DEFINE_SPINLOCK(fasync_lock); static struct kmem_cache *fasync_cache __read_mostly; +static void fasync_free_rcu(struct rcu_head *head) +{ + kmem_cache_free(fasync_cache, + container_of(head, struct fasync_struct, fa_rcu)); +} + /* * Remove a fasync entry. If successfully removed, return * positive and clear the FASYNC flag. If no entry exists, @@ -625,8 +636,6 @@ static struct kmem_cache *fasync_cache __read_mostly; * NOTE! It is very important that the FASYNC flag always * match the state "is the filp on a fasync list". * - * We always take the 'filp->f_lock', in since fasync_lock - * needs to be irq-safe. */ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) { @@ -634,17 +643,22 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) int result = 0; spin_lock(&filp->f_lock); - write_lock_irq(&fasync_lock); + spin_lock(&fasync_lock); for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { if (fa->fa_file != filp) continue; + + spin_lock_irq(&fa->fa_lock); + fa->fa_file = NULL; + spin_unlock_irq(&fa->fa_lock); + *fp = fa->fa_next; - kmem_cache_free(fasync_cache, fa); + call_rcu(&fa->fa_rcu, fasync_free_rcu); filp->f_flags &= ~FASYNC; result = 1; break; } - write_unlock_irq(&fasync_lock); + spin_unlock(&fasync_lock); spin_unlock(&filp->f_lock); return result; } @@ -666,25 +680,30 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa return -ENOMEM; spin_lock(&filp->f_lock); - write_lock_irq(&fasync_lock); + spin_lock(&fasync_lock); for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { if (fa->fa_file != filp) continue; + + spin_lock_irq(&fa->fa_lock); fa->fa_fd = fd; + spin_unlock_irq(&fa->fa_lock); + kmem_cache_free(fasync_cache, new); goto out; } + spin_lock_init(&new->fa_lock); new->magic = FASYNC_MAGIC; new->fa_file = filp; new->fa_fd = fd; new->fa_next = *fapp; - *fapp = new; + rcu_assign_pointer(*fapp, new); result = 1; filp->f_flags |= FASYNC; out: - write_unlock_irq(&fasync_lock); + spin_unlock(&fasync_lock); spin_unlock(&filp->f_lock); return result; } @@ -704,37 +723,41 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap EXPORT_SYMBOL(fasync_helper); -void __kill_fasync(struct fasync_struct *fa, int sig, int band) +/* + * rcu_read_lock() is held + */ +static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) { while (fa) { - struct fown_struct * fown; + struct fown_struct *fown; if (fa->magic != FASYNC_MAGIC) { printk(KERN_ERR "kill_fasync: bad magic number in " "fasync_struct!\n"); return; } - fown = &fa->fa_file->f_owner; - /* Don't send SIGURG to processes which have not set a - queued signum: SIGURG has its own default signalling - mechanism. */ - if (!(sig == SIGURG && fown->signum == 0)) - send_sigio(fown, fa->fa_fd, band); - fa = fa->fa_next; + spin_lock(&fa->fa_lock); + if (fa->fa_file) { + fown = &fa->fa_file->f_owner; + /* Don't send SIGURG to processes which have not set a + queued signum: SIGURG has its own default signalling + mechanism. */ + if (!(sig == SIGURG && fown->signum == 0)) + send_sigio(fown, fa->fa_fd, band); + } + spin_unlock(&fa->fa_lock); + fa = rcu_dereference(fa->fa_next); } } -EXPORT_SYMBOL(__kill_fasync); - void kill_fasync(struct fasync_struct **fp, int sig, int band) { /* First a quick test without locking: usually * the list is empty. */ if (*fp) { - read_lock(&fasync_lock); - /* reread *fp after obtaining the lock */ - __kill_fasync(*fp, sig, band); - read_unlock(&fasync_lock); + rcu_read_lock(); + kill_fasync_rcu(rcu_dereference(*fp), sig, band); + rcu_read_unlock(); } } EXPORT_SYMBOL(kill_fasync); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 4b37f7cea4d..ea8592b9069 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -42,9 +42,10 @@ struct wb_writeback_args { long nr_pages; struct super_block *sb; enum writeback_sync_modes sync_mode; - int for_kupdate:1; - int range_cyclic:1; - int for_background:1; + unsigned int for_kupdate:1; + unsigned int range_cyclic:1; + unsigned int for_background:1; + unsigned int sb_pinned:1; }; /* @@ -192,7 +193,8 @@ static void bdi_wait_on_work_clear(struct bdi_work *work) } static void bdi_alloc_queue_work(struct backing_dev_info *bdi, - struct wb_writeback_args *args) + struct wb_writeback_args *args, + int wait) { struct bdi_work *work; @@ -204,6 +206,8 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi, if (work) { bdi_work_init(work, args); bdi_queue_work(bdi, work); + if (wait) + bdi_wait_on_work_clear(work); } else { struct bdi_writeback *wb = &bdi->wb; @@ -230,6 +234,11 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi, .sync_mode = WB_SYNC_ALL, .nr_pages = LONG_MAX, .range_cyclic = 0, + /* + * Setting sb_pinned is not necessary for WB_SYNC_ALL, but + * lets make it explicitly clear. + */ + .sb_pinned = 1, }; struct bdi_work work; @@ -245,21 +254,23 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi, * @bdi: the backing device to write from * @sb: write inodes from this super_block * @nr_pages: the number of pages to write + * @sb_locked: caller already holds sb umount sem. * * Description: * This does WB_SYNC_NONE opportunistic writeback. The IO is only * started when this function returns, we make no guarentees on - * completion. Caller need not hold sb s_umount semaphore. + * completion. Caller specifies whether sb umount sem is held already or not. * */ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, - long nr_pages) + long nr_pages, int sb_locked) { struct wb_writeback_args args = { .sb = sb, .sync_mode = WB_SYNC_NONE, .nr_pages = nr_pages, .range_cyclic = 1, + .sb_pinned = sb_locked, }; /* @@ -271,7 +282,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, args.for_background = 1; } - bdi_alloc_queue_work(bdi, &args); + bdi_alloc_queue_work(bdi, &args, sb_locked); } /* @@ -398,11 +409,11 @@ static void inode_wait_for_writeback(struct inode *inode) wait_queue_head_t *wqh; wqh = bit_waitqueue(&inode->i_state, __I_SYNC); - do { + while (inode->i_state & I_SYNC) { spin_unlock(&inode_lock); __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); spin_lock(&inode_lock); - } while (inode->i_state & I_SYNC); + } } /* @@ -452,11 +463,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) BUG_ON(inode->i_state & I_SYNC); - /* Set I_SYNC, reset I_DIRTY */ - dirty = inode->i_state & I_DIRTY; + /* Set I_SYNC, reset I_DIRTY_PAGES */ inode->i_state |= I_SYNC; - inode->i_state &= ~I_DIRTY; - + inode->i_state &= ~I_DIRTY_PAGES; spin_unlock(&inode_lock); ret = do_writepages(mapping, wbc); @@ -472,6 +481,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) ret = err; } + /* + * Some filesystems may redirty the inode during the writeback + * due to delalloc, clear dirty metadata flags right before + * write_inode() + */ + spin_lock(&inode_lock); + dirty = inode->i_state & I_DIRTY; + inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); + spin_unlock(&inode_lock); /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { int err = write_inode(inode, wbc); @@ -577,7 +595,7 @@ static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc, /* * Caller must already hold the ref for this */ - if (wbc->sync_mode == WB_SYNC_ALL) { + if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) { WARN_ON(!rwsem_is_locked(&sb->s_umount)); return SB_NOT_PINNED; } @@ -751,6 +769,7 @@ static long wb_writeback(struct bdi_writeback *wb, .for_kupdate = args->for_kupdate, .for_background = args->for_background, .range_cyclic = args->range_cyclic, + .sb_pinned = args->sb_pinned, }; unsigned long oldest_jif; long wrote = 0; @@ -852,6 +871,12 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) unsigned long expired; long nr_pages; + /* + * When set to zero, disable periodic writeback + */ + if (!dirty_writeback_interval) + return 0; + expired = wb->last_old_flush + msecs_to_jiffies(dirty_writeback_interval * 10); if (time_before(jiffies, expired)) @@ -887,6 +912,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) while ((work = get_next_work_item(bdi, wb)) != NULL) { struct wb_writeback_args args = work->args; + int post_clear; /* * Override sync mode, in case we must wait for completion @@ -894,11 +920,13 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) if (force_wait) work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; + post_clear = WB_SYNC_ALL || args.sb_pinned; + /* * If this isn't a data integrity operation, just notify * that we have seen this work and we are now starting it. */ - if (args.sync_mode == WB_SYNC_NONE) + if (!post_clear) wb_clear_pending(wb, work); wrote += wb_writeback(wb, &args); @@ -907,7 +935,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) * This is a data integrity writeback, so only do the * notification when we have completed the work. */ - if (args.sync_mode == WB_SYNC_ALL) + if (post_clear) wb_clear_pending(wb, work); } @@ -947,8 +975,17 @@ int bdi_writeback_task(struct bdi_writeback *wb) break; } - wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); - schedule_timeout_interruptible(wait_jiffies); + if (dirty_writeback_interval) { + wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); + schedule_timeout_interruptible(wait_jiffies); + } else { + set_current_state(TASK_INTERRUPTIBLE); + if (list_empty_careful(&wb->bdi->work_list) && + !kthread_should_stop()) + schedule(); + __set_current_state(TASK_RUNNING); + } + try_to_freeze(); } @@ -974,7 +1011,7 @@ static void bdi_writeback_all(struct super_block *sb, long nr_pages) if (!bdi_has_dirty_io(bdi)) continue; - bdi_alloc_queue_work(bdi, &args); + bdi_alloc_queue_work(bdi, &args, 0); } rcu_read_unlock(); @@ -1183,6 +1220,18 @@ static void wait_sb_inodes(struct super_block *sb) iput(old_inode); } +static void __writeback_inodes_sb(struct super_block *sb, int sb_locked) +{ + unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); + unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); + long nr_to_write; + + nr_to_write = nr_dirty + nr_unstable + + (inodes_stat.nr_inodes - inodes_stat.nr_unused); + + bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked); +} + /** * writeback_inodes_sb - writeback dirty inodes from given super_block * @sb: the superblock @@ -1194,18 +1243,23 @@ static void wait_sb_inodes(struct super_block *sb) */ void writeback_inodes_sb(struct super_block *sb) { - unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); - unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); - long nr_to_write; - - nr_to_write = nr_dirty + nr_unstable + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); - - bdi_start_writeback(sb->s_bdi, sb, nr_to_write); + __writeback_inodes_sb(sb, 0); } EXPORT_SYMBOL(writeback_inodes_sb); /** + * writeback_inodes_sb_locked - writeback dirty inodes from given super_block + * @sb: the superblock + * + * Like writeback_inodes_sb(), except the caller already holds the + * sb umount sem. + */ +void writeback_inodes_sb_locked(struct super_block *sb) +{ + __writeback_inodes_sb(sb, 1); +} + +/** * writeback_inodes_sb_if_idle - start writeback if none underway * @sb: the superblock * diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 1e1f286dd70..4a8eb31c533 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -103,7 +103,7 @@ static struct fscache_object *fscache_objlist_lookup(loff_t *_pos) /* banners (can't represent line 0 by pos 0 as that would involve * returning a NULL pointer) */ if (pos == 0) - return (struct fscache_object *) ++(*_pos); + return (struct fscache_object *)(long)++(*_pos); if (pos < 3) return (struct fscache_object *)pos; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index eb7e9423691..e53df5ebb2b 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -18,6 +18,7 @@ #include <linux/slab.h> MODULE_ALIAS_MISCDEV(FUSE_MINOR); +MODULE_ALIAS("devname:fuse"); static struct kmem_cache *fuse_req_cachep; diff --git a/fs/generic_acl.c b/fs/generic_acl.c index fe5df545765..99800e56415 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c @@ -201,7 +201,7 @@ generic_check_acl(struct inode *inode, int mask) return -EAGAIN; } -struct xattr_handler generic_acl_access_handler = { +const struct xattr_handler generic_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .list = generic_acl_list, @@ -209,7 +209,7 @@ struct xattr_handler generic_acl_access_handler = { .set = generic_acl_set, }; -struct xattr_handler generic_acl_default_handler = { +const struct xattr_handler generic_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .list = generic_acl_list, diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 87ee309d4c2..48171f4c943 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -236,10 +236,14 @@ static int gfs2_xattr_system_get(struct dentry *dentry, const char *name, void *buffer, size_t size, int xtype) { struct inode *inode = dentry->d_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); struct posix_acl *acl; int type; int error; + if (!sdp->sd_args.ar_posix_acl) + return -EOPNOTSUPP; + type = gfs2_acl_type(name); if (type < 0) return type; @@ -335,7 +339,7 @@ out: return error; } -struct xattr_handler gfs2_xattr_system_handler = { +const struct xattr_handler gfs2_xattr_system_handler = { .prefix = XATTR_SYSTEM_PREFIX, .flags = GFS2_EATYPE_SYS, .get = gfs2_xattr_system_get, diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index 9306a2e6620..b522b0cb39e 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -19,6 +19,6 @@ extern int gfs2_check_acl(struct inode *inode, int mask); extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode); extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); -extern struct xattr_handler gfs2_xattr_system_handler; +extern const struct xattr_handler gfs2_xattr_system_handler; #endif /* __ACL_DOT_H__ */ diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 0c1d0b82dcf..a739a0a4806 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -418,6 +418,7 @@ static int gfs2_jdata_writepages(struct address_space *mapping, static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) { struct buffer_head *dibh; + u64 dsize = i_size_read(&ip->i_inode); void *kaddr; int error; @@ -437,9 +438,10 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) return error; kaddr = kmap_atomic(page, KM_USER0); - memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), - ip->i_disksize); - memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize); + if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode))) + dsize = (dibh->b_size - sizeof(struct gfs2_dinode)); + memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); + memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize); kunmap_atomic(kaddr, KM_USER0); flush_dcache_page(page); brelse(dibh); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 5e411d5f469..4a48c0f4b40 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -71,11 +71,13 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, if (!PageUptodate(page)) { void *kaddr = kmap(page); + u64 dsize = i_size_read(inode); + + if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode))) + dsize = dibh->b_size - sizeof(struct gfs2_dinode); - memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), - ip->i_disksize); - memset(kaddr + ip->i_disksize, 0, - PAGE_CACHE_SIZE - ip->i_disksize); + memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); + memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize); kunmap(page); SetPageUptodate(page); @@ -1038,13 +1040,14 @@ static int trunc_start(struct gfs2_inode *ip, u64 size) goto out; if (gfs2_is_stuffed(ip)) { - ip->i_disksize = size; + u64 dsize = size + sizeof(struct gfs2_inode); ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); - gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size); + if (dsize > dibh->b_size) + dsize = dibh->b_size; + gfs2_buffer_clear_tail(dibh, dsize); error = 1; - } else { if (size & (u64)(sdp->sd_sb.sb_bsize - 1)) error = gfs2_block_truncate_page(ip->i_inode.i_mapping); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 25fddc100f1..8295c5b5d4a 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1475,7 +1475,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name) inode = gfs2_inode_lookup(dir->i_sb, be16_to_cpu(dent->de_type), be64_to_cpu(dent->de_inum.no_addr), - be64_to_cpu(dent->de_inum.no_formal_ino), 0); + be64_to_cpu(dent->de_inum.no_formal_ino)); brelse(bh); return inode; } diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index c22c2117483..dfe237a3f8a 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -168,7 +168,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, if (error) goto fail; - inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0); + inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto fail; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index e6dd2aec6f8..b20bfcc9fa2 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -218,6 +218,11 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) if (error) goto out_drop_write; + error = -EACCES; + if (!is_owner_or_cap(inode)) + goto out; + + error = 0; flags = ip->i_diskflags; new_flags = (flags & ~mask) | (reqflags & mask); if ((new_flags ^ flags) == 0) @@ -275,8 +280,10 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr) { struct inode *inode = filp->f_path.dentry->d_inode; u32 fsflags, gfsflags; + if (get_user(fsflags, ptr)) return -EFAULT; + gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags); if (!S_ISDIR(inode->i_mode)) { if (gfsflags & GFS2_DIF_INHERIT_JDATA) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 454d4b4eb36..ddcdbf49353 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -855,6 +855,9 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder * gh->gh_flags = flags; gh->gh_iflags = 0; gh->gh_ip = (unsigned long)__builtin_return_address(0); + if (gh->gh_owner_pid) + put_pid(gh->gh_owner_pid); + gh->gh_owner_pid = get_pid(task_pid(current)); } /** diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 3aac46f6853..b5d7363b22d 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -439,9 +439,6 @@ struct gfs2_args { struct gfs2_tune { spinlock_t gt_spin; - unsigned int gt_incore_log_blocks; - unsigned int gt_log_flush_secs; - unsigned int gt_logd_secs; unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */ @@ -462,6 +459,7 @@ enum { SDF_SHUTDOWN = 2, SDF_NOBARRIERS = 3, SDF_NORECOVERY = 4, + SDF_DEMOTE = 5, }; #define GFS2_FSNAME_LEN 256 @@ -618,6 +616,7 @@ struct gfs2_sbd { unsigned int sd_log_commited_databuf; int sd_log_commited_revoke; + atomic_t sd_log_pinned; unsigned int sd_log_num_buf; unsigned int sd_log_num_revoke; unsigned int sd_log_num_rg; @@ -629,15 +628,17 @@ struct gfs2_sbd { struct list_head sd_log_le_databuf; struct list_head sd_log_le_ordered; + atomic_t sd_log_thresh1; + atomic_t sd_log_thresh2; atomic_t sd_log_blks_free; - struct mutex sd_log_reserve_mutex; + wait_queue_head_t sd_log_waitq; + wait_queue_head_t sd_logd_waitq; u64 sd_log_sequence; unsigned int sd_log_head; unsigned int sd_log_tail; int sd_log_idle; - unsigned long sd_log_flush_time; struct rw_semaphore sd_log_flush_lock; atomic_t sd_log_in_flight; wait_queue_head_t sd_log_flush_wait; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index b1bf2694fb2..b5612cbb62a 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -158,7 +158,6 @@ void gfs2_set_iop(struct inode *inode) * @sb: The super block * @no_addr: The inode number * @type: The type of the inode - * @skip_freeing: set this not return an inode if it is currently being freed. * * Returns: A VFS inode, or an error */ @@ -166,17 +165,14 @@ void gfs2_set_iop(struct inode *inode) struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, u64 no_addr, - u64 no_formal_ino, int skip_freeing) + u64 no_formal_ino) { struct inode *inode; struct gfs2_inode *ip; struct gfs2_glock *io_gl; int error; - if (skip_freeing) - inode = gfs2_iget_skip(sb, no_addr); - else - inode = gfs2_iget(sb, no_addr); + inode = gfs2_iget(sb, no_addr); ip = GFS2_I(inode); if (!inode) @@ -234,11 +230,102 @@ fail_glock: fail_iopen: gfs2_glock_put(io_gl); fail_put: + if (inode->i_state & I_NEW) + ip->i_gl->gl_object = NULL; + gfs2_glock_put(ip->i_gl); +fail: + if (inode->i_state & I_NEW) + iget_failed(inode); + else + iput(inode); + return ERR_PTR(error); +} + +/** + * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation + * and try to reclaim it by doing iput. + * + * This function assumes no rgrp locks are currently held. + * + * @sb: The super block + * no_addr: The inode number + * + */ + +void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr) +{ + struct gfs2_sbd *sdp; + struct gfs2_inode *ip; + struct gfs2_glock *io_gl; + int error; + struct gfs2_holder gh; + struct inode *inode; + + inode = gfs2_iget_skip(sb, no_addr); + + if (!inode) + return; + + /* If it's not a new inode, someone's using it, so leave it alone. */ + if (!(inode->i_state & I_NEW)) { + iput(inode); + return; + } + + ip = GFS2_I(inode); + sdp = GFS2_SB(inode); + ip->i_no_formal_ino = -1; + + error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); + if (unlikely(error)) + goto fail; + ip->i_gl->gl_object = ip; + + error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); + if (unlikely(error)) + goto fail_put; + + set_bit(GIF_INVALID, &ip->i_flags); + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT, + &ip->i_iopen_gh); + if (unlikely(error)) + goto fail_iopen; + + ip->i_iopen_gh.gh_gl->gl_object = ip; + gfs2_glock_put(io_gl); + + inode->i_mode = DT2IF(DT_UNKNOWN); + + /* + * We must read the inode in order to work out its type in + * this case. Note that this doesn't happen often as we normally + * know the type beforehand. This code path only occurs during + * unlinked inode recovery (where it is safe to do this glock, + * which is not true in the general case). + */ + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY, + &gh); + if (unlikely(error)) + goto fail_glock; + + /* Inode is now uptodate */ + gfs2_glock_dq_uninit(&gh); + gfs2_set_iop(inode); + + /* The iput will cause it to be deleted. */ + iput(inode); + return; + +fail_glock: + gfs2_glock_dq(&ip->i_iopen_gh); +fail_iopen: + gfs2_glock_put(io_gl); +fail_put: ip->i_gl->gl_object = NULL; gfs2_glock_put(ip->i_gl); fail: iget_failed(inode); - return ERR_PTR(error); + return; } static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) @@ -862,7 +949,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, goto fail_gunlock2; inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr, - inum.no_formal_ino, 0); + inum.no_formal_ino); if (IS_ERR(inode)) goto fail_gunlock2; diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index c341aaf67ad..300ada3f21d 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -83,8 +83,8 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip, extern void gfs2_set_iop(struct inode *inode); extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, - u64 no_addr, u64 no_formal_ino, - int skip_freeing); + u64 no_addr, u64 no_formal_ino); +extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr); extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); extern int gfs2_inode_refresh(struct gfs2_inode *ip); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index e5bf4b59d46..6a857e24f94 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -168,12 +168,11 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl return list_empty(&ai->ai_ail1_list); } -static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags) +static void gfs2_ail1_start(struct gfs2_sbd *sdp) { struct list_head *head; u64 sync_gen; - struct list_head *first; - struct gfs2_ail *first_ai, *ai, *tmp; + struct gfs2_ail *ai; int done = 0; gfs2_log_lock(sdp); @@ -184,21 +183,9 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags) } sync_gen = sdp->sd_ail_sync_gen++; - first = head->prev; - first_ai = list_entry(first, struct gfs2_ail, ai_list); - first_ai->ai_sync_gen = sync_gen; - gfs2_ail1_start_one(sdp, first_ai); /* This may drop log lock */ - - if (flags & DIO_ALL) - first = NULL; - while(!done) { - if (first && (head->prev != first || - gfs2_ail1_empty_one(sdp, first_ai, 0))) - break; - done = 1; - list_for_each_entry_safe_reverse(ai, tmp, head, ai_list) { + list_for_each_entry_reverse(ai, head, ai_list) { if (ai->ai_sync_gen >= sync_gen) continue; ai->ai_sync_gen = sync_gen; @@ -290,58 +277,57 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) * flush time, so we ensure that we have just enough free blocks at all * times to avoid running out during a log flush. * + * We no longer flush the log here, instead we wake up logd to do that + * for us. To avoid the thundering herd and to ensure that we deal fairly + * with queued waiters, we use an exclusive wait. This means that when we + * get woken with enough journal space to get our reservation, we need to + * wake the next waiter on the list. + * * Returns: errno */ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) { - unsigned int try = 0; unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize); + unsigned wanted = blks + reserved_blks; + DEFINE_WAIT(wait); + int did_wait = 0; + unsigned int free_blocks; if (gfs2_assert_warn(sdp, blks) || gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks)) return -EINVAL; - - mutex_lock(&sdp->sd_log_reserve_mutex); - gfs2_log_lock(sdp); - while(atomic_read(&sdp->sd_log_blks_free) <= (blks + reserved_blks)) { - gfs2_log_unlock(sdp); - gfs2_ail1_empty(sdp, 0); - gfs2_log_flush(sdp, NULL); - - if (try++) - gfs2_ail1_start(sdp, 0); - gfs2_log_lock(sdp); +retry: + free_blocks = atomic_read(&sdp->sd_log_blks_free); + if (unlikely(free_blocks <= wanted)) { + do { + prepare_to_wait_exclusive(&sdp->sd_log_waitq, &wait, + TASK_UNINTERRUPTIBLE); + wake_up(&sdp->sd_logd_waitq); + did_wait = 1; + if (atomic_read(&sdp->sd_log_blks_free) <= wanted) + io_schedule(); + free_blocks = atomic_read(&sdp->sd_log_blks_free); + } while(free_blocks <= wanted); + finish_wait(&sdp->sd_log_waitq, &wait); } - atomic_sub(blks, &sdp->sd_log_blks_free); + if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks, + free_blocks - blks) != free_blocks) + goto retry; trace_gfs2_log_blocks(sdp, -blks); - gfs2_log_unlock(sdp); - mutex_unlock(&sdp->sd_log_reserve_mutex); + + /* + * If we waited, then so might others, wake them up _after_ we get + * our share of the log. + */ + if (unlikely(did_wait)) + wake_up(&sdp->sd_log_waitq); down_read(&sdp->sd_log_flush_lock); return 0; } -/** - * gfs2_log_release - Release a given number of log blocks - * @sdp: The GFS2 superblock - * @blks: The number of blocks - * - */ - -void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) -{ - - gfs2_log_lock(sdp); - atomic_add(blks, &sdp->sd_log_blks_free); - trace_gfs2_log_blocks(sdp, blks); - gfs2_assert_withdraw(sdp, - atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); - gfs2_log_unlock(sdp); - up_read(&sdp->sd_log_flush_lock); -} - static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn) { struct gfs2_journal_extent *je; @@ -559,11 +545,10 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail) ail2_empty(sdp, new_tail); - gfs2_log_lock(sdp); atomic_add(dist, &sdp->sd_log_blks_free); trace_gfs2_log_blocks(sdp, dist); - gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); - gfs2_log_unlock(sdp); + gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= + sdp->sd_jdesc->jd_blocks); sdp->sd_log_tail = new_tail; } @@ -615,6 +600,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) if (buffer_eopnotsupp(bh)) { clear_buffer_eopnotsupp(bh); set_buffer_uptodate(bh); + fs_info(sdp, "barrier sync failed - disabling barriers\n"); set_bit(SDF_NOBARRIERS, &sdp->sd_flags); lock_buffer(bh); skip_barrier: @@ -710,7 +696,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp) * */ -void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) +void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) { struct gfs2_ail *ai; @@ -822,6 +808,13 @@ static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) * @sdp: the filesystem * @tr: the transaction * + * We wake up gfs2_logd if the number of pinned blocks exceed thresh1 + * or the total number of used blocks (pinned blocks plus AIL blocks) + * is greater than thresh2. + * + * At mount time thresh1 is 1/3rd of journal size, thresh2 is 2/3rd of + * journal size. + * * Returns: errno */ @@ -832,10 +825,10 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) up_read(&sdp->sd_log_flush_lock); - gfs2_log_lock(sdp); - if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks)) - wake_up_process(sdp->sd_logd_process); - gfs2_log_unlock(sdp); + if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) || + ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) > + atomic_read(&sdp->sd_log_thresh2))) + wake_up(&sdp->sd_logd_waitq); } /** @@ -882,13 +875,23 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp) { gfs2_log_flush(sdp, NULL); for (;;) { - gfs2_ail1_start(sdp, DIO_ALL); + gfs2_ail1_start(sdp); if (gfs2_ail1_empty(sdp, DIO_ALL)) break; msleep(10); } } +static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) +{ + return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1)); +} + +static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) +{ + unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free); + return used_blocks >= atomic_read(&sdp->sd_log_thresh2); +} /** * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks @@ -901,28 +904,43 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp) int gfs2_logd(void *data) { struct gfs2_sbd *sdp = data; - unsigned long t; - int need_flush; + unsigned long t = 1; + DEFINE_WAIT(wait); + unsigned preflush; while (!kthread_should_stop()) { - /* Advance the log tail */ - t = sdp->sd_log_flush_time + - gfs2_tune_get(sdp, gt_log_flush_secs) * HZ; + preflush = atomic_read(&sdp->sd_log_pinned); + if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { + gfs2_ail1_empty(sdp, DIO_ALL); + gfs2_log_flush(sdp, NULL); + gfs2_ail1_empty(sdp, DIO_ALL); + } - gfs2_ail1_empty(sdp, DIO_ALL); - gfs2_log_lock(sdp); - need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks); - gfs2_log_unlock(sdp); - if (need_flush || time_after_eq(jiffies, t)) { + if (gfs2_ail_flush_reqd(sdp)) { + gfs2_ail1_start(sdp); + io_schedule(); + gfs2_ail1_empty(sdp, 0); gfs2_log_flush(sdp, NULL); - sdp->sd_log_flush_time = jiffies; + gfs2_ail1_empty(sdp, DIO_ALL); } + wake_up(&sdp->sd_log_waitq); t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; if (freezing(current)) refrigerator(); - schedule_timeout_interruptible(t); + + do { + prepare_to_wait(&sdp->sd_logd_waitq, &wait, + TASK_UNINTERRUPTIBLE); + if (!gfs2_ail_flush_reqd(sdp) && + !gfs2_jrnl_flush_reqd(sdp) && + !kthread_should_stop()) + t = schedule_timeout(t); + } while(t && !gfs2_ail_flush_reqd(sdp) && + !gfs2_jrnl_flush_reqd(sdp) && + !kthread_should_stop()); + finish_wait(&sdp->sd_logd_waitq, &wait); } return 0; diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 7c64510ccfd..0d007f92023 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -47,29 +47,21 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp, sdp->sd_log_head = sdp->sd_log_tail = value; } -unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, +extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, unsigned int ssize); -int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); -void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); -void gfs2_log_incr_head(struct gfs2_sbd *sdp); +extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); +extern void gfs2_log_incr_head(struct gfs2_sbd *sdp); -struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); -struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, +extern struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); +extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, struct buffer_head *real); -void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); +extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); +extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); +extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); -static inline void gfs2_log_flush(struct gfs2_sbd *sbd, struct gfs2_glock *gl) -{ - if (!gl || test_bit(GLF_LFLUSH, &gl->gl_flags)) - __gfs2_log_flush(sbd, gl); -} - -void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); -void gfs2_remove_from_ail(struct gfs2_bufdata *bd); - -void gfs2_log_shutdown(struct gfs2_sbd *sdp); -void gfs2_meta_syncfs(struct gfs2_sbd *sdp); -int gfs2_logd(void *data); +extern void gfs2_log_shutdown(struct gfs2_sbd *sdp); +extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp); +extern int gfs2_logd(void *data); #endif /* __LOG_DOT_H__ */ diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index adc260fbea9..bf33f822058 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -54,6 +54,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) if (bd->bd_ail) list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list); get_bh(bh); + atomic_inc(&sdp->sd_log_pinned); trace_gfs2_pin(bd, 1); } @@ -94,6 +95,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, trace_gfs2_pin(bd, 0); gfs2_log_unlock(sdp); unlock_buffer(bh); + atomic_dec(&sdp->sd_log_pinned); } diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index a88fadc704b..fb2a5f93b7c 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -94,7 +94,7 @@ static int __init init_gfs2_fs(void) if (!gfs2_glock_cachep) goto fail; - gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock (aspace)", + gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock(aspace)", sizeof(struct gfs2_glock) + sizeof(struct address_space), 0, 0, gfs2_init_gl_aspace_once); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 0bb12c80937..18176d0b75d 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -34,7 +34,6 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc) { - int err; struct buffer_head *bh, *head; int nr_underway = 0; int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ? @@ -86,11 +85,10 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb } while (bh != head); unlock_page(page); - err = 0; if (nr_underway == 0) end_page_writeback(page); - return err; + return 0; } const struct address_space_operations gfs2_meta_aops = { @@ -313,6 +311,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int struct gfs2_bufdata *bd = bh->b_private; if (test_clear_buffer_pinned(bh)) { + atomic_dec(&sdp->sd_log_pinned); list_del_init(&bd->bd_le.le_list); if (meta) { gfs2_assert_warn(sdp, sdp->sd_log_num_buf); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index c1309ed1c49..3593b3a7290 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -57,8 +57,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt) { spin_lock_init(>->gt_spin); - gt->gt_incore_log_blocks = 1024; - gt->gt_logd_secs = 1; gt->gt_quota_simul_sync = 64; gt->gt_quota_warn_period = 10; gt->gt_quota_scale_num = 1; @@ -101,14 +99,15 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) spin_lock_init(&sdp->sd_trunc_lock); spin_lock_init(&sdp->sd_log_lock); - + atomic_set(&sdp->sd_log_pinned, 0); INIT_LIST_HEAD(&sdp->sd_log_le_buf); INIT_LIST_HEAD(&sdp->sd_log_le_revoke); INIT_LIST_HEAD(&sdp->sd_log_le_rg); INIT_LIST_HEAD(&sdp->sd_log_le_databuf); INIT_LIST_HEAD(&sdp->sd_log_le_ordered); - mutex_init(&sdp->sd_log_reserve_mutex); + init_waitqueue_head(&sdp->sd_log_waitq); + init_waitqueue_head(&sdp->sd_logd_waitq); INIT_LIST_HEAD(&sdp->sd_ail1_list); INIT_LIST_HEAD(&sdp->sd_ail2_list); @@ -487,7 +486,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr, struct dentry *dentry; struct inode *inode; - inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0); + inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0); if (IS_ERR(inode)) { fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode)); return PTR_ERR(inode); @@ -733,6 +732,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) if (sdp->sd_args.ar_spectator) { sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0); atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); + atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5); + atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5); } else { if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) { fs_err(sdp, "can't mount journal #%u\n", @@ -770,6 +771,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) goto fail_jinode_gh; } atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); + atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5); + atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5); /* Map the extents for this journal's blocks */ map_journal_extents(sdp); @@ -951,8 +954,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo) if (undo) goto fail_quotad; - sdp->sd_log_flush_time = jiffies; - p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); error = IS_ERR(p); if (error) { @@ -1160,7 +1161,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent GFS2_BASIC_BLOCK_SHIFT; sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; - sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit; + sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit; sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum; if (sdp->sd_args.ar_statfs_quantum) { sdp->sd_tune.gt_statfs_slow = 0; @@ -1323,7 +1324,7 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags, memset(&args, 0, sizeof(args)); args.ar_quota = GFS2_QUOTA_DEFAULT; args.ar_data = GFS2_DATA_DEFAULT; - args.ar_commit = 60; + args.ar_commit = 30; args.ar_statfs_quantum = 30; args.ar_quota_quantum = 60; args.ar_errors = GFS2_ERRORS_DEFAULT; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 6dbcbad6ab1..49667d68769 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -637,15 +637,40 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, unsigned blocksize, iblock, pos; struct buffer_head *bh, *dibh; struct page *page; - void *kaddr; - struct gfs2_quota *qp; - s64 value; - int err = -EIO; + void *kaddr, *ptr; + struct gfs2_quota q, *qp; + int err, nbytes; u64 size; if (gfs2_is_stuffed(ip)) gfs2_unstuff_dinode(ip, NULL); - + + memset(&q, 0, sizeof(struct gfs2_quota)); + err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q)); + if (err < 0) + return err; + + err = -EIO; + qp = &q; + qp->qu_value = be64_to_cpu(qp->qu_value); + qp->qu_value += change; + qp->qu_value = cpu_to_be64(qp->qu_value); + qd->qd_qb.qb_value = qp->qu_value; + if (fdq) { + if (fdq->d_fieldmask & FS_DQ_BSOFT) { + qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit); + qd->qd_qb.qb_warn = qp->qu_warn; + } + if (fdq->d_fieldmask & FS_DQ_BHARD) { + qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit); + qd->qd_qb.qb_limit = qp->qu_limit; + } + } + + /* Write the quota into the quota file on disk */ + ptr = qp; + nbytes = sizeof(struct gfs2_quota); +get_a_page: page = grab_cache_page(mapping, index); if (!page) return -ENOMEM; @@ -667,7 +692,12 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, if (!buffer_mapped(bh)) { gfs2_block_map(inode, iblock, bh, 1); if (!buffer_mapped(bh)) - goto unlock; + goto unlock_out; + /* If it's a newly allocated disk block for quota, zero it */ + if (buffer_new(bh)) { + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + } } if (PageUptodate(page)) @@ -677,32 +707,34 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, ll_rw_block(READ_META, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) - goto unlock; + goto unlock_out; } gfs2_trans_add_bh(ip->i_gl, bh, 0); kaddr = kmap_atomic(page, KM_USER0); - qp = kaddr + offset; - value = (s64)be64_to_cpu(qp->qu_value) + change; - qp->qu_value = cpu_to_be64(value); - qd->qd_qb.qb_value = qp->qu_value; - if (fdq) { - if (fdq->d_fieldmask & FS_DQ_BSOFT) { - qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit); - qd->qd_qb.qb_warn = qp->qu_warn; - } - if (fdq->d_fieldmask & FS_DQ_BHARD) { - qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit); - qd->qd_qb.qb_limit = qp->qu_limit; - } - } + if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE) + nbytes = PAGE_CACHE_SIZE - offset; + memcpy(kaddr + offset, ptr, nbytes); flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); + unlock_page(page); + page_cache_release(page); + /* If quota straddles page boundary, we need to update the rest of the + * quota at the beginning of the next page */ + if (offset != 0) { /* first page, offset is closer to PAGE_CACHE_SIZE */ + ptr = ptr + nbytes; + nbytes = sizeof(struct gfs2_quota) - nbytes; + offset = 0; + index++; + goto get_a_page; + } + + /* Update the disk inode timestamp and size (if extended) */ err = gfs2_meta_inode_buffer(ip, &dibh); if (err) - goto unlock; + goto out; size = loc + sizeof(struct gfs2_quota); if (size > inode->i_size) { @@ -715,7 +747,9 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, brelse(dibh); mark_inode_dirty(inode); -unlock: +out: + return err; +unlock_out: unlock_page(page); page_cache_release(page); return err; @@ -779,8 +813,10 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) * rgrp since it won't be allocated during the transaction */ al->al_requested = 1; - /* +1 in the end for block requested above for unstuffing */ - blocks = num_qd * data_blocks + RES_DINODE + num_qd + 1; + /* +3 in the end for unstuffing block, inode size update block + * and another block in case quota straddles page boundary and + * two blocks need to be updated instead of 1 */ + blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3; if (nalloc) al->al_requested += nalloc * (data_blocks + ind_blocks); @@ -1418,10 +1454,18 @@ static int gfs2_quota_get_xstate(struct super_block *sb, memset(fqs, 0, sizeof(struct fs_quota_stat)); fqs->qs_version = FS_QSTAT_VERSION; - if (sdp->sd_args.ar_quota == GFS2_QUOTA_ON) - fqs->qs_flags = (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD); - else if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT) - fqs->qs_flags = (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT); + + switch (sdp->sd_args.ar_quota) { + case GFS2_QUOTA_ON: + fqs->qs_flags |= (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD); + /*FALLTHRU*/ + case GFS2_QUOTA_ACCOUNT: + fqs->qs_flags |= (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT); + break; + case GFS2_QUOTA_OFF: + break; + } + if (sdp->sd_quota_inode) { fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr; fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks; @@ -1432,8 +1476,8 @@ static int gfs2_quota_get_xstate(struct super_block *sb, return 0; } -static int gfs2_xquota_get(struct super_block *sb, int type, qid_t id, - struct fs_disk_quota *fdq) +static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id, + struct fs_disk_quota *fdq) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_quota_lvb *qlvb; @@ -1477,8 +1521,8 @@ out: /* GFS2 only supports a subset of the XFS fields */ #define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD) -static int gfs2_xquota_set(struct super_block *sb, int type, qid_t id, - struct fs_disk_quota *fdq) +static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, + struct fs_disk_quota *fdq) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); @@ -1585,7 +1629,7 @@ out_put: const struct quotactl_ops gfs2_quotactl_ops = { .quota_sync = gfs2_quota_sync, .get_xstate = gfs2_quota_get_xstate, - .get_xquota = gfs2_xquota_get, - .set_xquota = gfs2_xquota_set, + .get_dqblk = gfs2_get_dqblk, + .set_dqblk = gfs2_set_dqblk, }; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 503b842f3ba..171a744f8e4 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -854,7 +854,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, if ((start + nr_sects) != blk) { rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, - DISCARD_FL_BARRIER); + BLKDEV_IFL_WAIT | + BLKDEV_IFL_BARRIER); if (rv) goto fail; nr_sects = 0; @@ -869,7 +870,7 @@ start_new_extent: } if (nr_sects) { rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, - DISCARD_FL_BARRIER); + BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); if (rv) goto fail; } @@ -948,13 +949,13 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes * @rgd: The rgrp * - * Returns: The inode, if one has been found + * Returns: 0 if no error + * The inode, if one has been found, in inode. */ -static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, - u64 skip) +static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, + u64 skip) { - struct inode *inode; u32 goal = 0, block; u64 no_addr; struct gfs2_sbd *sdp = rgd->rd_sbd; @@ -979,14 +980,11 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, if (no_addr == skip) continue; *last_unlinked = no_addr; - inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN, - no_addr, -1, 1); - if (!IS_ERR(inode)) - return inode; + return no_addr; } rgd->rd_flags &= ~GFS2_RDF_CHECK; - return NULL; + return 0; } /** @@ -1067,11 +1065,12 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd) * Try to acquire rgrp in way which avoids contending with others. * * Returns: errno + * unlinked: the block address of an unlinked block to be reclaimed */ -static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) +static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked, + u64 *last_unlinked) { - struct inode *inode = NULL; struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd, *begin = NULL; struct gfs2_alloc *al = ip->i_alloc; @@ -1080,6 +1079,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) int loops = 0; int error, rg_locked; + *unlinked = 0; rgd = gfs2_blk2rgrpd(sdp, ip->i_goal); while (rgd) { @@ -1096,19 +1096,24 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) case 0: if (try_rgrp_fit(rgd, al)) goto out; - if (rgd->rd_flags & GFS2_RDF_CHECK) - inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); + /* If the rg came in already locked, there's no + way we can recover from a failed try_rgrp_unlink + because that would require an iput which can only + happen after the rgrp is unlocked. */ + if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK) + *unlinked = try_rgrp_unlink(rgd, last_unlinked, + ip->i_no_addr); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (inode) - return inode; + if (*unlinked) + return -EAGAIN; /* fall through */ case GLR_TRYFAILED: rgd = recent_rgrp_next(rgd); break; default: - return ERR_PTR(error); + return error; } } @@ -1130,12 +1135,13 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) case 0: if (try_rgrp_fit(rgd, al)) goto out; - if (rgd->rd_flags & GFS2_RDF_CHECK) - inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); + if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK) + *unlinked = try_rgrp_unlink(rgd, last_unlinked, + ip->i_no_addr); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (inode) - return inode; + if (*unlinked) + return -EAGAIN; break; case GLR_TRYFAILED: @@ -1143,7 +1149,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) break; default: - return ERR_PTR(error); + return error; } rgd = gfs2_rgrpd_get_next(rgd); @@ -1152,7 +1158,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) if (rgd == begin) { if (++loops >= 3) - return ERR_PTR(-ENOSPC); + return -ENOSPC; if (!skipped) loops++; flags = 0; @@ -1172,7 +1178,7 @@ out: forward_rgrp_set(sdp, rgd); } - return NULL; + return 0; } /** @@ -1186,9 +1192,8 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_alloc *al = ip->i_alloc; - struct inode *inode; int error = 0; - u64 last_unlinked = NO_BLOCK; + u64 last_unlinked = NO_BLOCK, unlinked; if (gfs2_assert_warn(sdp, al->al_requested)) return -EINVAL; @@ -1204,17 +1209,27 @@ try_again: if (error) return error; - inode = get_local_rgrp(ip, &last_unlinked); - if (inode) { + /* Find an rgrp suitable for allocation. If it encounters any unlinked + dinodes along the way, error will equal -EAGAIN and unlinked will + contains it block address. We then need to look up that inode and + try to free it, and try the allocation again. */ + error = get_local_rgrp(ip, &unlinked, &last_unlinked); + if (error) { if (ip != GFS2_I(sdp->sd_rindex)) gfs2_glock_dq_uninit(&al->al_ri_gh); - if (IS_ERR(inode)) - return PTR_ERR(inode); - iput(inode); + if (error != -EAGAIN) + return error; + + gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked); + /* regardless of whether or not gfs2_process_unlinked_inode + was successful, we don't want to repeat it again. */ + last_unlinked = unlinked; gfs2_log_flush(sdp, NULL); + error = 0; + goto try_again; } - + /* no error, so we have the rgrp set in the inode's allocation. */ al->al_file = file; al->al_line = line; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 50aac606b99..4d1aad38f1b 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1113,7 +1113,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) int error; spin_lock(>->gt_spin); - args.ar_commit = gt->gt_log_flush_secs; + args.ar_commit = gt->gt_logd_secs; args.ar_quota_quantum = gt->gt_quota_quantum; if (gt->gt_statfs_slow) args.ar_statfs_quantum = 0; @@ -1160,7 +1160,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) else clear_bit(SDF_NOBARRIERS, &sdp->sd_flags); spin_lock(>->gt_spin); - gt->gt_log_flush_secs = args.ar_commit; + gt->gt_logd_secs = args.ar_commit; gt->gt_quota_quantum = args.ar_quota_quantum; if (args.ar_statfs_quantum) { gt->gt_statfs_slow = 0; @@ -1305,8 +1305,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) } if (args->ar_discard) seq_printf(s, ",discard"); - val = sdp->sd_tune.gt_log_flush_secs; - if (val != 60) + val = sdp->sd_tune.gt_logd_secs; + if (val != 30) seq_printf(s, ",commit=%d", val); val = sdp->sd_tune.gt_statfs_quantum; if (val != 30) @@ -1334,7 +1334,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) } if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) seq_printf(s, ",nobarrier"); - + if (test_bit(SDF_DEMOTE, &sdp->sd_flags)) + seq_printf(s, ",demote_interface_used"); return 0; } diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 3df60f2d84e..a0464680af0 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -54,7 +54,7 @@ extern struct file_system_type gfs2meta_fs_type; extern const struct export_operations gfs2_export_ops; extern const struct super_operations gfs2_super_ops; extern const struct dentry_operations gfs2_dops; -extern struct xattr_handler *gfs2_xattr_handlers[]; +extern const struct xattr_handler *gfs2_xattr_handlers[]; #endif /* __SUPER_DOT_H__ */ diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 54fd9842599..37f5393e68e 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -232,6 +232,8 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len glops = gfs2_glops_list[gltype]; if (glops == NULL) return -EINVAL; + if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags)) + fs_info(sdp, "demote interface used\n"); rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl); if (rv) return rv; @@ -468,8 +470,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\ } \ TUNE_ATTR_2(name, name##_store) -TUNE_ATTR(incore_log_blocks, 0); -TUNE_ATTR(log_flush_secs, 0); TUNE_ATTR(quota_warn_period, 0); TUNE_ATTR(quota_quantum, 0); TUNE_ATTR(max_readahead, 0); @@ -481,8 +481,6 @@ TUNE_ATTR(statfs_quantum, 1); TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); static struct attribute *tune_attrs[] = { - &tune_attr_incore_log_blocks.attr, - &tune_attr_log_flush_secs.attr, &tune_attr_quota_warn_period.attr, &tune_attr_quota_quantum.attr, &tune_attr_max_readahead.attr, diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 4ef0e9fa354..9ec73a85411 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -23,6 +23,7 @@ #include "meta_io.h" #include "trans.h" #include "util.h" +#include "trace_gfs2.h" int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, unsigned int revokes) @@ -75,6 +76,23 @@ fail_holder_uninit: return error; } +/** + * gfs2_log_release - Release a given number of log blocks + * @sdp: The GFS2 superblock + * @blks: The number of blocks + * + */ + +static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) +{ + + atomic_add(blks, &sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, blks); + gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= + sdp->sd_jdesc->jd_blocks); + up_read(&sdp->sd_log_flush_lock); +} + void gfs2_trans_end(struct gfs2_sbd *sdp) { struct gfs2_trans *tr = current->journal_info; diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index c2ebdf2c01d..82f93da00d1 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1535,21 +1535,21 @@ out_alloc: return error; } -static struct xattr_handler gfs2_xattr_user_handler = { +static const struct xattr_handler gfs2_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .flags = GFS2_EATYPE_USR, .get = gfs2_xattr_get, .set = gfs2_xattr_set, }; -static struct xattr_handler gfs2_xattr_security_handler = { +static const struct xattr_handler gfs2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .flags = GFS2_EATYPE_SECURITY, .get = gfs2_xattr_get, .set = gfs2_xattr_set, }; -struct xattr_handler *gfs2_xattr_handlers[] = { +const struct xattr_handler *gfs2_xattr_handlers[] = { &gfs2_xattr_user_handler, &gfs2_xattr_security_handler, &gfs2_xattr_system_handler, diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 5f402367825..764fd1bdca8 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -494,7 +494,7 @@ const struct inode_operations hfsplus_dir_inode_operations = { const struct file_operations hfsplus_dir_operations = { .read = generic_read_dir, .readdir = hfsplus_readdir, - .ioctl = hfsplus_ioctl, + .unlocked_ioctl = hfsplus_ioctl, .llseek = generic_file_llseek, .release = hfsplus_dir_release, }; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 5c10d803d9d..6505c30ad96 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -337,8 +337,7 @@ struct inode *hfsplus_new_inode(struct super_block *, int); void hfsplus_delete_inode(struct inode *); /* ioctl.c */ -int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, - unsigned long arg); +long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); int hfsplus_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 1bcf597c056..9bbb82924a2 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -285,7 +285,7 @@ static const struct file_operations hfsplus_file_operations = { .fsync = file_fsync, .open = hfsplus_file_open, .release = hfsplus_file_release, - .ioctl = hfsplus_ioctl, + .unlocked_ioctl = hfsplus_ioctl, }; struct inode *hfsplus_new_inode(struct super_block *sb, int mode) diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index f457d2ca51a..ac405f09902 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -17,14 +17,16 @@ #include <linux/mount.h> #include <linux/sched.h> #include <linux/xattr.h> +#include <linux/smp_lock.h> #include <asm/uaccess.h> #include "hfsplus_fs.h" -int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, - unsigned long arg) +long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { + struct inode *inode = filp->f_path.dentry->d_inode; unsigned int flags; + lock_kernel(); switch (cmd) { case HFSPLUS_IOC_EXT2_GETFLAGS: flags = 0; @@ -38,8 +40,10 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, case HFSPLUS_IOC_EXT2_SETFLAGS: { int err = 0; err = mnt_want_write(filp->f_path.mnt); - if (err) + if (err) { + unlock_kernel(); return err; + } if (!is_owner_or_cap(inode)) { err = -EACCES; @@ -85,9 +89,11 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, mark_inode_dirty(inode); setflags_out: mnt_drop_write(filp->f_path.mnt); + unlock_kernel(); return err; } default: + unlock_kernel(); return -ENOTTY; } } diff --git a/fs/inode.c b/fs/inode.c index 258ec22bb29..2bee20ae3d6 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -286,11 +286,9 @@ static void init_once(void *foo) */ void __iget(struct inode *inode) { - if (atomic_read(&inode->i_count)) { - atomic_inc(&inode->i_count); + if (atomic_inc_return(&inode->i_count) != 1) return; - } - atomic_inc(&inode->i_count); + if (!(inode->i_state & (I_DIRTY|I_SYNC))) list_move(&inode->i_list, &inode_in_use); inodes_stat.nr_unused--; @@ -1608,3 +1606,23 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) inode->i_ino); } EXPORT_SYMBOL(init_special_inode); + +/** + * Init uid,gid,mode for new inode according to posix standards + * @inode: New inode + * @dir: Directory inode + * @mode: mode of the new inode + */ +void inode_init_owner(struct inode *inode, const struct inode *dir, + mode_t mode) +{ + inode->i_uid = current_fsuid(); + if (dir && dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current_fsgid(); + inode->i_mode = mode; +} +EXPORT_SYMBOL(inode_init_owner); diff --git a/fs/internal.h b/fs/internal.h index 8a03a5447bd..6b706bc60a6 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -87,6 +87,8 @@ extern struct file *get_empty_filp(void); * super.c */ extern int do_remount_sb(struct super_block *, int, void *, int); +extern void __put_super(struct super_block *sb); +extern void put_super(struct super_block *sb); /* * open.c diff --git a/fs/ioctl.c b/fs/ioctl.c index 7faefb4da93..2d140a71386 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -525,15 +525,8 @@ static int ioctl_fsfreeze(struct file *filp) if (sb->s_op->freeze_fs == NULL) return -EOPNOTSUPP; - /* If a blockdevice-backed filesystem isn't specified, return. */ - if (sb->s_bdev == NULL) - return -EINVAL; - /* Freeze */ - sb = freeze_bdev(sb->s_bdev); - if (IS_ERR(sb)) - return PTR_ERR(sb); - return 0; + return freeze_super(sb); } static int ioctl_fsthaw(struct file *filp) @@ -543,12 +536,8 @@ static int ioctl_fsthaw(struct file *filp) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */ - if (sb->s_bdev == NULL) - return -EINVAL; - /* Thaw */ - return thaw_bdev(sb->s_bdev, sb); + return thaw_super(sb); } /* diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index ecb44c94ba8..28a9ddaa0c4 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -786,6 +786,12 @@ wait_for_iobuf: jbd_debug(3, "JBD: commit phase 6\n"); + /* All metadata is written, now write commit record and do cleanup */ + spin_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_COMMIT); + commit_transaction->t_state = T_COMMIT_RECORD; + spin_unlock(&journal->j_state_lock); + if (journal_write_commit_record(journal, commit_transaction)) err = -EIO; @@ -923,7 +929,7 @@ restart_loop: jbd_debug(3, "JBD: commit phase 8\n"); - J_ASSERT(commit_transaction->t_state == T_COMMIT); + J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD); commit_transaction->t_state = T_FINISHED; J_ASSERT(commit_transaction == journal->j_committing_transaction); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index bd224eec9b0..93d1e47647b 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -565,6 +565,38 @@ int log_wait_commit(journal_t *journal, tid_t tid) } /* + * Return 1 if a given transaction has not yet sent barrier request + * connected with a transaction commit. If 0 is returned, transaction + * may or may not have sent the barrier. Used to avoid sending barrier + * twice in common cases. + */ +int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid) +{ + int ret = 0; + transaction_t *commit_trans; + + if (!(journal->j_flags & JFS_BARRIER)) + return 0; + spin_lock(&journal->j_state_lock); + /* Transaction already committed? */ + if (tid_geq(journal->j_commit_sequence, tid)) + goto out; + /* + * Transaction is being committed and we already proceeded to + * writing commit record? + */ + commit_trans = journal->j_committing_transaction; + if (commit_trans && commit_trans->t_tid == tid && + commit_trans->t_state >= T_COMMIT_RECORD) + goto out; + ret = 1; +out: + spin_unlock(&journal->j_state_lock); + return ret; +} +EXPORT_SYMBOL(journal_trans_will_send_data_barrier); + +/* * Log buffer allocation routines: */ @@ -1157,6 +1189,7 @@ int journal_destroy(journal_t *journal) { int err = 0; + /* Wait for the commit thread to wake up and die. */ journal_kill_thread(journal); diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 30beb11ef92..076d1cc44f9 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -530,7 +530,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal) */ if ((journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(journal->j_fs_dev, NULL); + blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); if (!(journal->j_flags & JBD2_ABORT)) jbd2_journal_update_superblock(journal, 1); return 0; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 671da7fb7ff..75716d3d2be 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -717,7 +717,8 @@ start_journal_io: if (commit_transaction->t_flushed_data_blocks && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(journal->j_fs_dev, NULL); + blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); /* Done it all: now write the commit record asynchronously. */ if (JBD2_HAS_INCOMPAT_FEATURE(journal, @@ -727,7 +728,8 @@ start_journal_io: if (err) __jbd2_journal_abort_hard(journal); if (journal->j_flags & JBD2_BARRIER) - blkdev_issue_flush(journal->j_dev, NULL); + blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); } err = journal_finish_inode_data_buffers(journal, commit_transaction); diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 7cdc3196476..a33aab6b5e6 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -419,7 +419,7 @@ static int jffs2_acl_setxattr(struct dentry *dentry, const char *name, return rc; } -struct xattr_handler jffs2_acl_access_xattr_handler = { +const struct xattr_handler jffs2_acl_access_xattr_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_DEFAULT, .list = jffs2_acl_access_listxattr, @@ -427,7 +427,7 @@ struct xattr_handler jffs2_acl_access_xattr_handler = { .set = jffs2_acl_setxattr, }; -struct xattr_handler jffs2_acl_default_xattr_handler = { +const struct xattr_handler jffs2_acl_default_xattr_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .list = jffs2_acl_default_listxattr, diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index f0ba63e3c36..5e42de8d954 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -31,8 +31,8 @@ extern int jffs2_acl_chmod(struct inode *); extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); extern int jffs2_init_acl_post(struct inode *); -extern struct xattr_handler jffs2_acl_access_xattr_handler; -extern struct xattr_handler jffs2_acl_default_xattr_handler; +extern const struct xattr_handler jffs2_acl_access_xattr_handler; +extern const struct xattr_handler jffs2_acl_default_xattr_handler; #else diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index 3ff50da9478..55f1dde2fa8 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -23,10 +23,9 @@ static int jffs2_garbage_collect_thread(void *); void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c) { - spin_lock(&c->erase_completion_lock); + assert_spin_locked(&c->erase_completion_lock); if (c->gc_task && jffs2_thread_should_wake(c)) send_sig(SIGHUP, c->gc_task, 1); - spin_unlock(&c->erase_completion_lock); } /* This must only ever be called when no GC thread is currently running */ diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index b47679be118..6286ad9b00f 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -103,9 +103,10 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, jffs2_erase_failed(c, jeb, bad_offset); } -void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count) +int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count) { struct jffs2_eraseblock *jeb; + int work_done = 0; mutex_lock(&c->erase_free_sem); @@ -121,6 +122,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count) mutex_unlock(&c->erase_free_sem); jffs2_mark_erased_block(c, jeb); + work_done++; if (!--count) { D1(printk(KERN_DEBUG "Count reached. jffs2_erase_pending_blocks leaving\n")); goto done; @@ -157,6 +159,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count) mutex_unlock(&c->erase_free_sem); done: D1(printk(KERN_DEBUG "jffs2_erase_pending_blocks completed\n")); + return work_done; } static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) @@ -165,10 +168,11 @@ static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblo mutex_lock(&c->erase_free_sem); spin_lock(&c->erase_completion_lock); list_move_tail(&jeb->list, &c->erase_complete_list); + /* Wake the GC thread to mark them clean */ + jffs2_garbage_collect_trigger(c); spin_unlock(&c->erase_completion_lock); mutex_unlock(&c->erase_free_sem); - /* Ensure that kupdated calls us again to mark them clean */ - jffs2_erase_pending_trigger(c); + wake_up(&c->erase_wait); } static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset) @@ -487,9 +491,9 @@ filebad: refile: /* Stick it back on the list from whence it came and come back later */ - jffs2_erase_pending_trigger(c); mutex_lock(&c->erase_free_sem); spin_lock(&c->erase_completion_lock); + jffs2_garbage_collect_trigger(c); list_move(&jeb->list, &c->erase_complete_list); spin_unlock(&c->erase_completion_lock); mutex_unlock(&c->erase_free_sem); diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 3451a81b214..86e0821fc98 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -313,8 +313,8 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) case S_IFBLK: case S_IFCHR: /* Read the device numbers from the media */ - if (f->metadata->size != sizeof(jdev.old) && - f->metadata->size != sizeof(jdev.new)) { + if (f->metadata->size != sizeof(jdev.old_id) && + f->metadata->size != sizeof(jdev.new_id)) { printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size); goto error_io; } @@ -325,10 +325,10 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino); goto error; } - if (f->metadata->size == sizeof(jdev.old)) - rdev = old_decode_dev(je16_to_cpu(jdev.old)); + if (f->metadata->size == sizeof(jdev.old_id)) + rdev = old_decode_dev(je16_to_cpu(jdev.old_id)); else - rdev = new_decode_dev(je32_to_cpu(jdev.new)); + rdev = new_decode_dev(je32_to_cpu(jdev.new_id)); case S_IFSOCK: case S_IFIFO: diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c index 3b6f2fa12cf..f5e96bd656e 100644 --- a/fs/jffs2/gc.c +++ b/fs/jffs2/gc.c @@ -214,6 +214,19 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) return ret; } + /* If there are any blocks which need erasing, erase them now */ + if (!list_empty(&c->erase_complete_list) || + !list_empty(&c->erase_pending_list)) { + spin_unlock(&c->erase_completion_lock); + D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n")); + if (jffs2_erase_pending_blocks(c, 1)) { + mutex_unlock(&c->alloc_sem); + return 0; + } + D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n")); + spin_lock(&c->erase_completion_lock); + } + /* First, work out which block we're garbage-collecting */ jeb = c->gcblock; @@ -222,7 +235,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) if (!jeb) { /* Couldn't find a free block. But maybe we can just erase one and make 'progress'? */ - if (!list_empty(&c->erase_pending_list)) { + if (c->nr_erasing_blocks) { spin_unlock(&c->erase_completion_lock); mutex_unlock(&c->alloc_sem); return -EAGAIN; @@ -435,7 +448,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) list_add_tail(&c->gcblock->list, &c->erase_pending_list); c->gcblock = NULL; c->nr_erasing_blocks++; - jffs2_erase_pending_trigger(c); + jffs2_garbage_collect_trigger(c); } spin_unlock(&c->erase_completion_lock); diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h index 507ed6ec184..a881a42f19e 100644 --- a/fs/jffs2/nodelist.h +++ b/fs/jffs2/nodelist.h @@ -312,11 +312,11 @@ static inline int jffs2_blocks_use_vmalloc(struct jffs2_sb_info *c) static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev) { if (old_valid_dev(rdev)) { - jdev->old = cpu_to_je16(old_encode_dev(rdev)); - return sizeof(jdev->old); + jdev->old_id = cpu_to_je16(old_encode_dev(rdev)); + return sizeof(jdev->old_id); } else { - jdev->new = cpu_to_je32(new_encode_dev(rdev)); - return sizeof(jdev->new); + jdev->new_id = cpu_to_je32(new_encode_dev(rdev)); + return sizeof(jdev->new_id); } } @@ -464,7 +464,7 @@ int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb int jffs2_do_mount_fs(struct jffs2_sb_info *c); /* erase.c */ -void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count); +int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count); void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); #ifdef CONFIG_JFFS2_FS_WRITEBUFFER diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c index 191359dde4e..694aa5b0350 100644 --- a/fs/jffs2/nodemgmt.c +++ b/fs/jffs2/nodemgmt.c @@ -116,9 +116,21 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, ret = jffs2_garbage_collect_pass(c); - if (ret == -EAGAIN) - jffs2_erase_pending_blocks(c, 1); - else if (ret) + if (ret == -EAGAIN) { + spin_lock(&c->erase_completion_lock); + if (c->nr_erasing_blocks && + list_empty(&c->erase_pending_list) && + list_empty(&c->erase_complete_list)) { + DECLARE_WAITQUEUE(wait, current); + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&c->erase_wait, &wait); + D1(printk(KERN_DEBUG "%s waiting for erase to complete\n", __func__)); + spin_unlock(&c->erase_completion_lock); + + schedule(); + } else + spin_unlock(&c->erase_completion_lock); + } else if (ret) return ret; cond_resched(); @@ -217,7 +229,7 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c) ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list); list_move_tail(&ejeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; - jffs2_erase_pending_trigger(c); + jffs2_garbage_collect_trigger(c); D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n", ejeb->offset)); } @@ -469,7 +481,9 @@ struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c, void jffs2_complete_reservation(struct jffs2_sb_info *c) { D1(printk(KERN_DEBUG "jffs2_complete_reservation()\n")); + spin_lock(&c->erase_completion_lock); jffs2_garbage_collect_trigger(c); + spin_unlock(&c->erase_completion_lock); mutex_unlock(&c->alloc_sem); } @@ -611,7 +625,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n")); list_add_tail(&jeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; - jffs2_erase_pending_trigger(c); + jffs2_garbage_collect_trigger(c); } else { /* Sometimes, however, we leave it elsewhere so it doesn't get immediately reused, and we spread the load a bit. */ @@ -732,6 +746,10 @@ int jffs2_thread_should_wake(struct jffs2_sb_info *c) int nr_very_dirty = 0; struct jffs2_eraseblock *jeb; + if (!list_empty(&c->erase_complete_list) || + !list_empty(&c->erase_pending_list)) + return 1; + if (c->unchecked_size) { D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n", c->unchecked_size, c->checked_ino)); diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index a7f03b7ebcb..035a767f958 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -140,8 +140,7 @@ void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c); #endif /* WRITEBUFFER */ -/* erase.c */ -static inline void jffs2_erase_pending_trigger(struct jffs2_sb_info *c) +static inline void jffs2_dirty_trigger(struct jffs2_sb_info *c) { OFNI_BS_2SFFJ(c)->s_dirt = 1; } diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 696686cc206..46f870d1cc3 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -260,7 +260,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) ret = -EIO; goto out; } - jffs2_erase_pending_trigger(c); + spin_lock(&c->erase_completion_lock); + jffs2_garbage_collect_trigger(c); + spin_unlock(&c->erase_completion_lock); } ret = 0; out: diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index eaccee05858..239f51216a6 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -77,7 +77,7 @@ static size_t jffs2_security_listxattr(struct dentry *dentry, char *list, return retlen; } -struct xattr_handler jffs2_security_xattr_handler = { +const struct xattr_handler jffs2_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = jffs2_security_listxattr, .set = jffs2_security_setxattr, diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 9a80e8e595d..511e2d609d1 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -63,8 +63,6 @@ static void jffs2_write_super(struct super_block *sb) if (!(sb->s_flags & MS_RDONLY)) { D1(printk(KERN_DEBUG "jffs2_write_super()\n")); - jffs2_garbage_collect_trigger(c); - jffs2_erase_pending_blocks(c, 0); jffs2_flush_wbuf_gc(c, 0); } diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 5ef7bac265e..07ee1546b2f 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -84,7 +84,7 @@ static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino) struct jffs2_inodirty *new; /* Mark the superblock dirty so that kupdated will flush... */ - jffs2_erase_pending_trigger(c); + jffs2_dirty_trigger(c); if (jffs2_wbuf_pending_for_ino(c, ino)) return; @@ -121,7 +121,7 @@ static inline void jffs2_refile_wbuf_blocks(struct jffs2_sb_info *c) D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n")); list_add_tail(&jeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; - jffs2_erase_pending_trigger(c); + jffs2_garbage_collect_trigger(c); } else { /* Sometimes, however, we leave it elsewhere so it doesn't get immediately reused, and we spread the load a bit. */ @@ -152,7 +152,7 @@ static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock D1(printk("Refiling block at %08x to erase_pending_list\n", jeb->offset)); list_add(&jeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; - jffs2_erase_pending_trigger(c); + jffs2_garbage_collect_trigger(c); } if (!jffs2_prealloc_raw_node_refs(c, jeb, 1)) { @@ -543,7 +543,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset)); list_move(&jeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; - jffs2_erase_pending_trigger(c); + jffs2_garbage_collect_trigger(c); } jffs2_dbg_acct_sanity_check_nolock(c, jeb); diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index 9e75c62c85d..a2d58c96f1b 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -904,7 +904,7 @@ struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c, * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags) * is an implementation of setxattr handler on jffs2. * -------------------------------------------------- */ -struct xattr_handler *jffs2_xattr_handlers[] = { +const struct xattr_handler *jffs2_xattr_handlers[] = { &jffs2_user_xattr_handler, #ifdef CONFIG_JFFS2_FS_SECURITY &jffs2_security_xattr_handler, @@ -917,8 +917,8 @@ struct xattr_handler *jffs2_xattr_handlers[] = { NULL }; -static struct xattr_handler *xprefix_to_handler(int xprefix) { - struct xattr_handler *ret; +static const struct xattr_handler *xprefix_to_handler(int xprefix) { + const struct xattr_handler *ret; switch (xprefix) { case JFFS2_XPREFIX_USER: @@ -955,7 +955,7 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size) struct jffs2_inode_cache *ic = f->inocache; struct jffs2_xattr_ref *ref, **pref; struct jffs2_xattr_datum *xd; - struct xattr_handler *xhandle; + const struct xattr_handler *xhandle; ssize_t len, rc; int retry = 0; diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h index 6e3b5ddfb7a..cf4f5759b42 100644 --- a/fs/jffs2/xattr.h +++ b/fs/jffs2/xattr.h @@ -93,9 +93,9 @@ extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname, const char *buffer, size_t size, int flags); -extern struct xattr_handler *jffs2_xattr_handlers[]; -extern struct xattr_handler jffs2_user_xattr_handler; -extern struct xattr_handler jffs2_trusted_xattr_handler; +extern const struct xattr_handler *jffs2_xattr_handlers[]; +extern const struct xattr_handler jffs2_user_xattr_handler; +extern const struct xattr_handler jffs2_trusted_xattr_handler; extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t); #define jffs2_getxattr generic_getxattr @@ -122,7 +122,7 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t); #ifdef CONFIG_JFFS2_FS_SECURITY extern int jffs2_init_security(struct inode *inode, struct inode *dir); -extern struct xattr_handler jffs2_security_xattr_handler; +extern const struct xattr_handler jffs2_security_xattr_handler; #else #define jffs2_init_security(inode,dir) (0) #endif /* CONFIG_JFFS2_FS_SECURITY */ diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c index 3e5a5e356e0..1c868194c50 100644 --- a/fs/jffs2/xattr_trusted.c +++ b/fs/jffs2/xattr_trusted.c @@ -47,7 +47,7 @@ static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list, return retlen; } -struct xattr_handler jffs2_trusted_xattr_handler = { +const struct xattr_handler jffs2_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, .list = jffs2_trusted_listxattr, .set = jffs2_trusted_setxattr, diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c index 8544af67dff..916b5c96603 100644 --- a/fs/jffs2/xattr_user.c +++ b/fs/jffs2/xattr_user.c @@ -47,7 +47,7 @@ static size_t jffs2_user_listxattr(struct dentry *dentry, char *list, return retlen; } -struct xattr_handler jffs2_user_xattr_handler = { +const struct xattr_handler jffs2_user_xattr_handler = { .prefix = XATTR_USER_PREFIX, .list = jffs2_user_listxattr, .set = jffs2_user_setxattr, diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 14ba982b3f2..85d9ec65922 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -98,7 +98,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr) if (rc) return rc; - if (iattr->ia_valid & ATTR_SIZE) + if (is_quota_modification(inode, iattr)) dquot_initialize(inode); if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 829921b6776..2686531e235 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -98,14 +98,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) goto fail_unlock; } - inode->i_uid = current_fsuid(); - if (parent->i_mode & S_ISGID) { - inode->i_gid = parent->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - inode->i_gid = current_fsgid(); - + inode_init_owner(inode, parent, mode); /* * New inodes need to save sane values on disk when * uid & gid mount options are used @@ -121,7 +114,6 @@ struct inode *ialloc(struct inode *parent, umode_t mode) if (rc) goto fail_drop; - inode->i_mode = mode; /* inherit flags from parent */ jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT; @@ -134,7 +126,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) if (S_ISLNK(mode)) jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL); } - jfs_inode->mode2 |= mode; + jfs_inode->mode2 |= inode->i_mode; inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index 755a92e8daa..f602e230e16 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -358,14 +358,7 @@ struct inode *logfs_new_inode(struct inode *dir, int mode) inode->i_mode = mode; logfs_set_ino_generation(sb, inode); - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - inode->i_mode |= S_ISGID; - } - + inode_init_owner(inode, dir, mode); logfs_inode_setops(inode); insert_inode_hash(inode); diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 6ac693faae4..482779fe4e7 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -221,7 +221,7 @@ void minix_free_inode(struct inode * inode) clear_inode(inode); /* clear in-memory copy */ } -struct inode * minix_new_inode(const struct inode * dir, int * error) +struct inode *minix_new_inode(const struct inode *dir, int mode, int *error) { struct super_block *sb = dir->i_sb; struct minix_sb_info *sbi = minix_sb(sb); @@ -263,8 +263,7 @@ struct inode * minix_new_inode(const struct inode * dir, int * error) iput(inode); return NULL; } - inode->i_uid = current_fsuid(); - inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid(); + inode_init_owner(inode, dir, mode); inode->i_ino = j; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; inode->i_blocks = 0; diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 9dcf95b4211..111f34ee9e3 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -46,7 +46,7 @@ struct minix_sb_info { extern struct inode *minix_iget(struct super_block *, unsigned long); extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **); extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); -extern struct inode * minix_new_inode(const struct inode * dir, int * error); +extern struct inode * minix_new_inode(const struct inode *, int, int *); extern void minix_free_inode(struct inode * inode); extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi); extern int minix_new_block(struct inode * inode); diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 32b131cd612..e20ee85955d 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -46,10 +46,9 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, int mode, dev_ if (!old_valid_dev(rdev)) return -EINVAL; - inode = minix_new_inode(dir, &error); + inode = minix_new_inode(dir, mode, &error); if (inode) { - inode->i_mode = mode; minix_set_inode(inode, rdev); mark_inode_dirty(inode); error = add_nondir(dentry, inode); @@ -73,11 +72,10 @@ static int minix_symlink(struct inode * dir, struct dentry *dentry, if (i > dir->i_sb->s_blocksize) goto out; - inode = minix_new_inode(dir, &err); + inode = minix_new_inode(dir, S_IFLNK | 0777, &err); if (!inode) goto out; - inode->i_mode = S_IFLNK | 0777; minix_set_inode(inode, 0); err = page_symlink(inode, symname, i); if (err) @@ -117,13 +115,10 @@ static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode) inode_inc_link_count(dir); - inode = minix_new_inode(dir, &err); + inode = minix_new_inode(dir, mode, &err); if (!inode) goto out_dir; - inode->i_mode = S_IFDIR | mode; - if (dir->i_mode & S_ISGID) - inode->i_mode |= S_ISGID; minix_set_inode(inode, 0); inode_inc_link_count(inode); diff --git a/fs/namei.c b/fs/namei.c index b86b96fe1dc..48e1f60520e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -523,9 +523,10 @@ static void path_put_conditional(struct path *path, struct nameidata *nd) static inline void path_to_nameidata(struct path *path, struct nameidata *nd) { dput(nd->path.dentry); - if (nd->path.mnt != path->mnt) + if (nd->path.mnt != path->mnt) { mntput(nd->path.mnt); - nd->path.mnt = path->mnt; + nd->path.mnt = path->mnt; + } nd->path.dentry = path->dentry; } diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 7edfcd4d5e5..92dde6f8d89 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -51,7 +51,7 @@ const struct file_operations ncp_dir_operations = { .read = generic_read_dir, .readdir = ncp_readdir, - .ioctl = ncp_ioctl, + .unlocked_ioctl = ncp_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ncp_compat_ioctl, #endif diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 1daabb90e0a..b9387089289 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -295,7 +295,7 @@ const struct file_operations ncp_file_operations = .llseek = ncp_remote_llseek, .read = ncp_file_read, .write = ncp_file_write, - .ioctl = ncp_ioctl, + .unlocked_ioctl = ncp_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ncp_compat_ioctl, #endif diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index 60a5e2864ea..023c03d0207 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -20,6 +20,7 @@ #include <linux/smp_lock.h> #include <linux/vmalloc.h> #include <linux/sched.h> +#include <linux/smp_lock.h> #include <linux/ncp_fs.h> @@ -261,9 +262,9 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg) } #endif /* CONFIG_NCPFS_NLS */ -static int __ncp_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { + struct inode *inode = filp->f_dentry->d_inode; struct ncp_server *server = NCP_SERVER(inode); int result; struct ncp_ioctl_request request; @@ -841,11 +842,11 @@ static int ncp_ioctl_need_write(unsigned int cmd) } } -int ncp_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - int ret; + long ret; + lock_kernel(); if (ncp_ioctl_need_write(cmd)) { /* * inside the ioctl(), any failures which @@ -853,24 +854,28 @@ int ncp_ioctl(struct inode *inode, struct file *filp, * -EACCESS, so it seems consistent to keep * that here. */ - if (mnt_want_write(filp->f_path.mnt)) - return -EACCES; + if (mnt_want_write(filp->f_path.mnt)) { + ret = -EACCES; + goto out; + } } - ret = __ncp_ioctl(inode, filp, cmd, arg); + ret = __ncp_ioctl(filp, cmd, arg); if (ncp_ioctl_need_write(cmd)) mnt_drop_write(filp->f_path.mnt); + +out: + unlock_kernel(); return ret; } #ifdef CONFIG_COMPAT long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct inode *inode = file->f_path.dentry->d_inode; - int ret; + long ret; lock_kernel(); arg = (unsigned long) compat_ptr(arg); - ret = ncp_ioctl(inode, file, cmd, arg); + ret = ncp_ioctl(file, cmd, arg); unlock_kernel(); return ret; } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2f8b1157daa..04214fc5c30 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1060,7 +1060,7 @@ static int nfs_parse_mount_options(char *raw, goto out_nomem; rc = strict_strtoul(string, 10, &option); kfree(string); - if (rc != 0 || option > USHORT_MAX) + if (rc != 0 || option > USHRT_MAX) goto out_invalid_value; mnt->nfs_server.port = option; break; @@ -1181,7 +1181,7 @@ static int nfs_parse_mount_options(char *raw, goto out_nomem; rc = strict_strtoul(string, 10, &option); kfree(string); - if (rc != 0 || option > USHORT_MAX) + if (rc != 0 || option > USHRT_MAX) goto out_invalid_value; mnt->mount_server.port = option; break; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 7a9ae3254a4..7e26caab2a2 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -44,8 +44,7 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC /* Globals */ -static struct path rec_dir; -static int rec_dir_init = 0; +static struct file *rec_file; static int nfs4_save_creds(const struct cred **original_creds) @@ -117,33 +116,28 @@ out_no_tfm: return status; } -static void -nfsd4_sync_rec_dir(void) -{ - vfs_fsync(NULL, rec_dir.dentry, 0); -} - int nfsd4_create_clid_dir(struct nfs4_client *clp) { const struct cred *original_cred; char *dname = clp->cl_recdir; - struct dentry *dentry; + struct dentry *dir, *dentry; int status; dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname); - if (!rec_dir_init || clp->cl_firststate) + if (!rec_file || clp->cl_firststate) return 0; status = nfs4_save_creds(&original_cred); if (status < 0) return status; + dir = rec_file->f_path.dentry; /* lock the parent */ - mutex_lock(&rec_dir.dentry->d_inode->i_mutex); + mutex_lock(&dir->d_inode->i_mutex); - dentry = lookup_one_len(dname, rec_dir.dentry, HEXDIR_LEN-1); + dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); goto out_unlock; @@ -153,18 +147,18 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n"); goto out_put; } - status = mnt_want_write(rec_dir.mnt); + status = mnt_want_write(rec_file->f_path.mnt); if (status) goto out_put; - status = vfs_mkdir(rec_dir.dentry->d_inode, dentry, S_IRWXU); - mnt_drop_write(rec_dir.mnt); + status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU); + mnt_drop_write(rec_file->f_path.mnt); out_put: dput(dentry); out_unlock: - mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); + mutex_unlock(&dir->d_inode->i_mutex); if (status == 0) { clp->cl_firststate = 1; - nfsd4_sync_rec_dir(); + vfs_fsync(rec_file, 0); } nfs4_reset_creds(original_cred); dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status); @@ -206,14 +200,14 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f) struct dentry *dentry; int status; - if (!rec_dir_init) + if (!rec_file) return 0; status = nfs4_save_creds(&original_cred); if (status < 0) return status; - filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY, + filp = dentry_open(dget(dir), mntget(rec_file->f_path.mnt), O_RDONLY, current_cred()); status = PTR_ERR(filp); if (IS_ERR(filp)) @@ -250,13 +244,14 @@ out: static int nfsd4_unlink_clid_dir(char *name, int namlen) { - struct dentry *dentry; + struct dentry *dir, *dentry; int status; dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); - mutex_lock_nested(&rec_dir.dentry->d_inode->i_mutex, I_MUTEX_PARENT); - dentry = lookup_one_len(name, rec_dir.dentry, namlen); + dir = rec_file->f_path.dentry; + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + dentry = lookup_one_len(name, dir, namlen); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); goto out_unlock; @@ -264,11 +259,11 @@ nfsd4_unlink_clid_dir(char *name, int namlen) status = -ENOENT; if (!dentry->d_inode) goto out; - status = vfs_rmdir(rec_dir.dentry->d_inode, dentry); + status = vfs_rmdir(dir->d_inode, dentry); out: dput(dentry); out_unlock: - mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); + mutex_unlock(&dir->d_inode->i_mutex); return status; } @@ -278,10 +273,10 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) const struct cred *original_cred; int status; - if (!rec_dir_init || !clp->cl_firststate) + if (!rec_file || !clp->cl_firststate) return; - status = mnt_want_write(rec_dir.mnt); + status = mnt_want_write(rec_file->f_path.mnt); if (status) goto out; clp->cl_firststate = 0; @@ -293,8 +288,8 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1); nfs4_reset_creds(original_cred); if (status == 0) - nfsd4_sync_rec_dir(); - mnt_drop_write(rec_dir.mnt); + vfs_fsync(rec_file, 0); + mnt_drop_write(rec_file->f_path.mnt); out: if (status) printk("NFSD: Failed to remove expired client state directory" @@ -323,19 +318,19 @@ void nfsd4_recdir_purge_old(void) { int status; - if (!rec_dir_init) + if (!rec_file) return; - status = mnt_want_write(rec_dir.mnt); + status = mnt_want_write(rec_file->f_path.mnt); if (status) goto out; - status = nfsd4_list_rec_dir(rec_dir.dentry, purge_old); + status = nfsd4_list_rec_dir(rec_file->f_path.dentry, purge_old); if (status == 0) - nfsd4_sync_rec_dir(); - mnt_drop_write(rec_dir.mnt); + vfs_fsync(rec_file, 0); + mnt_drop_write(rec_file->f_path.mnt); out: if (status) printk("nfsd4: failed to purge old clients from recovery" - " directory %s\n", rec_dir.dentry->d_name.name); + " directory %s\n", rec_file->f_path.dentry->d_name.name); } static int @@ -355,10 +350,13 @@ int nfsd4_recdir_load(void) { int status; - status = nfsd4_list_rec_dir(rec_dir.dentry, load_recdir); + if (!rec_file) + return 0; + + status = nfsd4_list_rec_dir(rec_file->f_path.dentry, load_recdir); if (status) printk("nfsd4: failed loading clients from recovery" - " directory %s\n", rec_dir.dentry->d_name.name); + " directory %s\n", rec_file->f_path.dentry->d_name.name); return status; } @@ -375,7 +373,7 @@ nfsd4_init_recdir(char *rec_dirname) printk("NFSD: Using %s as the NFSv4 state recovery directory\n", rec_dirname); - BUG_ON(rec_dir_init); + BUG_ON(rec_file); status = nfs4_save_creds(&original_cred); if (status < 0) { @@ -385,22 +383,21 @@ nfsd4_init_recdir(char *rec_dirname) return; } - status = kern_path(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, - &rec_dir); - if (status) + rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0); + if (IS_ERR(rec_file)) { printk("NFSD: unable to find recovery directory %s\n", rec_dirname); + rec_file = NULL; + } - if (!status) - rec_dir_init = 1; nfs4_reset_creds(original_cred); } void nfsd4_shutdown_recdir(void) { - if (!rec_dir_init) + if (!rec_file) return; - rec_dir_init = 0; - path_put(&rec_dir); + fput(rec_file); + rec_file = NULL; } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index bc3194ea01f..508941c23af 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -998,7 +998,7 @@ static ssize_t __write_ports_addxprt(char *buf) if (sscanf(buf, "%15s %4u", transport, &port) != 2) return -EINVAL; - if (port < 1 || port > USHORT_MAX) + if (port < 1 || port > USHRT_MAX) return -EINVAL; err = nfsd_create_serv(); @@ -1040,7 +1040,7 @@ static ssize_t __write_ports_delxprt(char *buf) if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2) return -EINVAL; - if (port < 1 || port > USHORT_MAX || nfsd_serv == NULL) + if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL) return -EINVAL; xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 23c06f77f4c..ebbf3b6b245 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -999,7 +999,7 @@ static int wait_for_concurrent_writes(struct file *file) if (inode->i_state & I_DIRTY) { dprintk("nfsd: write sync %d\n", task_pid_nr(current)); - err = vfs_fsync(file, file->f_path.dentry, 0); + err = vfs_fsync(file, 0); } last_ino = inode->i_ino; last_dev = inode->i_sb->s_dev; @@ -1175,8 +1175,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, if (err) goto out; if (EX_ISSYNC(fhp->fh_export)) { - int err2 = vfs_fsync_range(file, file->f_path.dentry, - offset, end, 0); + int err2 = vfs_fsync_range(file, offset, end, 0); if (err2 != -EINVAL) err = nfserrno(err2); diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 7cfb87e692d..d7fd696e595 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -31,6 +31,11 @@ #include "alloc.h" +/** + * nilfs_palloc_groups_per_desc_block - get the number of groups that a group + * descriptor block can maintain + * @inode: inode of metadata file using this allocator + */ static inline unsigned long nilfs_palloc_groups_per_desc_block(const struct inode *inode) { @@ -38,12 +43,21 @@ nilfs_palloc_groups_per_desc_block(const struct inode *inode) sizeof(struct nilfs_palloc_group_desc); } +/** + * nilfs_palloc_groups_count - get maximum number of groups + * @inode: inode of metadata file using this allocator + */ static inline unsigned long nilfs_palloc_groups_count(const struct inode *inode) { return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */)); } +/** + * nilfs_palloc_init_blockgroup - initialize private variables for allocator + * @inode: inode of metadata file using this allocator + * @entry_size: size of the persistent object + */ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size) { struct nilfs_mdt_info *mi = NILFS_MDT(inode); @@ -69,6 +83,12 @@ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size) return 0; } +/** + * nilfs_palloc_group - get group number and offset from an entry number + * @inode: inode of metadata file using this allocator + * @nr: serial number of the entry (e.g. inode number) + * @offset: pointer to store offset number in the group + */ static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr, unsigned long *offset) { @@ -78,6 +98,14 @@ static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr, return group; } +/** + * nilfs_palloc_desc_blkoff - get block offset of a group descriptor block + * @inode: inode of metadata file using this allocator + * @group: group number + * + * nilfs_palloc_desc_blkoff() returns block offset of the descriptor + * block which contains a descriptor of the specified group. + */ static unsigned long nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group) { @@ -86,6 +114,14 @@ nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group) return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block; } +/** + * nilfs_palloc_bitmap_blkoff - get block offset of a bitmap block + * @inode: inode of metadata file using this allocator + * @group: group number + * + * nilfs_palloc_bitmap_blkoff() returns block offset of the bitmap + * block used to allocate/deallocate entries in the specified group. + */ static unsigned long nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group) { @@ -95,6 +131,12 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group) desc_offset * NILFS_MDT(inode)->mi_blocks_per_group; } +/** + * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group + * @inode: inode of metadata file using this allocator + * @group: group number + * @desc: pointer to descriptor structure for the group + */ static unsigned long nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group, const struct nilfs_palloc_group_desc *desc) @@ -107,6 +149,13 @@ nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group, return nfree; } +/** + * nilfs_palloc_group_desc_add_entries - adjust count of free entries + * @inode: inode of metadata file using this allocator + * @group: group number + * @desc: pointer to descriptor structure for the group + * @n: delta to be added + */ static void nilfs_palloc_group_desc_add_entries(struct inode *inode, unsigned long group, @@ -118,6 +167,11 @@ nilfs_palloc_group_desc_add_entries(struct inode *inode, spin_unlock(nilfs_mdt_bgl_lock(inode, group)); } +/** + * nilfs_palloc_entry_blkoff - get block offset of an entry block + * @inode: inode of metadata file using this allocator + * @nr: serial number of the entry (e.g. inode number) + */ static unsigned long nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr) { @@ -129,6 +183,12 @@ nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr) group_offset / NILFS_MDT(inode)->mi_entries_per_block; } +/** + * nilfs_palloc_desc_block_init - initialize buffer of a group descriptor block + * @inode: inode of metadata file + * @bh: buffer head of the buffer to be initialized + * @kaddr: kernel address mapped for the page including the buffer + */ static void nilfs_palloc_desc_block_init(struct inode *inode, struct buffer_head *bh, void *kaddr) { @@ -179,6 +239,13 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff, return ret; } +/** + * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block + * @inode: inode of metadata file using this allocator + * @group: group number + * @create: create flag + * @bhp: pointer to store the resultant buffer head + */ static int nilfs_palloc_get_desc_block(struct inode *inode, unsigned long group, int create, struct buffer_head **bhp) @@ -191,6 +258,13 @@ static int nilfs_palloc_get_desc_block(struct inode *inode, bhp, &cache->prev_desc, &cache->lock); } +/** + * nilfs_palloc_get_bitmap_block - get buffer head of a bitmap block + * @inode: inode of metadata file using this allocator + * @group: group number + * @create: create flag + * @bhp: pointer to store the resultant buffer head + */ static int nilfs_palloc_get_bitmap_block(struct inode *inode, unsigned long group, int create, struct buffer_head **bhp) @@ -203,6 +277,13 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode, &cache->prev_bitmap, &cache->lock); } +/** + * nilfs_palloc_get_entry_block - get buffer head of an entry block + * @inode: inode of metadata file using this allocator + * @nr: serial number of the entry (e.g. inode number) + * @create: create flag + * @bhp: pointer to store the resultant buffer head + */ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, int create, struct buffer_head **bhp) { @@ -214,6 +295,13 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, &cache->prev_entry, &cache->lock); } +/** + * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor + * @inode: inode of metadata file using this allocator + * @group: group number + * @bh: buffer head of the buffer storing the group descriptor block + * @kaddr: kernel address mapped for the page including the buffer + */ static struct nilfs_palloc_group_desc * nilfs_palloc_block_get_group_desc(const struct inode *inode, unsigned long group, @@ -223,6 +311,13 @@ nilfs_palloc_block_get_group_desc(const struct inode *inode, group % nilfs_palloc_groups_per_desc_block(inode); } +/** + * nilfs_palloc_block_get_entry - get kernel address of an entry + * @inode: inode of metadata file using this allocator + * @nr: serial number of the entry (e.g. inode number) + * @bh: buffer head of the buffer storing the entry block + * @kaddr: kernel address mapped for the page including the buffer + */ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, const struct buffer_head *bh, void *kaddr) { @@ -235,11 +330,19 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, entry_offset * NILFS_MDT(inode)->mi_entry_size; } +/** + * nilfs_palloc_find_available_slot - find available slot in a group + * @inode: inode of metadata file using this allocator + * @group: group number + * @target: offset number of an entry in the group (start point) + * @bitmap: bitmap of the group + * @bsize: size in bits + */ static int nilfs_palloc_find_available_slot(struct inode *inode, unsigned long group, unsigned long target, unsigned char *bitmap, - int bsize) /* size in bits */ + int bsize) { int curr, pos, end, i; @@ -277,6 +380,13 @@ static int nilfs_palloc_find_available_slot(struct inode *inode, return -ENOSPC; } +/** + * nilfs_palloc_rest_groups_in_desc_block - get the remaining number of groups + * in a group descriptor block + * @inode: inode of metadata file using this allocator + * @curr: current group number + * @max: maximum number of groups + */ static unsigned long nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode, unsigned long curr, unsigned long max) @@ -287,6 +397,11 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode, max - curr + 1); } +/** + * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the allocation + */ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, struct nilfs_palloc_req *req) { @@ -366,6 +481,11 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, return ret; } +/** + * nilfs_palloc_commit_alloc_entry - finish allocation of a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the allocation + */ void nilfs_palloc_commit_alloc_entry(struct inode *inode, struct nilfs_palloc_req *req) { @@ -377,6 +497,11 @@ void nilfs_palloc_commit_alloc_entry(struct inode *inode, brelse(req->pr_desc_bh); } +/** + * nilfs_palloc_commit_free_entry - finish deallocating a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the removal + */ void nilfs_palloc_commit_free_entry(struct inode *inode, struct nilfs_palloc_req *req) { @@ -410,6 +535,11 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, brelse(req->pr_desc_bh); } +/** + * nilfs_palloc_abort_alloc_entry - cancel allocation of a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the allocation + */ void nilfs_palloc_abort_alloc_entry(struct inode *inode, struct nilfs_palloc_req *req) { @@ -442,6 +572,11 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, req->pr_desc_bh = NULL; } +/** + * nilfs_palloc_prepare_free_entry - prepare to deallocate a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the removal + */ int nilfs_palloc_prepare_free_entry(struct inode *inode, struct nilfs_palloc_req *req) { @@ -464,6 +599,11 @@ int nilfs_palloc_prepare_free_entry(struct inode *inode, return 0; } +/** + * nilfs_palloc_abort_free_entry - cancel deallocating a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the removal + */ void nilfs_palloc_abort_free_entry(struct inode *inode, struct nilfs_palloc_req *req) { @@ -475,6 +615,12 @@ void nilfs_palloc_abort_free_entry(struct inode *inode, req->pr_desc_bh = NULL; } +/** + * nilfs_palloc_group_is_in - judge if an entry is in a group + * @inode: inode of metadata file using this allocator + * @group: group number + * @nr: serial number of the entry (e.g. inode number) + */ static int nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr) { @@ -485,6 +631,12 @@ nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr) return (nr >= first) && (nr <= last); } +/** + * nilfs_palloc_freev - deallocate a set of persistent objects + * @inode: inode of metadata file using this allocator + * @entry_nrs: array of entry numbers to be deallocated + * @nitems: number of entries stored in @entry_nrs + */ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) { struct buffer_head *desc_bh, *bitmap_bh; diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h index 5cccf874d69..9af34a7e6e1 100644 --- a/fs/nilfs2/alloc.h +++ b/fs/nilfs2/alloc.h @@ -29,6 +29,13 @@ #include <linux/buffer_head.h> #include <linux/fs.h> +/** + * nilfs_palloc_entries_per_group - get the number of entries per group + * @inode: inode of metadata file using this allocator + * + * The number of entries per group is defined by the number of bits + * that a bitmap block can maintain. + */ static inline unsigned long nilfs_palloc_entries_per_group(const struct inode *inode) { diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 76c38e3e19d..b27a342c5af 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -31,63 +31,16 @@ #include "alloc.h" #include "dat.h" -/** - * struct nilfs_btree_path - A path on which B-tree operations are executed - * @bp_bh: buffer head of node block - * @bp_sib_bh: buffer head of sibling node block - * @bp_index: index of child node - * @bp_oldreq: ptr end request for old ptr - * @bp_newreq: ptr alloc request for new ptr - * @bp_op: rebalance operation - */ -struct nilfs_btree_path { - struct buffer_head *bp_bh; - struct buffer_head *bp_sib_bh; - int bp_index; - union nilfs_bmap_ptr_req bp_oldreq; - union nilfs_bmap_ptr_req bp_newreq; - struct nilfs_btnode_chkey_ctxt bp_ctxt; - void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *, - int, __u64 *, __u64 *); -}; - -/* - * B-tree path operations - */ - -static struct kmem_cache *nilfs_btree_path_cache; - -int __init nilfs_btree_path_cache_init(void) -{ - nilfs_btree_path_cache = - kmem_cache_create("nilfs2_btree_path_cache", - sizeof(struct nilfs_btree_path) * - NILFS_BTREE_LEVEL_MAX, 0, 0, NULL); - return (nilfs_btree_path_cache != NULL) ? 0 : -ENOMEM; -} - -void nilfs_btree_path_cache_destroy(void) -{ - kmem_cache_destroy(nilfs_btree_path_cache); -} - -static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void) -{ - return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS); -} - -static inline void nilfs_btree_free_path(struct nilfs_btree_path *path) +static struct nilfs_btree_path *nilfs_btree_alloc_path(void) { - kmem_cache_free(nilfs_btree_path_cache, path); -} + struct nilfs_btree_path *path; + int level = NILFS_BTREE_LEVEL_DATA; -static void nilfs_btree_init_path(struct nilfs_btree_path *path) -{ - int level; + path = kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS); + if (path == NULL) + goto out; - for (level = NILFS_BTREE_LEVEL_DATA; - level < NILFS_BTREE_LEVEL_MAX; - level++) { + for (; level < NILFS_BTREE_LEVEL_MAX; level++) { path[level].bp_bh = NULL; path[level].bp_sib_bh = NULL; path[level].bp_index = 0; @@ -95,15 +48,19 @@ static void nilfs_btree_init_path(struct nilfs_btree_path *path) path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; path[level].bp_op = NULL; } + +out: + return path; } -static void nilfs_btree_release_path(struct nilfs_btree_path *path) +static void nilfs_btree_free_path(struct nilfs_btree_path *path) { - int level; + int level = NILFS_BTREE_LEVEL_DATA; - for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX; - level++) + for (; level < NILFS_BTREE_LEVEL_MAX; level++) brelse(path[level].bp_bh); + + kmem_cache_free(nilfs_btree_path_cache, path); } /* @@ -566,14 +523,12 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap, path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); if (ptrp != NULL) *ptrp = ptr; - nilfs_btree_release_path(path); nilfs_btree_free_path(path); return ret; @@ -594,7 +549,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(path); + ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); if (ret < 0) goto out; @@ -655,7 +610,6 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, *ptrp = ptr; ret = cnt; out: - nilfs_btree_release_path(path); nilfs_btree_free_path(path); return ret; } @@ -1123,7 +1077,6 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, NULL, NILFS_BTREE_LEVEL_NODE_MIN); @@ -1140,7 +1093,6 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); out: - nilfs_btree_release_path(path); nilfs_btree_free_path(path); return ret; } @@ -1456,7 +1408,7 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key) path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(path); + ret = nilfs_btree_do_lookup(btree, path, key, NULL, NILFS_BTREE_LEVEL_NODE_MIN); if (ret < 0) @@ -1473,7 +1425,6 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key) nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); out: - nilfs_btree_release_path(path); nilfs_btree_free_path(path); return ret; } @@ -1488,11 +1439,9 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL); - nilfs_btree_release_path(path); nilfs_btree_free_path(path); return ret; @@ -1923,7 +1872,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(path); if (buffer_nilfs_node(bh)) { node = (struct nilfs_btree_node *)bh->b_data; @@ -1947,7 +1895,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, nilfs_btree_propagate_p(btree, path, level, bh); out: - nilfs_btree_release_path(path); nilfs_btree_free_path(path); return ret; @@ -2108,7 +2055,6 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap, path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(path); if (buffer_nilfs_node(*bh)) { node = (struct nilfs_btree_node *)(*bh)->b_data; @@ -2130,7 +2076,6 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap, nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo); out: - nilfs_btree_release_path(path); nilfs_btree_free_path(path); return ret; @@ -2175,7 +2120,6 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1); if (ret < 0) { @@ -2195,7 +2139,6 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) nilfs_bmap_set_dirty(&btree->bt_bmap); out: - nilfs_btree_release_path(path); nilfs_btree_free_path(path); return ret; } diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h index 4b82d84ade7..af638d59e3b 100644 --- a/fs/nilfs2/btree.h +++ b/fs/nilfs2/btree.h @@ -30,9 +30,6 @@ #include "btnode.h" #include "bmap.h" -struct nilfs_btree; -struct nilfs_btree_path; - /** * struct nilfs_btree - B-tree structure * @bt_bmap: bmap base structure @@ -41,6 +38,25 @@ struct nilfs_btree { struct nilfs_bmap bt_bmap; }; +/** + * struct nilfs_btree_path - A path on which B-tree operations are executed + * @bp_bh: buffer head of node block + * @bp_sib_bh: buffer head of sibling node block + * @bp_index: index of child node + * @bp_oldreq: ptr end request for old ptr + * @bp_newreq: ptr alloc request for new ptr + * @bp_op: rebalance operation + */ +struct nilfs_btree_path { + struct buffer_head *bp_bh; + struct buffer_head *bp_sib_bh; + int bp_index; + union nilfs_bmap_ptr_req bp_oldreq; + union nilfs_bmap_ptr_req bp_newreq; + struct nilfs_btnode_chkey_ctxt bp_ctxt; + void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *, + int, __u64 *, __u64 *); +}; #define NILFS_BTREE_ROOT_SIZE NILFS_BMAP_SIZE #define NILFS_BTREE_ROOT_NCHILDREN_MAX \ @@ -57,6 +73,7 @@ struct nilfs_btree { #define NILFS_BTREE_KEY_MIN ((__u64)0) #define NILFS_BTREE_KEY_MAX (~(__u64)0) +extern struct kmem_cache *nilfs_btree_path_cache; int nilfs_btree_path_cache_init(void); void nilfs_btree_path_cache_destroy(void); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 0957b58f909..39e038ac8fc 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -280,16 +280,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode) /* reference count of i_bh inherits from nilfs_mdt_read_block() */ atomic_inc(&sbi->s_inodes_count); - - inode->i_uid = current_fsuid(); - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - inode->i_gid = current_fsgid(); - - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); inode->i_ino = ino; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; @@ -451,7 +442,7 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino, inode->i_op = &nilfs_special_inode_operations; init_special_inode( inode, inode->i_mode, - new_decode_dev(le64_to_cpu(raw_inode->i_device_code))); + huge_decode_dev(le64_to_cpu(raw_inode->i_device_code))); } nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh); brelse(bh); @@ -511,7 +502,7 @@ void nilfs_write_inode_common(struct inode *inode, nilfs_bmap_write(ii->i_bmap, raw_inode); else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) raw_inode->i_device_code = - cpu_to_le64(new_encode_dev(inode->i_rdev)); + cpu_to_le64(huge_encode_dev(inode->i_rdev)); /* When extending inode, nilfs->ns_inode_size should be checked for substitutions of appended fields */ } diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index ba43146f3c3..bae2a516b4e 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -105,6 +105,8 @@ static void store_segsum_info(struct nilfs_segsum_info *ssi, ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize); ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi); + + /* need to verify ->ss_bytes field if read ->ss_cno */ } /** diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 17851f77f73..2e6a2723b8f 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -40,35 +40,10 @@ struct nilfs_write_info { sector_t blocknr; }; - static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, struct the_nilfs *nilfs); static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf); - -static struct kmem_cache *nilfs_segbuf_cachep; - -static void nilfs_segbuf_init_once(void *obj) -{ - memset(obj, 0, sizeof(struct nilfs_segment_buffer)); -} - -int __init nilfs_init_segbuf_cache(void) -{ - nilfs_segbuf_cachep = - kmem_cache_create("nilfs2_segbuf_cache", - sizeof(struct nilfs_segment_buffer), - 0, SLAB_RECLAIM_ACCOUNT, - nilfs_segbuf_init_once); - - return (nilfs_segbuf_cachep == NULL) ? -ENOMEM : 0; -} - -void nilfs_destroy_segbuf_cache(void) -{ - kmem_cache_destroy(nilfs_segbuf_cachep); -} - struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb) { struct nilfs_segment_buffer *segbuf; @@ -81,6 +56,7 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb) INIT_LIST_HEAD(&segbuf->sb_list); INIT_LIST_HEAD(&segbuf->sb_segsum_buffers); INIT_LIST_HEAD(&segbuf->sb_payload_buffers); + segbuf->sb_super_root = NULL; init_completion(&segbuf->sb_bio_event); atomic_set(&segbuf->sb_err, 0); @@ -158,7 +134,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf, } int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags, - time_t ctime) + time_t ctime, __u64 cno) { int err; @@ -171,6 +147,7 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags, segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary); segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0; segbuf->sb_sum.ctime = ctime; + segbuf->sb_sum.cno = cno; return 0; } @@ -196,13 +173,14 @@ void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf) raw_sum->ss_nfinfo = cpu_to_le32(segbuf->sb_sum.nfinfo); raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes); raw_sum->ss_pad = 0; + raw_sum->ss_cno = cpu_to_le64(segbuf->sb_sum.cno); } /* * CRC calculation routines */ -void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, - u32 seed) +static void +nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, u32 seed) { struct buffer_head *bh; struct nilfs_segment_summary *raw_sum; @@ -229,8 +207,8 @@ void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, raw_sum->ss_sumsum = cpu_to_le32(crc); } -void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, - u32 seed) +static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, + u32 seed) { struct buffer_head *bh; struct nilfs_segment_summary *raw_sum; @@ -256,6 +234,20 @@ void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, raw_sum->ss_datasum = cpu_to_le32(crc); } +static void +nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf, + u32 seed) +{ + struct nilfs_super_root *raw_sr; + u32 crc; + + raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data; + crc = crc32_le(seed, + (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum), + NILFS_SR_BYTES - sizeof(raw_sr->sr_sum)); + raw_sr->sr_sum = cpu_to_le32(crc); +} + static void nilfs_release_buffers(struct list_head *list) { struct buffer_head *bh, *n; @@ -282,6 +274,7 @@ static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf) { nilfs_release_buffers(&segbuf->sb_segsum_buffers); nilfs_release_buffers(&segbuf->sb_payload_buffers); + segbuf->sb_super_root = NULL; } /* @@ -334,6 +327,23 @@ int nilfs_wait_on_logs(struct list_head *logs) return ret; } +/** + * nilfs_add_checksums_on_logs - add checksums on the logs + * @logs: list of segment buffers storing target logs + * @seed: checksum seed value + */ +void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed) +{ + struct nilfs_segment_buffer *segbuf; + + list_for_each_entry(segbuf, logs, sb_list) { + if (segbuf->sb_super_root) + nilfs_segbuf_fill_in_super_root_crc(segbuf, seed); + nilfs_segbuf_fill_in_segsum_crc(segbuf, seed); + nilfs_segbuf_fill_in_data_crc(segbuf, seed); + } +} + /* * BIO operations */ diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h index 94dfd3517bc..fdf1c3b6d67 100644 --- a/fs/nilfs2/segbuf.h +++ b/fs/nilfs2/segbuf.h @@ -37,6 +37,7 @@ * @sumbytes: Byte count of segment summary * @nfileblk: Total number of file blocks * @seg_seq: Segment sequence number + * @cno: Checkpoint number * @ctime: Creation time * @next: Block number of the next full segment */ @@ -48,6 +49,7 @@ struct nilfs_segsum_info { unsigned long sumbytes; unsigned long nfileblk; u64 seg_seq; + __u64 cno; time_t ctime; sector_t next; }; @@ -76,6 +78,7 @@ struct nilfs_segsum_info { * @sb_rest_blocks: Number of residual blocks in the current segment * @sb_segsum_buffers: List of buffers for segment summaries * @sb_payload_buffers: List of buffers for segment payload + * @sb_super_root: Pointer to buffer storing a super root block (if exists) * @sb_nbio: Number of flying bio requests * @sb_err: I/O error status * @sb_bio_event: Completion event of log writing @@ -95,6 +98,7 @@ struct nilfs_segment_buffer { /* Buffers */ struct list_head sb_segsum_buffers; struct list_head sb_payload_buffers; /* including super root */ + struct buffer_head *sb_super_root; /* io status */ int sb_nbio; @@ -121,6 +125,7 @@ struct nilfs_segment_buffer { b_assoc_buffers)) #define NILFS_SEGBUF_BH_IS_LAST(bh, head) ((bh)->b_assoc_buffers.next == head) +extern struct kmem_cache *nilfs_segbuf_cachep; int __init nilfs_init_segbuf_cache(void); void nilfs_destroy_segbuf_cache(void); @@ -132,13 +137,11 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf, struct nilfs_segment_buffer *prev); void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64, struct the_nilfs *); -int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t); +int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t, __u64); int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *); int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *, struct buffer_head **); void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *); -void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *, u32); -void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *, u32); static inline void nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf, @@ -171,6 +174,7 @@ void nilfs_truncate_logs(struct list_head *logs, struct nilfs_segment_buffer *last); int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs); int nilfs_wait_on_logs(struct list_head *logs); +void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed); static inline void nilfs_destroy_logs(struct list_head *logs) { diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 6a7dbd8451d..c9201649cc4 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -116,42 +116,6 @@ static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *, #define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a) #define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a) -/* - * Transaction - */ -static struct kmem_cache *nilfs_transaction_cachep; - -/** - * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info - * - * nilfs_init_transaction_cache() creates a slab cache for the struct - * nilfs_transaction_info. - * - * Return Value: On success, it returns 0. On error, one of the following - * negative error code is returned. - * - * %-ENOMEM - Insufficient memory available. - */ -int nilfs_init_transaction_cache(void) -{ - nilfs_transaction_cachep = - kmem_cache_create("nilfs2_transaction_cache", - sizeof(struct nilfs_transaction_info), - 0, SLAB_RECLAIM_ACCOUNT, NULL); - return (nilfs_transaction_cachep == NULL) ? -ENOMEM : 0; -} - -/** - * nilfs_destroy_transaction_cache - destroy the cache for transaction info - * - * nilfs_destroy_transaction_cache() frees the slab cache for the struct - * nilfs_transaction_info. - */ -void nilfs_destroy_transaction_cache(void) -{ - kmem_cache_destroy(nilfs_transaction_cachep); -} - static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti) { struct nilfs_transaction_info *cur_ti = current->journal_info; @@ -402,7 +366,8 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci) if (nilfs_doing_gc()) flags = NILFS_SS_GC; - err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime); + err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime, + sci->sc_sbi->s_nilfs->ns_cno); if (unlikely(err)) return err; @@ -435,7 +400,7 @@ static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci) return err; segbuf = sci->sc_curseg; } - err = nilfs_segbuf_extend_payload(segbuf, &sci->sc_super_root); + err = nilfs_segbuf_extend_payload(segbuf, &segbuf->sb_super_root); if (likely(!err)) segbuf->sb_sum.flags |= NILFS_SS_SR; return err; @@ -599,7 +564,7 @@ static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci, *vblocknr = binfo->bi_v.bi_vblocknr; } -struct nilfs_sc_operations nilfs_sc_file_ops = { +static struct nilfs_sc_operations nilfs_sc_file_ops = { .collect_data = nilfs_collect_file_data, .collect_node = nilfs_collect_file_node, .collect_bmap = nilfs_collect_file_bmap, @@ -649,7 +614,7 @@ static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci, *binfo_dat = binfo->bi_dat; } -struct nilfs_sc_operations nilfs_sc_dat_ops = { +static struct nilfs_sc_operations nilfs_sc_dat_ops = { .collect_data = nilfs_collect_dat_data, .collect_node = nilfs_collect_file_node, .collect_bmap = nilfs_collect_dat_bmap, @@ -657,7 +622,7 @@ struct nilfs_sc_operations nilfs_sc_dat_ops = { .write_node_binfo = nilfs_write_dat_node_binfo, }; -struct nilfs_sc_operations nilfs_sc_dsync_ops = { +static struct nilfs_sc_operations nilfs_sc_dsync_ops = { .collect_data = nilfs_collect_file_data, .collect_node = NULL, .collect_bmap = NULL, @@ -932,43 +897,16 @@ static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci, } } -/* - * CRC calculation routines - */ -static void nilfs_fill_in_super_root_crc(struct buffer_head *bh_sr, u32 seed) -{ - struct nilfs_super_root *raw_sr = - (struct nilfs_super_root *)bh_sr->b_data; - u32 crc; - - crc = crc32_le(seed, - (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum), - NILFS_SR_BYTES - sizeof(raw_sr->sr_sum)); - raw_sr->sr_sum = cpu_to_le32(crc); -} - -static void nilfs_segctor_fill_in_checksums(struct nilfs_sc_info *sci, - u32 seed) -{ - struct nilfs_segment_buffer *segbuf; - - if (sci->sc_super_root) - nilfs_fill_in_super_root_crc(sci->sc_super_root, seed); - - list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { - nilfs_segbuf_fill_in_segsum_crc(segbuf, seed); - nilfs_segbuf_fill_in_data_crc(segbuf, seed); - } -} - static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, struct the_nilfs *nilfs) { - struct buffer_head *bh_sr = sci->sc_super_root; - struct nilfs_super_root *raw_sr = - (struct nilfs_super_root *)bh_sr->b_data; + struct buffer_head *bh_sr; + struct nilfs_super_root *raw_sr; unsigned isz = nilfs->ns_inode_size; + bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root; + raw_sr = (struct nilfs_super_root *)bh_sr->b_data; + raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES); raw_sr->sr_nongc_ctime = cpu_to_le64(nilfs_doing_gc() ? @@ -1491,7 +1429,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci, /* Collection retry loop */ for (;;) { - sci->sc_super_root = NULL; sci->sc_nblk_this_inc = 0; sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); @@ -1568,7 +1505,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci, ssp.offset = sizeof(struct nilfs_segment_summary); list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - if (bh == sci->sc_super_root) + if (bh == segbuf->sb_super_root) break; if (!finfo) { finfo = nilfs_segctor_map_segsum_entry( @@ -1729,7 +1666,7 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci, list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - if (bh == sci->sc_super_root) { + if (bh == segbuf->sb_super_root) { if (bh->b_page != bd_page) { lock_page(bd_page); clear_page_dirty_for_io(bd_page); @@ -1848,7 +1785,7 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err) } static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page, - struct buffer_head *bh_sr, int err) + int err) { struct nilfs_segment_buffer *segbuf; struct page *bd_page = NULL, *fs_page = NULL; @@ -1869,7 +1806,7 @@ static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page, list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - if (bh == bh_sr) { + if (bh == segbuf->sb_super_root) { if (bh->b_page != bd_page) { end_page_writeback(bd_page); bd_page = bh->b_page; @@ -1898,7 +1835,7 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci, list_splice_tail_init(&sci->sc_write_logs, &logs); ret = nilfs_wait_on_logs(&logs); - nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret ? : err); + nilfs_abort_logs(&logs, NULL, ret ? : err); list_splice_tail_init(&sci->sc_segbufs, &logs); nilfs_cancel_segusage(&logs, nilfs->ns_sufile); @@ -1914,7 +1851,6 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci, } nilfs_destroy_logs(&logs); - sci->sc_super_root = NULL; } static void nilfs_set_next_segment(struct the_nilfs *nilfs, @@ -1933,7 +1869,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) struct nilfs_segment_buffer *segbuf; struct page *bd_page = NULL, *fs_page = NULL; struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; - int update_sr = (sci->sc_super_root != NULL); + int update_sr = false; list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) { struct buffer_head *bh; @@ -1964,11 +1900,12 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) set_buffer_uptodate(bh); clear_buffer_dirty(bh); clear_buffer_nilfs_volatile(bh); - if (bh == sci->sc_super_root) { + if (bh == segbuf->sb_super_root) { if (bh->b_page != bd_page) { end_page_writeback(bd_page); bd_page = bh->b_page; } + update_sr = true; break; } if (bh->b_page != fs_page) { @@ -2115,7 +2052,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) struct nilfs_sb_info *sbi = sci->sc_sbi; struct the_nilfs *nilfs = sbi->s_nilfs; struct page *failed_page; - int err, has_sr = 0; + int err; sci->sc_stage.scnt = NILFS_ST_INIT; @@ -2143,8 +2080,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) if (unlikely(err)) goto failed; - has_sr = (sci->sc_super_root != NULL); - /* Avoid empty segment */ if (sci->sc_stage.scnt == NILFS_ST_DONE && NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) { @@ -2159,7 +2094,8 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile); - if (has_sr) { + if (mode == SC_LSEG_SR && + sci->sc_stage.scnt >= NILFS_ST_CPFILE) { err = nilfs_segctor_fill_in_checkpoint(sci); if (unlikely(err)) goto failed_to_write; @@ -2171,11 +2107,12 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) /* Write partial segments */ err = nilfs_segctor_prepare_write(sci, &failed_page); if (err) { - nilfs_abort_logs(&sci->sc_segbufs, failed_page, - sci->sc_super_root, err); + nilfs_abort_logs(&sci->sc_segbufs, failed_page, err); goto failed_to_write; } - nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed); + + nilfs_add_checksums_on_logs(&sci->sc_segbufs, + nilfs->ns_crc_seed); err = nilfs_segctor_write(sci, nilfs); if (unlikely(err)) @@ -2196,8 +2133,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) } } while (sci->sc_stage.scnt != NILFS_ST_DONE); - sci->sc_super_root = NULL; - out: nilfs_segctor_check_out_files(sci, sbi); return err; @@ -2224,9 +2159,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci) { spin_lock(&sci->sc_state_lock); - if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) { - sci->sc_timer->expires = jiffies + sci->sc_interval; - add_timer(sci->sc_timer); + if (!(sci->sc_state & NILFS_SEGCTOR_COMMIT)) { + sci->sc_timer.expires = jiffies + sci->sc_interval; + add_timer(&sci->sc_timer); sci->sc_state |= NILFS_SEGCTOR_COMMIT; } spin_unlock(&sci->sc_state_lock); @@ -2431,9 +2366,7 @@ static void nilfs_segctor_accept(struct nilfs_sc_info *sci) spin_lock(&sci->sc_state_lock); sci->sc_seq_accepted = sci->sc_seq_request; spin_unlock(&sci->sc_state_lock); - - if (sci->sc_timer) - del_timer_sync(sci->sc_timer); + del_timer_sync(&sci->sc_timer); } /** @@ -2459,9 +2392,9 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err) sci->sc_flush_request &= ~FLUSH_DAT_BIT; /* re-enable timer if checkpoint creation was not done */ - if (sci->sc_timer && (sci->sc_state & NILFS_SEGCTOR_COMMIT) && - time_before(jiffies, sci->sc_timer->expires)) - add_timer(sci->sc_timer); + if ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && + time_before(jiffies, sci->sc_timer.expires)) + add_timer(&sci->sc_timer); } spin_unlock(&sci->sc_state_lock); } @@ -2640,13 +2573,10 @@ static int nilfs_segctor_thread(void *arg) { struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg; struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; - struct timer_list timer; int timeout = 0; - init_timer(&timer); - timer.data = (unsigned long)current; - timer.function = nilfs_construction_timeout; - sci->sc_timer = &timer; + sci->sc_timer.data = (unsigned long)current; + sci->sc_timer.function = nilfs_construction_timeout; /* start sync. */ sci->sc_task = current; @@ -2695,7 +2625,7 @@ static int nilfs_segctor_thread(void *arg) should_sleep = 0; else if (sci->sc_state & NILFS_SEGCTOR_COMMIT) should_sleep = time_before(jiffies, - sci->sc_timer->expires); + sci->sc_timer.expires); if (should_sleep) { spin_unlock(&sci->sc_state_lock); @@ -2704,7 +2634,7 @@ static int nilfs_segctor_thread(void *arg) } finish_wait(&sci->sc_wait_daemon, &wait); timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && - time_after_eq(jiffies, sci->sc_timer->expires)); + time_after_eq(jiffies, sci->sc_timer.expires)); if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs)) set_nilfs_discontinued(nilfs); @@ -2713,8 +2643,6 @@ static int nilfs_segctor_thread(void *arg) end_thread: spin_unlock(&sci->sc_state_lock); - del_timer_sync(sci->sc_timer); - sci->sc_timer = NULL; /* end sync. */ sci->sc_task = NULL; @@ -2750,13 +2678,6 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci) } } -static int nilfs_segctor_init(struct nilfs_sc_info *sci) -{ - sci->sc_seq_done = sci->sc_seq_request; - - return nilfs_segctor_start_thread(sci); -} - /* * Setup & clean-up functions */ @@ -2780,6 +2701,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi) INIT_LIST_HEAD(&sci->sc_write_logs); INIT_LIST_HEAD(&sci->sc_gc_inodes); INIT_LIST_HEAD(&sci->sc_copied_buffers); + init_timer(&sci->sc_timer); sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ; @@ -2846,6 +2768,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) down_write(&sbi->s_nilfs->ns_segctor_sem); + del_timer_sync(&sci->sc_timer); kfree(sci); } @@ -2880,7 +2803,7 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi) return -ENOMEM; nilfs_attach_writer(nilfs, sbi); - err = nilfs_segctor_init(NILFS_SC(sbi)); + err = nilfs_segctor_start_thread(NILFS_SC(sbi)); if (err) { nilfs_detach_writer(nilfs, sbi); kfree(sbi->s_sc_info); diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 82dfd6a686b..dca142361cc 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -100,7 +100,6 @@ struct nilfs_segsum_pointer { * @sc_write_logs: List of segment buffers to hold logs under writing * @sc_segbuf_nblocks: Number of available blocks in segment buffers. * @sc_curseg: Current segment buffer - * @sc_super_root: Pointer to the super root buffer * @sc_stage: Collection stage * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary @@ -148,7 +147,6 @@ struct nilfs_sc_info { struct list_head sc_write_logs; unsigned long sc_segbuf_nblocks; struct nilfs_segment_buffer *sc_curseg; - struct buffer_head *sc_super_root; struct nilfs_cstage sc_stage; @@ -179,7 +177,7 @@ struct nilfs_sc_info { unsigned long sc_lseg_stime; /* in 1/HZ seconds */ unsigned long sc_watermark; - struct timer_list *sc_timer; + struct timer_list sc_timer; struct task_struct *sc_task; }; @@ -219,6 +217,8 @@ enum { */ #define NILFS_SC_DEFAULT_WATERMARK 3600 +/* super.c */ +extern struct kmem_cache *nilfs_transaction_cachep; /* segment.c */ extern int nilfs_init_transaction_cache(void); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 48145f505a6..03b34b73899 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -67,6 +67,11 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem " "(NILFS)"); MODULE_LICENSE("GPL"); +struct kmem_cache *nilfs_inode_cachep; +struct kmem_cache *nilfs_transaction_cachep; +struct kmem_cache *nilfs_segbuf_cachep; +struct kmem_cache *nilfs_btree_path_cache; + static int nilfs_remount(struct super_block *sb, int *flags, char *data); /** @@ -129,7 +134,6 @@ void nilfs_warning(struct super_block *sb, const char *function, va_end(args); } -static struct kmem_cache *nilfs_inode_cachep; struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs) { @@ -155,34 +159,6 @@ void nilfs_destroy_inode(struct inode *inode) kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); } -static void init_once(void *obj) -{ - struct nilfs_inode_info *ii = obj; - - INIT_LIST_HEAD(&ii->i_dirty); -#ifdef CONFIG_NILFS_XATTR - init_rwsem(&ii->xattr_sem); -#endif - nilfs_btnode_cache_init_once(&ii->i_btnode_cache); - ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union; - inode_init_once(&ii->vfs_inode); -} - -static int nilfs_init_inode_cache(void) -{ - nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache", - sizeof(struct nilfs_inode_info), - 0, SLAB_RECLAIM_ACCOUNT, - init_once); - - return (nilfs_inode_cachep == NULL) ? -ENOMEM : 0; -} - -static inline void nilfs_destroy_inode_cache(void) -{ - kmem_cache_destroy(nilfs_inode_cachep); -} - static void nilfs_clear_inode(struct inode *inode) { struct nilfs_inode_info *ii = NILFS_I(inode); @@ -266,8 +242,8 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb) int err; /* nilfs->sem must be locked by the caller. */ - if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) { - if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC) + if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) { + if (sbp[1] && sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) nilfs_swap_super_block(nilfs); else { printk(KERN_CRIT "NILFS: superblock broke on dev %s\n", @@ -470,10 +446,10 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs) if (nilfs_test_opt(sbi, SNAPSHOT)) seq_printf(seq, ",cp=%llu", (unsigned long long int)sbi->s_snapshot_cno); - if (nilfs_test_opt(sbi, ERRORS_RO)) - seq_printf(seq, ",errors=remount-ro"); if (nilfs_test_opt(sbi, ERRORS_PANIC)) seq_printf(seq, ",errors=panic"); + if (nilfs_test_opt(sbi, ERRORS_CONT)) + seq_printf(seq, ",errors=continue"); if (nilfs_test_opt(sbi, STRICT_ORDER)) seq_printf(seq, ",order=strict"); if (nilfs_test_opt(sbi, NORECOVERY)) @@ -631,7 +607,7 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi, struct nilfs_super_block *sbp) { sbi->s_mount_opt = - NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER; + NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER; } static int nilfs_setup_super(struct nilfs_sb_info *sbi) @@ -778,9 +754,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent, goto failed_sbi; } cno = sbi->s_snapshot_cno; - } else - /* Read-only mount */ - sbi->s_snapshot_cno = cno; + } } err = nilfs_attach_checkpoint(sbi, cno); @@ -849,7 +823,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) struct the_nilfs *nilfs = sbi->s_nilfs; unsigned long old_sb_flags; struct nilfs_mount_options old_opts; - int err; + int was_snapshot, err; lock_kernel(); @@ -857,6 +831,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) old_sb_flags = sb->s_flags; old_opts.mount_opt = sbi->s_mount_opt; old_opts.snapshot_cno = sbi->s_snapshot_cno; + was_snapshot = nilfs_test_opt(sbi, SNAPSHOT); if (!parse_options(data, sb)) { err = -EINVAL; @@ -864,20 +839,32 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) } sb->s_flags = (sb->s_flags & ~MS_POSIXACL); - if ((*flags & MS_RDONLY) && - sbi->s_snapshot_cno != old_opts.snapshot_cno) { - printk(KERN_WARNING "NILFS (device %s): couldn't " - "remount to a different snapshot.\n", - sb->s_id); - err = -EINVAL; - goto restore_opts; + err = -EINVAL; + if (was_snapshot) { + if (!(*flags & MS_RDONLY)) { + printk(KERN_ERR "NILFS (device %s): cannot remount " + "snapshot read/write.\n", + sb->s_id); + goto restore_opts; + } else if (sbi->s_snapshot_cno != old_opts.snapshot_cno) { + printk(KERN_ERR "NILFS (device %s): cannot " + "remount to a different snapshot.\n", + sb->s_id); + goto restore_opts; + } + } else { + if (nilfs_test_opt(sbi, SNAPSHOT)) { + printk(KERN_ERR "NILFS (device %s): cannot change " + "a regular mount to a snapshot.\n", + sb->s_id); + goto restore_opts; + } } if (!nilfs_valid_fs(nilfs)) { printk(KERN_WARNING "NILFS (device %s): couldn't " "remount because the filesystem is in an " "incomplete recovery state.\n", sb->s_id); - err = -EINVAL; goto restore_opts; } @@ -888,9 +875,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) nilfs_detach_segment_constructor(sbi); sb->s_flags |= MS_RDONLY; - sbi->s_snapshot_cno = nilfs_last_cno(nilfs); - /* nilfs_set_opt(sbi, SNAPSHOT); */ - /* * Remounting a valid RW partition RDONLY, so set * the RDONLY flag and then mark the partition as valid again. @@ -909,24 +893,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) * store the current valid flag. (It may have been changed * by fsck since we originally mounted the partition.) */ - if (nilfs->ns_current && nilfs->ns_current != sbi) { - printk(KERN_WARNING "NILFS (device %s): couldn't " - "remount because an RW-mount exists.\n", - sb->s_id); - err = -EBUSY; - goto restore_opts; - } - if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) { - printk(KERN_WARNING "NILFS (device %s): couldn't " - "remount because the current RO-mount is not " - "the latest one.\n", - sb->s_id); - err = -EINVAL; - goto restore_opts; - } sb->s_flags &= ~MS_RDONLY; - nilfs_clear_opt(sbi, SNAPSHOT); - sbi->s_snapshot_cno = 0; err = nilfs_attach_segment_constructor(sbi); if (err) @@ -935,8 +902,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) down_write(&nilfs->ns_sem); nilfs_setup_super(sbi); up_write(&nilfs->ns_sem); - - nilfs->ns_current = sbi; } out: up_write(&nilfs->ns_super_sem); @@ -1022,10 +987,14 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, { struct nilfs_super_data sd; struct super_block *s; + fmode_t mode = FMODE_READ; struct the_nilfs *nilfs; int err, need_to_close = 1; - sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type); + if (!(flags & MS_RDONLY)) + mode |= FMODE_WRITE; + + sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type); if (IS_ERR(sd.bdev)) return PTR_ERR(sd.bdev); @@ -1092,10 +1061,12 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, /* New superblock instance created */ s->s_flags = flags; + s->s_mode = mode; strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); sb_set_blocksize(s, block_size(sd.bdev)); - err = nilfs_fill_super(s, data, flags & MS_VERBOSE, nilfs); + err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0, + nilfs); if (err) goto cancel_new; @@ -1106,7 +1077,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, mutex_unlock(&nilfs->ns_mount_mutex); put_nilfs(nilfs); if (need_to_close) - close_bdev_exclusive(sd.bdev, flags); + close_bdev_exclusive(sd.bdev, mode); simple_set_mnt(mnt, s); return 0; @@ -1114,7 +1085,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, mutex_unlock(&nilfs->ns_mount_mutex); put_nilfs(nilfs); failed: - close_bdev_exclusive(sd.bdev, flags); + close_bdev_exclusive(sd.bdev, mode); return err; @@ -1124,7 +1095,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, put_nilfs(nilfs); deactivate_locked_super(s); /* - * deactivate_super() invokes close_bdev_exclusive(). + * deactivate_locked_super() invokes close_bdev_exclusive(). * We must finish all post-cleaning before this call; * put_nilfs() needs the block device. */ @@ -1139,54 +1110,93 @@ struct file_system_type nilfs_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; -static int __init init_nilfs_fs(void) +static void nilfs_inode_init_once(void *obj) { - int err; - - err = nilfs_init_inode_cache(); - if (err) - goto failed; + struct nilfs_inode_info *ii = obj; - err = nilfs_init_transaction_cache(); - if (err) - goto failed_inode_cache; + INIT_LIST_HEAD(&ii->i_dirty); +#ifdef CONFIG_NILFS_XATTR + init_rwsem(&ii->xattr_sem); +#endif + nilfs_btnode_cache_init_once(&ii->i_btnode_cache); + ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union; + inode_init_once(&ii->vfs_inode); +} - err = nilfs_init_segbuf_cache(); - if (err) - goto failed_transaction_cache; +static void nilfs_segbuf_init_once(void *obj) +{ + memset(obj, 0, sizeof(struct nilfs_segment_buffer)); +} - err = nilfs_btree_path_cache_init(); - if (err) - goto failed_segbuf_cache; +static void nilfs_destroy_cachep(void) +{ + if (nilfs_inode_cachep) + kmem_cache_destroy(nilfs_inode_cachep); + if (nilfs_transaction_cachep) + kmem_cache_destroy(nilfs_transaction_cachep); + if (nilfs_segbuf_cachep) + kmem_cache_destroy(nilfs_segbuf_cachep); + if (nilfs_btree_path_cache) + kmem_cache_destroy(nilfs_btree_path_cache); +} - err = register_filesystem(&nilfs_fs_type); - if (err) - goto failed_btree_path_cache; +static int __init nilfs_init_cachep(void) +{ + nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache", + sizeof(struct nilfs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once); + if (!nilfs_inode_cachep) + goto fail; + + nilfs_transaction_cachep = kmem_cache_create("nilfs2_transaction_cache", + sizeof(struct nilfs_transaction_info), 0, + SLAB_RECLAIM_ACCOUNT, NULL); + if (!nilfs_transaction_cachep) + goto fail; + + nilfs_segbuf_cachep = kmem_cache_create("nilfs2_segbuf_cache", + sizeof(struct nilfs_segment_buffer), 0, + SLAB_RECLAIM_ACCOUNT, nilfs_segbuf_init_once); + if (!nilfs_segbuf_cachep) + goto fail; + + nilfs_btree_path_cache = kmem_cache_create("nilfs2_btree_path_cache", + sizeof(struct nilfs_btree_path) * NILFS_BTREE_LEVEL_MAX, + 0, 0, NULL); + if (!nilfs_btree_path_cache) + goto fail; return 0; - failed_btree_path_cache: - nilfs_btree_path_cache_destroy(); +fail: + nilfs_destroy_cachep(); + return -ENOMEM; +} + +static int __init init_nilfs_fs(void) +{ + int err; - failed_segbuf_cache: - nilfs_destroy_segbuf_cache(); + err = nilfs_init_cachep(); + if (err) + goto fail; - failed_transaction_cache: - nilfs_destroy_transaction_cache(); + err = register_filesystem(&nilfs_fs_type); + if (err) + goto free_cachep; - failed_inode_cache: - nilfs_destroy_inode_cache(); + printk(KERN_INFO "NILFS version 2 loaded\n"); + return 0; - failed: +free_cachep: + nilfs_destroy_cachep(); +fail: return err; } static void __exit exit_nilfs_fs(void) { - nilfs_destroy_segbuf_cache(); - nilfs_destroy_transaction_cache(); - nilfs_destroy_inode_cache(); - nilfs_btree_path_cache_destroy(); + nilfs_destroy_cachep(); unregister_filesystem(&nilfs_fs_type); } diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 33871f7e4f0..8c1097327ab 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -486,11 +486,15 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, printk(KERN_WARNING "NILFS warning: unable to read secondary superblock\n"); + /* + * Compare two super blocks and set 1 in swp if the secondary + * super block is valid and newer. Otherwise, set 0 in swp. + */ valid[0] = nilfs_valid_sb(sbp[0]); valid[1] = nilfs_valid_sb(sbp[1]); - swp = valid[1] && - (!valid[0] || - le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime)); + swp = valid[1] && (!valid[0] || + le64_to_cpu(sbp[1]->s_last_cno) > + le64_to_cpu(sbp[0]->s_last_cno)); if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) { brelse(sbh[1]); @@ -670,7 +674,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, start * sects_per_block, nblocks * sects_per_block, GFP_NOFS, - DISCARD_FL_BARRIER); + BLKDEV_IFL_BARRIER); if (ret < 0) return ret; nblocks = 0; @@ -680,7 +684,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, - GFP_NOFS, DISCARD_FL_BARRIER); + GFP_NOFS, BLKDEV_IFL_BARRIER); return ret; } diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c index 40b1cf914cc..27b75ebc746 100644 --- a/fs/notify/inotify/inotify.c +++ b/fs/notify/inotify/inotify.c @@ -110,14 +110,10 @@ EXPORT_SYMBOL_GPL(get_inotify_watch); int pin_inotify_watch(struct inotify_watch *watch) { struct super_block *sb = watch->inode->i_sb; - spin_lock(&sb_lock); - if (sb->s_count >= S_BIAS) { - atomic_inc(&sb->s_active); - spin_unlock(&sb_lock); + if (atomic_inc_not_zero(&sb->s_active)) { atomic_inc(&watch->count); return 1; } - spin_unlock(&sb_lock); return 0; } @@ -515,34 +511,8 @@ EXPORT_SYMBOL_GPL(inotify_init_watch); * done. Cleanup is just deactivate_super(). However, that leaves a messy * case - what if we *are* racing with umount() and active references to * superblock can't be acquired anymore? We can bump ->s_count, grab - * ->s_umount, which will almost certainly wait until the superblock is shut - * down and the watch in question is pining for fjords. That's fine, but - * there is a problem - we might have hit the window between ->s_active - * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock - * is past the point of no return and is heading for shutdown) and the - * moment when deactivate_super() acquires ->s_umount. We could just do - * drop_super() yield() and retry, but that's rather antisocial and this - * stuff is luser-triggerable. OTOH, having grabbed ->s_umount and having - * found that we'd got there first (i.e. that ->s_root is non-NULL) we know - * that we won't race with inotify_umount_inodes(). So we could grab a - * reference to watch and do the rest as above, just with drop_super() instead - * of deactivate_super(), right? Wrong. We had to drop ih->mutex before we - * could grab ->s_umount. So the watch could've been gone already. - * - * That still can be dealt with - we need to save watch->wd, do idr_find() - * and compare its result with our pointer. If they match, we either have - * the damn thing still alive or we'd lost not one but two races at once, - * the watch had been killed and a new one got created with the same ->wd - * at the same address. That couldn't have happened in inotify_destroy(), - * but inotify_rm_wd() could run into that. Still, "new one got created" - * is not a problem - we have every right to kill it or leave it alone, - * whatever's more convenient. - * - * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as - * "grab it and kill it" check. If it's been our original watch, we are - * fine, if it's a newcomer - nevermind, just pretend that we'd won the - * race and kill the fscker anyway; we are safe since we know that its - * superblock won't be going away. + * ->s_umount, which will wait until the superblock is shut down and the + * watch in question is pining for fjords. * * And yes, this is far beyond mere "not very pretty"; so's the entire * concept of inotify to start with. @@ -556,57 +526,31 @@ EXPORT_SYMBOL_GPL(inotify_init_watch); * Called with ih->mutex held, drops it. Possible return values: * 0 - nothing to do, it has died * 1 - remove it, drop the reference and deactivate_super() - * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid - * that variant, since it involved a lot of PITA, but that's the best that - * could've been done. */ static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch) { struct super_block *sb = watch->inode->i_sb; - s32 wd = watch->wd; - spin_lock(&sb_lock); - if (sb->s_count >= S_BIAS) { - atomic_inc(&sb->s_active); - spin_unlock(&sb_lock); + if (atomic_inc_not_zero(&sb->s_active)) { get_inotify_watch(watch); mutex_unlock(&ih->mutex); return 1; /* the best outcome */ } + spin_lock(&sb_lock); sb->s_count++; spin_unlock(&sb_lock); mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */ down_read(&sb->s_umount); - if (likely(!sb->s_root)) { - /* fs is already shut down; the watch is dead */ - drop_super(sb); - return 0; - } - /* raced with the final deactivate_super() */ - mutex_lock(&ih->mutex); - if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) { - /* the watch is dead */ - mutex_unlock(&ih->mutex); - drop_super(sb); - return 0; - } - /* still alive or freed and reused with the same sb and wd; kill */ - get_inotify_watch(watch); - mutex_unlock(&ih->mutex); - return 2; + /* fs is already shut down; the watch is dead */ + drop_super(sb); + return 0; } -static void unpin_and_kill(struct inotify_watch *watch, int how) +static void unpin_and_kill(struct inotify_watch *watch) { struct super_block *sb = watch->inode->i_sb; put_inotify_watch(watch); - switch (how) { - case 1: - deactivate_super(sb); - break; - case 2: - drop_super(sb); - } + deactivate_super(sb); } /** @@ -628,7 +572,6 @@ void inotify_destroy(struct inotify_handle *ih) struct list_head *watches; struct super_block *sb; struct inode *inode; - int how; mutex_lock(&ih->mutex); watches = &ih->watches; @@ -638,8 +581,7 @@ void inotify_destroy(struct inotify_handle *ih) } watch = list_first_entry(watches, struct inotify_watch, h_list); sb = watch->inode->i_sb; - how = pin_to_kill(ih, watch); - if (!how) + if (!pin_to_kill(ih, watch)) continue; inode = watch->inode; @@ -654,7 +596,7 @@ void inotify_destroy(struct inotify_handle *ih) mutex_unlock(&ih->mutex); mutex_unlock(&inode->inotify_mutex); - unpin_and_kill(watch, how); + unpin_and_kill(watch); } /* free this handle: the put matching the get in inotify_init() */ @@ -857,7 +799,6 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd) struct inotify_watch *watch; struct super_block *sb; struct inode *inode; - int how; mutex_lock(&ih->mutex); watch = idr_find(&ih->idr, wd); @@ -866,8 +807,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd) return -EINVAL; } sb = watch->inode->i_sb; - how = pin_to_kill(ih, watch); - if (!how) + if (!pin_to_kill(ih, watch)) return 0; inode = watch->inode; @@ -881,7 +821,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd) mutex_unlock(&ih->mutex); mutex_unlock(&inode->inotify_mutex); - unpin_and_kill(watch, how); + unpin_and_kill(watch); return 0; } diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 8804f093ba7..a1924a0d2ab 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -98,9 +98,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp) * the page at all. For a more detailed explanation see ntfs_truncate() in * fs/ntfs/inode.c. * - * @cached_page and @lru_pvec are just optimizations for dealing with multiple - * pages. - * * Return 0 on success and -errno on error. In the case that an error is * encountered it is possible that the initialized size will already have been * incremented some way towards @new_init_size but it is guaranteed that if @@ -110,8 +107,7 @@ static int ntfs_file_open(struct inode *vi, struct file *filp) * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be * held by the caller. */ -static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size, - struct page **cached_page, struct pagevec *lru_pvec) +static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size) { s64 old_init_size; loff_t old_i_size; @@ -403,18 +399,13 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov, * Obtain @nr_pages locked page cache pages from the mapping @mapping and * starting at index @index. * - * If a page is newly created, increment its refcount and add it to the - * caller's lru-buffering pagevec @lru_pvec. - * - * This is the same as mm/filemap.c::__grab_cache_page(), except that @nr_pages - * are obtained at once instead of just one page and that 0 is returned on - * success and -errno on error. + * If a page is newly created, add it to lru list * * Note, the page locks are obtained in ascending page index order. */ static inline int __ntfs_grab_cache_pages(struct address_space *mapping, pgoff_t index, const unsigned nr_pages, struct page **pages, - struct page **cached_page, struct pagevec *lru_pvec) + struct page **cached_page) { int err, nr; @@ -430,7 +421,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping, goto err_out; } } - err = add_to_page_cache(*cached_page, mapping, index, + err = add_to_page_cache_lru(*cached_page, mapping, index, GFP_KERNEL); if (unlikely(err)) { if (err == -EEXIST) @@ -438,9 +429,6 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping, goto err_out; } pages[nr] = *cached_page; - page_cache_get(*cached_page); - if (unlikely(!pagevec_add(lru_pvec, *cached_page))) - __pagevec_lru_add_file(lru_pvec); *cached_page = NULL; } index++; @@ -1800,7 +1788,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, ssize_t status, written; unsigned nr_pages; int err; - struct pagevec lru_pvec; ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " "pos 0x%llx, count 0x%lx.", @@ -1912,7 +1899,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, } } } - pagevec_init(&lru_pvec, 0); written = 0; /* * If the write starts beyond the initialized size, extend it up to the @@ -1925,8 +1911,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, ll = ni->initialized_size; read_unlock_irqrestore(&ni->size_lock, flags); if (pos > ll) { - err = ntfs_attr_extend_initialized(ni, pos, &cached_page, - &lru_pvec); + err = ntfs_attr_extend_initialized(ni, pos); if (err < 0) { ntfs_error(vol->sb, "Cannot perform write to inode " "0x%lx, attribute type 0x%x, because " @@ -2012,7 +1997,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes); /* Get and lock @do_pages starting at index @start_idx. */ status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, - pages, &cached_page, &lru_pvec); + pages, &cached_page); if (unlikely(status)) break; /* @@ -2077,7 +2062,6 @@ err_out: *ppos = pos; if (cached_page) page_cache_release(cached_page); - pagevec_lru_add_file(&lru_pvec); ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", written ? "written" : "status", (unsigned long)written, (long)status); diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index 791c0886c06..07d9fd85435 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -29,6 +29,7 @@ ocfs2-objs := \ mmap.o \ namei.o \ refcounttree.o \ + reservations.o \ resize.o \ slot_map.o \ suballoc.o \ diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index e13fc9e8fcd..da702294d7e 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -489,7 +489,7 @@ cleanup: return ret; } -struct xattr_handler ocfs2_xattr_acl_access_handler = { +const struct xattr_handler ocfs2_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .list = ocfs2_xattr_list_acl_access, @@ -497,7 +497,7 @@ struct xattr_handler ocfs2_xattr_acl_access_handler = { .set = ocfs2_xattr_set_acl, }; -struct xattr_handler ocfs2_xattr_acl_default_handler = { +const struct xattr_handler ocfs2_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .list = ocfs2_xattr_list_acl_default, diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 9f8bd913c51..215e12ce1d8 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1006,7 +1006,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle, int count, status, i; u16 suballoc_bit_start; u32 num_got; - u64 first_blkno; + u64 suballoc_loc, first_blkno; struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); struct ocfs2_extent_block *eb; @@ -1015,10 +1015,10 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle, count = 0; while (count < wanted) { - status = ocfs2_claim_metadata(osb, - handle, + status = ocfs2_claim_metadata(handle, meta_ac, wanted - count, + &suballoc_loc, &suballoc_bit_start, &num_got, &first_blkno); @@ -1052,6 +1052,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle, eb->h_fs_generation = cpu_to_le32(osb->fs_generation); eb->h_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); + eb->h_suballoc_loc = cpu_to_le64(suballoc_loc); eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); eb->h_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); @@ -1061,11 +1062,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle, /* We'll also be dirtied by the caller, so * this isn't absolutely necessary. */ - status = ocfs2_journal_dirty(handle, bhs[i]); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, bhs[i]); } count += num_got; @@ -1129,8 +1126,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle, goto out; } - status = ocfs2_extend_trans(handle, path_num_items(path) + - handle->h_buffer_credits); + status = ocfs2_extend_trans(handle, path_num_items(path)); if (status < 0) { mlog_errno(status); goto out; @@ -1270,12 +1266,7 @@ static int ocfs2_add_branch(handle_t *handle, if (!eb_el->l_tree_depth) new_last_eb_blk = le64_to_cpu(eb->h_blkno); - status = ocfs2_journal_dirty(handle, bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } - + ocfs2_journal_dirty(handle, bh); next_blkno = le64_to_cpu(eb->h_blkno); } @@ -1321,17 +1312,10 @@ static int ocfs2_add_branch(handle_t *handle, eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); - status = ocfs2_journal_dirty(handle, *last_eb_bh); - if (status < 0) - mlog_errno(status); - status = ocfs2_journal_dirty(handle, et->et_root_bh); - if (status < 0) - mlog_errno(status); - if (eb_bh) { - status = ocfs2_journal_dirty(handle, eb_bh); - if (status < 0) - mlog_errno(status); - } + ocfs2_journal_dirty(handle, *last_eb_bh); + ocfs2_journal_dirty(handle, et->et_root_bh); + if (eb_bh) + ocfs2_journal_dirty(handle, eb_bh); /* * Some callers want to track the rightmost leaf so pass it @@ -1399,11 +1383,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle, for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++) eb_el->l_recs[i] = root_el->l_recs[i]; - status = ocfs2_journal_dirty(handle, new_eb_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, new_eb_bh); status = ocfs2_et_root_journal_access(handle, et, OCFS2_JOURNAL_ACCESS_WRITE); @@ -1428,11 +1408,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle, if (root_el->l_tree_depth == cpu_to_le16(1)) ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); - status = ocfs2_journal_dirty(handle, et->et_root_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, et->et_root_bh); *ret_new_eb_bh = new_eb_bh; new_eb_bh = NULL; @@ -2064,7 +2040,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle, struct ocfs2_path *right_path, int subtree_index) { - int ret, i, idx; + int i, idx; struct ocfs2_extent_list *el, *left_el, *right_el; struct ocfs2_extent_rec *left_rec, *right_rec; struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; @@ -2102,13 +2078,8 @@ static void ocfs2_complete_edge_insert(handle_t *handle, ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, right_el); - ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh); - if (ret) - mlog_errno(ret); - - ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, left_path->p_node[i].bh); + ocfs2_journal_dirty(handle, right_path->p_node[i].bh); /* * Setup our list pointers now so that the current @@ -2132,9 +2103,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle, root_bh = left_path->p_node[subtree_index].bh; - ret = ocfs2_journal_dirty(handle, root_bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, root_bh); } static int ocfs2_rotate_subtree_right(handle_t *handle, @@ -2207,11 +2176,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle, ocfs2_create_empty_extent(right_el); - ret = ocfs2_journal_dirty(handle, right_leaf_bh); - if (ret) { - mlog_errno(ret); - goto out; - } + ocfs2_journal_dirty(handle, right_leaf_bh); /* Do the copy now. */ i = le16_to_cpu(left_el->l_next_free_rec) - 1; @@ -2230,11 +2195,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle, memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); le16_add_cpu(&left_el->l_next_free_rec, 1); - ret = ocfs2_journal_dirty(handle, left_leaf_bh); - if (ret) { - mlog_errno(ret); - goto out; - } + ocfs2_journal_dirty(handle, left_leaf_bh); ocfs2_complete_edge_insert(handle, left_path, right_path, subtree_index); @@ -2249,8 +2210,8 @@ out: * * Will return zero if the path passed in is already the leftmost path. */ -static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, - struct ocfs2_path *path, u32 *cpos) +int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos) { int i, j, ret = 0; u64 blkno; @@ -2327,20 +2288,14 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, int op_credits, struct ocfs2_path *path) { - int ret; + int ret = 0; int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; - if (handle->h_buffer_credits < credits) { + if (handle->h_buffer_credits < credits) ret = ocfs2_extend_trans(handle, credits - handle->h_buffer_credits); - if (ret) - return ret; - if (unlikely(handle->h_buffer_credits < credits)) - return ocfs2_extend_trans(handle, credits); - } - - return 0; + return ret; } /* @@ -2584,8 +2539,7 @@ static int ocfs2_update_edge_lengths(handle_t *handle, * records for all the bh in the path. * So we have to allocate extra credits and access them. */ - ret = ocfs2_extend_trans(handle, - handle->h_buffer_credits + subtree_index); + ret = ocfs2_extend_trans(handle, subtree_index); if (ret) { mlog_errno(ret); goto out; @@ -2823,12 +2777,8 @@ static int ocfs2_rotate_subtree_left(handle_t *handle, ocfs2_remove_empty_extent(right_leaf_el); } - ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); - if (ret) - mlog_errno(ret); - ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); + ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); if (del_right_subtree) { ocfs2_unlink_subtree(handle, et, left_path, right_path, @@ -2851,9 +2801,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle, if (right_has_empty) ocfs2_remove_empty_extent(left_leaf_el); - ret = ocfs2_journal_dirty(handle, et_root_bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, et_root_bh); *deleted = 1; } else @@ -2962,10 +2910,7 @@ static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle, } ocfs2_remove_empty_extent(el); - - ret = ocfs2_journal_dirty(handle, bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, bh); out: return ret; @@ -3506,15 +3451,9 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path, ocfs2_cleanup_merge(el, index); - ret = ocfs2_journal_dirty(handle, bh); - if (ret) - mlog_errno(ret); - + ocfs2_journal_dirty(handle, bh); if (right_path) { - ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); - if (ret) - mlog_errno(ret); - + ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); ocfs2_complete_edge_insert(handle, left_path, right_path, subtree_index); } @@ -3683,14 +3622,9 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, ocfs2_cleanup_merge(el, index); - ret = ocfs2_journal_dirty(handle, bh); - if (ret) - mlog_errno(ret); - + ocfs2_journal_dirty(handle, bh); if (left_path) { - ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); /* * In the situation that the right_rec is empty and the extent @@ -4016,10 +3950,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle, le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos)); - ret = ocfs2_journal_dirty(handle, bh); - if (ret) - mlog_errno(ret); - + ocfs2_journal_dirty(handle, bh); } } @@ -4203,17 +4134,13 @@ static int ocfs2_insert_path(handle_t *handle, struct buffer_head *leaf_bh = path_leaf_bh(right_path); if (left_path) { - int credits = handle->h_buffer_credits; - /* * There's a chance that left_path got passed back to * us without being accounted for in the * journal. Extend our transaction here to be sure we * can change those blocks. */ - credits += left_path->p_tree_depth; - - ret = ocfs2_extend_trans(handle, credits); + ret = ocfs2_extend_trans(handle, left_path->p_tree_depth); if (ret < 0) { mlog_errno(ret); goto out; @@ -4251,17 +4178,13 @@ static int ocfs2_insert_path(handle_t *handle, * dirty this for us. */ if (left_path) - ret = ocfs2_journal_dirty(handle, - path_leaf_bh(left_path)); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, + path_leaf_bh(left_path)); } else ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path), insert); - ret = ocfs2_journal_dirty(handle, leaf_bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, leaf_bh); if (left_path) { /* @@ -4384,9 +4307,7 @@ out_update_clusters: ocfs2_et_update_clusters(et, le16_to_cpu(insert_rec->e_leaf_clusters)); - ret = ocfs2_journal_dirty(handle, et->et_root_bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, et->et_root_bh); out: ocfs2_free_path(left_path); @@ -4866,7 +4787,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, goto leave; } - status = __ocfs2_claim_clusters(osb, handle, data_ac, 1, + status = __ocfs2_claim_clusters(handle, data_ac, 1, clusters_to_add, &bit_off, &num_bits); if (status < 0) { if (status != -ENOSPC) @@ -4895,11 +4816,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, goto leave; } - status = ocfs2_journal_dirty(handle, et->et_root_bh); - if (status < 0) { - mlog_errno(status); - goto leave; - } + ocfs2_journal_dirty(handle, et->et_root_bh); clusters_to_add -= num_bits; *logical_offset += num_bits; @@ -5309,7 +5226,7 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et, int index, u32 new_range, struct ocfs2_alloc_context *meta_ac) { - int ret, depth, credits = handle->h_buffer_credits; + int ret, depth, credits; struct buffer_head *last_eb_bh = NULL; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *rightmost_el, *el; @@ -5340,8 +5257,8 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et, } else rightmost_el = path_leaf_el(path); - credits += path->p_tree_depth + - ocfs2_extend_meta_needed(et->et_root_el); + credits = path->p_tree_depth + + ocfs2_extend_meta_needed(et->et_root_el); ret = ocfs2_extend_trans(handle, credits); if (ret) { mlog_errno(ret); @@ -5671,19 +5588,97 @@ out: return ret; } +/* + * ocfs2_reserve_blocks_for_rec_trunc() would look basically the + * same as ocfs2_lock_alloctors(), except for it accepts a blocks + * number to reserve some extra blocks, and it only handles meta + * data allocations. + * + * Currently, only ocfs2_remove_btree_range() uses it for truncating + * and punching holes. + */ +static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 extents_to_split, + struct ocfs2_alloc_context **ac, + int extra_blocks) +{ + int ret = 0, num_free_extents; + unsigned int max_recs_needed = 2 * extents_to_split; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + *ac = NULL; + + num_free_extents = ocfs2_num_free_extents(osb, et); + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + if (!num_free_extents || + (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) + extra_blocks += ocfs2_extend_meta_needed(et->et_root_el); + + if (extra_blocks) { + ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + } + +out: + if (ret) { + if (*ac) { + ocfs2_free_alloc_context(*ac); + *ac = NULL; + } + } + + return ret; +} + int ocfs2_remove_btree_range(struct inode *inode, struct ocfs2_extent_tree *et, - u32 cpos, u32 phys_cpos, u32 len, - struct ocfs2_cached_dealloc_ctxt *dealloc) + u32 cpos, u32 phys_cpos, u32 len, int flags, + struct ocfs2_cached_dealloc_ctxt *dealloc, + u64 refcount_loc) { - int ret; + int ret, credits = 0, extra_blocks = 0; u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *tl_inode = osb->osb_tl_inode; handle_t *handle; struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_refcount_tree *ref_tree = NULL; + + if ((flags & OCFS2_EXT_REFCOUNTED) && len) { + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & + OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, + &ref_tree, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } - ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac); + ret = ocfs2_prepare_refcount_change_for_del(inode, + refcount_loc, + phys_blkno, + len, + &credits, + &extra_blocks); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac, + extra_blocks); if (ret) { mlog_errno(ret); return ret; @@ -5699,7 +5694,8 @@ int ocfs2_remove_btree_range(struct inode *inode, } } - handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb)); + handle = ocfs2_start_trans(osb, + ocfs2_remove_extent_credits(osb->sb) + credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); @@ -5724,15 +5720,22 @@ int ocfs2_remove_btree_range(struct inode *inode, ocfs2_et_update_clusters(et, -len); - ret = ocfs2_journal_dirty(handle, et->et_root_bh); - if (ret) { - mlog_errno(ret); - goto out_commit; - } + ocfs2_journal_dirty(handle, et->et_root_bh); - ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); - if (ret) - mlog_errno(ret); + if (phys_blkno) { + if (flags & OCFS2_EXT_REFCOUNTED) + ret = ocfs2_decrease_refcount(inode, handle, + ocfs2_blocks_to_clusters(osb->sb, + phys_blkno), + len, meta_ac, + dealloc, 1); + else + ret = ocfs2_truncate_log_append(osb, handle, + phys_blkno, len); + if (ret) + mlog_errno(ret); + + } out_commit: ocfs2_commit_trans(osb, handle); @@ -5742,6 +5745,9 @@ out: if (meta_ac) ocfs2_free_alloc_context(meta_ac); + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + return ret; } @@ -5850,11 +5856,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, } tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters); - status = ocfs2_journal_dirty(handle, tl_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, tl_bh); bail: mlog_exit(status); @@ -5893,11 +5895,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, tl->tl_used = cpu_to_le16(i); - status = ocfs2_journal_dirty(handle, tl_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, tl_bh); /* TODO: Perhaps we can calculate the bulk of the * credits up front rather than extending like @@ -6298,6 +6296,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb) */ struct ocfs2_cached_block_free { struct ocfs2_cached_block_free *free_next; + u64 free_bg; u64 free_blk; unsigned int free_bit; }; @@ -6344,8 +6343,11 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, } while (head) { - bg_blkno = ocfs2_which_suballoc_group(head->free_blk, - head->free_bit); + if (head->free_bg) + bg_blkno = head->free_bg; + else + bg_blkno = ocfs2_which_suballoc_group(head->free_blk, + head->free_bit); mlog(0, "Free bit: (bit %u, blkno %llu)\n", head->free_bit, (unsigned long long)head->free_blk); @@ -6393,7 +6395,7 @@ int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, int ret = 0; struct ocfs2_cached_block_free *item; - item = kmalloc(sizeof(*item), GFP_NOFS); + item = kzalloc(sizeof(*item), GFP_NOFS); if (item == NULL) { ret = -ENOMEM; mlog_errno(ret); @@ -6533,8 +6535,8 @@ ocfs2_find_per_slot_free_list(int type, } int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, - int type, int slot, u64 blkno, - unsigned int bit) + int type, int slot, u64 suballoc, + u64 blkno, unsigned int bit) { int ret; struct ocfs2_per_slot_free_list *fl; @@ -6547,7 +6549,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, goto out; } - item = kmalloc(sizeof(*item), GFP_NOFS); + item = kzalloc(sizeof(*item), GFP_NOFS); if (item == NULL) { ret = -ENOMEM; mlog_errno(ret); @@ -6557,6 +6559,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n", type, slot, bit, (unsigned long long)blkno); + item->free_bg = suballoc; item->free_blk = blkno; item->free_bit = bit; item->free_next = fl->f_first; @@ -6573,433 +6576,11 @@ static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, { return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE, le16_to_cpu(eb->h_suballoc_slot), + le64_to_cpu(eb->h_suballoc_loc), le64_to_cpu(eb->h_blkno), le16_to_cpu(eb->h_suballoc_bit)); } -/* This function will figure out whether the currently last extent - * block will be deleted, and if it will, what the new last extent - * block will be so we can update his h_next_leaf_blk field, as well - * as the dinodes i_last_eb_blk */ -static int ocfs2_find_new_last_ext_blk(struct inode *inode, - unsigned int clusters_to_del, - struct ocfs2_path *path, - struct buffer_head **new_last_eb) -{ - int next_free, ret = 0; - u32 cpos; - struct ocfs2_extent_rec *rec; - struct ocfs2_extent_block *eb; - struct ocfs2_extent_list *el; - struct buffer_head *bh = NULL; - - *new_last_eb = NULL; - - /* we have no tree, so of course, no last_eb. */ - if (!path->p_tree_depth) - goto out; - - /* trunc to zero special case - this makes tree_depth = 0 - * regardless of what it is. */ - if (OCFS2_I(inode)->ip_clusters == clusters_to_del) - goto out; - - el = path_leaf_el(path); - BUG_ON(!el->l_next_free_rec); - - /* - * Make sure that this extent list will actually be empty - * after we clear away the data. We can shortcut out if - * there's more than one non-empty extent in the - * list. Otherwise, a check of the remaining extent is - * necessary. - */ - next_free = le16_to_cpu(el->l_next_free_rec); - rec = NULL; - if (ocfs2_is_empty_extent(&el->l_recs[0])) { - if (next_free > 2) - goto out; - - /* We may have a valid extent in index 1, check it. */ - if (next_free == 2) - rec = &el->l_recs[1]; - - /* - * Fall through - no more nonempty extents, so we want - * to delete this leaf. - */ - } else { - if (next_free > 1) - goto out; - - rec = &el->l_recs[0]; - } - - if (rec) { - /* - * Check it we'll only be trimming off the end of this - * cluster. - */ - if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del) - goto out; - } - - ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); - if (ret) { - mlog_errno(ret); - goto out; - } - - ret = ocfs2_find_leaf(INODE_CACHE(inode), path_root_el(path), cpos, &bh); - if (ret) { - mlog_errno(ret); - goto out; - } - - eb = (struct ocfs2_extent_block *) bh->b_data; - el = &eb->h_list; - - /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block(). - * Any corruption is a code bug. */ - BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); - - *new_last_eb = bh; - get_bh(*new_last_eb); - mlog(0, "returning block %llu, (cpos: %u)\n", - (unsigned long long)le64_to_cpu(eb->h_blkno), cpos); -out: - brelse(bh); - - return ret; -} - -/* - * Trim some clusters off the rightmost edge of a tree. Only called - * during truncate. - * - * The caller needs to: - * - start journaling of each path component. - * - compute and fully set up any new last ext block - */ -static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path, - handle_t *handle, struct ocfs2_truncate_context *tc, - u32 clusters_to_del, u64 *delete_start, u8 *flags) -{ - int ret, i, index = path->p_tree_depth; - u32 new_edge = 0; - u64 deleted_eb = 0; - struct buffer_head *bh; - struct ocfs2_extent_list *el; - struct ocfs2_extent_rec *rec; - - *delete_start = 0; - *flags = 0; - - while (index >= 0) { - bh = path->p_node[index].bh; - el = path->p_node[index].el; - - mlog(0, "traveling tree (index = %d, block = %llu)\n", - index, (unsigned long long)bh->b_blocknr); - - BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); - - if (index != - (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) { - ocfs2_error(inode->i_sb, - "Inode %lu has invalid ext. block %llu", - inode->i_ino, - (unsigned long long)bh->b_blocknr); - ret = -EROFS; - goto out; - } - -find_tail_record: - i = le16_to_cpu(el->l_next_free_rec) - 1; - rec = &el->l_recs[i]; - - mlog(0, "Extent list before: record %d: (%u, %u, %llu), " - "next = %u\n", i, le32_to_cpu(rec->e_cpos), - ocfs2_rec_clusters(el, rec), - (unsigned long long)le64_to_cpu(rec->e_blkno), - le16_to_cpu(el->l_next_free_rec)); - - BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del); - - if (le16_to_cpu(el->l_tree_depth) == 0) { - /* - * If the leaf block contains a single empty - * extent and no records, we can just remove - * the block. - */ - if (i == 0 && ocfs2_is_empty_extent(rec)) { - memset(rec, 0, - sizeof(struct ocfs2_extent_rec)); - el->l_next_free_rec = cpu_to_le16(0); - - goto delete; - } - - /* - * Remove any empty extents by shifting things - * left. That should make life much easier on - * the code below. This condition is rare - * enough that we shouldn't see a performance - * hit. - */ - if (ocfs2_is_empty_extent(&el->l_recs[0])) { - le16_add_cpu(&el->l_next_free_rec, -1); - - for(i = 0; - i < le16_to_cpu(el->l_next_free_rec); i++) - el->l_recs[i] = el->l_recs[i + 1]; - - memset(&el->l_recs[i], 0, - sizeof(struct ocfs2_extent_rec)); - - /* - * We've modified our extent list. The - * simplest way to handle this change - * is to being the search from the - * start again. - */ - goto find_tail_record; - } - - le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del); - - /* - * We'll use "new_edge" on our way back up the - * tree to know what our rightmost cpos is. - */ - new_edge = le16_to_cpu(rec->e_leaf_clusters); - new_edge += le32_to_cpu(rec->e_cpos); - - /* - * The caller will use this to delete data blocks. - */ - *delete_start = le64_to_cpu(rec->e_blkno) - + ocfs2_clusters_to_blocks(inode->i_sb, - le16_to_cpu(rec->e_leaf_clusters)); - *flags = rec->e_flags; - - /* - * If it's now empty, remove this record. - */ - if (le16_to_cpu(rec->e_leaf_clusters) == 0) { - memset(rec, 0, - sizeof(struct ocfs2_extent_rec)); - le16_add_cpu(&el->l_next_free_rec, -1); - } - } else { - if (le64_to_cpu(rec->e_blkno) == deleted_eb) { - memset(rec, 0, - sizeof(struct ocfs2_extent_rec)); - le16_add_cpu(&el->l_next_free_rec, -1); - - goto delete; - } - - /* Can this actually happen? */ - if (le16_to_cpu(el->l_next_free_rec) == 0) - goto delete; - - /* - * We never actually deleted any clusters - * because our leaf was empty. There's no - * reason to adjust the rightmost edge then. - */ - if (new_edge == 0) - goto delete; - - rec->e_int_clusters = cpu_to_le32(new_edge); - le32_add_cpu(&rec->e_int_clusters, - -le32_to_cpu(rec->e_cpos)); - - /* - * A deleted child record should have been - * caught above. - */ - BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0); - } - -delete: - ret = ocfs2_journal_dirty(handle, bh); - if (ret) { - mlog_errno(ret); - goto out; - } - - mlog(0, "extent list container %llu, after: record %d: " - "(%u, %u, %llu), next = %u.\n", - (unsigned long long)bh->b_blocknr, i, - le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec), - (unsigned long long)le64_to_cpu(rec->e_blkno), - le16_to_cpu(el->l_next_free_rec)); - - /* - * We must be careful to only attempt delete of an - * extent block (and not the root inode block). - */ - if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) { - struct ocfs2_extent_block *eb = - (struct ocfs2_extent_block *)bh->b_data; - - /* - * Save this for use when processing the - * parent block. - */ - deleted_eb = le64_to_cpu(eb->h_blkno); - - mlog(0, "deleting this extent block.\n"); - - ocfs2_remove_from_cache(INODE_CACHE(inode), bh); - - BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0])); - BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); - BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno)); - - ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb); - /* An error here is not fatal. */ - if (ret < 0) - mlog_errno(ret); - } else { - deleted_eb = 0; - } - - index--; - } - - ret = 0; -out: - return ret; -} - -static int ocfs2_do_truncate(struct ocfs2_super *osb, - unsigned int clusters_to_del, - struct inode *inode, - struct buffer_head *fe_bh, - handle_t *handle, - struct ocfs2_truncate_context *tc, - struct ocfs2_path *path, - struct ocfs2_alloc_context *meta_ac) -{ - int status; - struct ocfs2_dinode *fe; - struct ocfs2_extent_block *last_eb = NULL; - struct ocfs2_extent_list *el; - struct buffer_head *last_eb_bh = NULL; - u64 delete_blk = 0; - u8 rec_flags; - - fe = (struct ocfs2_dinode *) fe_bh->b_data; - - status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del, - path, &last_eb_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - /* - * Each component will be touched, so we might as well journal - * here to avoid having to handle errors later. - */ - status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - if (last_eb_bh) { - status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; - } - - el = &(fe->id2.i_list); - - /* - * Lower levels depend on this never happening, but it's best - * to check it up here before changing the tree. - */ - if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) { - ocfs2_error(inode->i_sb, - "Inode %lu has an empty extent record, depth %u\n", - inode->i_ino, le16_to_cpu(el->l_tree_depth)); - status = -EROFS; - goto bail; - } - - dquot_free_space_nodirty(inode, - ocfs2_clusters_to_bytes(osb->sb, clusters_to_del)); - spin_lock(&OCFS2_I(inode)->ip_lock); - OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - - clusters_to_del; - spin_unlock(&OCFS2_I(inode)->ip_lock); - le32_add_cpu(&fe->i_clusters, -clusters_to_del); - inode->i_blocks = ocfs2_inode_sector_count(inode); - - status = ocfs2_trim_tree(inode, path, handle, tc, - clusters_to_del, &delete_blk, &rec_flags); - if (status) { - mlog_errno(status); - goto bail; - } - - if (le32_to_cpu(fe->i_clusters) == 0) { - /* trunc to zero is a special case. */ - el->l_tree_depth = 0; - fe->i_last_eb_blk = 0; - } else if (last_eb) - fe->i_last_eb_blk = last_eb->h_blkno; - - status = ocfs2_journal_dirty(handle, fe_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - if (last_eb) { - /* If there will be a new last extent block, then by - * definition, there cannot be any leaves to the right of - * him. */ - last_eb->h_next_leaf_blk = 0; - status = ocfs2_journal_dirty(handle, last_eb_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } - } - - if (delete_blk) { - if (rec_flags & OCFS2_EXT_REFCOUNTED) - status = ocfs2_decrease_refcount(inode, handle, - ocfs2_blocks_to_clusters(osb->sb, - delete_blk), - clusters_to_del, meta_ac, - &tc->tc_dealloc, 1); - else - status = ocfs2_truncate_log_append(osb, handle, - delete_blk, - clusters_to_del); - if (status < 0) { - mlog_errno(status); - goto bail; - } - } - status = 0; -bail: - brelse(last_eb_bh); - mlog_exit(status); - return status; -} - static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh) { set_buffer_uptodate(bh); @@ -7307,7 +6888,9 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, goto out_commit; did_quota = 1; - ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, + data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; + + ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &num); if (ret) { mlog_errno(ret); @@ -7406,26 +6989,29 @@ out: */ int ocfs2_commit_truncate(struct ocfs2_super *osb, struct inode *inode, - struct buffer_head *fe_bh, - struct ocfs2_truncate_context *tc) + struct buffer_head *di_bh) { - int status, i, credits, tl_sem = 0; - u32 clusters_to_del, new_highest_cpos, range; + int status = 0, i, flags = 0; + u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff; u64 blkno = 0; struct ocfs2_extent_list *el; - handle_t *handle = NULL; - struct inode *tl_inode = osb->osb_tl_inode; + struct ocfs2_extent_rec *rec; struct ocfs2_path *path = NULL; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; - struct ocfs2_alloc_context *meta_ac = NULL; - struct ocfs2_refcount_tree *ref_tree = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_extent_list *root_el = &(di->id2.i_list); + u64 refcount_loc = le64_to_cpu(di->i_refcount_loc); + struct ocfs2_extent_tree et; + struct ocfs2_cached_dealloc_ctxt dealloc; mlog_entry_void(); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); + ocfs2_init_dealloc_ctxt(&dealloc); + new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, i_size_read(inode)); - path = ocfs2_new_path(fe_bh, &di->id2.i_list, + path = ocfs2_new_path(di_bh, &di->id2.i_list, ocfs2_journal_access_di); if (!path) { status = -ENOMEM; @@ -7444,8 +7030,6 @@ start: goto bail; } - credits = 0; - /* * Truncate always works against the rightmost tree branch. */ @@ -7480,101 +7064,62 @@ start: } i = le16_to_cpu(el->l_next_free_rec) - 1; - range = le32_to_cpu(el->l_recs[i].e_cpos) + - ocfs2_rec_clusters(el, &el->l_recs[i]); - if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) { - clusters_to_del = 0; - } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { - clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]); - blkno = le64_to_cpu(el->l_recs[i].e_blkno); + rec = &el->l_recs[i]; + flags = rec->e_flags; + range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); + + if (i == 0 && ocfs2_is_empty_extent(rec)) { + /* + * Lower levels depend on this never happening, but it's best + * to check it up here before changing the tree. + */ + if (root_el->l_tree_depth && rec->e_int_clusters == 0) { + ocfs2_error(inode->i_sb, "Inode %lu has an empty " + "extent record, depth %u\n", inode->i_ino, + le16_to_cpu(root_el->l_tree_depth)); + status = -EROFS; + goto bail; + } + trunc_cpos = le32_to_cpu(rec->e_cpos); + trunc_len = 0; + blkno = 0; + } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) { + /* + * Truncate entire record. + */ + trunc_cpos = le32_to_cpu(rec->e_cpos); + trunc_len = ocfs2_rec_clusters(el, rec); + blkno = le64_to_cpu(rec->e_blkno); } else if (range > new_highest_cpos) { - clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) + - le32_to_cpu(el->l_recs[i].e_cpos)) - - new_highest_cpos; - blkno = le64_to_cpu(el->l_recs[i].e_blkno) + - ocfs2_clusters_to_blocks(inode->i_sb, - ocfs2_rec_clusters(el, &el->l_recs[i]) - - clusters_to_del); + /* + * Partial truncate. it also should be + * the last truncate we're doing. + */ + trunc_cpos = new_highest_cpos; + trunc_len = range - new_highest_cpos; + coff = new_highest_cpos - le32_to_cpu(rec->e_cpos); + blkno = le64_to_cpu(rec->e_blkno) + + ocfs2_clusters_to_blocks(inode->i_sb, coff); } else { + /* + * Truncate completed, leave happily. + */ status = 0; goto bail; } - mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", - clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); - - if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED && clusters_to_del) { - BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & - OCFS2_HAS_REFCOUNT_FL)); - - status = ocfs2_lock_refcount_tree(osb, - le64_to_cpu(di->i_refcount_loc), - 1, &ref_tree, NULL); - if (status) { - mlog_errno(status); - goto bail; - } - - status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh, - blkno, - clusters_to_del, - &credits, - &meta_ac); - if (status < 0) { - mlog_errno(status); - goto bail; - } - } - - mutex_lock(&tl_inode->i_mutex); - tl_sem = 1; - /* ocfs2_truncate_log_needs_flush guarantees us at least one - * record is free for use. If there isn't any, we flush to get - * an empty truncate log. */ - if (ocfs2_truncate_log_needs_flush(osb)) { - status = __ocfs2_flush_truncate_log(osb); - if (status < 0) { - mlog_errno(status); - goto bail; - } - } + phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno); - credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, - (struct ocfs2_dinode *)fe_bh->b_data, - el); - handle = ocfs2_start_trans(osb, credits); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - handle = NULL; - mlog_errno(status); - goto bail; - } - - status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle, - tc, path, meta_ac); + status = ocfs2_remove_btree_range(inode, &et, trunc_cpos, + phys_cpos, trunc_len, flags, &dealloc, + refcount_loc); if (status < 0) { mlog_errno(status); goto bail; } - mutex_unlock(&tl_inode->i_mutex); - tl_sem = 0; - - ocfs2_commit_trans(osb, handle); - handle = NULL; - ocfs2_reinit_path(path, 1); - if (meta_ac) { - ocfs2_free_alloc_context(meta_ac); - meta_ac = NULL; - } - - if (ref_tree) { - ocfs2_unlock_refcount_tree(osb, ref_tree, 1); - ref_tree = NULL; - } - /* * The check above will catch the case where we've truncated * away all allocation. @@ -7585,25 +7130,10 @@ bail: ocfs2_schedule_truncate_log_flush(osb, 1); - if (tl_sem) - mutex_unlock(&tl_inode->i_mutex); - - if (handle) - ocfs2_commit_trans(osb, handle); - - if (meta_ac) - ocfs2_free_alloc_context(meta_ac); - - if (ref_tree) - ocfs2_unlock_refcount_tree(osb, ref_tree, 1); - - ocfs2_run_deallocs(osb, &tc->tc_dealloc); + ocfs2_run_deallocs(osb, &dealloc); ocfs2_free_path(path); - /* This will drop the ext_alloc cluster lock for us */ - ocfs2_free_truncate_context(tc); - mlog_exit(status); return status; } diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 1db4359ccb9..55762b554b9 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -140,8 +140,9 @@ int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et, struct ocfs2_cached_dealloc_ctxt *dealloc); int ocfs2_remove_btree_range(struct inode *inode, struct ocfs2_extent_tree *et, - u32 cpos, u32 phys_cpos, u32 len, - struct ocfs2_cached_dealloc_ctxt *dealloc); + u32 cpos, u32 phys_cpos, u32 len, int flags, + struct ocfs2_cached_dealloc_ctxt *dealloc, + u64 refcount_loc); int ocfs2_num_free_extents(struct ocfs2_super *osb, struct ocfs2_extent_tree *et); @@ -209,7 +210,7 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c) int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, u64 blkno, unsigned int bit); int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, - int type, int slot, u64 blkno, + int type, int slot, u64 suballoc, u64 blkno, unsigned int bit); static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c) { @@ -233,8 +234,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, struct ocfs2_truncate_context **tc); int ocfs2_commit_truncate(struct ocfs2_super *osb, struct inode *inode, - struct buffer_head *fe_bh, - struct ocfs2_truncate_context *tc); + struct buffer_head *di_bh); int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, unsigned int start, unsigned int end, int trunc); @@ -319,6 +319,8 @@ int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, struct ocfs2_path *path); int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, struct ocfs2_path *path, u32 *cpos); +int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos); int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, struct ocfs2_path *left, struct ocfs2_path *right); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 21441ddb550..3623ca20cc1 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1735,6 +1735,9 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, goto out; } + if (data_ac) + data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; + credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list, clusters_to_alloc); diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c index b7428c5d0d3..ec6d1233959 100644 --- a/fs/ocfs2/blockcheck.c +++ b/fs/ocfs2/blockcheck.c @@ -403,7 +403,7 @@ void ocfs2_block_check_compute(void *data, size_t blocksize, * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no * larger than 16 bits. */ - BUG_ON(ecc > USHORT_MAX); + BUG_ON(ecc > USHRT_MAX); bc->bc_crc32e = cpu_to_le32(crc); bc->bc_ecc = cpu_to_le16((u16)ecc); @@ -508,7 +508,7 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr, * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no * larger than 16 bits. */ - BUG_ON(ecc > USHORT_MAX); + BUG_ON(ecc > USHRT_MAX); bc->bc_crc32e = cpu_to_le32(crc); bc->bc_ecc = cpu_to_le16((u16)ecc); diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index 3bb928a2bf7..c7fba396392 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -116,6 +116,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { define_mask(ERROR), define_mask(NOTICE), define_mask(KTHREAD), + define_mask(RESERVATIONS), }; static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 3dfddbec32f..fd96e2a2fa5 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -119,6 +119,7 @@ #define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ #define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ #define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ +#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */ #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 73e743eea2c..aa75ca3f78d 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -583,6 +583,9 @@ static void o2net_state_change(struct sock *sk) o2net_sc_queue_work(sc, &sc->sc_connect_work); break; default: + printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT + " shutdown, state %d\n", + SC_NODEF_ARGS(sc), sk->sk_state); o2net_sc_queue_work(sc, &sc->sc_shutdown_work); break; } diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index efd77d071c8..f04ebcfffc4 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1194,7 +1194,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, else de->inode = 0; dir->i_version++; - status = ocfs2_journal_dirty(handle, bh); + ocfs2_journal_dirty(handle, bh); goto bail; } i += le16_to_cpu(de->rec_len); @@ -1752,7 +1752,7 @@ int __ocfs2_add_entry(handle_t *handle, ocfs2_recalc_free_list(dir, handle, lookup); dir->i_version++; - status = ocfs2_journal_dirty(handle, insert_bh); + ocfs2_journal_dirty(handle, insert_bh); retval = 0; goto bail; } @@ -2297,12 +2297,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb, } ocfs2_fill_initial_dirents(inode, parent, data->id_data, size); - ocfs2_journal_dirty(handle, di_bh); - if (ret) { - mlog_errno(ret); - goto out; - } i_size_write(inode, size); inode->i_nlink = 2; @@ -2366,11 +2361,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, ocfs2_init_dir_trailer(inode, new_bh, size); } - status = ocfs2_journal_dirty(handle, new_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, new_bh); i_size_write(inode, inode->i_sb->s_blocksize); inode->i_nlink = 2; @@ -2404,15 +2395,15 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, int ret; struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; u16 dr_suballoc_bit; - u64 dr_blkno; + u64 suballoc_loc, dr_blkno; unsigned int num_bits; struct buffer_head *dx_root_bh = NULL; struct ocfs2_dx_root_block *dx_root; struct ocfs2_dir_block_trailer *trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb); - ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit, - &num_bits, &dr_blkno); + ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, + &dr_suballoc_bit, &num_bits, &dr_blkno); if (ret) { mlog_errno(ret); goto out; @@ -2440,6 +2431,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, memset(dx_root, 0, osb->sb->s_blocksize); strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE); dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); + dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc); dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit); dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation); dx_root->dr_blkno = cpu_to_le64(dr_blkno); @@ -2458,10 +2450,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, dx_root->dr_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); } - - ret = ocfs2_journal_dirty(handle, dx_root_bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, dx_root_bh); ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, OCFS2_JOURNAL_ACCESS_CREATE); @@ -2475,9 +2464,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL; di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); - ret = ocfs2_journal_dirty(handle, di_bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, di_bh); *ret_dx_root_bh = dx_root_bh; dx_root_bh = NULL; @@ -2558,7 +2545,7 @@ static int __ocfs2_dx_dir_new_cluster(struct inode *dir, * chance of contiguousness as the directory grows in number * of entries. */ - ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num); + ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num); if (ret) { mlog_errno(ret); goto out; @@ -2991,7 +2978,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, * if we only get one now, that's enough to continue. The rest * will be claimed after the conversion to extents. */ - ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len); + if (ocfs2_dir_resv_allowed(osb)) + data_ac->ac_resv = &oi->ip_la_data_resv; + ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len); if (ret) { mlog_errno(ret); goto out_commit; @@ -3034,11 +3023,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, ocfs2_init_dir_trailer(dir, dirdata_bh, i); } - ret = ocfs2_journal_dirty(handle, dirdata_bh); - if (ret) { - mlog_errno(ret); - goto out_commit; - } + ocfs2_journal_dirty(handle, dirdata_bh); if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { /* @@ -3104,11 +3089,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, */ dir->i_blocks = ocfs2_inode_sector_count(dir); - ret = ocfs2_journal_dirty(handle, di_bh); - if (ret) { - mlog_errno(ret); - goto out_commit; - } + ocfs2_journal_dirty(handle, di_bh); if (ocfs2_supports_indexed_dirs(osb)) { ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh, @@ -3138,7 +3119,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, * pass. Claim the 2nd cluster as a separate extent. */ if (alloc > len) { - ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, + ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len); if (ret) { mlog_errno(ret); @@ -3369,6 +3350,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, goto bail; } + if (ocfs2_dir_resv_allowed(osb)) + data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv; + credits = ocfs2_calc_extend_credits(sb, el, 1); } else { spin_unlock(&OCFS2_I(dir)->ip_lock); @@ -3423,11 +3407,7 @@ do_extend: } else { de->rec_len = cpu_to_le16(sb->s_blocksize); } - status = ocfs2_journal_dirty(handle, new_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, new_bh); dir_i_size += dir->i_sb->s_blocksize; i_size_write(dir, dir_i_size); @@ -3906,11 +3886,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp, dx_leaf_sort_swap); - ret = ocfs2_journal_dirty(handle, dx_leaf_bh); - if (ret) { - mlog_errno(ret); - goto out_commit; - } + ocfs2_journal_dirty(handle, dx_leaf_bh); ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash, &split_hash); @@ -4490,7 +4466,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir, blk = le64_to_cpu(dx_root->dr_blkno); bit = le16_to_cpu(dx_root->dr_suballoc_bit); - bg_blkno = ocfs2_which_suballoc_group(blk, bit); + if (dx_root->dr_suballoc_loc) + bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc); + else + bg_blkno = ocfs2_which_suballoc_group(blk, bit); ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh, bit, bg_blkno, 1); if (ret) @@ -4551,8 +4530,8 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh) p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno); - ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, - &dealloc); + ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0, + &dealloc, 0); if (ret) { mlog_errno(ret); goto out; diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 12d5eb78a11..f4499915683 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -88,7 +88,7 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) return 0; } -static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) +void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) { mlog_entry_void(); @@ -145,7 +145,7 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) } -static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) +void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) { mlog_entry_void(); @@ -451,7 +451,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen, lock->ml.node, &status); if (ret < 0) - mlog_errno(ret); + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key, + lock->ml.node); else { if (status == DLM_RECOVERING) { mlog(ML_ERROR, "sent AST to node %u, it thinks this " diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 0102be35980..4b6ae2c13b4 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -37,7 +37,7 @@ #define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes #define DLM_THREAD_MS 200 // flush at least every 200 ms -#define DLM_HASH_SIZE_DEFAULT (1 << 14) +#define DLM_HASH_SIZE_DEFAULT (1 << 17) #if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE # define DLM_HASH_PAGES 1 #else @@ -904,6 +904,8 @@ void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); +void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); +void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, struct dlm_lock *lock); diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 90803b47cd8..9f30491e5e8 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -390,7 +390,9 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm, } else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED) dlm_error(ret); } else { - mlog_errno(tmpret); + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", tmpret, DLM_CONVERT_LOCK_MSG, dlm->key, + res->owner); if (dlm_is_host_down(tmpret)) { /* instead of logging the same network error over * and over, sleep here and wait for the heartbeat diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 988c9055fd4..6b5a492e174 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -511,7 +511,7 @@ static void __dlm_print_nodes(struct dlm_ctxt *dlm) assert_spin_locked(&dlm->spinlock); - printk(KERN_INFO "ocfs2_dlm: Nodes in domain (\"%s\"): ", dlm->name); + printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name); while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1)) < O2NM_MAX_NODES) { @@ -534,7 +534,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, node = exit_msg->node_idx; - printk(KERN_INFO "ocfs2_dlm: Node %u leaves domain %s\n", node, dlm->name); + printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name); spin_lock(&dlm->spinlock); clear_bit(node, dlm->domain_map); @@ -565,7 +565,9 @@ static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key, &leave_msg, sizeof(leave_msg), node, NULL); - + if (status < 0) + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node); mlog(0, "status return %d from o2net_send_message\n", status); return status; @@ -904,7 +906,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, set_bit(assert->node_idx, dlm->domain_map); __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); - printk(KERN_INFO "ocfs2_dlm: Node %u joins domain %s\n", + printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n", assert->node_idx, dlm->name); __dlm_print_nodes(dlm); @@ -962,7 +964,9 @@ static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm, &cancel_msg, sizeof(cancel_msg), node, NULL); if (status < 0) { - mlog_errno(status); + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", status, DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY, + node); goto bail; } @@ -1029,10 +1033,11 @@ static int dlm_request_join(struct dlm_ctxt *dlm, byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES); status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg, - sizeof(join_msg), node, - &join_resp); + sizeof(join_msg), node, &join_resp); if (status < 0 && status != -ENOPROTOOPT) { - mlog_errno(status); + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", status, DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, + node); goto bail; } dlm_query_join_wire_to_packet(join_resp, &packet); @@ -1103,7 +1108,9 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm, &assert_msg, sizeof(assert_msg), node, NULL); if (status < 0) - mlog_errno(status); + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, + node); return status; } @@ -1516,7 +1523,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, goto leave; } - dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL); + dlm->name = kstrdup(domain, GFP_KERNEL); if (dlm->name == NULL) { mlog_errno(-ENOMEM); kfree(dlm); @@ -1550,7 +1557,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, for (i = 0; i < DLM_HASH_BUCKETS; i++) INIT_HLIST_HEAD(dlm_master_hash(dlm, i)); - strcpy(dlm->name, domain); dlm->key = key; dlm->node_num = o2nm_this_node(); diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 73333777267..69cf369961c 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -329,7 +329,9 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm, BUG(); } } else { - mlog_errno(tmpret); + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key, + res->owner); if (dlm_is_host_down(tmpret)) { ret = DLM_RECOVERING; mlog(0, "node %u died so returning DLM_RECOVERING " @@ -429,7 +431,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, struct dlm_lock *lock; int kernel_allocated = 0; - lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS); + lock = kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS); if (!lock) return NULL; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 9289b4357d2..4a7506a4e31 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -617,13 +617,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, { struct dlm_lock_resource *res = NULL; - res = (struct dlm_lock_resource *) - kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS); + res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS); if (!res) goto error; - res->lockname.name = (char *) - kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS); + res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS); if (!res->lockname.name) goto error; @@ -757,8 +755,7 @@ lookup: spin_unlock(&dlm->spinlock); mlog(0, "allocating a new resource\n"); /* nothing found and we need to allocate one. */ - alloc_mle = (struct dlm_master_list_entry *) - kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); + alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); if (!alloc_mle) goto leave; res = dlm_new_lockres(dlm, lockid, namelen); @@ -1542,8 +1539,7 @@ way_up_top: spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); - mle = (struct dlm_master_list_entry *) - kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); + mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); if (!mle) { response = DLM_MASTER_RESP_ERROR; mlog_errno(-ENOMEM); @@ -1666,7 +1662,9 @@ again: tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key, &assert, sizeof(assert), to, &r); if (tmpret < 0) { - mlog(0, "assert_master returned %d!\n", tmpret); + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", tmpret, + DLM_ASSERT_MASTER_MSG, dlm->key, to); if (!dlm_is_host_down(tmpret)) { mlog(ML_ERROR, "unhandled error=%d!\n", tmpret); BUG(); @@ -2205,7 +2203,9 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, &deref, sizeof(deref), res->owner, &r); if (ret < 0) - mlog_errno(ret); + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key, + res->owner); else if (r < 0) { /* BAD. other node says I did not have a ref. */ mlog(ML_ERROR,"while dropping ref on %s:%.*s " @@ -2452,8 +2452,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, goto leave; } - mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, - GFP_NOFS); + mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); if (!mle) { mlog_errno(ret); goto leave; @@ -2975,7 +2974,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, &migrate, sizeof(migrate), nodenum, &status); if (ret < 0) { - mlog(0, "migrate_request returned %d!\n", ret); + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG, + dlm->key, nodenum); if (!dlm_is_host_down(ret)) { mlog(ML_ERROR, "unhandled error=%d!\n", ret); BUG(); @@ -3033,8 +3034,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, hash = dlm_lockid_hash(name, namelen); /* preallocate.. if this fails, abort */ - mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, - GFP_NOFS); + mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); if (!mle) { ret = -ENOMEM; diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index b4f99de2caf..f8b75ce4be7 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -803,7 +803,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, /* negative status is handled by caller */ if (ret < 0) - mlog_errno(ret); + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG, + dlm->key, request_from); // return from here, then // sleep until all received or error @@ -955,10 +957,10 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to) ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, sizeof(done_msg), send_to, &tmpret); if (ret < 0) { + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG, + dlm->key, send_to); if (!dlm_is_host_down(ret)) { - mlog_errno(ret); - mlog(ML_ERROR, "%s: unknown error sending data-done " - "to %u\n", dlm->name, send_to); BUG(); } } else @@ -1126,7 +1128,9 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, if (ret < 0) { /* XXX: negative status is not handled. * this will end up killing this node. */ - mlog_errno(ret); + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG, + dlm->key, send_to); } else { /* might get an -ENOMEM back here */ ret = status; @@ -1642,7 +1646,9 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, &req, sizeof(req), nodenum, &status); /* XXX: negative status not handled properly here. */ if (ret < 0) - mlog_errno(ret); + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG, + dlm->key, nodenum); else { BUG_ON(status < 0); BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN); @@ -2640,7 +2646,7 @@ retry: if (dlm_is_host_down(ret)) { /* node is down. not involved in recovery * so just keep going */ - mlog(0, "%s: node %u was down when sending " + mlog(ML_NOTICE, "%s: node %u was down when sending " "begin reco msg (%d)\n", dlm->name, nodenum, ret); ret = 0; } @@ -2660,11 +2666,12 @@ retry: } if (ret < 0) { struct dlm_lock_resource *res; + /* this is now a serious problem, possibly ENOMEM * in the network stack. must retry */ mlog_errno(ret); mlog(ML_ERROR, "begin reco of dlm %s to node %u " - " returned %d\n", dlm->name, nodenum, ret); + "returned %d\n", dlm->name, nodenum, ret); res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME, DLM_RECOVERY_LOCK_NAME_LEN); if (res) { @@ -2789,7 +2796,9 @@ stage2: if (ret >= 0) ret = status; if (ret < 0) { - mlog_errno(ret); + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", ret, DLM_FINALIZE_RECO_MSG, + dlm->key, nodenum); if (dlm_is_host_down(ret)) { /* this has no effect on this recovery * session, so set the status to zero to diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 11a6d1fd1d3..d4f73ca68fe 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -309,6 +309,7 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm, * spinlock, and because we know that it is not migrating/ * recovering/in-progress, it is fine to reserve asts and * basts right before queueing them all throughout */ + assert_spin_locked(&dlm->ast_lock); assert_spin_locked(&res->spinlock); BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING| DLM_LOCK_RES_RECOVERING| @@ -337,7 +338,7 @@ converting: /* queue the BAST if not already */ if (lock->ml.highest_blocked == LKM_IVMODE) { __dlm_lockres_reserve_ast(res); - dlm_queue_bast(dlm, lock); + __dlm_queue_bast(dlm, lock); } /* update the highest_blocked if needed */ if (lock->ml.highest_blocked < target->ml.convert_type) @@ -355,7 +356,7 @@ converting: can_grant = 0; if (lock->ml.highest_blocked == LKM_IVMODE) { __dlm_lockres_reserve_ast(res); - dlm_queue_bast(dlm, lock); + __dlm_queue_bast(dlm, lock); } if (lock->ml.highest_blocked < target->ml.convert_type) lock->ml.highest_blocked = @@ -383,7 +384,7 @@ converting: spin_unlock(&target->spinlock); __dlm_lockres_reserve_ast(res); - dlm_queue_ast(dlm, target); + __dlm_queue_ast(dlm, target); /* go back and check for more */ goto converting; } @@ -402,7 +403,7 @@ blocked: can_grant = 0; if (lock->ml.highest_blocked == LKM_IVMODE) { __dlm_lockres_reserve_ast(res); - dlm_queue_bast(dlm, lock); + __dlm_queue_bast(dlm, lock); } if (lock->ml.highest_blocked < target->ml.type) lock->ml.highest_blocked = target->ml.type; @@ -418,7 +419,7 @@ blocked: can_grant = 0; if (lock->ml.highest_blocked == LKM_IVMODE) { __dlm_lockres_reserve_ast(res); - dlm_queue_bast(dlm, lock); + __dlm_queue_bast(dlm, lock); } if (lock->ml.highest_blocked < target->ml.type) lock->ml.highest_blocked = target->ml.type; @@ -444,7 +445,7 @@ blocked: spin_unlock(&target->spinlock); __dlm_lockres_reserve_ast(res); - dlm_queue_ast(dlm, target); + __dlm_queue_ast(dlm, target); /* go back and check for more */ goto converting; } @@ -674,6 +675,7 @@ static int dlm_thread(void *data) /* lockres can be re-dirtied/re-added to the * dirty_list in this gap, but that is ok */ + spin_lock(&dlm->ast_lock); spin_lock(&res->spinlock); if (res->owner != dlm->node_num) { __dlm_print_one_lock_resource(res); @@ -694,6 +696,7 @@ static int dlm_thread(void *data) /* move it to the tail and keep going */ res->state &= ~DLM_LOCK_RES_DIRTY; spin_unlock(&res->spinlock); + spin_unlock(&dlm->ast_lock); mlog(0, "delaying list shuffling for in-" "progress lockres %.*s, state=%d\n", res->lockname.len, res->lockname.name, @@ -715,6 +718,7 @@ static int dlm_thread(void *data) dlm_shuffle_lists(dlm, res); res->state &= ~DLM_LOCK_RES_DIRTY; spin_unlock(&res->spinlock); + spin_unlock(&dlm->ast_lock); dlm_lockres_calc_usage(dlm, res); diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index b47c1b92b82..817287c6a6d 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -354,7 +354,8 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, mlog(0, "master was in-progress. retry\n"); ret = status; } else { - mlog_errno(tmpret); + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", tmpret, DLM_UNLOCK_LOCK_MSG, dlm->key, owner); if (dlm_is_host_down(tmpret)) { /* NOTE: this seems strange, but it is what we want. * when the master goes down during a cancel or diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 50c4ee805da..39eb16ac5f9 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3897,7 +3897,8 @@ static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo) oinfo->dqi_gi.dqi_free_entry = be32_to_cpu(lvb->lvb_free_entry); } else { - status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh); + status = ocfs2_read_quota_phys_block(oinfo->dqi_gqinode, + oinfo->dqi_giblk, &bh); if (status) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index a5fbd9cea96..97e54b9e654 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -278,10 +278,7 @@ int ocfs2_update_inode_atime(struct inode *inode, inode->i_atime = CURRENT_TIME; di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); - - ret = ocfs2_journal_dirty(handle, bh); - if (ret < 0) - mlog_errno(ret); + ocfs2_journal_dirty(handle, bh); out_commit: ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); @@ -430,9 +427,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); - status = ocfs2_journal_dirty(handle, fe_bh); - if (status < 0) - mlog_errno(status); + ocfs2_journal_dirty(handle, fe_bh); out_commit: ocfs2_commit_trans(osb, handle); @@ -449,7 +444,6 @@ static int ocfs2_truncate_file(struct inode *inode, int status = 0; struct ocfs2_dinode *fe = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct ocfs2_truncate_context *tc = NULL; mlog_entry("(inode = %llu, new_i_size = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -488,6 +482,9 @@ static int ocfs2_truncate_file(struct inode *inode, down_write(&OCFS2_I(inode)->ip_alloc_sem); + ocfs2_resv_discard(&osb->osb_la_resmap, + &OCFS2_I(inode)->ip_la_data_resv); + /* * The inode lock forced other nodes to sync and drop their * pages, which (correctly) happens even if we have a truncate @@ -517,13 +514,7 @@ static int ocfs2_truncate_file(struct inode *inode, goto bail_unlock_sem; } - status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); - if (status < 0) { - mlog_errno(status); - goto bail_unlock_sem; - } - - status = ocfs2_commit_truncate(osb, inode, di_bh, tc); + status = ocfs2_commit_truncate(osb, inode, di_bh); if (status < 0) { mlog_errno(status); goto bail_unlock_sem; @@ -666,11 +657,7 @@ restarted_transaction: goto leave; } - status = ocfs2_journal_dirty(handle, bh); - if (status < 0) { - mlog_errno(status); - goto leave; - } + ocfs2_journal_dirty(handle, bh); spin_lock(&OCFS2_I(inode)->ip_lock); clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); @@ -946,9 +933,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) struct ocfs2_super *osb = OCFS2_SB(sb); struct buffer_head *bh = NULL; handle_t *handle = NULL; - int qtype; - struct dquot *transfer_from[MAXQUOTAS] = { }; struct dquot *transfer_to[MAXQUOTAS] = { }; + int qtype; mlog_entry("(0x%p, '%.*s')\n", dentry, dentry->d_name.len, dentry->d_name.name); @@ -979,10 +965,10 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) if (status) return status; + if (is_quota_modification(inode, attr)) + dquot_initialize(inode); size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; if (size_change) { - dquot_initialize(inode); - status = ocfs2_rw_lock(inode, 1); if (status < 0) { mlog_errno(status); @@ -1032,9 +1018,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid, USRQUOTA); - transfer_from[USRQUOTA] = dqget(sb, inode->i_uid, - USRQUOTA); - if (!transfer_to[USRQUOTA] || !transfer_from[USRQUOTA]) { + if (!transfer_to[USRQUOTA]) { status = -ESRCH; goto bail_unlock; } @@ -1044,9 +1028,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid, GRPQUOTA); - transfer_from[GRPQUOTA] = dqget(sb, inode->i_gid, - GRPQUOTA); - if (!transfer_to[GRPQUOTA] || !transfer_from[GRPQUOTA]) { + if (!transfer_to[GRPQUOTA]) { status = -ESRCH; goto bail_unlock; } @@ -1058,7 +1040,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) mlog_errno(status); goto bail_unlock; } - status = dquot_transfer(inode, attr); + status = __dquot_transfer(inode, transfer_to); if (status < 0) goto bail_commit; } else { @@ -1098,10 +1080,8 @@ bail: brelse(bh); /* Release quota pointers in case we acquired them */ - for (qtype = 0; qtype < MAXQUOTAS; qtype++) { + for (qtype = 0; qtype < MAXQUOTAS; qtype++) dqput(transfer_to[qtype]); - dqput(transfer_from[qtype]); - } if (!status && attr->ia_valid & ATTR_MODE) { status = ocfs2_acl_chmod(inode); @@ -1195,9 +1175,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode, di = (struct ocfs2_dinode *) bh->b_data; di->i_mode = cpu_to_le16(inode->i_mode); - ret = ocfs2_journal_dirty(handle, bh); - if (ret < 0) - mlog_errno(ret); + ocfs2_journal_dirty(handle, bh); out_trans: ocfs2_commit_trans(osb, handle); @@ -1434,16 +1412,90 @@ out: return ret; } +static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos) +{ + int i; + struct ocfs2_extent_rec *rec = NULL; + + for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { + + rec = &el->l_recs[i]; + + if (le32_to_cpu(rec->e_cpos) < pos) + break; + } + + return i; +} + +/* + * Helper to calculate the punching pos and length in one run, we handle the + * following three cases in order: + * + * - remove the entire record + * - remove a partial record + * - no record needs to be removed (hole-punching completed) +*/ +static void ocfs2_calc_trunc_pos(struct inode *inode, + struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *rec, + u32 trunc_start, u32 *trunc_cpos, + u32 *trunc_len, u32 *trunc_end, + u64 *blkno, int *done) +{ + int ret = 0; + u32 coff, range; + + range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); + + if (le32_to_cpu(rec->e_cpos) >= trunc_start) { + *trunc_cpos = le32_to_cpu(rec->e_cpos); + /* + * Skip holes if any. + */ + if (range < *trunc_end) + *trunc_end = range; + *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos); + *blkno = le64_to_cpu(rec->e_blkno); + *trunc_end = le32_to_cpu(rec->e_cpos); + } else if (range > trunc_start) { + *trunc_cpos = trunc_start; + *trunc_len = *trunc_end - trunc_start; + coff = trunc_start - le32_to_cpu(rec->e_cpos); + *blkno = le64_to_cpu(rec->e_blkno) + + ocfs2_clusters_to_blocks(inode->i_sb, coff); + *trunc_end = trunc_start; + } else { + /* + * It may have two following possibilities: + * + * - last record has been removed + * - trunc_start was within a hole + * + * both two cases mean the completion of hole punching. + */ + ret = 1; + } + + *done = ret; +} + static int ocfs2_remove_inode_range(struct inode *inode, struct buffer_head *di_bh, u64 byte_start, u64 byte_len) { - int ret = 0; - u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size; + int ret = 0, flags = 0, done = 0, i; + u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos; + u32 cluster_in_el; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_cached_dealloc_ctxt dealloc; struct address_space *mapping = inode->i_mapping; struct ocfs2_extent_tree et; + struct ocfs2_path *path = NULL; + struct ocfs2_extent_list *el = NULL; + struct ocfs2_extent_rec *rec = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc); ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); ocfs2_init_dealloc_ctxt(&dealloc); @@ -1469,17 +1521,35 @@ static int ocfs2_remove_inode_range(struct inode *inode, goto out; } + /* + * For reflinks, we may need to CoW 2 clusters which might be + * partially zero'd later, if hole's start and end offset were + * within one cluster(means is not exactly aligned to clustersize). + */ + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) { + + ret = ocfs2_cow_file_pos(inode, di_bh, byte_start); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len); + if (ret) { + mlog_errno(ret); + goto out; + } + } + trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); - trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits; - if (trunc_len >= trunc_start) - trunc_len -= trunc_start; - else - trunc_len = 0; + trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits; + cluster_in_el = trunc_end; - mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n", + mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, cend: %u\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)byte_start, - (unsigned long long)byte_len, trunc_start, trunc_len); + (unsigned long long)byte_len, trunc_start, trunc_end); ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len); if (ret) { @@ -1487,31 +1557,79 @@ static int ocfs2_remove_inode_range(struct inode *inode, goto out; } - cpos = trunc_start; - while (trunc_len) { - ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, - &alloc_size, NULL); + path = ocfs2_new_path_from_et(&et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + while (trunc_end > trunc_start) { + + ret = ocfs2_find_path(INODE_CACHE(inode), path, + cluster_in_el); if (ret) { mlog_errno(ret); goto out; } - if (alloc_size > trunc_len) - alloc_size = trunc_len; + el = path_leaf_el(path); + + i = ocfs2_find_rec(el, trunc_end); + /* + * Need to go to previous extent block. + */ + if (i < 0) { + if (path->p_tree_depth == 0) + break; - /* Only do work for non-holes */ - if (phys_cpos != 0) { - ret = ocfs2_remove_btree_range(inode, &et, cpos, - phys_cpos, alloc_size, - &dealloc); + ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, + path, + &cluster_in_el); if (ret) { mlog_errno(ret); goto out; } + + /* + * We've reached the leftmost extent block, + * it's safe to leave. + */ + if (cluster_in_el == 0) + break; + + /* + * The 'pos' searched for previous extent block is + * always one cluster less than actual trunc_end. + */ + trunc_end = cluster_in_el + 1; + + ocfs2_reinit_path(path, 1); + + continue; + + } else + rec = &el->l_recs[i]; + + ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos, + &trunc_len, &trunc_end, &blkno, &done); + if (done) + break; + + flags = rec->e_flags; + phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno); + + ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos, + phys_cpos, trunc_len, flags, + &dealloc, refcount_loc); + if (ret < 0) { + mlog_errno(ret); + goto out; } - cpos += alloc_size; - trunc_len -= alloc_size; + cluster_in_el = trunc_end; + + ocfs2_reinit_path(path, 1); } ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index af189887201..abb0a95cc71 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -376,6 +376,10 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, OCFS2_I(inode)->ip_last_used_slot = 0; OCFS2_I(inode)->ip_last_used_group = 0; + + if (S_ISDIR(inode->i_mode)) + ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv, + OCFS2_RESV_FLAG_DIR); mlog_exit_void(); } @@ -539,7 +543,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, struct buffer_head *fe_bh) { int status = 0; - struct ocfs2_truncate_context *tc = NULL; struct ocfs2_dinode *fe; handle_t *handle = NULL; @@ -582,13 +585,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, ocfs2_commit_trans(osb, handle); handle = NULL; - status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); - if (status < 0) { - mlog_errno(status); - goto out; - } - - status = ocfs2_commit_truncate(osb, inode, fe_bh, tc); + status = ocfs2_commit_truncate(osb, inode, fe_bh); if (status < 0) { mlog_errno(status); goto out; @@ -659,12 +656,7 @@ static int ocfs2_remove_inode(struct inode *inode, di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); - - status = ocfs2_journal_dirty(handle, di_bh); - if (status < 0) { - mlog_errno(status); - goto bail_commit; - } + ocfs2_journal_dirty(handle, di_bh); ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh); dquot_free_inode(inode); @@ -980,7 +972,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode, void ocfs2_delete_inode(struct inode *inode) { int wipe, status; - sigset_t blocked, oldset; + sigset_t oldset; struct buffer_head *di_bh = NULL; mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); @@ -1007,13 +999,7 @@ void ocfs2_delete_inode(struct inode *inode) * messaging paths may return us -ERESTARTSYS. Which would * cause us to exit early, resulting in inodes being orphaned * forever. */ - sigfillset(&blocked); - status = sigprocmask(SIG_BLOCK, &blocked, &oldset); - if (status < 0) { - mlog_errno(status); - ocfs2_cleanup_delete_inode(inode, 1); - goto bail; - } + ocfs2_block_signals(&oldset); /* * Synchronize us against ocfs2_get_dentry. We take this in @@ -1087,9 +1073,7 @@ bail_unlock_nfs_sync: ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0); bail_unblock: - status = sigprocmask(SIG_SETMASK, &oldset, NULL); - if (status < 0) - mlog_errno(status); + ocfs2_unblock_signals(&oldset); bail: clear_inode(inode); mlog_exit_void(); @@ -1123,6 +1107,10 @@ void ocfs2_clear_inode(struct inode *inode) ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres); ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); + ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap, + &oi->ip_la_data_resv); + ocfs2_resv_init_once(&oi->ip_la_data_resv); + /* We very well may get a clear_inode before all an inodes * metadata has hit disk. Of course, we can't drop any cluster * locks until the journal has finished with it. The only @@ -1298,13 +1286,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle, fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - status = ocfs2_journal_dirty(handle, bh); - if (status < 0) - mlog_errno(status); - - status = 0; + ocfs2_journal_dirty(handle, bh); leave: - mlog_exit(status); return status; } diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 0b28e1921a3..9f5f5fcadc4 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -70,6 +70,8 @@ struct ocfs2_inode_info /* Only valid if the inode is the dir. */ u32 ip_last_used_slot; u64 ip_last_used_group; + + struct ocfs2_alloc_reservation ip_la_data_resv; }; /* diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 9336c60e3a3..47878cf1641 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -402,9 +402,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb, } /* - * 'nblocks' is what you want to add to the current - * transaction. extend_trans will either extend the current handle by - * nblocks, or commit it and start a new one with nblocks credits. + * 'nblocks' is what you want to add to the current transaction. * * This might call jbd2_journal_restart() which will commit dirty buffers * and then restart the transaction. Before calling @@ -422,11 +420,15 @@ int ocfs2_commit_trans(struct ocfs2_super *osb, */ int ocfs2_extend_trans(handle_t *handle, int nblocks) { - int status; + int status, old_nblocks; BUG_ON(!handle); - BUG_ON(!nblocks); + BUG_ON(nblocks < 0); + + if (!nblocks) + return 0; + old_nblocks = handle->h_buffer_credits; mlog_entry_void(); mlog(0, "Trying to extend transaction by %d blocks\n", nblocks); @@ -445,7 +447,8 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks) mlog(0, "jbd2_journal_extend failed, trying " "jbd2_journal_restart\n"); - status = jbd2_journal_restart(handle, nblocks); + status = jbd2_journal_restart(handle, + old_nblocks + nblocks); if (status < 0) { mlog_errno(status); goto bail; @@ -734,8 +737,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci, return __ocfs2_journal_access(handle, ci, bh, NULL, type); } -int ocfs2_journal_dirty(handle_t *handle, - struct buffer_head *bh) +void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh) { int status; @@ -743,13 +745,9 @@ int ocfs2_journal_dirty(handle_t *handle, (unsigned long long)bh->b_blocknr); status = jbd2_journal_dirty_metadata(handle, bh); - if (status < 0) - mlog(ML_ERROR, "Could not dirty metadata buffer. " - "(bh->b_blocknr=%llu)\n", - (unsigned long long)bh->b_blocknr); + BUG_ON(status); - mlog_exit(status); - return status; + mlog_exit_void(); } #define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 3f74e09b0d8..b5baaa8e710 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -325,8 +325,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci, * <modify the bh> * ocfs2_journal_dirty(handle, bh); */ -int ocfs2_journal_dirty(handle_t *handle, - struct buffer_head *bh); +void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh); /* * Credit Macros: @@ -562,6 +561,18 @@ static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb, return blocks; } +/* + * Allocating a discontiguous block group requires the credits from + * ocfs2_calc_group_alloc_credits() as well as enough credits to fill + * the group descriptor's extent list. The caller already has started + * the transaction with ocfs2_calc_group_alloc_credits(). They extend + * it with these credits. + */ +static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb) +{ + return ocfs2_extent_recs_per_gd(sb); +} + static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, unsigned int clusters_to_del, struct ocfs2_dinode *fe, diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index c983715d8d8..3d7419682dc 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -52,7 +52,8 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc); static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, struct ocfs2_dinode *alloc, - u32 numbits); + u32 *numbits, + struct ocfs2_alloc_reservation *resv); static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc); @@ -74,6 +75,144 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, struct inode *local_alloc_inode); +/* + * ocfs2_la_default_mb() - determine a default size, in megabytes of + * the local alloc. + * + * Generally, we'd like to pick as large a local alloc as + * possible. Performance on large workloads tends to scale + * proportionally to la size. In addition to that, the reservations + * code functions more efficiently as it can reserve more windows for + * write. + * + * Some things work against us when trying to choose a large local alloc: + * + * - We need to ensure our sizing is picked to leave enough space in + * group descriptors for other allocations (such as block groups, + * etc). Picking default sizes which are a multiple of 4 could help + * - block groups are allocated in 2mb and 4mb chunks. + * + * - Likewise, we don't want to starve other nodes of bits on small + * file systems. This can easily be taken care of by limiting our + * default to a reasonable size (256M) on larger cluster sizes. + * + * - Some file systems can't support very large sizes - 4k and 8k in + * particular are limited to less than 128 and 256 megabytes respectively. + * + * The following reference table shows group descriptor and local + * alloc maximums at various cluster sizes (4k blocksize) + * + * csize: 4K group: 126M la: 121M + * csize: 8K group: 252M la: 243M + * csize: 16K group: 504M la: 486M + * csize: 32K group: 1008M la: 972M + * csize: 64K group: 2016M la: 1944M + * csize: 128K group: 4032M la: 3888M + * csize: 256K group: 8064M la: 7776M + * csize: 512K group: 16128M la: 15552M + * csize: 1024K group: 32256M la: 31104M + */ +#define OCFS2_LA_MAX_DEFAULT_MB 256 +#define OCFS2_LA_OLD_DEFAULT 8 +unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb) +{ + unsigned int la_mb; + unsigned int gd_mb; + unsigned int megs_per_slot; + struct super_block *sb = osb->sb; + + gd_mb = ocfs2_clusters_to_megabytes(osb->sb, + 8 * ocfs2_group_bitmap_size(sb, 0, osb->s_feature_incompat)); + + /* + * This takes care of files systems with very small group + * descriptors - 512 byte blocksize at cluster sizes lower + * than 16K and also 1k blocksize with 4k cluster size. + */ + if ((sb->s_blocksize == 512 && osb->s_clustersize <= 8192) + || (sb->s_blocksize == 1024 && osb->s_clustersize == 4096)) + return OCFS2_LA_OLD_DEFAULT; + + /* + * Leave enough room for some block groups and make the final + * value we work from a multiple of 4. + */ + gd_mb -= 16; + gd_mb &= 0xFFFFFFFB; + + la_mb = gd_mb; + + /* + * Keep window sizes down to a reasonable default + */ + if (la_mb > OCFS2_LA_MAX_DEFAULT_MB) { + /* + * Some clustersize / blocksize combinations will have + * given us a larger than OCFS2_LA_MAX_DEFAULT_MB + * default size, but get poor distribution when + * limited to exactly 256 megabytes. + * + * As an example, 16K clustersize at 4K blocksize + * gives us a cluster group size of 504M. Paring the + * local alloc size down to 256 however, would give us + * only one window and around 200MB left in the + * cluster group. Instead, find the first size below + * 256 which would give us an even distribution. + * + * Larger cluster group sizes actually work out pretty + * well when pared to 256, so we don't have to do this + * for any group that fits more than two + * OCFS2_LA_MAX_DEFAULT_MB windows. + */ + if (gd_mb > (2 * OCFS2_LA_MAX_DEFAULT_MB)) + la_mb = 256; + else { + unsigned int gd_mult = gd_mb; + + while (gd_mult > 256) + gd_mult = gd_mult >> 1; + + la_mb = gd_mult; + } + } + + megs_per_slot = osb->osb_clusters_at_boot / osb->max_slots; + megs_per_slot = ocfs2_clusters_to_megabytes(osb->sb, megs_per_slot); + /* Too many nodes, too few disk clusters. */ + if (megs_per_slot < la_mb) + la_mb = megs_per_slot; + + return la_mb; +} + +void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb) +{ + struct super_block *sb = osb->sb; + unsigned int la_default_mb = ocfs2_la_default_mb(osb); + unsigned int la_max_mb; + + la_max_mb = ocfs2_clusters_to_megabytes(sb, + ocfs2_local_alloc_size(sb) * 8); + + mlog(0, "requested: %dM, max: %uM, default: %uM\n", + requested_mb, la_max_mb, la_default_mb); + + if (requested_mb == -1) { + /* No user request - use defaults */ + osb->local_alloc_default_bits = + ocfs2_megabytes_to_clusters(sb, la_default_mb); + } else if (requested_mb > la_max_mb) { + /* Request is too big, we give the maximum available */ + osb->local_alloc_default_bits = + ocfs2_megabytes_to_clusters(sb, la_max_mb); + } else { + osb->local_alloc_default_bits = + ocfs2_megabytes_to_clusters(sb, requested_mb); + } + + osb->local_alloc_bits = osb->local_alloc_default_bits; +} + static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb) { return (osb->local_alloc_state == OCFS2_LA_THROTTLED || @@ -156,7 +295,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) osb->local_alloc_bits, (osb->bitmap_cpg - 1)); osb->local_alloc_bits = ocfs2_megabytes_to_clusters(osb->sb, - OCFS2_DEFAULT_LOCAL_ALLOC_SIZE); + ocfs2_la_default_mb(osb)); } /* read the alloc off disk */ @@ -262,6 +401,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) osb->local_alloc_state = OCFS2_LA_DISABLED; + ocfs2_resmap_uninit(&osb->osb_la_resmap); + main_bm_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); @@ -305,12 +446,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) } ocfs2_clear_local_alloc(alloc); - - status = ocfs2_journal_dirty(handle, bh); - if (status < 0) { - mlog_errno(status); - goto out_commit; - } + ocfs2_journal_dirty(handle, bh); brelse(bh); osb->local_alloc_bh = NULL; @@ -481,46 +617,6 @@ out: return status; } -/* Check to see if the local alloc window is within ac->ac_max_block */ -static int ocfs2_local_alloc_in_range(struct inode *inode, - struct ocfs2_alloc_context *ac, - u32 bits_wanted) -{ - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct ocfs2_dinode *alloc; - struct ocfs2_local_alloc *la; - int start; - u64 block_off; - - if (!ac->ac_max_block) - return 1; - - alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; - la = OCFS2_LOCAL_ALLOC(alloc); - - start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted); - if (start == -1) { - mlog_errno(-ENOSPC); - return 0; - } - - /* - * Converting (bm_off + start + bits_wanted) to blocks gives us - * the blkno just past our actual allocation. This is perfect - * to compare with ac_max_block. - */ - block_off = ocfs2_clusters_to_blocks(inode->i_sb, - le32_to_cpu(la->la_bm_off) + - start + bits_wanted); - mlog(0, "Checking %llu against %llu\n", - (unsigned long long)block_off, - (unsigned long long)ac->ac_max_block); - if (block_off > ac->ac_max_block) - return 0; - - return 1; -} - /* * make sure we've got at least bits_wanted contiguous bits in the * local alloc. You lose them when you drop i_mutex. @@ -613,17 +709,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, mlog(0, "Calling in_range for max block %llu\n", (unsigned long long)ac->ac_max_block); - if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac, - bits_wanted)) { - /* - * The window is outside ac->ac_max_block. - * This errno tells the caller to keep localalloc enabled - * but to get the allocation from the main bitmap. - */ - status = -EFBIG; - goto bail; - } - ac->ac_inode = local_alloc_inode; /* We should never use localalloc from another slot */ ac->ac_alloc_slot = osb->slot_num; @@ -664,7 +749,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; la = OCFS2_LOCAL_ALLOC(alloc); - start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted); + start = ocfs2_local_alloc_find_clear_bits(osb, alloc, &bits_wanted, + ac->ac_resv); if (start == -1) { /* TODO: Shouldn't we just BUG here? */ status = -ENOSPC; @@ -674,8 +760,6 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, bitmap = la->la_bitmap; *bit_off = le32_to_cpu(la->la_bm_off) + start; - /* local alloc is always contiguous by nature -- we never - * delete bits from it! */ *num_bits = bits_wanted; status = ocfs2_journal_access_di(handle, @@ -687,18 +771,15 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, goto bail; } + ocfs2_resmap_claimed_bits(&osb->osb_la_resmap, ac->ac_resv, start, + bits_wanted); + while(bits_wanted--) ocfs2_set_bit(start++, bitmap); le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits); + ocfs2_journal_dirty(handle, osb->local_alloc_bh); - status = ocfs2_journal_dirty(handle, osb->local_alloc_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - status = 0; bail: mlog_exit(status); return status; @@ -722,13 +803,17 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc) } static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, - struct ocfs2_dinode *alloc, - u32 numbits) + struct ocfs2_dinode *alloc, + u32 *numbits, + struct ocfs2_alloc_reservation *resv) { int numfound, bitoff, left, startoff, lastzero; + int local_resv = 0; + struct ocfs2_alloc_reservation r; void *bitmap = NULL; + struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap; - mlog_entry("(numbits wanted = %u)\n", numbits); + mlog_entry("(numbits wanted = %u)\n", *numbits); if (!alloc->id1.bitmap1.i_total) { mlog(0, "No bits in my window!\n"); @@ -736,6 +821,30 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, goto bail; } + if (!resv) { + local_resv = 1; + ocfs2_resv_init_once(&r); + ocfs2_resv_set_type(&r, OCFS2_RESV_FLAG_TMP); + resv = &r; + } + + numfound = *numbits; + if (ocfs2_resmap_resv_bits(resmap, resv, &bitoff, &numfound) == 0) { + if (numfound < *numbits) + *numbits = numfound; + goto bail; + } + + /* + * Code error. While reservations are enabled, local + * allocation should _always_ go through them. + */ + BUG_ON(osb->osb_resv_level != 0); + + /* + * Reservations are disabled. Handle this the old way. + */ + bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap; numfound = bitoff = startoff = 0; @@ -761,7 +870,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, startoff = bitoff+1; } /* we got everything we needed */ - if (numfound == numbits) { + if (numfound == *numbits) { /* mlog(0, "Found it all!\n"); */ break; } @@ -770,12 +879,15 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff, numfound); - if (numfound == numbits) + if (numfound == *numbits) bitoff = startoff - numfound; else bitoff = -1; bail: + if (local_resv) + ocfs2_resv_discard(resmap, resv); + mlog_exit(bitoff); return bitoff; } @@ -1049,7 +1161,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, /* we used the generic suballoc reserve function, but we set * everything up nicely, so there's no reason why we can't use * the more specific cluster api to claim bits. */ - status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits, + status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits, &cluster_off, &cluster_count); if (status == -ENOSPC) { retry_enospc: @@ -1063,7 +1175,7 @@ retry_enospc: goto bail; ac->ac_bits_wanted = osb->local_alloc_default_bits; - status = ocfs2_claim_clusters(osb, handle, ac, + status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits, &cluster_off, &cluster_count); @@ -1098,6 +1210,9 @@ retry_enospc: memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0, le16_to_cpu(la->la_size)); + ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count, + OCFS2_LOCAL_ALLOC(alloc)->la_bitmap); + mlog(0, "New window allocated:\n"); mlog(0, "window la_bm_off = %u\n", OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); @@ -1169,12 +1284,7 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, } ocfs2_clear_local_alloc(alloc); - - status = ocfs2_journal_dirty(handle, osb->local_alloc_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, osb->local_alloc_bh); status = ocfs2_sync_local_to_main(osb, handle, alloc_copy, main_bm_inode, main_bm_bh); @@ -1192,7 +1302,6 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, atomic_inc(&osb->alloc_stats.moves); - status = 0; bail: if (handle) ocfs2_commit_trans(osb, handle); diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h index ac5ea9f8665..1be9b586446 100644 --- a/fs/ocfs2/localalloc.h +++ b/fs/ocfs2/localalloc.h @@ -30,6 +30,9 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb); void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb); +void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb); +unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb); + int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, int node_num, struct ocfs2_dinode **alloc_copy); diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 7898bd3a99f..af2b8fe1f13 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -41,44 +41,20 @@ #include "file.h" #include "inode.h" #include "mmap.h" +#include "super.h" -static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset) -{ - /* The best way to deal with signals in the vm path is - * to block them upfront, rather than allowing the - * locking paths to return -ERESTARTSYS. */ - sigfillset(blocked); - - /* We should technically never get a bad return value - * from sigprocmask */ - return sigprocmask(SIG_BLOCK, blocked, oldset); -} - -static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset) -{ - return sigprocmask(SIG_SETMASK, oldset, NULL); -} static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf) { - sigset_t blocked, oldset; - int error, ret; + sigset_t oldset; + int ret; mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff); - error = ocfs2_vm_op_block_sigs(&blocked, &oldset); - if (error < 0) { - mlog_errno(error); - ret = VM_FAULT_SIGBUS; - goto out; - } - + ocfs2_block_signals(&oldset); ret = filemap_fault(area, vmf); + ocfs2_unblock_signals(&oldset); - error = ocfs2_vm_op_unblock_sigs(&oldset); - if (error < 0) - mlog_errno(error); -out: mlog_exit_ptr(vmf->page); return ret; } @@ -158,14 +134,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct buffer_head *di_bh = NULL; - sigset_t blocked, oldset; - int ret, ret2; + sigset_t oldset; + int ret; - ret = ocfs2_vm_op_block_sigs(&blocked, &oldset); - if (ret < 0) { - mlog_errno(ret); - return ret; - } + ocfs2_block_signals(&oldset); /* * The cluster locks taken will block a truncate from another @@ -193,9 +165,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ocfs2_inode_unlock(inode, 1); out: - ret2 = ocfs2_vm_op_unblock_sigs(&oldset); - if (ret2 < 0) - mlog_errno(ret2); + ocfs2_unblock_signals(&oldset); if (ret) ret = VM_FAULT_SIGBUS; return ret; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 4cbb18f26c5..f171b51a74f 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -204,14 +204,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode) inode->i_nlink = 2; else inode->i_nlink = 1; - inode->i_uid = current_fsuid(); - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - inode->i_gid = current_fsgid(); - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); dquot_initialize(inode); return inode; } @@ -239,6 +232,8 @@ static int ocfs2_mknod(struct inode *dir, }; int did_quota_inode = 0; struct ocfs2_dir_lookup_result lookup = { NULL, }; + sigset_t oldset; + int did_block_signals = 0; mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, (unsigned long)dev, dentry->d_name.len, @@ -350,6 +345,10 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } + /* Starting to change things, restart is no longer possible. */ + ocfs2_block_signals(&oldset); + did_block_signals = 1; + status = dquot_alloc_inode(inode); if (status) goto leave; @@ -384,11 +383,7 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } ocfs2_add_links_count(dirfe, 1); - status = ocfs2_journal_dirty(handle, parent_fe_bh); - if (status < 0) { - mlog_errno(status); - goto leave; - } + ocfs2_journal_dirty(handle, parent_fe_bh); inc_nlink(dir); } @@ -439,6 +434,8 @@ leave: ocfs2_commit_trans(osb, handle); ocfs2_inode_unlock(dir, 1); + if (did_block_signals) + ocfs2_unblock_signals(&oldset); if (status == -ENOSPC) mlog(0, "Disk is full\n"); @@ -487,14 +484,15 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, int status = 0; struct ocfs2_dinode *fe = NULL; struct ocfs2_extent_list *fel; - u64 fe_blkno = 0; + u64 suballoc_loc, fe_blkno = 0; u16 suballoc_bit; u16 feat; *new_fe_bh = NULL; - status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh, - inode_ac, &suballoc_bit, &fe_blkno); + status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh, + inode_ac, &suballoc_loc, + &suballoc_bit, &fe_blkno); if (status < 0) { mlog_errno(status); goto leave; @@ -531,6 +529,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, fe->i_generation = cpu_to_le32(inode->i_generation); fe->i_fs_generation = cpu_to_le32(osb->fs_generation); fe->i_blkno = cpu_to_le64(fe_blkno); + fe->i_suballoc_loc = cpu_to_le64(suballoc_loc); fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot); fe->i_uid = cpu_to_le32(inode->i_uid); @@ -567,11 +566,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb)); } - status = ocfs2_journal_dirty(handle, *new_fe_bh); - if (status < 0) { - mlog_errno(status); - goto leave; - } + ocfs2_journal_dirty(handle, *new_fe_bh); ocfs2_populate_inode(inode, fe, 1); ocfs2_ci_set_new(osb, INODE_CACHE(inode)); @@ -637,6 +632,7 @@ static int ocfs2_link(struct dentry *old_dentry, struct ocfs2_dinode *fe = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_dir_lookup_result lookup = { NULL, }; + sigset_t oldset; mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino, old_dentry->d_name.len, old_dentry->d_name.name, @@ -693,6 +689,9 @@ static int ocfs2_link(struct dentry *old_dentry, goto out_unlock_inode; } + /* Starting to change things, restart is no longer possible. */ + ocfs2_block_signals(&oldset); + err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (err < 0) { @@ -705,14 +704,7 @@ static int ocfs2_link(struct dentry *old_dentry, ocfs2_set_links_count(fe, inode->i_nlink); fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); - - err = ocfs2_journal_dirty(handle, fe_bh); - if (err < 0) { - ocfs2_add_links_count(fe, -1); - drop_nlink(inode); - mlog_errno(err); - goto out_commit; - } + ocfs2_journal_dirty(handle, fe_bh); err = ocfs2_add_entry(handle, dentry, inode, OCFS2_I(inode)->ip_blkno, @@ -736,6 +728,7 @@ static int ocfs2_link(struct dentry *old_dentry, out_commit: ocfs2_commit_trans(osb, handle); + ocfs2_unblock_signals(&oldset); out_unlock_inode: ocfs2_inode_unlock(inode, 1); @@ -909,12 +902,7 @@ static int ocfs2_unlink(struct inode *dir, drop_nlink(inode); drop_nlink(inode); ocfs2_set_links_count(fe, inode->i_nlink); - - status = ocfs2_journal_dirty(handle, fe_bh); - if (status < 0) { - mlog_errno(status); - goto leave; - } + ocfs2_journal_dirty(handle, fe_bh); dir->i_ctime = dir->i_mtime = CURRENT_TIME; if (S_ISDIR(inode->i_mode)) @@ -1332,12 +1320,7 @@ static int ocfs2_rename(struct inode *old_dir, ocfs2_set_links_count(newfe, 0); else ocfs2_add_links_count(newfe, -1); - - status = ocfs2_journal_dirty(handle, newfe_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, newfe_bh); } else { /* if the name was not found in new_dir, add it now */ status = ocfs2_add_entry(handle, new_dentry, old_inode, @@ -1356,10 +1339,7 @@ static int ocfs2_rename(struct inode *old_dir, old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec); old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec); - - status = ocfs2_journal_dirty(handle, old_inode_bh); - if (status < 0) - mlog_errno(status); + ocfs2_journal_dirty(handle, old_inode_bh); } else mlog_errno(status); @@ -1431,7 +1411,7 @@ static int ocfs2_rename(struct inode *old_dir, OCFS2_JOURNAL_ACCESS_WRITE); fe = (struct ocfs2_dinode *) old_dir_bh->b_data; ocfs2_set_links_count(fe, old_dir->i_nlink); - status = ocfs2_journal_dirty(handle, old_dir_bh); + ocfs2_journal_dirty(handle, old_dir_bh); } } ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); @@ -1563,11 +1543,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb, (bytes_left > sb->s_blocksize) ? sb->s_blocksize : bytes_left); - status = ocfs2_journal_dirty(handle, bhs[virtual]); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, bhs[virtual]); virtual++; p_blkno++; @@ -1611,6 +1587,8 @@ static int ocfs2_symlink(struct inode *dir, }; int did_quota = 0, did_quota_inode = 0; struct ocfs2_dir_lookup_result lookup = { NULL, }; + sigset_t oldset; + int did_block_signals = 0; mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, dentry, symname, dentry->d_name.len, dentry->d_name.name); @@ -1706,6 +1684,10 @@ static int ocfs2_symlink(struct inode *dir, goto bail; } + /* Starting to change things, restart is no longer possible. */ + ocfs2_block_signals(&oldset); + did_block_signals = 1; + status = dquot_alloc_inode(inode); if (status) goto bail; @@ -1814,6 +1796,8 @@ bail: ocfs2_commit_trans(osb, handle); ocfs2_inode_unlock(dir, 1); + if (did_block_signals) + ocfs2_unblock_signals(&oldset); brelse(new_fe_bh); brelse(parent_fe_bh); @@ -1961,12 +1945,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, if (S_ISDIR(inode->i_mode)) ocfs2_add_links_count(orphan_fe, 1); orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); - - status = ocfs2_journal_dirty(handle, orphan_dir_bh); - if (status < 0) { - mlog_errno(status); - goto leave; - } + ocfs2_journal_dirty(handle, orphan_dir_bh); status = __ocfs2_add_entry(handle, orphan_dir_inode, name, OCFS2_ORPHAN_NAMELEN, inode, @@ -2065,12 +2044,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, if (S_ISDIR(inode->i_mode)) ocfs2_add_links_count(orphan_fe, -1); orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); - - status = ocfs2_journal_dirty(handle, orphan_dir_bh); - if (status < 0) { - mlog_errno(status); - goto leave; - } + ocfs2_journal_dirty(handle, orphan_dir_bh); leave: ocfs2_free_dir_lookup_result(&lookup); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index adf5e2ebc2c..c67003b6b5a 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -47,6 +47,7 @@ /* For struct ocfs2_blockcheck_stats */ #include "blockcheck.h" +#include "reservations.h" /* Caching of metadata buffers */ @@ -341,6 +342,9 @@ struct ocfs2_super */ unsigned int local_alloc_bits; unsigned int local_alloc_default_bits; + /* osb_clusters_at_boot can become stale! Do not trust it to + * be up to date. */ + unsigned int osb_clusters_at_boot; enum ocfs2_local_alloc_state local_alloc_state; /* protected * by osb_lock */ @@ -349,6 +353,11 @@ struct ocfs2_super u64 la_last_gd; + struct ocfs2_reservation_map osb_la_resmap; + + unsigned int osb_resv_level; + unsigned int osb_dir_resv_level; + /* Next three fields are for local node slot recovery during * mount. */ int dirty; @@ -482,6 +491,13 @@ static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb) return 0; } +static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG) + return 1; + return 0; +} + static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb) { if (ocfs2_supports_indexed_dirs(osb)) @@ -763,6 +779,12 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb, return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); } +static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb, + unsigned int clusters) +{ + return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits); +} + static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap) { ext2_set_bit(bit, bitmap); diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index bb37218a797..33f1c9a8258 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -100,7 +100,8 @@ | OCFS2_FEATURE_INCOMPAT_XATTR \ | OCFS2_FEATURE_INCOMPAT_META_ECC \ | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ - | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE) + | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \ + | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG) #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) @@ -165,6 +166,9 @@ /* Refcount tree support */ #define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000 +/* Discontigous block groups */ +#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000 + /* * backup superblock flag is used to indicate that this volume * has backup superblocks. @@ -283,14 +287,6 @@ #define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) /* - * Default local alloc size (in megabytes) - * - * The value chosen should be such that most allocations, including new - * block groups, use local alloc. - */ -#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8 - -/* * Inline extended attribute size (in bytes) * The value chosen should be aligned to 16 byte boundaries. */ @@ -512,7 +508,10 @@ struct ocfs2_extent_block block group */ __le32 h_fs_generation; /* Must match super block */ __le64 h_blkno; /* Offset on disk, in blocks */ -/*20*/ __le64 h_reserved3; +/*20*/ __le64 h_suballoc_loc; /* Suballocator block group this + eb belongs to. Only valid + if allocated from a + discontiguous block group */ __le64 h_next_leaf_blk; /* Offset on disk, in blocks, of next leaf header pointing to data */ @@ -679,7 +678,11 @@ struct ocfs2_dinode { /*80*/ struct ocfs2_block_check i_check; /* Error checking */ /*88*/ __le64 i_dx_root; /* Pointer to dir index root block */ /*90*/ __le64 i_refcount_loc; - __le64 i_reserved2[4]; + __le64 i_suballoc_loc; /* Suballocator block group this + inode belongs to. Only valid + if allocated from a + discontiguous block group */ +/*A0*/ __le64 i_reserved2[3]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this 64bit union */ @@ -814,7 +817,12 @@ struct ocfs2_dx_root_block { __le32 dr_reserved2; __le64 dr_free_blk; /* Pointer to head of free * unindexed block list. */ - __le64 dr_reserved3[15]; + __le64 dr_suballoc_loc; /* Suballocator block group + this root belongs to. + Only valid if allocated + from a discontiguous + block group */ + __le64 dr_reserved3[14]; union { struct ocfs2_extent_list dr_list; /* Keep this aligned to 128 * bits for maximum space @@ -840,6 +848,13 @@ struct ocfs2_dx_leaf { }; /* + * Largest bitmap for a block (suballocator) group in bytes. This limit + * does not affect cluster groups (global allocator). Cluster group + * bitmaps run to the end of the block. + */ +#define OCFS2_MAX_BG_BITMAP_SIZE 256 + +/* * On disk allocator group structure for OCFS2 */ struct ocfs2_group_desc @@ -860,7 +875,29 @@ struct ocfs2_group_desc __le64 bg_blkno; /* Offset on disk, in blocks */ /*30*/ struct ocfs2_block_check bg_check; /* Error checking */ __le64 bg_reserved2; -/*40*/ __u8 bg_bitmap[0]; +/*40*/ union { + __u8 bg_bitmap[0]; + struct { + /* + * Block groups may be discontiguous when + * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set. + * The extents of a discontigous block group are + * stored in bg_list. It is a flat list. + * l_tree_depth must always be zero. A + * discontiguous group is signified by a non-zero + * bg_list->l_next_free_rec. Only block groups + * can be discontiguous; Cluster groups cannot. + * We've never made a block group with more than + * 2048 blocks (256 bytes of bg_bitmap). This + * codifies that limit so that we can fit bg_list. + * bg_size of a discontiguous block group will + * be 256 to match bg_bitmap_filler. + */ + __u8 bg_bitmap_filler[OCFS2_MAX_BG_BITMAP_SIZE]; +/*140*/ struct ocfs2_extent_list bg_list; + }; + }; +/* Actual on-disk size is one block */ }; struct ocfs2_refcount_rec { @@ -905,7 +942,11 @@ struct ocfs2_refcount_block { /*40*/ __le32 rf_generation; /* generation number. all be the same * for the same refcount tree. */ __le32 rf_reserved0; - __le64 rf_reserved1[7]; + __le64 rf_suballoc_loc; /* Suballocator block group this + refcount block belongs to. Only + valid if allocated from a + discontiguous block group */ +/*50*/ __le64 rf_reserved1[6]; /*80*/ union { struct ocfs2_refcount_list rf_records; /* List of refcount records */ @@ -1017,7 +1058,10 @@ struct ocfs2_xattr_block { real xattr or a xattr tree. */ __le16 xb_reserved0; __le32 xb_reserved1; - __le64 xb_reserved2; + __le64 xb_suballoc_loc; /* Suballocator block group this + xattr block belongs to. Only + valid if allocated from a + discontiguous block group */ /*30*/ union { struct ocfs2_xattr_header xb_header; /* xattr header if this block contains xattr */ @@ -1254,6 +1298,16 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb) return size / sizeof(struct ocfs2_extent_rec); } +static inline u16 ocfs2_extent_recs_per_gd(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_group_desc, bg_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb) { int size; @@ -1284,13 +1338,23 @@ static inline u16 ocfs2_local_alloc_size(struct super_block *sb) return size; } -static inline int ocfs2_group_bitmap_size(struct super_block *sb) +static inline int ocfs2_group_bitmap_size(struct super_block *sb, + int suballocator, + u32 feature_incompat) { - int size; - - size = sb->s_blocksize - + int size = sb->s_blocksize - offsetof(struct ocfs2_group_desc, bg_bitmap); + /* + * The cluster allocator uses the entire block. Suballocators have + * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older + * code expects bg_size set to the maximum. Thus we must keep + * bg_size as-is unless discontig_bg is enabled. + */ + if (suballocator && + (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)) + size = OCFS2_MAX_BG_BITMAP_SIZE; + return size; } @@ -1402,23 +1466,43 @@ static inline int ocfs2_extent_recs_per_eb(int blocksize) return size / sizeof(struct ocfs2_extent_rec); } -static inline int ocfs2_local_alloc_size(int blocksize) +static inline int ocfs2_extent_recs_per_gd(int blocksize) { int size; size = blocksize - - offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); + offsetof(struct ocfs2_group_desc, bg_list.l_recs); - return size; + return size / sizeof(struct ocfs2_extent_rec); } -static inline int ocfs2_group_bitmap_size(int blocksize) +static inline int ocfs2_local_alloc_size(int blocksize) { int size; size = blocksize - + offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); + + return size; +} + +static inline int ocfs2_group_bitmap_size(int blocksize, + int suballocator, + uint32_t feature_incompat) +{ + int size = sb->s_blocksize - offsetof(struct ocfs2_group_desc, bg_bitmap); + /* + * The cluster allocator uses the entire block. Suballocators have + * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older + * code expects bg_size set to the maximum. Thus we must keep + * bg_size as-is unless discontig_bg is enabled. + */ + if (suballocator && + (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)) + size = OCFS2_MAX_BG_BITMAP_SIZE; + return size; } @@ -1491,5 +1575,19 @@ static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de, de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; } +static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd) +{ + if ((offsetof(struct ocfs2_group_desc, bg_bitmap) + + le16_to_cpu(gd->bg_size)) != + offsetof(struct ocfs2_group_desc, bg_list)) + return 0; + /* + * Only valid to check l_next_free_rec if + * bg_bitmap + bg_size == bg_list. + */ + if (!gd->bg_list.l_next_free_rec) + return 0; + return 1; +} #endif /* _OCFS2_FS_H */ diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index 123bc520a2c..196fcb52d95 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -23,6 +23,7 @@ struct ocfs2_dquot { struct dquot dq_dquot; /* Generic VFS dquot */ loff_t dq_local_off; /* Offset in the local quota file */ + u64 dq_local_phys_blk; /* Physical block carrying quota structure */ struct ocfs2_quota_chunk *dq_chunk; /* Chunk dquot is in */ unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */ s64 dq_origspace; /* Last globally synced space usage */ @@ -51,8 +52,9 @@ struct ocfs2_mem_dqinfo { struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */ struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */ int dqi_gqi_count; /* Number of holders of dqi_gqi_bh */ + u64 dqi_giblk; /* Number of block with global information header */ struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */ - struct buffer_head *dqi_ibh; /* Buffer with information header */ + struct buffer_head *dqi_libh; /* Buffer with local information header */ struct qtree_mem_dqinfo dqi_gi; /* Info about global file */ struct delayed_work dqi_sync_work; /* Work for syncing dquots */ struct ocfs2_quota_recovery *dqi_rec; /* Pointer to recovery @@ -102,8 +104,12 @@ static inline int ocfs2_global_release_dquot(struct dquot *dquot) int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex); void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex); -int ocfs2_read_quota_block(struct inode *inode, u64 v_block, - struct buffer_head **bh); +int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh); +int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block, + struct buffer_head **bh); +int ocfs2_create_local_dquot(struct dquot *dquot); +int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot); +int ocfs2_local_write_dquot(struct dquot *dquot); extern const struct dquot_operations ocfs2_quota_operations; extern struct quota_format_type ocfs2_quota_format; diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index ab42a74c753..2bb35fe0051 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -25,8 +25,44 @@ #include "dlmglue.h" #include "uptodate.h" #include "super.h" +#include "buffer_head_io.h" #include "quota.h" +/* + * Locking of quotas with OCFS2 is rather complex. Here are rules that + * should be obeyed by all the functions: + * - any write of quota structure (either to local or global file) is protected + * by dqio_mutex or dquot->dq_lock. + * - any modification of global quota file holds inode cluster lock, i_mutex, + * and ip_alloc_sem of the global quota file (achieved by + * ocfs2_lock_global_qf). It also has to hold qinfo_lock. + * - an allocation of new blocks for local quota file is protected by + * its ip_alloc_sem + * + * A rough sketch of locking dependencies (lf = local file, gf = global file): + * Normal filesystem operation: + * start_trans -> dqio_mutex -> write to lf + * Syncing of local and global file: + * ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock -> + * write to gf + * -> write to lf + * Acquire dquot for the first time: + * dq_lock -> ocfs2_lock_global_qf -> qinfo_lock -> read from gf + * -> alloc space for gf + * -> start_trans -> qinfo_lock -> write to gf + * -> ip_alloc_sem of lf -> alloc space for lf + * -> write to lf + * Release last reference to dquot: + * dq_lock -> ocfs2_lock_global_qf -> start_trans -> qinfo_lock -> write to gf + * -> write to lf + * Note that all the above operations also hold the inode cluster lock of lf. + * Recovery: + * inode cluster lock of recovered lf + * -> read bitmaps -> ip_alloc_sem of lf + * -> ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock -> + * write to gf + */ + static struct workqueue_struct *ocfs2_quota_wq = NULL; static void qsync_work_fn(struct work_struct *work); @@ -91,8 +127,7 @@ struct qtree_fmt_operations ocfs2_global_ops = { .is_id = ocfs2_global_is_id, }; -static int ocfs2_validate_quota_block(struct super_block *sb, - struct buffer_head *bh) +int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh) { struct ocfs2_disk_dqtrailer *dqt = ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data); @@ -110,54 +145,19 @@ static int ocfs2_validate_quota_block(struct super_block *sb, return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check); } -int ocfs2_read_quota_block(struct inode *inode, u64 v_block, - struct buffer_head **bh) +int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block, + struct buffer_head **bhp) { - int rc = 0; - struct buffer_head *tmp = *bh; - - if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) { - ocfs2_error(inode->i_sb, - "Quota file %llu is probably corrupted! Requested " - "to read block %Lu but file has size only %Lu\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (unsigned long long)v_block, - (unsigned long long)i_size_read(inode)); - return -EIO; - } - rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0, - ocfs2_validate_quota_block); + int rc; + + *bhp = NULL; + rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, 1, bhp, 0, + ocfs2_validate_quota_block); if (rc) mlog_errno(rc); - - /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */ - if (!rc && !*bh) - *bh = tmp; - return rc; } -static int ocfs2_get_quota_block(struct inode *inode, int block, - struct buffer_head **bh) -{ - u64 pblock, pcount; - int err; - - down_read(&OCFS2_I(inode)->ip_alloc_sem); - err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL); - up_read(&OCFS2_I(inode)->ip_alloc_sem); - if (err) { - mlog_errno(err); - return err; - } - *bh = sb_getblk(inode->i_sb, pblock); - if (!*bh) { - err = -EIO; - mlog_errno(err); - } - return err; -} - /* Read data from global quotafile - avoid pagecache and such because we cannot * afford acquiring the locks... We use quota cluster lock to serialize * operations. Caller is responsible for acquiring it. */ @@ -172,6 +172,7 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data, int err = 0; struct buffer_head *bh; size_t toread, tocopy; + u64 pblock = 0, pcount = 0; if (off > i_size) return 0; @@ -180,8 +181,19 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data, toread = len; while (toread > 0) { tocopy = min_t(size_t, (sb->s_blocksize - offset), toread); + if (!pcount) { + err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock, + &pcount, NULL); + if (err) { + mlog_errno(err); + return err; + } + } else { + pcount--; + pblock++; + } bh = NULL; - err = ocfs2_read_quota_block(gqinode, blk, &bh); + err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh); if (err) { mlog_errno(err); return err; @@ -209,6 +221,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, int err = 0, new = 0, ja_type; struct buffer_head *bh = NULL; handle_t *handle = journal_current_handle(); + u64 pblock, pcount; if (!handle) { mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled " @@ -221,12 +234,11 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset; } - mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA); if (gqinode->i_size < off + len) { loff_t rounded_end = ocfs2_align_bytes_to_blocks(sb, off + len); - /* Space is already allocated in ocfs2_global_read_dquot() */ + /* Space is already allocated in ocfs2_acquire_dquot() */ err = ocfs2_simple_size_update(gqinode, oinfo->dqi_gqi_bh, rounded_end); @@ -234,13 +246,20 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, goto out; new = 1; } + err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock, &pcount, NULL); + if (err) { + mlog_errno(err); + goto out; + } /* Not rewriting whole block? */ if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) && !new) { - err = ocfs2_read_quota_block(gqinode, blk, &bh); + err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh); ja_type = OCFS2_JOURNAL_ACCESS_WRITE; } else { - err = ocfs2_get_quota_block(gqinode, blk, &bh); + bh = sb_getblk(sb, pblock); + if (!bh) + err = -ENOMEM; ja_type = OCFS2_JOURNAL_ACCESS_CREATE; } if (err) { @@ -261,19 +280,15 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, brelse(bh); goto out; } - err = ocfs2_journal_dirty(handle, bh); + ocfs2_journal_dirty(handle, bh); brelse(bh); - if (err < 0) - goto out; out: if (err) { - mutex_unlock(&gqinode->i_mutex); mlog_errno(err); return err; } gqinode->i_version++; ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh); - mutex_unlock(&gqinode->i_mutex); return len; } @@ -291,11 +306,23 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) else WARN_ON(bh != oinfo->dqi_gqi_bh); spin_unlock(&dq_data_lock); + if (ex) { + mutex_lock(&oinfo->dqi_gqinode->i_mutex); + down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); + } else { + down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); + } return 0; } void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) { + if (ex) { + up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); + mutex_unlock(&oinfo->dqi_gqinode->i_mutex); + } else { + up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); + } ocfs2_inode_unlock(oinfo->dqi_gqinode, ex); brelse(oinfo->dqi_gqi_bh); spin_lock(&dq_data_lock); @@ -313,6 +340,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type) struct ocfs2_global_disk_dqinfo dinfo; struct mem_dqinfo *info = sb_dqinfo(sb, type); struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + u64 pcount; int status; mlog_entry_void(); @@ -339,9 +367,19 @@ int ocfs2_global_read_info(struct super_block *sb, int type) mlog_errno(status); goto out_err; } + + status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk, + &pcount, NULL); + if (status < 0) + goto out_unlock; + + status = ocfs2_qinfo_lock(oinfo, 0); + if (status < 0) + goto out_unlock; status = sb->s_op->quota_read(sb, type, (char *)&dinfo, sizeof(struct ocfs2_global_disk_dqinfo), OCFS2_GLOBAL_INFO_OFF); + ocfs2_qinfo_unlock(oinfo, 0); ocfs2_unlock_global_qf(oinfo, 0); if (status != sizeof(struct ocfs2_global_disk_dqinfo)) { mlog(ML_ERROR, "Cannot read global quota info (%d).\n", @@ -368,6 +406,10 @@ int ocfs2_global_read_info(struct super_block *sb, int type) out_err: mlog_exit(status); return status; +out_unlock: + ocfs2_unlock_global_qf(oinfo, 0); + mlog_errno(status); + goto out_err; } /* Write information to global quota file. Expects exlusive lock on quota @@ -426,78 +468,10 @@ static int ocfs2_global_qinit_alloc(struct super_block *sb, int type) static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type) { - /* We modify all the allocated blocks, tree root, and info block */ + /* We modify all the allocated blocks, tree root, info block and + * the inode */ return (ocfs2_global_qinit_alloc(sb, type) + 2) * - OCFS2_QUOTA_BLOCK_UPDATE_CREDITS; -} - -/* Read in information from global quota file and acquire a reference to it. - * dquot_acquire() has already started the transaction and locked quota file */ -int ocfs2_global_read_dquot(struct dquot *dquot) -{ - int err, err2, ex = 0; - struct super_block *sb = dquot->dq_sb; - int type = dquot->dq_type; - struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; - struct ocfs2_super *osb = OCFS2_SB(sb); - struct inode *gqinode = info->dqi_gqinode; - int need_alloc = ocfs2_global_qinit_alloc(sb, type); - handle_t *handle = NULL; - - err = ocfs2_qinfo_lock(info, 0); - if (err < 0) - goto out; - err = qtree_read_dquot(&info->dqi_gi, dquot); - if (err < 0) - goto out_qlock; - OCFS2_DQUOT(dquot)->dq_use_count++; - OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; - OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes; - ocfs2_qinfo_unlock(info, 0); - - if (!dquot->dq_off) { /* No real quota entry? */ - ex = 1; - /* - * Add blocks to quota file before we start a transaction since - * locking allocators ranks above a transaction start - */ - WARN_ON(journal_current_handle()); - down_write(&OCFS2_I(gqinode)->ip_alloc_sem); - err = ocfs2_extend_no_holes(gqinode, - gqinode->i_size + (need_alloc << sb->s_blocksize_bits), - gqinode->i_size); - up_write(&OCFS2_I(gqinode)->ip_alloc_sem); - if (err < 0) - goto out; - } - - handle = ocfs2_start_trans(osb, - ocfs2_calc_global_qinit_credits(sb, type)); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto out; - } - err = ocfs2_qinfo_lock(info, ex); - if (err < 0) - goto out_trans; - err = qtree_write_dquot(&info->dqi_gi, dquot); - if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) { - err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type); - if (!err) - err = err2; - } -out_qlock: - if (ex) - ocfs2_qinfo_unlock(info, 1); - else - ocfs2_qinfo_unlock(info, 0); -out_trans: - if (handle) - ocfs2_commit_trans(osb, handle); -out: - if (err < 0) - mlog_errno(err); - return err; + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + 1; } /* Sync local information about quota modifications with global quota file. @@ -638,14 +612,13 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type) } mutex_lock(&sb_dqopt(sb)->dqio_mutex); status = ocfs2_sync_dquot(dquot); - mutex_unlock(&sb_dqopt(sb)->dqio_mutex); if (status < 0) mlog_errno(status); /* We have to write local structure as well... */ - dquot_mark_dquot_dirty(dquot); - status = dquot_commit(dquot); + status = ocfs2_local_write_dquot(dquot); if (status < 0) mlog_errno(status); + mutex_unlock(&sb_dqopt(sb)->dqio_mutex); ocfs2_commit_trans(osb, handle); out_ilock: ocfs2_unlock_global_qf(oinfo, 1); @@ -684,7 +657,9 @@ static int ocfs2_write_dquot(struct dquot *dquot) mlog_errno(status); goto out; } - status = dquot_commit(dquot); + mutex_lock(&sb_dqopt(dquot->dq_sb)->dqio_mutex); + status = ocfs2_local_write_dquot(dquot); + mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex); ocfs2_commit_trans(osb, handle); out: mlog_exit(status); @@ -715,6 +690,10 @@ static int ocfs2_release_dquot(struct dquot *dquot) mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); + mutex_lock(&dquot->dq_lock); + /* Check whether we are not racing with some other dqget() */ + if (atomic_read(&dquot->dq_count) > 1) + goto out; status = ocfs2_lock_global_qf(oinfo, 1); if (status < 0) goto out; @@ -725,30 +704,113 @@ static int ocfs2_release_dquot(struct dquot *dquot) mlog_errno(status); goto out_ilock; } - status = dquot_release(dquot); + + status = ocfs2_global_release_dquot(dquot); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + status = ocfs2_local_release_dquot(handle, dquot); + /* + * If we fail here, we cannot do much as global structure is + * already released. So just complain... + */ + if (status < 0) + mlog_errno(status); + clear_bit(DQ_ACTIVE_B, &dquot->dq_flags); +out_trans: ocfs2_commit_trans(osb, handle); out_ilock: ocfs2_unlock_global_qf(oinfo, 1); out: + mutex_unlock(&dquot->dq_lock); mlog_exit(status); return status; } +/* + * Read global dquot structure from disk or create it if it does + * not exist. Also update use count of the global structure and + * create structure in node-local quota file. + */ static int ocfs2_acquire_dquot(struct dquot *dquot) { - struct ocfs2_mem_dqinfo *oinfo = - sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; - int status = 0; + int status = 0, err; + int ex = 0; + struct super_block *sb = dquot->dq_sb; + struct ocfs2_super *osb = OCFS2_SB(sb); + int type = dquot->dq_type; + struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; + struct inode *gqinode = info->dqi_gqinode; + int need_alloc = ocfs2_global_qinit_alloc(sb, type); + handle_t *handle; - mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); - /* We need an exclusive lock, because we're going to update use count - * and instantiate possibly new dquot structure */ - status = ocfs2_lock_global_qf(oinfo, 1); + mlog_entry("id=%u, type=%d", dquot->dq_id, type); + mutex_lock(&dquot->dq_lock); + /* + * We need an exclusive lock, because we're going to update use count + * and instantiate possibly new dquot structure + */ + status = ocfs2_lock_global_qf(info, 1); if (status < 0) goto out; - status = dquot_acquire(dquot); - ocfs2_unlock_global_qf(oinfo, 1); + if (!test_bit(DQ_READ_B, &dquot->dq_flags)) { + status = ocfs2_qinfo_lock(info, 0); + if (status < 0) + goto out_dq; + status = qtree_read_dquot(&info->dqi_gi, dquot); + ocfs2_qinfo_unlock(info, 0); + if (status < 0) + goto out_dq; + } + set_bit(DQ_READ_B, &dquot->dq_flags); + + OCFS2_DQUOT(dquot)->dq_use_count++; + OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; + OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes; + if (!dquot->dq_off) { /* No real quota entry? */ + ex = 1; + /* + * Add blocks to quota file before we start a transaction since + * locking allocators ranks above a transaction start + */ + WARN_ON(journal_current_handle()); + status = ocfs2_extend_no_holes(gqinode, + gqinode->i_size + (need_alloc << sb->s_blocksize_bits), + gqinode->i_size); + if (status < 0) + goto out_dq; + } + + handle = ocfs2_start_trans(osb, + ocfs2_calc_global_qinit_credits(sb, type)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + goto out_dq; + } + status = ocfs2_qinfo_lock(info, ex); + if (status < 0) + goto out_trans; + status = qtree_write_dquot(&info->dqi_gi, dquot); + if (ex && info_dirty(sb_dqinfo(sb, type))) { + err = __ocfs2_global_write_info(sb, type); + if (!status) + status = err; + } + ocfs2_qinfo_unlock(info, ex); +out_trans: + ocfs2_commit_trans(osb, handle); +out_dq: + ocfs2_unlock_global_qf(info, 1); + if (status < 0) + goto out; + + status = ocfs2_create_local_dquot(dquot); + if (status < 0) + goto out; + set_bit(DQ_ACTIVE_B, &dquot->dq_flags); out: + mutex_unlock(&dquot->dq_lock); mlog_exit(status); return status; } @@ -770,7 +832,6 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot) struct ocfs2_super *osb = OCFS2_SB(sb); mlog_entry("id=%u, type=%d", dquot->dq_id, type); - dquot_mark_dquot_dirty(dquot); /* In case user set some limits, sync dquot immediately to global * quota file so that information propagates quicker */ @@ -793,14 +854,16 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot) mlog_errno(status); goto out_ilock; } + mutex_lock(&sb_dqopt(sb)->dqio_mutex); status = ocfs2_sync_dquot(dquot); if (status < 0) { mlog_errno(status); - goto out_trans; + goto out_dlock; } /* Now write updated local dquot structure */ - status = dquot_commit(dquot); -out_trans: + status = ocfs2_local_write_dquot(dquot); +out_dlock: + mutex_unlock(&sb_dqopt(sb)->dqio_mutex); ocfs2_commit_trans(osb, handle); out_ilock: ocfs2_unlock_global_qf(oinfo, 1); @@ -852,7 +915,7 @@ static void ocfs2_destroy_dquot(struct dquot *dquot) } const struct dquot_operations ocfs2_quota_operations = { - .write_dquot = ocfs2_write_dquot, + /* We never make dquot dirty so .write_dquot is never called */ .acquire_dquot = ocfs2_acquire_dquot, .release_dquot = ocfs2_release_dquot, .mark_dirty = ocfs2_mark_dquot_dirty, diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 9ad49305f45..8bd70d4d184 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -22,6 +22,7 @@ #include "dlmglue.h" #include "quota.h" #include "uptodate.h" +#include "super.h" /* Number of local quota structures per block */ static inline unsigned int ol_quota_entries_per_block(struct super_block *sb) @@ -119,12 +120,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh, lock_buffer(bh); modify(bh, private); unlock_buffer(bh); - status = ocfs2_journal_dirty(handle, bh); - if (status < 0) { - mlog_errno(status); - ocfs2_commit_trans(OCFS2_SB(sb), handle); - return status; - } + ocfs2_journal_dirty(handle, bh); + status = ocfs2_commit_trans(OCFS2_SB(sb), handle); if (status < 0) { mlog_errno(status); @@ -133,6 +130,39 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh, return 0; } +/* + * Read quota block from a given logical offset. + * + * This function acquires ip_alloc_sem and thus it must not be called with a + * transaction started. + */ +static int ocfs2_read_quota_block(struct inode *inode, u64 v_block, + struct buffer_head **bh) +{ + int rc = 0; + struct buffer_head *tmp = *bh; + + if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) { + ocfs2_error(inode->i_sb, + "Quota file %llu is probably corrupted! Requested " + "to read block %Lu but file has size only %Lu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)v_block, + (unsigned long long)i_size_read(inode)); + return -EIO; + } + rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0, + ocfs2_validate_quota_block); + if (rc) + mlog_errno(rc); + + /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + /* Check whether we understand format of quota files */ static int ocfs2_local_check_quota_file(struct super_block *sb, int type) { @@ -523,9 +553,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, ocfs2_clear_bit(bit, dchunk->dqc_bitmap); le32_add_cpu(&dchunk->dqc_free, 1); unlock_buffer(qbh); - status = ocfs2_journal_dirty(handle, qbh); - if (status < 0) - mlog_errno(status); + ocfs2_journal_dirty(handle, qbh); out_commit: mutex_unlock(&sb_dqopt(sb)->dqio_mutex); ocfs2_commit_trans(OCFS2_SB(sb), handle); @@ -631,9 +659,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, lock_buffer(bh); ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN); unlock_buffer(bh); - status = ocfs2_journal_dirty(handle, bh); - if (status < 0) - mlog_errno(status); + ocfs2_journal_dirty(handle, bh); out_trans: ocfs2_commit_trans(osb, handle); out_bh: @@ -679,7 +705,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) INIT_LIST_HEAD(&oinfo->dqi_chunk); oinfo->dqi_rec = NULL; oinfo->dqi_lqi_bh = NULL; - oinfo->dqi_ibh = NULL; + oinfo->dqi_libh = NULL; status = ocfs2_global_read_info(sb, type); if (status < 0) @@ -705,7 +731,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags); oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks); oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks); - oinfo->dqi_ibh = bh; + oinfo->dqi_libh = bh; /* We crashed when using local quota file? */ if (!(info->dqi_flags & OLQF_CLEAN)) { @@ -767,7 +793,7 @@ static int ocfs2_local_write_info(struct super_block *sb, int type) { struct mem_dqinfo *info = sb_dqinfo(sb, type); struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv) - ->dqi_ibh; + ->dqi_libh; int status; status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info, @@ -790,10 +816,6 @@ static int ocfs2_local_free_info(struct super_block *sb, int type) int mark_clean = 1, len; int status; - /* At this point we know there are no more dquots and thus - * even if there's some sync in the pdflush queue, it won't - * find any dquots and return without doing anything */ - cancel_delayed_work_sync(&oinfo->dqi_sync_work); iput(oinfo->dqi_gqinode); ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock); ocfs2_lock_res_free(&oinfo->dqi_gqlock); @@ -828,7 +850,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type) /* Mark local file as clean */ info->dqi_flags |= OLQF_CLEAN; status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], - oinfo->dqi_ibh, + oinfo->dqi_libh, olq_update_info, info); if (status < 0) { @@ -838,7 +860,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type) out: ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1); - brelse(oinfo->dqi_ibh); + brelse(oinfo->dqi_libh); brelse(oinfo->dqi_lqi_bh); kfree(oinfo); return 0; @@ -866,22 +888,21 @@ static void olq_set_dquot(struct buffer_head *bh, void *private) } /* Write dquot to local quota file */ -static int ocfs2_local_write_dquot(struct dquot *dquot) +int ocfs2_local_write_dquot(struct dquot *dquot) { struct super_block *sb = dquot->dq_sb; struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); - struct buffer_head *bh = NULL; + struct buffer_head *bh; + struct inode *lqinode = sb_dqopt(sb)->files[dquot->dq_type]; int status; - status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type], - ol_dqblk_file_block(sb, od->dq_local_off), - &bh); + status = ocfs2_read_quota_phys_block(lqinode, od->dq_local_phys_blk, + &bh); if (status) { mlog_errno(status); goto out; } - status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh, - olq_set_dquot, od); + status = ocfs2_modify_bh(lqinode, bh, olq_set_dquot, od); if (status < 0) { mlog_errno(status); goto out; @@ -981,10 +1002,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( } /* Initialize chunk header */ - down_read(&OCFS2_I(lqinode)->ip_alloc_sem); status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, &p_blkno, NULL, NULL); - up_read(&OCFS2_I(lqinode)->ip_alloc_sem); if (status < 0) { mlog_errno(status); goto out_trans; @@ -1009,17 +1028,11 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) - OCFS2_QBLK_RESERVED_SPACE); unlock_buffer(bh); - status = ocfs2_journal_dirty(handle, bh); - if (status < 0) { - mlog_errno(status); - goto out_trans; - } + ocfs2_journal_dirty(handle, bh); /* Initialize new block with structures */ - down_read(&OCFS2_I(lqinode)->ip_alloc_sem); status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1, &p_blkno, NULL, NULL); - up_read(&OCFS2_I(lqinode)->ip_alloc_sem); if (status < 0) { mlog_errno(status); goto out_trans; @@ -1040,11 +1053,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( lock_buffer(dbh); memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE); unlock_buffer(dbh); - status = ocfs2_journal_dirty(handle, dbh); - if (status < 0) { - mlog_errno(status); - goto out_trans; - } + ocfs2_journal_dirty(handle, dbh); /* Update local quotafile info */ oinfo->dqi_blocks += 2; @@ -1120,10 +1129,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( } /* Get buffer from the just added block */ - down_read(&OCFS2_I(lqinode)->ip_alloc_sem); status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, &p_blkno, NULL, NULL); - up_read(&OCFS2_I(lqinode)->ip_alloc_sem); if (status < 0) { mlog_errno(status); goto out; @@ -1155,11 +1162,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( lock_buffer(bh); memset(bh->b_data, 0, sb->s_blocksize); unlock_buffer(bh); - status = ocfs2_journal_dirty(handle, bh); - if (status < 0) { - mlog_errno(status); - goto out_trans; - } + ocfs2_journal_dirty(handle, bh); + /* Update chunk header */ status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), chunk->qc_headerbh, @@ -1173,11 +1177,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( lock_buffer(chunk->qc_headerbh); le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb)); unlock_buffer(chunk->qc_headerbh); - status = ocfs2_journal_dirty(handle, chunk->qc_headerbh); - if (status < 0) { - mlog_errno(status); - goto out_trans; - } + ocfs2_journal_dirty(handle, chunk->qc_headerbh); + /* Update file header */ oinfo->dqi_blocks++; status = ocfs2_local_write_info(sb, type); @@ -1210,7 +1211,7 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private) } /* Create dquot in the local file for given id */ -static int ocfs2_create_local_dquot(struct dquot *dquot) +int ocfs2_create_local_dquot(struct dquot *dquot) { struct super_block *sb = dquot->dq_sb; int type = dquot->dq_type; @@ -1219,17 +1220,27 @@ static int ocfs2_create_local_dquot(struct dquot *dquot) struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); int offset; int status; + u64 pcount; + down_write(&OCFS2_I(lqinode)->ip_alloc_sem); chunk = ocfs2_find_free_entry(sb, type, &offset); if (!chunk) { chunk = ocfs2_extend_local_quota_file(sb, type, &offset); - if (IS_ERR(chunk)) - return PTR_ERR(chunk); + if (IS_ERR(chunk)) { + status = PTR_ERR(chunk); + goto out; + } } else if (IS_ERR(chunk)) { - return PTR_ERR(chunk); + status = PTR_ERR(chunk); + goto out; } od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset); od->dq_chunk = chunk; + status = ocfs2_extent_map_get_blocks(lqinode, + ol_dqblk_block(sb, chunk->qc_num, offset), + &od->dq_local_phys_blk, + &pcount, + NULL); /* Initialize dquot structure on disk */ status = ocfs2_local_write_dquot(dquot); @@ -1246,39 +1257,15 @@ static int ocfs2_create_local_dquot(struct dquot *dquot) goto out; } out: + up_write(&OCFS2_I(lqinode)->ip_alloc_sem); return status; } -/* Create entry in local file for dquot, load data from the global file */ -static int ocfs2_local_read_dquot(struct dquot *dquot) -{ - int status; - - mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type); - - status = ocfs2_global_read_dquot(dquot); - if (status < 0) { - mlog_errno(status); - goto out_err; - } - - /* Now create entry in the local quota file */ - status = ocfs2_create_local_dquot(dquot); - if (status < 0) { - mlog_errno(status); - goto out_err; - } - mlog_exit(0); - return 0; -out_err: - mlog_exit(status); - return status; -} - -/* Release dquot structure from local quota file. ocfs2_release_dquot() has - * already started a transaction and obtained exclusive lock for global - * quota file. */ -static int ocfs2_local_release_dquot(struct dquot *dquot) +/* + * Release dquot structure from local quota file. ocfs2_release_dquot() has + * already started a transaction and written all changes to global quota file + */ +int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot) { int status; int type = dquot->dq_type; @@ -1286,15 +1273,6 @@ static int ocfs2_local_release_dquot(struct dquot *dquot) struct super_block *sb = dquot->dq_sb; struct ocfs2_local_disk_chunk *dchunk; int offset; - handle_t *handle = journal_current_handle(); - - BUG_ON(!handle); - /* First write all local changes to global file */ - status = ocfs2_global_release_dquot(dquot); - if (status < 0) { - mlog_errno(status); - goto out; - } status = ocfs2_journal_access_dq(handle, INODE_CACHE(sb_dqopt(sb)->files[type]), @@ -1312,12 +1290,8 @@ static int ocfs2_local_release_dquot(struct dquot *dquot) ocfs2_clear_bit(offset, dchunk->dqc_bitmap); le32_add_cpu(&dchunk->dqc_free, 1); unlock_buffer(od->dq_chunk->qc_headerbh); - status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); - if (status < 0) { - mlog_errno(status); - goto out; - } - status = 0; + ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); + out: /* Clear the read bit so that next time someone uses this * dquot he reads fresh info from disk and allocates local @@ -1331,9 +1305,6 @@ static const struct quota_format_ops ocfs2_format_ops = { .read_file_info = ocfs2_local_read_info, .write_file_info = ocfs2_global_write_info, .free_file_info = ocfs2_local_free_info, - .read_dqblk = ocfs2_local_read_dquot, - .commit_dqblk = ocfs2_local_write_dquot, - .release_dqblk = ocfs2_local_release_dquot, }; struct quota_format_type ocfs2_quota_format = { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 5cbcd0f008f..4793f36f651 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -570,7 +570,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL; u16 suballoc_bit_start; u32 num_got; - u64 first_blkno; + u64 suballoc_loc, first_blkno; BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); @@ -596,7 +596,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, goto out_commit; } - ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, + ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, &suballoc_bit_start, &num_got, &first_blkno); if (ret) { @@ -626,6 +626,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, memset(rb, 0, inode->i_sb->s_blocksize); strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); + rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); rb->rf_blkno = cpu_to_le64(first_blkno); @@ -790,7 +791,10 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) if (le32_to_cpu(rb->rf_count) == 1) { blk = le64_to_cpu(rb->rf_blkno); bit = le16_to_cpu(rb->rf_suballoc_bit); - bg_blkno = ocfs2_which_suballoc_group(blk, bit); + if (rb->rf_suballoc_loc) + bg_blkno = le64_to_cpu(rb->rf_suballoc_loc); + else + bg_blkno = ocfs2_which_suballoc_group(blk, bit); alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, @@ -1268,9 +1272,7 @@ static int ocfs2_change_refcount_rec(handle_t *handle, } else if (merge) ocfs2_refcount_rec_merge(rb, index); - ret = ocfs2_journal_dirty(handle, ref_leaf_bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, ref_leaf_bh); out: return ret; } @@ -1284,7 +1286,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle, int ret; u16 suballoc_bit_start; u32 num_got; - u64 blkno; + u64 suballoc_loc, blkno; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); struct buffer_head *new_bh = NULL; struct ocfs2_refcount_block *new_rb; @@ -1298,7 +1300,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle, goto out; } - ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, + ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, &suballoc_bit_start, &num_got, &blkno); if (ret) { @@ -1330,6 +1332,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle, new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); + new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); new_rb->rf_blkno = cpu_to_le64(blkno); new_rb->rf_cpos = cpu_to_le32(0); @@ -1524,7 +1527,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle, int ret; u16 suballoc_bit_start; u32 num_got, new_cpos; - u64 blkno; + u64 suballoc_loc, blkno; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); struct ocfs2_refcount_block *root_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; @@ -1548,7 +1551,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle, goto out; } - ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, + ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, &suballoc_bit_start, &num_got, &blkno); if (ret) { @@ -1576,6 +1579,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle, memset(new_rb, 0, sb->s_blocksize); strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); + new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); new_rb->rf_blkno = cpu_to_le64(blkno); @@ -1694,7 +1698,7 @@ static int ocfs2_adjust_refcount_rec(handle_t *handle, * 2 more credits, one for the leaf refcount block, one for * the extent block contains the extent rec. */ - ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2); + ret = ocfs2_extend_trans(handle, 2); if (ret < 0) { mlog_errno(ret); goto out; @@ -1802,11 +1806,7 @@ static int ocfs2_insert_refcount_rec(handle_t *handle, if (merge) ocfs2_refcount_rec_merge(rb, index); - ret = ocfs2_journal_dirty(handle, ref_leaf_bh); - if (ret) { - mlog_errno(ret); - goto out; - } + ocfs2_journal_dirty(handle, ref_leaf_bh); if (index == 0) { ret = ocfs2_adjust_refcount_rec(handle, ci, @@ -1977,9 +1977,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle, ocfs2_refcount_rec_merge(rb, index); } - ret = ocfs2_journal_dirty(handle, ref_leaf_bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, ref_leaf_bh); out: brelse(new_bh); @@ -2112,6 +2110,7 @@ static int ocfs2_remove_refcount_extent(handle_t *handle, */ ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE, le16_to_cpu(rb->rf_suballoc_slot), + le64_to_cpu(rb->rf_suballoc_loc), le64_to_cpu(rb->rf_blkno), le16_to_cpu(rb->rf_suballoc_bit)); if (ret) { @@ -2516,20 +2515,19 @@ out: * * Normally the refcount blocks store these refcount should be * contiguous also, so that we can get the number easily. - * As for meta_ac, we will at most add split 2 refcount record and - * 2 more refcount block, so just check it in a rough way. + * We will at most add split 2 refcount records and 2 more + * refcount blocks, so just check it in a rough way. * * Caller must hold refcount tree lock. */ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, - struct buffer_head *di_bh, + u64 refcount_loc, u64 phys_blkno, u32 clusters, int *credits, - struct ocfs2_alloc_context **meta_ac) + int *ref_blocks) { - int ret, ref_blocks = 0; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + int ret; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *tree; @@ -2546,14 +2544,13 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), - le64_to_cpu(di->i_refcount_loc), &tree); + refcount_loc, &tree); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_read_refcount_block(&tree->rf_ci, - le64_to_cpu(di->i_refcount_loc), + ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc, &ref_root_bh); if (ret) { mlog_errno(ret); @@ -2564,21 +2561,14 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, &tree->rf_ci, ref_root_bh, start_cpos, clusters, - &ref_blocks, credits); + ref_blocks, credits); if (ret) { mlog_errno(ret); goto out; } - mlog(0, "reserve new metadata %d, credits = %d\n", - ref_blocks, *credits); - - if (ref_blocks) { - ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), - ref_blocks, meta_ac); - if (ret) - mlog_errno(ret); - } + mlog(0, "reserve new metadata %d blocks, credits = %d\n", + *ref_blocks, *credits); out: brelse(ref_root_bh); @@ -3040,11 +3030,7 @@ static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, } memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize); - ret = ocfs2_journal_dirty(handle, new_bh); - if (ret) { - mlog_errno(ret); - break; - } + ocfs2_journal_dirty(handle, new_bh); brelse(new_bh); brelse(old_bh); @@ -3282,7 +3268,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb, } else { delete = 1; - ret = __ocfs2_claim_clusters(osb, handle, + ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, set_len, &new_bit, &new_len); diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index c1d19b1d3ec..9983ba1570e 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -47,11 +47,11 @@ int ocfs2_decrease_refcount(struct inode *inode, struct ocfs2_cached_dealloc_ctxt *dealloc, int delete); int ocfs2_prepare_refcount_change_for_del(struct inode *inode, - struct buffer_head *di_bh, + u64 refcount_loc, u64 phys_blkno, u32 clusters, int *credits, - struct ocfs2_alloc_context **meta_ac); + int *ref_blocks); int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos); diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c new file mode 100644 index 00000000000..40650021fc2 --- /dev/null +++ b/fs/ocfs2/reservations.c @@ -0,0 +1,847 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * reservations.c + * + * Allocation reservations implementation + * + * Some code borrowed from fs/ext3/balloc.c and is: + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * The rest is copyright (C) 2010 Novell. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/bitops.h> +#include <linux/list.h> + +#define MLOG_MASK_PREFIX ML_RESERVATIONS +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#ifdef CONFIG_OCFS2_DEBUG_FS +#define OCFS2_CHECK_RESERVATIONS +#endif + +DEFINE_SPINLOCK(resv_lock); + +#define OCFS2_MIN_RESV_WINDOW_BITS 8 +#define OCFS2_MAX_RESV_WINDOW_BITS 1024 + +int ocfs2_dir_resv_allowed(struct ocfs2_super *osb) +{ + return (osb->osb_resv_level && osb->osb_dir_resv_level); +} + +static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + struct ocfs2_super *osb = resmap->m_osb; + unsigned int bits; + + if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) { + /* 8, 16, 32, 64, 128, 256, 512, 1024 */ + bits = 4 << osb->osb_resv_level; + } else { + bits = 4 << osb->osb_dir_resv_level; + } + return bits; +} + +static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv) +{ + if (resv->r_len) + return resv->r_start + resv->r_len - 1; + return resv->r_start; +} + +static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv) +{ + return !!(resv->r_len == 0); +} + +static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap) +{ + if (resmap->m_osb->osb_resv_level == 0) + return 1; + return 0; +} + +static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap) +{ + struct ocfs2_super *osb = resmap->m_osb; + struct rb_node *node; + struct ocfs2_alloc_reservation *resv; + int i = 0; + + mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n", + osb->dev_str, resmap->m_bitmap_len); + + node = rb_first(&resmap->m_reservations); + while (node) { + resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node); + + mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u" + "\tlast_len: %u\n", resv->r_start, + ocfs2_resv_end(resv), resv->r_len, resv->r_last_start, + resv->r_last_len); + + node = rb_next(node); + i++; + } + + mlog(ML_NOTICE, "%d reservations found. LRU follows\n", i); + + i = 0; + list_for_each_entry(resv, &resmap->m_lru, r_lru) { + mlog(ML_NOTICE, "LRU(%d) start: %u\tend: %u\tlen: %u\t" + "last_start: %u\tlast_len: %u\n", i, resv->r_start, + ocfs2_resv_end(resv), resv->r_len, resv->r_last_start, + resv->r_last_len); + + i++; + } +} + +#ifdef OCFS2_CHECK_RESERVATIONS +static int ocfs2_validate_resmap_bits(struct ocfs2_reservation_map *resmap, + int i, + struct ocfs2_alloc_reservation *resv) +{ + char *disk_bitmap = resmap->m_disk_bitmap; + unsigned int start = resv->r_start; + unsigned int end = ocfs2_resv_end(resv); + + while (start <= end) { + if (ocfs2_test_bit(start, disk_bitmap)) { + mlog(ML_ERROR, + "reservation %d covers an allocated area " + "starting at bit %u!\n", i, start); + return 1; + } + + start++; + } + return 0; +} + +static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap) +{ + unsigned int off = 0; + int i = 0; + struct rb_node *node; + struct ocfs2_alloc_reservation *resv; + + node = rb_first(&resmap->m_reservations); + while (node) { + resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node); + + if (i > 0 && resv->r_start <= off) { + mlog(ML_ERROR, "reservation %d has bad start off!\n", + i); + goto bad; + } + + if (resv->r_len == 0) { + mlog(ML_ERROR, "reservation %d has no length!\n", + i); + goto bad; + } + + if (resv->r_start > ocfs2_resv_end(resv)) { + mlog(ML_ERROR, "reservation %d has invalid range!\n", + i); + goto bad; + } + + if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) { + mlog(ML_ERROR, "reservation %d extends past bitmap!\n", + i); + goto bad; + } + + if (ocfs2_validate_resmap_bits(resmap, i, resv)) + goto bad; + + off = ocfs2_resv_end(resv); + node = rb_next(node); + + i++; + } + return; + +bad: + ocfs2_dump_resv(resmap); + BUG(); +} +#else +static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap) +{ + +} +#endif + +void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv) +{ + memset(resv, 0, sizeof(*resv)); + INIT_LIST_HEAD(&resv->r_lru); +} + +void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv, + unsigned int flags) +{ + BUG_ON(flags & ~OCFS2_RESV_TYPES); + + resv->r_flags |= flags; +} + +int ocfs2_resmap_init(struct ocfs2_super *osb, + struct ocfs2_reservation_map *resmap) +{ + memset(resmap, 0, sizeof(*resmap)); + + resmap->m_osb = osb; + resmap->m_reservations = RB_ROOT; + /* m_bitmap_len is initialized to zero by the above memset. */ + INIT_LIST_HEAD(&resmap->m_lru); + + return 0; +} + +static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + assert_spin_locked(&resv_lock); + + if (!list_empty(&resv->r_lru)) + list_del_init(&resv->r_lru); + + list_add_tail(&resv->r_lru, &resmap->m_lru); +} + +static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv) +{ + resv->r_len = 0; + resv->r_start = 0; +} + +static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + if (resv->r_flags & OCFS2_RESV_FLAG_INUSE) { + list_del_init(&resv->r_lru); + rb_erase(&resv->r_node, &resmap->m_reservations); + resv->r_flags &= ~OCFS2_RESV_FLAG_INUSE; + } +} + +static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + assert_spin_locked(&resv_lock); + + __ocfs2_resv_trunc(resv); + /* + * last_len and last_start no longer make sense if + * we're changing the range of our allocations. + */ + resv->r_last_len = resv->r_last_start = 0; + + ocfs2_resv_remove(resmap, resv); +} + +/* does nothing if 'resv' is null */ +void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + if (resv) { + spin_lock(&resv_lock); + __ocfs2_resv_discard(resmap, resv); + spin_unlock(&resv_lock); + } +} + +static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap) +{ + struct rb_node *node; + struct ocfs2_alloc_reservation *resv; + + assert_spin_locked(&resv_lock); + + while ((node = rb_last(&resmap->m_reservations)) != NULL) { + resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node); + + __ocfs2_resv_discard(resmap, resv); + } +} + +void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap, + unsigned int clen, char *disk_bitmap) +{ + if (ocfs2_resmap_disabled(resmap)) + return; + + spin_lock(&resv_lock); + + ocfs2_resmap_clear_all_resv(resmap); + resmap->m_bitmap_len = clen; + resmap->m_disk_bitmap = disk_bitmap; + + spin_unlock(&resv_lock); +} + +void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap) +{ + /* Does nothing for now. Keep this around for API symmetry */ +} + +static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *new) +{ + struct rb_root *root = &resmap->m_reservations; + struct rb_node *parent = NULL; + struct rb_node **p = &root->rb_node; + struct ocfs2_alloc_reservation *tmp; + + assert_spin_locked(&resv_lock); + + mlog(0, "Insert reservation start: %u len: %u\n", new->r_start, + new->r_len); + + while (*p) { + parent = *p; + + tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node); + + if (new->r_start < tmp->r_start) { + p = &(*p)->rb_left; + + /* + * This is a good place to check for + * overlapping reservations. + */ + BUG_ON(ocfs2_resv_end(new) >= tmp->r_start); + } else if (new->r_start > ocfs2_resv_end(tmp)) { + p = &(*p)->rb_right; + } else { + /* This should never happen! */ + mlog(ML_ERROR, "Duplicate reservation window!\n"); + BUG(); + } + } + + rb_link_node(&new->r_node, parent, p); + rb_insert_color(&new->r_node, root); + new->r_flags |= OCFS2_RESV_FLAG_INUSE; + + ocfs2_resv_mark_lru(resmap, new); + + ocfs2_check_resmap(resmap); +} + +/** + * ocfs2_find_resv_lhs() - find the window which contains goal + * @resmap: reservation map to search + * @goal: which bit to search for + * + * If a window containing that goal is not found, we return the window + * which comes before goal. Returns NULL on empty rbtree or no window + * before goal. + */ +static struct ocfs2_alloc_reservation * +ocfs2_find_resv_lhs(struct ocfs2_reservation_map *resmap, unsigned int goal) +{ + struct ocfs2_alloc_reservation *resv = NULL; + struct ocfs2_alloc_reservation *prev_resv = NULL; + struct rb_node *node = resmap->m_reservations.rb_node; + + assert_spin_locked(&resv_lock); + + if (!node) + return NULL; + + node = rb_first(&resmap->m_reservations); + while (node) { + resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node); + + if (resv->r_start <= goal && ocfs2_resv_end(resv) >= goal) + break; + + /* Check if we overshot the reservation just before goal? */ + if (resv->r_start > goal) { + resv = prev_resv; + break; + } + + prev_resv = resv; + node = rb_next(node); + } + + return resv; +} + +/* + * We are given a range within the bitmap, which corresponds to a gap + * inside the reservations tree (search_start, search_len). The range + * can be anything from the whole bitmap, to a gap between + * reservations. + * + * The start value of *rstart is insignificant. + * + * This function searches the bitmap range starting at search_start + * with length search_len for a set of contiguous free bits. We try + * to find up to 'wanted' bits, but can sometimes return less. + * + * Returns the length of allocation, 0 if no free bits are found. + * + * *cstart and *clen will also be populated with the result. + */ +static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap, + unsigned int wanted, + unsigned int search_start, + unsigned int search_len, + unsigned int *rstart, + unsigned int *rlen) +{ + void *bitmap = resmap->m_disk_bitmap; + unsigned int best_start, best_len = 0; + int offset, start, found; + + mlog(0, "Find %u bits within range (%u, len %u) resmap len: %u\n", + wanted, search_start, search_len, resmap->m_bitmap_len); + + found = best_start = best_len = 0; + + start = search_start; + while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len, + start)) != -1) { + /* Search reached end of the region */ + if (offset >= (search_start + search_len)) + break; + + if (offset == start) { + /* we found a zero */ + found++; + /* move start to the next bit to test */ + start++; + } else { + /* got a zero after some ones */ + found = 1; + start = offset + 1; + } + if (found > best_len) { + best_len = found; + best_start = start - found; + } + + if (found >= wanted) + break; + } + + if (best_len == 0) + return 0; + + if (best_len >= wanted) + best_len = wanted; + + *rlen = best_len; + *rstart = best_start; + + mlog(0, "Found start: %u len: %u\n", best_start, best_len); + + return *rlen; +} + +static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + unsigned int goal, unsigned int wanted) +{ + struct rb_root *root = &resmap->m_reservations; + unsigned int gap_start, gap_end, gap_len; + struct ocfs2_alloc_reservation *prev_resv, *next_resv; + struct rb_node *prev, *next; + unsigned int cstart, clen; + unsigned int best_start = 0, best_len = 0; + + /* + * Nasty cases to consider: + * + * - rbtree is empty + * - our window should be first in all reservations + * - our window should be last in all reservations + * - need to make sure we don't go past end of bitmap + */ + + mlog(0, "resv start: %u resv end: %u goal: %u wanted: %u\n", + resv->r_start, ocfs2_resv_end(resv), goal, wanted); + + assert_spin_locked(&resv_lock); + + if (RB_EMPTY_ROOT(root)) { + /* + * Easiest case - empty tree. We can just take + * whatever window of free bits we want. + */ + + mlog(0, "Empty root\n"); + + clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal, + resmap->m_bitmap_len - goal, + &cstart, &clen); + + /* + * This should never happen - the local alloc window + * will always have free bits when we're called. + */ + BUG_ON(goal == 0 && clen == 0); + + if (clen == 0) + return; + + resv->r_start = cstart; + resv->r_len = clen; + + ocfs2_resv_insert(resmap, resv); + return; + } + + prev_resv = ocfs2_find_resv_lhs(resmap, goal); + + if (prev_resv == NULL) { + mlog(0, "Goal on LHS of leftmost window\n"); + + /* + * A NULL here means that the search code couldn't + * find a window that starts before goal. + * + * However, we can take the first window after goal, + * which is also by definition, the leftmost window in + * the entire tree. If we can find free bits in the + * gap between goal and the LHS window, then the + * reservation can safely be placed there. + * + * Otherwise we fall back to a linear search, checking + * the gaps in between windows for a place to + * allocate. + */ + + next = rb_first(root); + next_resv = rb_entry(next, struct ocfs2_alloc_reservation, + r_node); + + /* + * The search should never return such a window. (see + * comment above + */ + if (next_resv->r_start <= goal) { + mlog(ML_ERROR, "goal: %u next_resv: start %u len %u\n", + goal, next_resv->r_start, next_resv->r_len); + ocfs2_dump_resv(resmap); + BUG(); + } + + clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal, + next_resv->r_start - goal, + &cstart, &clen); + if (clen) { + best_len = clen; + best_start = cstart; + if (best_len == wanted) + goto out_insert; + } + + prev_resv = next_resv; + next_resv = NULL; + } + + prev = &prev_resv->r_node; + + /* Now we do a linear search for a window, starting at 'prev_rsv' */ + while (1) { + next = rb_next(prev); + if (next) { + mlog(0, "One more resv found in linear search\n"); + next_resv = rb_entry(next, + struct ocfs2_alloc_reservation, + r_node); + + gap_start = ocfs2_resv_end(prev_resv) + 1; + gap_end = next_resv->r_start - 1; + gap_len = gap_end - gap_start + 1; + } else { + mlog(0, "No next node\n"); + /* + * We're at the rightmost edge of the + * tree. See if a reservation between this + * window and the end of the bitmap will work. + */ + gap_start = ocfs2_resv_end(prev_resv) + 1; + gap_len = resmap->m_bitmap_len - gap_start; + gap_end = resmap->m_bitmap_len - 1; + } + + /* + * No need to check this gap if we have already found + * a larger region of free bits. + */ + if (gap_len <= best_len) + goto next_resv; + + clen = ocfs2_resmap_find_free_bits(resmap, wanted, gap_start, + gap_len, &cstart, &clen); + if (clen == wanted) { + best_len = clen; + best_start = cstart; + goto out_insert; + } else if (clen > best_len) { + best_len = clen; + best_start = cstart; + } + +next_resv: + if (!next) + break; + + prev = next; + prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation, + r_node); + } + +out_insert: + if (best_len) { + resv->r_start = best_start; + resv->r_len = best_len; + ocfs2_resv_insert(resmap, resv); + } +} + +static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + unsigned int wanted) +{ + struct ocfs2_alloc_reservation *lru_resv; + int tmpwindow = !!(resv->r_flags & OCFS2_RESV_FLAG_TMP); + unsigned int min_bits; + + if (!tmpwindow) + min_bits = ocfs2_resv_window_bits(resmap, resv) >> 1; + else + min_bits = wanted; /* We at know the temp window will use all + * of these bits */ + + /* + * Take the first reservation off the LRU as our 'target'. We + * don't try to be smart about it. There might be a case for + * searching based on size but I don't have enough data to be + * sure. --Mark (3/16/2010) + */ + lru_resv = list_first_entry(&resmap->m_lru, + struct ocfs2_alloc_reservation, r_lru); + + mlog(0, "lru resv: start: %u len: %u end: %u\n", lru_resv->r_start, + lru_resv->r_len, ocfs2_resv_end(lru_resv)); + + /* + * Cannibalize (some or all) of the target reservation and + * feed it to the current window. + */ + if (lru_resv->r_len <= min_bits) { + /* + * Discard completely if size is less than or equal to a + * reasonable threshold - 50% of window bits for non temporary + * windows. + */ + resv->r_start = lru_resv->r_start; + resv->r_len = lru_resv->r_len; + + __ocfs2_resv_discard(resmap, lru_resv); + } else { + unsigned int shrink; + if (tmpwindow) + shrink = min_bits; + else + shrink = lru_resv->r_len / 2; + + lru_resv->r_len -= shrink; + + resv->r_start = ocfs2_resv_end(lru_resv) + 1; + resv->r_len = shrink; + } + + mlog(0, "Reservation now looks like: r_start: %u r_end: %u " + "r_len: %u r_last_start: %u r_last_len: %u\n", + resv->r_start, ocfs2_resv_end(resv), resv->r_len, + resv->r_last_start, resv->r_last_len); + + ocfs2_resv_insert(resmap, resv); +} + +static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + unsigned int wanted) +{ + unsigned int goal = 0; + + BUG_ON(!ocfs2_resv_empty(resv)); + + /* + * Begin by trying to get a window as close to the previous + * one as possible. Using the most recent allocation as a + * start goal makes sense. + */ + if (resv->r_last_len) { + goal = resv->r_last_start + resv->r_last_len; + if (goal >= resmap->m_bitmap_len) + goal = 0; + } + + __ocfs2_resv_find_window(resmap, resv, goal, wanted); + + /* Search from last alloc didn't work, try once more from beginning. */ + if (ocfs2_resv_empty(resv) && goal != 0) + __ocfs2_resv_find_window(resmap, resv, 0, wanted); + + if (ocfs2_resv_empty(resv)) { + /* + * Still empty? Pull oldest one off the LRU, remove it from + * tree, put this one in it's place. + */ + ocfs2_cannibalize_resv(resmap, resv, wanted); + } + + BUG_ON(ocfs2_resv_empty(resv)); +} + +int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + int *cstart, int *clen) +{ + unsigned int wanted = *clen; + + if (resv == NULL || ocfs2_resmap_disabled(resmap)) + return -ENOSPC; + + spin_lock(&resv_lock); + + /* + * We don't want to over-allocate for temporary + * windows. Otherwise, we run the risk of fragmenting the + * allocation space. + */ + wanted = ocfs2_resv_window_bits(resmap, resv); + if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen) + wanted = *clen; + + if (ocfs2_resv_empty(resv)) { + mlog(0, "empty reservation, find new window\n"); + + /* + * Try to get a window here. If it works, we must fall + * through and test the bitmap . This avoids some + * ping-ponging of windows due to non-reserved space + * being allocation before we initialize a window for + * that inode. + */ + ocfs2_resv_find_window(resmap, resv, wanted); + } + + BUG_ON(ocfs2_resv_empty(resv)); + + *cstart = resv->r_start; + *clen = resv->r_len; + + spin_unlock(&resv_lock); + return 0; +} + +static void + ocfs2_adjust_resv_from_alloc(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + unsigned int start, unsigned int end) +{ + unsigned int rhs = 0; + unsigned int old_end = ocfs2_resv_end(resv); + + BUG_ON(start != resv->r_start || old_end < end); + + /* + * Completely used? We can remove it then. + */ + if (old_end == end) { + __ocfs2_resv_discard(resmap, resv); + return; + } + + rhs = old_end - end; + + /* + * This should have been trapped above. + */ + BUG_ON(rhs == 0); + + resv->r_start = end + 1; + resv->r_len = old_end - resv->r_start + 1; +} + +void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + u32 cstart, u32 clen) +{ + unsigned int cend = cstart + clen - 1; + + if (resmap == NULL || ocfs2_resmap_disabled(resmap)) + return; + + if (resv == NULL) + return; + + BUG_ON(cstart != resv->r_start); + + spin_lock(&resv_lock); + + mlog(0, "claim bits: cstart: %u cend: %u clen: %u r_start: %u " + "r_end: %u r_len: %u, r_last_start: %u r_last_len: %u\n", + cstart, cend, clen, resv->r_start, ocfs2_resv_end(resv), + resv->r_len, resv->r_last_start, resv->r_last_len); + + BUG_ON(cstart < resv->r_start); + BUG_ON(cstart > ocfs2_resv_end(resv)); + BUG_ON(cend > ocfs2_resv_end(resv)); + + ocfs2_adjust_resv_from_alloc(resmap, resv, cstart, cend); + resv->r_last_start = cstart; + resv->r_last_len = clen; + + /* + * May have been discarded above from + * ocfs2_adjust_resv_from_alloc(). + */ + if (!ocfs2_resv_empty(resv)) + ocfs2_resv_mark_lru(resmap, resv); + + mlog(0, "Reservation now looks like: r_start: %u r_end: %u " + "r_len: %u r_last_start: %u r_last_len: %u\n", + resv->r_start, ocfs2_resv_end(resv), resv->r_len, + resv->r_last_start, resv->r_last_len); + + ocfs2_check_resmap(resmap); + + spin_unlock(&resv_lock); +} diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h new file mode 100644 index 00000000000..1e49cc29d06 --- /dev/null +++ b/fs/ocfs2/reservations.h @@ -0,0 +1,159 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * reservations.h + * + * Allocation reservations function prototypes and structures. + * + * Copyright (C) 2010 Novell. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OCFS2_RESERVATIONS_H +#define OCFS2_RESERVATIONS_H + +#include <linux/rbtree.h> + +#define OCFS2_DEFAULT_RESV_LEVEL 2 +#define OCFS2_MAX_RESV_LEVEL 9 +#define OCFS2_MIN_RESV_LEVEL 0 + +struct ocfs2_alloc_reservation { + struct rb_node r_node; + + unsigned int r_start; /* Begining of current window */ + unsigned int r_len; /* Length of the window */ + + unsigned int r_last_len; /* Length of most recent alloc */ + unsigned int r_last_start; /* Start of most recent alloc */ + struct list_head r_lru; /* LRU list head */ + + unsigned int r_flags; +}; + +#define OCFS2_RESV_FLAG_INUSE 0x01 /* Set when r_node is part of a btree */ +#define OCFS2_RESV_FLAG_TMP 0x02 /* Temporary reservation, will be + * destroyed immedately after use */ +#define OCFS2_RESV_FLAG_DIR 0x04 /* Reservation is for an unindexed + * directory btree */ + +struct ocfs2_reservation_map { + struct rb_root m_reservations; + char *m_disk_bitmap; + + struct ocfs2_super *m_osb; + + /* The following are not initialized to meaningful values until a disk + * bitmap is provided. */ + u32 m_bitmap_len; /* Number of valid + * bits available */ + + struct list_head m_lru; /* LRU of reservations + * structures. */ + +}; + +void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv); + +#define OCFS2_RESV_TYPES (OCFS2_RESV_FLAG_TMP|OCFS2_RESV_FLAG_DIR) +void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv, + unsigned int flags); + +int ocfs2_dir_resv_allowed(struct ocfs2_super *osb); + +/** + * ocfs2_resv_discard() - truncate a reservation + * @resmap: + * @resv: the reservation to truncate. + * + * After this function is called, the reservation will be empty, and + * unlinked from the rbtree. + */ +void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv); + + +/** + * ocfs2_resmap_init() - Initialize fields of a reservations bitmap + * @resmap: struct ocfs2_reservation_map to initialize + * @obj: unused for now + * @ops: unused for now + * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize) + * + * Only possible return value other than '0' is -ENOMEM for failure to + * allocation mirror bitmap. + */ +int ocfs2_resmap_init(struct ocfs2_super *osb, + struct ocfs2_reservation_map *resmap); + +/** + * ocfs2_resmap_restart() - "restart" a reservation bitmap + * @resmap: reservations bitmap + * @clen: Number of valid bits in the bitmap + * @disk_bitmap: the disk bitmap this resmap should refer to. + * + * Re-initialize the parameters of a reservation bitmap. This is + * useful for local alloc window slides. + * + * This function will call ocfs2_trunc_resv against all existing + * reservations. A future version will recalculate existing + * reservations based on the new bitmap. + */ +void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap, + unsigned int clen, char *disk_bitmap); + +/** + * ocfs2_resmap_uninit() - uninitialize a reservation bitmap structure + * @resmap: the struct ocfs2_reservation_map to uninitialize + */ +void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap); + +/** + * ocfs2_resmap_resv_bits() - Return still-valid reservation bits + * @resmap: reservations bitmap + * @resv: reservation to base search from + * @cstart: start of proposed allocation + * @clen: length (in clusters) of proposed allocation + * + * Using the reservation data from resv, this function will compare + * resmap and resmap->m_disk_bitmap to determine what part (if any) of + * the reservation window is still clear to use. If resv is empty, + * this function will try to allocate a window for it. + * + * On success, zero is returned and the valid allocation area is set in cstart + * and clen. + * + * Returns -ENOSPC if reservations are disabled. + */ +int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + int *cstart, int *clen); + +/** + * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used. + * @resmap: reservations bitmap + * @resv: optional reservation to recalulate based on new bitmap + * @cstart: start of allocation in clusters + * @clen: end of allocation in clusters. + * + * Tell the reservation code that bits were used to fulfill allocation in + * resmap. The bits don't have to have been part of any existing + * reservation. But we must always call this function when bits are claimed. + * Internally, the reservations code will use this information to mark the + * reservations bitmap. If resv is passed, it's next allocation window will be + * calculated. It also expects that 'cstart' is the same as we passed back + * from ocfs2_resmap_resv_bits(). + */ +void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + u32 cstart, u32 clen); + +#endif /* OCFS2_RESERVATIONS_H */ diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index 3c3d673a4d2..dacd553d861 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -134,11 +134,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, le16_add_cpu(&group->bg_free_bits_count, -1 * backups); } - ret = ocfs2_journal_dirty(handle, group_bh); - if (ret < 0) { - mlog_errno(ret); - goto out_rollback; - } + ocfs2_journal_dirty(handle, group_bh); /* update the inode accordingly. */ ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh, @@ -319,7 +315,8 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters) BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != - ocfs2_group_bitmap_size(osb->sb) * 8) { + ocfs2_group_bitmap_size(osb->sb, 0, + osb->s_feature_incompat) * 8) { mlog(ML_ERROR, "The disk is too old and small. " "Force to do offline resize."); ret = -EINVAL; @@ -500,7 +497,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) fe = (struct ocfs2_dinode *)main_bm_bh->b_data; if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != - ocfs2_group_bitmap_size(osb->sb) * 8) { + ocfs2_group_bitmap_size(osb->sb, 0, + osb->s_feature_incompat) * 8) { mlog(ML_ERROR, "The disk is too old and small." " Force to do offline resize."); ret = -EINVAL; @@ -545,12 +543,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) group = (struct ocfs2_group_desc *)group_bh->b_data; group->bg_next_group = cr->c_blkno; - - ret = ocfs2_journal_dirty(handle, group_bh); - if (ret < 0) { - mlog_errno(ret); - goto out_commit; - } + ocfs2_journal_dirty(handle, group_bh); ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode), main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 19ba00f2854..f4c2a9eb8c4 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -53,6 +53,15 @@ #define OCFS2_MAX_TO_STEAL 1024 +struct ocfs2_suballoc_result { + u64 sr_bg_blkno; /* The bg we allocated from. Set + to 0 when a block group is + contiguous. */ + u64 sr_blkno; /* The first allocated block */ + unsigned int sr_bit_offset; /* The bit in the bg */ + unsigned int sr_bits; /* How many bits we claimed */ +}; + static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); @@ -60,6 +69,7 @@ static int ocfs2_block_group_fill(handle_t *handle, struct inode *alloc_inode, struct buffer_head *bg_bh, u64 group_blkno, + unsigned int group_clusters, u16 my_chain, struct ocfs2_chain_list *cl); static int ocfs2_block_group_alloc(struct ocfs2_super *osb, @@ -73,20 +83,17 @@ static int ocfs2_cluster_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, u64 max_block, - u16 *bit_off, u16 *bits_found); + struct ocfs2_suballoc_result *res); static int ocfs2_block_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, u64 max_block, - u16 *bit_off, u16 *bits_found); -static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, - struct ocfs2_alloc_context *ac, + struct ocfs2_suballoc_result *res); +static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, handle_t *handle, u32 bits_wanted, u32 min_bits, - u16 *bit_off, - unsigned int *num_bits, - u64 *bg_blkno); + struct ocfs2_suballoc_result *res); static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, int nr); static inline int ocfs2_block_group_set_bits(handle_t *handle, @@ -130,6 +137,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) } brelse(ac->ac_bh); ac->ac_bh = NULL; + ac->ac_resv = NULL; } void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) @@ -325,14 +333,38 @@ out: return rc; } +static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb, + struct ocfs2_group_desc *bg, + struct ocfs2_chain_list *cl, + u64 p_blkno, u32 clusters) +{ + struct ocfs2_extent_list *el = &bg->bg_list; + struct ocfs2_extent_rec *rec; + + BUG_ON(!ocfs2_supports_discontig_bg(osb)); + if (!el->l_next_free_rec) + el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb)); + rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)]; + rec->e_blkno = cpu_to_le64(p_blkno); + rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) / + le16_to_cpu(cl->cl_bpc)); + rec->e_leaf_clusters = cpu_to_le32(clusters); + le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc)); + le16_add_cpu(&bg->bg_free_bits_count, + clusters * le16_to_cpu(cl->cl_bpc)); + le16_add_cpu(&el->l_next_free_rec, 1); +} + static int ocfs2_block_group_fill(handle_t *handle, struct inode *alloc_inode, struct buffer_head *bg_bh, u64 group_blkno, + unsigned int group_clusters, u16 my_chain, struct ocfs2_chain_list *cl) { int status = 0; + struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; struct super_block * sb = alloc_inode->i_sb; @@ -359,19 +391,23 @@ static int ocfs2_block_group_fill(handle_t *handle, memset(bg, 0, sb->s_blocksize); strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); - bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb)); - bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); + bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1, + osb->s_feature_incompat)); bg->bg_chain = cpu_to_le16(my_chain); bg->bg_next_group = cl->cl_recs[my_chain].c_blkno; bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno); bg->bg_blkno = cpu_to_le64(group_blkno); + if (group_clusters == le16_to_cpu(cl->cl_cpg)) + bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); + else + ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno, + group_clusters); + /* set the 1st bit in the bitmap to account for the descriptor block */ ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap); bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1); - status = ocfs2_journal_dirty(handle, bg_bh); - if (status < 0) - mlog_errno(status); + ocfs2_journal_dirty(handle, bg_bh); /* There is no need to zero out or otherwise initialize the * other blocks in a group - All valid FS metadata in a block @@ -397,6 +433,238 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) return best; } +static struct buffer_head * +ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_alloc_context *ac, + struct ocfs2_chain_list *cl) +{ + int status; + u32 bit_off, num_bits; + u64 bg_blkno; + struct buffer_head *bg_bh; + unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); + + status = ocfs2_claim_clusters(handle, ac, + le16_to_cpu(cl->cl_cpg), &bit_off, + &num_bits); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + /* setup the group */ + bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); + mlog(0, "new descriptor, record %u, at block %llu\n", + alloc_rec, (unsigned long long)bg_blkno); + + bg_bh = sb_getblk(osb->sb, bg_blkno); + if (!bg_bh) { + status = -EIO; + mlog_errno(status); + goto bail; + } + ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); + + status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, + bg_blkno, num_bits, alloc_rec, cl); + if (status < 0) { + brelse(bg_bh); + mlog_errno(status); + } + +bail: + return status ? ERR_PTR(status) : bg_bh; +} + +static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac, + unsigned int min_bits, + u32 *bit_off, u32 *num_bits) +{ + int status = 0; + + while (min_bits) { + status = ocfs2_claim_clusters(handle, ac, min_bits, + bit_off, num_bits); + if (status != -ENOSPC) + break; + + min_bits >>= 1; + } + + return status; +} + +static int ocfs2_block_group_grow_discontig(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *bg_bh, + struct ocfs2_alloc_context *ac, + struct ocfs2_chain_list *cl, + unsigned int min_bits) +{ + int status; + struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); + struct ocfs2_group_desc *bg = + (struct ocfs2_group_desc *)bg_bh->b_data; + unsigned int needed = le16_to_cpu(cl->cl_cpg) - + le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); + u32 p_cpos, clusters; + u64 p_blkno; + struct ocfs2_extent_list *el = &bg->bg_list; + + status = ocfs2_journal_access_gd(handle, + INODE_CACHE(alloc_inode), + bg_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) < + le16_to_cpu(el->l_count))) { + if (min_bits > needed) + min_bits = needed; + status = ocfs2_block_group_claim_bits(osb, handle, ac, + min_bits, &p_cpos, + &clusters); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos); + ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno, + clusters); + + min_bits = clusters; + needed = le16_to_cpu(cl->cl_cpg) - + le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); + } + + if (needed > 0) { + /* + * We have used up all the extent rec but can't fill up + * the cpg. So bail out. + */ + status = -ENOSPC; + goto bail; + } + + ocfs2_journal_dirty(handle, bg_bh); + +bail: + return status; +} + +static void ocfs2_bg_alloc_cleanup(handle_t *handle, + struct ocfs2_alloc_context *cluster_ac, + struct inode *alloc_inode, + struct buffer_head *bg_bh) +{ + int i, ret; + struct ocfs2_group_desc *bg; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + + if (!bg_bh) + return; + + bg = (struct ocfs2_group_desc *)bg_bh->b_data; + el = &bg->bg_list; + for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; + ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode, + cluster_ac->ac_bh, + le64_to_cpu(rec->e_blkno), + le32_to_cpu(rec->e_leaf_clusters)); + if (ret) + mlog_errno(ret); + /* Try all the clusters to free */ + } + + ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh); + brelse(bg_bh); +} + +static struct buffer_head * +ocfs2_block_group_alloc_discontig(handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_alloc_context *ac, + struct ocfs2_chain_list *cl) +{ + int status; + u32 bit_off, num_bits; + u64 bg_blkno; + unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1; + struct buffer_head *bg_bh = NULL; + unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); + struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); + + if (!ocfs2_supports_discontig_bg(osb)) { + status = -ENOSPC; + goto bail; + } + + status = ocfs2_extend_trans(handle, + ocfs2_calc_bg_discontig_credits(osb->sb)); + if (status) { + mlog_errno(status); + goto bail; + } + + /* + * We're going to be grabbing from multiple cluster groups. + * We don't have enough credits to relink them all, and the + * cluster groups will be staying in cache for the duration of + * this operation. + */ + ac->ac_allow_chain_relink = 0; + + /* Claim the first region */ + status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits, + &bit_off, &num_bits); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + min_bits = num_bits; + + /* setup the group */ + bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); + mlog(0, "new descriptor, record %u, at block %llu\n", + alloc_rec, (unsigned long long)bg_blkno); + + bg_bh = sb_getblk(osb->sb, bg_blkno); + if (!bg_bh) { + status = -EIO; + mlog_errno(status); + goto bail; + } + ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); + + status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, + bg_blkno, num_bits, alloc_rec, cl); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_block_group_grow_discontig(handle, alloc_inode, + bg_bh, ac, cl, min_bits); + if (status) + mlog_errno(status); + +bail: + if (status) + ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh); + return status ? ERR_PTR(status) : bg_bh; +} + /* * We expect the block group allocator to already be locked. */ @@ -412,9 +680,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, struct ocfs2_chain_list *cl; struct ocfs2_alloc_context *ac = NULL; handle_t *handle = NULL; - u32 bit_off, num_bits; u16 alloc_rec; - u64 bg_blkno; struct buffer_head *bg_bh = NULL; struct ocfs2_group_desc *bg; @@ -447,44 +713,20 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, (unsigned long long)*last_alloc_group); ac->ac_last_group = *last_alloc_group; } - status = ocfs2_claim_clusters(osb, - handle, - ac, - le16_to_cpu(cl->cl_cpg), - &bit_off, - &num_bits); - if (status < 0) { + + bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode, + ac, cl); + if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC)) + bg_bh = ocfs2_block_group_alloc_discontig(handle, + alloc_inode, + ac, cl); + if (IS_ERR(bg_bh)) { + status = PTR_ERR(bg_bh); + bg_bh = NULL; if (status != -ENOSPC) mlog_errno(status); goto bail; } - - alloc_rec = ocfs2_find_smallest_chain(cl); - - /* setup the group */ - bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); - mlog(0, "new descriptor, record %u, at block %llu\n", - alloc_rec, (unsigned long long)bg_blkno); - - bg_bh = sb_getblk(osb->sb, bg_blkno); - if (!bg_bh) { - status = -EIO; - mlog_errno(status); - goto bail; - } - ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); - - status = ocfs2_block_group_fill(handle, - alloc_inode, - bg_bh, - bg_blkno, - alloc_rec, - cl); - if (status < 0) { - mlog_errno(status); - goto bail; - } - bg = (struct ocfs2_group_desc *) bg_bh->b_data; status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), @@ -494,10 +736,12 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, goto bail; } + alloc_rec = le16_to_cpu(bg->bg_chain); le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, le16_to_cpu(bg->bg_free_bits_count)); - le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits)); - cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno); + le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, + le16_to_cpu(bg->bg_bits)); + cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg->bg_blkno); if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) le16_add_cpu(&cl->cl_next_free_rec, 1); @@ -506,11 +750,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); - status = ocfs2_journal_dirty(handle, bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, bh); spin_lock(&OCFS2_I(alloc_inode)->ip_lock); OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); @@ -760,7 +1000,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, status = ocfs2_reserve_suballoc_bits(osb, (*ac), EXTENT_ALLOC_SYSTEM_INODE, (u32)osb->slot_num, NULL, - ALLOC_NEW_GROUP); + ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP); if (status >= 0) { @@ -946,11 +1186,7 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, status = ocfs2_reserve_local_alloc_bits(osb, bits_wanted, *ac); - if (status == -EFBIG) { - /* The local alloc window is outside ac_max_block. - * use the main bitmap. */ - status = -ENOSPC; - } else if ((status < 0) && (status != -ENOSPC)) { + if ((status < 0) && (status != -ENOSPC)) { mlog_errno(status); goto bail; } @@ -1033,8 +1269,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, struct buffer_head *bg_bh, unsigned int bits_wanted, unsigned int total_bits, - u16 *bit_off, - u16 *bits_found) + struct ocfs2_suballoc_result *res) { void *bitmap; u16 best_offset, best_size; @@ -1078,14 +1313,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, } } - /* XXX: I think the first clause is equivalent to the second - * - jlbec */ - if (found == bits_wanted) { - *bit_off = start - found; - *bits_found = found; - } else if (best_size) { - *bit_off = best_offset; - *bits_found = best_size; + if (best_size) { + res->sr_bit_offset = best_offset; + res->sr_bits = best_size; } else { status = -ENOSPC; /* No error log here -- see the comment above @@ -1129,16 +1359,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle, } le16_add_cpu(&bg->bg_free_bits_count, -num_bits); - while(num_bits--) ocfs2_set_bit(bit_off++, bitmap); - status = ocfs2_journal_dirty(handle, - group_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, group_bh); bail: mlog_exit(status); @@ -1202,12 +1426,7 @@ static int ocfs2_relink_block_group(handle_t *handle, } prev_bg->bg_next_group = bg->bg_next_group; - - status = ocfs2_journal_dirty(handle, prev_bg_bh); - if (status < 0) { - mlog_errno(status); - goto out_rollback; - } + ocfs2_journal_dirty(handle, prev_bg_bh); status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); @@ -1217,12 +1436,7 @@ static int ocfs2_relink_block_group(handle_t *handle, } bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; - - status = ocfs2_journal_dirty(handle, bg_bh); - if (status < 0) { - mlog_errno(status); - goto out_rollback; - } + ocfs2_journal_dirty(handle, bg_bh); status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); @@ -1232,14 +1446,8 @@ static int ocfs2_relink_block_group(handle_t *handle, } fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; + ocfs2_journal_dirty(handle, fe_bh); - status = ocfs2_journal_dirty(handle, fe_bh); - if (status < 0) { - mlog_errno(status); - goto out_rollback; - } - - status = 0; out_rollback: if (status < 0) { fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr); @@ -1263,14 +1471,13 @@ static int ocfs2_cluster_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, u64 max_block, - u16 *bit_off, u16 *bits_found) + struct ocfs2_suballoc_result *res) { int search = -ENOSPC; int ret; u64 blkoff; struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - u16 tmp_off, tmp_found; unsigned int max_bits, gd_cluster_off; BUG_ON(!ocfs2_is_cluster_bitmap(inode)); @@ -1297,15 +1504,15 @@ static int ocfs2_cluster_group_search(struct inode *inode, ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), group_bh, bits_wanted, - max_bits, - &tmp_off, &tmp_found); + max_bits, res); if (ret) return ret; if (max_block) { blkoff = ocfs2_clusters_to_blocks(inode->i_sb, gd_cluster_off + - tmp_off + tmp_found); + res->sr_bit_offset + + res->sr_bits); mlog(0, "Checking %llu against %llu\n", (unsigned long long)blkoff, (unsigned long long)max_block); @@ -1317,16 +1524,14 @@ static int ocfs2_cluster_group_search(struct inode *inode, * return success, but we still want to return * -ENOSPC unless it found the minimum number * of bits. */ - if (min_bits <= tmp_found) { - *bit_off = tmp_off; - *bits_found = tmp_found; + if (min_bits <= res->sr_bits) search = 0; /* success */ - } else if (tmp_found) { + else if (res->sr_bits) { /* * Don't show bits which we'll be returning * for allocation to the local alloc bitmap. */ - ocfs2_local_alloc_seen_free_bits(osb, tmp_found); + ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits); } } @@ -1337,7 +1542,7 @@ static int ocfs2_block_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, u64 max_block, - u16 *bit_off, u16 *bits_found) + struct ocfs2_suballoc_result *res) { int ret = -ENOSPC; u64 blkoff; @@ -1350,10 +1555,10 @@ static int ocfs2_block_group_search(struct inode *inode, ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), group_bh, bits_wanted, le16_to_cpu(bg->bg_bits), - bit_off, bits_found); + res); if (!ret && max_block) { - blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off + - *bits_found; + blkoff = le64_to_cpu(bg->bg_blkno) + + res->sr_bit_offset + res->sr_bits; mlog(0, "Checking %llu against %llu\n", (unsigned long long)blkoff, (unsigned long long)max_block); @@ -1386,33 +1591,76 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode, tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); - - ret = ocfs2_journal_dirty(handle, di_bh); - if (ret < 0) - mlog_errno(ret); + ocfs2_journal_dirty(handle, di_bh); out: return ret; } +static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, + struct ocfs2_extent_rec *rec, + struct ocfs2_chain_list *cl) +{ + unsigned int bpc = le16_to_cpu(cl->cl_bpc); + unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc; + unsigned int bitcount = le32_to_cpu(rec->e_leaf_clusters) * bpc; + + if (res->sr_bit_offset < bitoff) + return 0; + if (res->sr_bit_offset >= (bitoff + bitcount)) + return 0; + res->sr_blkno = le64_to_cpu(rec->e_blkno) + + (res->sr_bit_offset - bitoff); + if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount)) + res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset; + return 1; +} + +static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac, + struct ocfs2_group_desc *bg, + struct ocfs2_suballoc_result *res) +{ + int i; + u64 bg_blkno = res->sr_bg_blkno; /* Save off */ + struct ocfs2_extent_rec *rec; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; + struct ocfs2_chain_list *cl = &di->id2.i_chain; + + if (ocfs2_is_cluster_bitmap(ac->ac_inode)) { + res->sr_blkno = 0; + return; + } + + res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset; + res->sr_bg_blkno = 0; /* Clear it for contig block groups */ + if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) || + !bg->bg_list.l_next_free_rec) + return; + + for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) { + rec = &bg->bg_list.l_recs[i]; + if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) { + res->sr_bg_blkno = bg_blkno; /* Restore */ + break; + } + } +} + static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, handle_t *handle, u32 bits_wanted, u32 min_bits, - u16 *bit_off, - unsigned int *num_bits, - u64 gd_blkno, + struct ocfs2_suballoc_result *res, u16 *bits_left) { int ret; - u16 found; struct buffer_head *group_bh = NULL; struct ocfs2_group_desc *gd; struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; struct inode *alloc_inode = ac->ac_inode; - ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno, - &group_bh); + ret = ocfs2_read_group_descriptor(alloc_inode, di, + res->sr_bg_blkno, &group_bh); if (ret < 0) { mlog_errno(ret); return ret; @@ -1420,17 +1668,18 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, gd = (struct ocfs2_group_desc *) group_bh->b_data; ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, - ac->ac_max_block, bit_off, &found); + ac->ac_max_block, res); if (ret < 0) { if (ret != -ENOSPC) mlog_errno(ret); goto out; } - *num_bits = found; + if (!ret) + ocfs2_bg_discontig_fix_result(ac, gd, res); ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, - *num_bits, + res->sr_bits, le16_to_cpu(gd->bg_chain)); if (ret < 0) { mlog_errno(ret); @@ -1438,7 +1687,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, } ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, - *bit_off, *num_bits); + res->sr_bit_offset, res->sr_bits); if (ret < 0) mlog_errno(ret); @@ -1454,13 +1703,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, handle_t *handle, u32 bits_wanted, u32 min_bits, - u16 *bit_off, - unsigned int *num_bits, - u64 *bg_blkno, + struct ocfs2_suballoc_result *res, u16 *bits_left) { int status; - u16 chain, tmp_bits; + u16 chain; u32 tmp_used; u64 next_group; struct inode *alloc_inode = ac->ac_inode; @@ -1489,8 +1736,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, * the 1st group with any empty bits. */ while ((status = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, - ac->ac_max_block, bit_off, - &tmp_bits)) == -ENOSPC) { + ac->ac_max_block, + res)) == -ENOSPC) { if (!bg->bg_next_group) break; @@ -1515,11 +1762,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, } mlog(0, "alloc succeeds: we give %u bits from block group %llu\n", - tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno)); + res->sr_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno)); - *num_bits = tmp_bits; + res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno); + + BUG_ON(res->sr_bits == 0); + if (!status) + ocfs2_bg_discontig_fix_result(ac, bg, res); - BUG_ON(*num_bits == 0); /* * Keep track of previous block descriptor read. When @@ -1536,7 +1786,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, */ if (ac->ac_allow_chain_relink && (prev_group_bh) && - (ocfs2_block_group_reasonably_empty(bg, *num_bits))) { + (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) { status = ocfs2_relink_block_group(handle, alloc_inode, ac->ac_bh, group_bh, prev_group_bh, chain); @@ -1558,31 +1808,24 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, } tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); - fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used); - le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits)); - - status = ocfs2_journal_dirty(handle, - ac->ac_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used); + le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits); + ocfs2_journal_dirty(handle, ac->ac_bh); status = ocfs2_block_group_set_bits(handle, alloc_inode, bg, group_bh, - *bit_off, - *num_bits); + res->sr_bit_offset, + res->sr_bits); if (status < 0) { mlog_errno(status); goto bail; } - mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits, + mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits, (unsigned long long)le64_to_cpu(fe->i_blkno)); - *bg_blkno = le64_to_cpu(bg->bg_blkno); *bits_left = le16_to_cpu(bg->bg_free_bits_count); bail: brelse(group_bh); @@ -1593,19 +1836,15 @@ bail: } /* will give out up to bits_wanted contiguous bits. */ -static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, - struct ocfs2_alloc_context *ac, +static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, handle_t *handle, u32 bits_wanted, u32 min_bits, - u16 *bit_off, - unsigned int *num_bits, - u64 *bg_blkno) + struct ocfs2_suballoc_result *res) { int status; u16 victim, i; u16 bits_left = 0; - u64 hint_blkno = ac->ac_last_group; struct ocfs2_chain_list *cl; struct ocfs2_dinode *fe; @@ -1623,7 +1862,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, if (le32_to_cpu(fe->id1.bitmap1.i_used) >= le32_to_cpu(fe->id1.bitmap1.i_total)) { - ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used " + ocfs2_error(ac->ac_inode->i_sb, + "Chain allocator dinode %llu has %u used " "bits but only %u total.", (unsigned long long)le64_to_cpu(fe->i_blkno), le32_to_cpu(fe->id1.bitmap1.i_used), @@ -1632,22 +1872,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, goto bail; } - if (hint_blkno) { + res->sr_bg_blkno = ac->ac_last_group; + if (res->sr_bg_blkno) { /* Attempt to short-circuit the usual search mechanism * by jumping straight to the most recently used * allocation group. This helps us mantain some * contiguousness across allocations. */ status = ocfs2_search_one_group(ac, handle, bits_wanted, - min_bits, bit_off, num_bits, - hint_blkno, &bits_left); - if (!status) { - /* Be careful to update *bg_blkno here as the - * caller is expecting it to be filled in, and - * ocfs2_search_one_group() won't do that for - * us. */ - *bg_blkno = hint_blkno; + min_bits, res, &bits_left); + if (!status) goto set_hint; - } if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; @@ -1660,8 +1894,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, ac->ac_chain = victim; ac->ac_allow_chain_relink = 1; - status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off, - num_bits, bg_blkno, &bits_left); + status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, + res, &bits_left); if (!status) goto set_hint; if (status < 0 && status != -ENOSPC) { @@ -1685,8 +1919,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, ac->ac_chain = i; status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, - bit_off, num_bits, bg_blkno, - &bits_left); + res, &bits_left); if (!status) break; if (status < 0 && status != -ENOSPC) { @@ -1703,7 +1936,7 @@ set_hint: if (bits_left < min_bits) ac->ac_last_group = 0; else - ac->ac_last_group = *bg_blkno; + ac->ac_last_group = res->sr_bg_blkno; } bail: @@ -1711,37 +1944,37 @@ bail: return status; } -int ocfs2_claim_metadata(struct ocfs2_super *osb, - handle_t *handle, +int ocfs2_claim_metadata(handle_t *handle, struct ocfs2_alloc_context *ac, u32 bits_wanted, + u64 *suballoc_loc, u16 *suballoc_bit_start, unsigned int *num_bits, u64 *blkno_start) { int status; - u64 bg_blkno; + struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; BUG_ON(!ac); BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); BUG_ON(ac->ac_which != OCFS2_AC_USE_META); - status = ocfs2_claim_suballoc_bits(osb, - ac, + status = ocfs2_claim_suballoc_bits(ac, handle, bits_wanted, 1, - suballoc_bit_start, - num_bits, - &bg_blkno); + &res); if (status < 0) { mlog_errno(status); goto bail; } - atomic_inc(&osb->alloc_stats.bg_allocs); + atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); - *blkno_start = bg_blkno + (u64) *suballoc_bit_start; - ac->ac_bits_given += (*num_bits); + *suballoc_loc = res.sr_bg_blkno; + *suballoc_bit_start = res.sr_bit_offset; + *blkno_start = res.sr_blkno; + ac->ac_bits_given += res.sr_bits; + *num_bits = res.sr_bits; status = 0; bail: mlog_exit(status); @@ -1749,10 +1982,10 @@ bail: } static void ocfs2_init_inode_ac_group(struct inode *dir, - struct buffer_head *parent_fe_bh, + struct buffer_head *parent_di_bh, struct ocfs2_alloc_context *ac) { - struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data; /* * Try to allocate inodes from some specific group. * @@ -1766,10 +1999,14 @@ static void ocfs2_init_inode_ac_group(struct inode *dir, if (OCFS2_I(dir)->ip_last_used_group && OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot) ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group; - else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot) - ac->ac_last_group = ocfs2_which_suballoc_group( - le64_to_cpu(fe->i_blkno), - le16_to_cpu(fe->i_suballoc_bit)); + else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) { + if (di->i_suballoc_loc) + ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc); + else + ac->ac_last_group = ocfs2_which_suballoc_group( + le64_to_cpu(di->i_blkno), + le16_to_cpu(di->i_suballoc_bit)); + } } static inline void ocfs2_save_inode_ac_group(struct inode *dir, @@ -1779,17 +2016,16 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir, OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; } -int ocfs2_claim_new_inode(struct ocfs2_super *osb, - handle_t *handle, +int ocfs2_claim_new_inode(handle_t *handle, struct inode *dir, struct buffer_head *parent_fe_bh, struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, u16 *suballoc_bit, u64 *fe_blkno) { int status; - unsigned int num_bits; - u64 bg_blkno; + struct ocfs2_suballoc_result res; mlog_entry_void(); @@ -1800,23 +2036,22 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb, ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); - status = ocfs2_claim_suballoc_bits(osb, - ac, + status = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, - suballoc_bit, - &num_bits, - &bg_blkno); + &res); if (status < 0) { mlog_errno(status); goto bail; } - atomic_inc(&osb->alloc_stats.bg_allocs); + atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); - BUG_ON(num_bits != 1); + BUG_ON(res.sr_bits != 1); - *fe_blkno = bg_blkno + (u64) (*suballoc_bit); + *suballoc_loc = res.sr_bg_blkno; + *suballoc_bit = res.sr_bit_offset; + *fe_blkno = res.sr_blkno; ac->ac_bits_given++; ocfs2_save_inode_ac_group(dir, ac); status = 0; @@ -1886,8 +2121,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode, * contig. allocation, set to '1' to indicate we can deal with extents * of any size. */ -int __ocfs2_claim_clusters(struct ocfs2_super *osb, - handle_t *handle, +int __ocfs2_claim_clusters(handle_t *handle, struct ocfs2_alloc_context *ac, u32 min_clusters, u32 max_clusters, @@ -1896,8 +2130,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb, { int status; unsigned int bits_wanted = max_clusters; - u64 bg_blkno = 0; - u16 bg_bit_off; + struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; + struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb); mlog_entry_void(); @@ -1907,6 +2141,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb, && ac->ac_which != OCFS2_AC_USE_MAIN); if (ac->ac_which == OCFS2_AC_USE_LOCAL) { + WARN_ON(min_clusters > 1); + status = ocfs2_claim_local_alloc_bits(osb, handle, ac, @@ -1929,20 +2165,19 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb, if (bits_wanted > (osb->bitmap_cpg - 1)) bits_wanted = osb->bitmap_cpg - 1; - status = ocfs2_claim_suballoc_bits(osb, - ac, + status = ocfs2_claim_suballoc_bits(ac, handle, bits_wanted, min_clusters, - &bg_bit_off, - num_clusters, - &bg_blkno); + &res); if (!status) { + BUG_ON(res.sr_blkno); /* cluster alloc can't set */ *cluster_start = ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, - bg_blkno, - bg_bit_off); + res.sr_bg_blkno, + res.sr_bit_offset); atomic_inc(&osb->alloc_stats.bitmap_data); + *num_clusters = res.sr_bits; } } if (status < 0) { @@ -1958,8 +2193,7 @@ bail: return status; } -int ocfs2_claim_clusters(struct ocfs2_super *osb, - handle_t *handle, +int ocfs2_claim_clusters(handle_t *handle, struct ocfs2_alloc_context *ac, u32 min_clusters, u32 *cluster_start, @@ -1967,7 +2201,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb, { unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; - return __ocfs2_claim_clusters(osb, handle, ac, min_clusters, + return __ocfs2_claim_clusters(handle, ac, min_clusters, bits_wanted, cluster_start, num_clusters); } @@ -2023,9 +2257,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, if (undo_fn) jbd_unlock_bh_state(group_bh); - status = ocfs2_journal_dirty(handle, group_bh); - if (status < 0) - mlog_errno(status); + ocfs2_journal_dirty(handle, group_bh); bail: return status; } @@ -2092,12 +2324,7 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, count); tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); - - status = ocfs2_journal_dirty(handle, alloc_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, alloc_bh); bail: brelse(group_bh); @@ -2126,6 +2353,8 @@ int ocfs2_free_dinode(handle_t *handle, u16 bit = le16_to_cpu(di->i_suballoc_bit); u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); + if (di->i_suballoc_loc) + bg_blkno = le64_to_cpu(di->i_suballoc_loc); return ocfs2_free_suballoc_bits(handle, inode_alloc_inode, inode_alloc_bh, bit, bg_blkno, 1); } @@ -2395,7 +2624,7 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, struct buffer_head *alloc_bh, u64 blkno, u16 bit, int *res) { - struct ocfs2_dinode *alloc_fe; + struct ocfs2_dinode *alloc_di; struct ocfs2_group_desc *group; struct buffer_head *group_bh = NULL; u64 bg_blkno; @@ -2404,17 +2633,20 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno, (unsigned int)bit); - alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data; - if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) { + alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data; + if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) { mlog(ML_ERROR, "suballoc bit %u out of range of %u\n", (unsigned int)bit, - ocfs2_bits_per_group(&alloc_fe->id2.i_chain)); + ocfs2_bits_per_group(&alloc_di->id2.i_chain)); status = -EINVAL; goto bail; } - bg_blkno = ocfs2_which_suballoc_group(blkno, bit); - status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno, + if (alloc_di->i_suballoc_loc) + bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc); + else + bg_blkno = ocfs2_which_suballoc_group(blkno, bit); + status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno, &group_bh); if (status < 0) { mlog(ML_ERROR, "read group %llu failed %d\n", diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index e0f46df357e..a017dd3ee7d 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -26,13 +26,14 @@ #ifndef _CHAINALLOC_H_ #define _CHAINALLOC_H_ +struct ocfs2_suballoc_result; typedef int (group_search_t)(struct inode *, struct buffer_head *, u32, /* bits_wanted */ u32, /* min_bits */ u64, /* max_block */ - u16 *, /* *bit_off */ - u16 *); /* *bits_found */ + struct ocfs2_suballoc_result *); + /* found bits */ struct ocfs2_alloc_context { struct inode *ac_inode; /* which bitmap are we allocating from? */ @@ -54,6 +55,8 @@ struct ocfs2_alloc_context { u64 ac_last_group; u64 ac_max_block; /* Highest block number to allocate. 0 is is the same as ~0 - unlimited */ + + struct ocfs2_alloc_reservation *ac_resv; }; void ocfs2_init_steal_slots(struct ocfs2_super *osb); @@ -80,22 +83,21 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb, u32 bits_wanted, struct ocfs2_alloc_context **ac); -int ocfs2_claim_metadata(struct ocfs2_super *osb, - handle_t *handle, +int ocfs2_claim_metadata(handle_t *handle, struct ocfs2_alloc_context *ac, u32 bits_wanted, + u64 *suballoc_loc, u16 *suballoc_bit_start, u32 *num_bits, u64 *blkno_start); -int ocfs2_claim_new_inode(struct ocfs2_super *osb, - handle_t *handle, +int ocfs2_claim_new_inode(handle_t *handle, struct inode *dir, struct buffer_head *parent_fe_bh, struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, u16 *suballoc_bit, u64 *fe_blkno); -int ocfs2_claim_clusters(struct ocfs2_super *osb, - handle_t *handle, +int ocfs2_claim_clusters(handle_t *handle, struct ocfs2_alloc_context *ac, u32 min_clusters, u32 *cluster_start, @@ -104,8 +106,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb, * Use this variant of ocfs2_claim_clusters to specify a maxiumum * number of clusters smaller than the allocation reserved. */ -int __ocfs2_claim_clusters(struct ocfs2_super *osb, - handle_t *handle, +int __ocfs2_claim_clusters(handle_t *handle, struct ocfs2_alloc_context *ac, u32 min_clusters, u32 max_clusters, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index dee03197a49..2c26ce251cb 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -94,7 +94,9 @@ struct mount_options unsigned long mount_opt; unsigned int atime_quantum; signed short slot; - unsigned int localalloc_opt; + int localalloc_opt; + unsigned int resv_level; + int dir_resv_level; char cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; }; @@ -176,6 +178,8 @@ enum { Opt_noacl, Opt_usrquota, Opt_grpquota, + Opt_resv_level, + Opt_dir_resv_level, Opt_err, }; @@ -202,6 +206,8 @@ static const match_table_t tokens = { {Opt_noacl, "noacl"}, {Opt_usrquota, "usrquota"}, {Opt_grpquota, "grpquota"}, + {Opt_resv_level, "resv_level=%u"}, + {Opt_dir_resv_level, "dir_resv_level=%u"}, {Opt_err, NULL} }; @@ -932,12 +938,16 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb) int type; struct inode *inode; struct super_block *sb = osb->sb; + struct ocfs2_mem_dqinfo *oinfo; /* We mostly ignore errors in this function because there's not much * we can do when we see them */ for (type = 0; type < MAXQUOTAS; type++) { if (!sb_has_quota_loaded(sb, type)) continue; + /* Cancel periodic syncing before we grab dqonoff_mutex */ + oinfo = sb_dqinfo(sb, type)->dqi_priv; + cancel_delayed_work_sync(&oinfo->dqi_sync_work); inode = igrab(sb->s_dquot.files[type]); /* Turn off quotas. This will remove all dquot structures from * memory and so they will be automatically synced to global @@ -1028,8 +1038,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; osb->osb_commit_interval = parsed_options.commit_interval; - osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt); - osb->local_alloc_bits = osb->local_alloc_default_bits; + + ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt); + osb->osb_resv_level = parsed_options.resv_level; + osb->osb_dir_resv_level = parsed_options.resv_level; + if (parsed_options.dir_resv_level == -1) + osb->osb_dir_resv_level = parsed_options.resv_level; + else + osb->osb_dir_resv_level = parsed_options.dir_resv_level; status = ocfs2_verify_userspace_stack(osb, &parsed_options); if (status) @@ -1285,11 +1301,13 @@ static int ocfs2_parse_options(struct super_block *sb, options ? options : "(none)"); mopt->commit_interval = 0; - mopt->mount_opt = 0; + mopt->mount_opt = OCFS2_MOUNT_NOINTR; mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; mopt->slot = OCFS2_INVALID_SLOT; - mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; + mopt->localalloc_opt = -1; mopt->cluster_stack[0] = '\0'; + mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL; + mopt->dir_resv_level = -1; if (!options) { status = 1; @@ -1380,7 +1398,7 @@ static int ocfs2_parse_options(struct super_block *sb, status = 0; goto bail; } - if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8)) + if (option >= 0) mopt->localalloc_opt = option; break; case Opt_localflocks: @@ -1433,6 +1451,28 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; break; + case Opt_resv_level: + if (is_remount) + break; + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option >= OCFS2_MIN_RESV_LEVEL && + option < OCFS2_MAX_RESV_LEVEL) + mopt->resv_level = option; + break; + case Opt_dir_resv_level: + if (is_remount) + break; + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option >= OCFS2_MIN_RESV_LEVEL && + option < OCFS2_MAX_RESV_LEVEL) + mopt->dir_resv_level = option; + break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -1487,7 +1527,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) (unsigned) (osb->osb_commit_interval / HZ)); local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits); - if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE) + if (local_alloc_megs != ocfs2_la_default_mb(osb)) seq_printf(s, ",localalloc=%d", local_alloc_megs); if (opts & OCFS2_MOUNT_LOCALFLOCKS) @@ -1514,6 +1554,12 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) else seq_printf(s, ",noacl"); + if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL) + seq_printf(s, ",resv_level=%d", osb->osb_resv_level); + + if (osb->osb_dir_resv_level != osb->osb_resv_level) + seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level); + return 0; } @@ -1688,6 +1734,8 @@ static void ocfs2_inode_init_once(void *data) oi->ip_blkno = 0ULL; oi->ip_clusters = 0; + ocfs2_resv_init_once(&oi->ip_la_data_resv); + ocfs2_lock_res_init_once(&oi->ip_rw_lockres); ocfs2_lock_res_init_once(&oi->ip_inode_lockres); ocfs2_lock_res_init_once(&oi->ip_open_lockres); @@ -2042,6 +2090,12 @@ static int ocfs2_initialize_super(struct super_block *sb, init_waitqueue_head(&osb->osb_mount_event); + status = ocfs2_resmap_init(osb, &osb->osb_la_resmap); + if (status) { + mlog_errno(status); + goto bail; + } + osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); if (!osb->vol_label) { mlog(ML_ERROR, "unable to alloc vol label\n"); @@ -2224,9 +2278,11 @@ static int ocfs2_initialize_super(struct super_block *sb, } osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; + osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters; iput(inode); - osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8; + osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0, + osb->s_feature_incompat) * 8; status = ocfs2_init_slot_info(osb); if (status < 0) { @@ -2509,5 +2565,25 @@ void __ocfs2_abort(struct super_block* sb, ocfs2_handle_error(sb); } +/* + * Void signal blockers, because in-kernel sigprocmask() only fails + * when SIG_* is wrong. + */ +void ocfs2_block_signals(sigset_t *oldset) +{ + int rc; + sigset_t blocked; + + sigfillset(&blocked); + rc = sigprocmask(SIG_BLOCK, &blocked, oldset); + BUG_ON(rc); +} + +void ocfs2_unblock_signals(sigset_t *oldset) +{ + int rc = sigprocmask(SIG_SETMASK, oldset, NULL); + BUG_ON(rc); +} + module_init(ocfs2_init); module_exit(ocfs2_exit); diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index 783f5270f2a..40c7de084c1 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h @@ -45,4 +45,11 @@ void __ocfs2_abort(struct super_block *sb, #define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) +/* + * Void signal blockers, because in-kernel sigprocmask() only fails + * when SIG_* is wrong. + */ +void ocfs2_block_signals(sigset_t *oldset); +void ocfs2_unblock_signals(sigset_t *oldset); + #endif /* OCFS2_SUPER_H */ diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 3e7773089b9..e97b34842cf 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -79,6 +79,7 @@ struct ocfs2_xattr_set_ctxt { struct ocfs2_alloc_context *meta_ac; struct ocfs2_alloc_context *data_ac; struct ocfs2_cached_dealloc_ctxt dealloc; + int set_abort; }; #define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root)) @@ -96,7 +97,7 @@ static struct ocfs2_xattr_def_value_root def_xv = { .xv.xr_list.l_count = cpu_to_le16(1), }; -struct xattr_handler *ocfs2_xattr_handlers[] = { +const struct xattr_handler *ocfs2_xattr_handlers[] = { &ocfs2_xattr_user_handler, &ocfs2_xattr_acl_access_handler, &ocfs2_xattr_acl_default_handler, @@ -105,7 +106,7 @@ struct xattr_handler *ocfs2_xattr_handlers[] = { NULL }; -static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { +static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ocfs2_xattr_acl_access_handler, @@ -539,7 +540,7 @@ static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno, static inline const char *ocfs2_xattr_prefix(int name_index) { - struct xattr_handler *handler = NULL; + const struct xattr_handler *handler = NULL; if (name_index > 0 && name_index < OCFS2_XATTR_MAX) handler = ocfs2_xattr_handler_map[name_index]; @@ -739,11 +740,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode, goto leave; } - status = ocfs2_journal_dirty(handle, vb->vb_bh); - if (status < 0) { - mlog_errno(status); - goto leave; - } + ocfs2_journal_dirty(handle, vb->vb_bh); clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters; @@ -786,12 +783,7 @@ static int __ocfs2_remove_xattr_range(struct inode *inode, } le32_add_cpu(&vb->vb_xv->xr_clusters, -len); - - ret = ocfs2_journal_dirty(handle, vb->vb_bh); - if (ret) { - mlog_errno(ret); - goto out; - } + ocfs2_journal_dirty(handle, vb->vb_bh); if (ext_flags & OCFS2_EXT_REFCOUNTED) ret = ocfs2_decrease_refcount(inode, handle, @@ -1374,11 +1366,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode, memset(bh->b_data + cp_len, 0, blocksize - cp_len); - ret = ocfs2_journal_dirty(handle, bh); - if (ret < 0) { - mlog_errno(ret); - goto out; - } + ocfs2_journal_dirty(handle, bh); brelse(bh); bh = NULL; @@ -2148,15 +2136,19 @@ alloc_value: orig_clusters = ocfs2_xa_value_clusters(loc); rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt); if (rc < 0) { - /* - * If we tried to grow an existing external value, - * ocfs2_xa_cleanuP-value_truncate() is going to - * let it stand. We have to restore its original - * value size. - */ - loc->xl_entry->xe_value_size = orig_value_size; + ctxt->set_abort = 1; ocfs2_xa_cleanup_value_truncate(loc, "growing", orig_clusters); + /* + * If we were growing an existing value, + * ocfs2_xa_cleanup_value_truncate() won't remove + * the entry. We need to restore the original value + * size. + */ + if (loc->xl_entry) { + BUG_ON(!orig_value_size); + loc->xl_entry->xe_value_size = orig_value_size; + } mlog_errno(rc); } } @@ -2479,7 +2471,10 @@ static int ocfs2_xattr_free_block(struct inode *inode, xb = (struct ocfs2_xattr_block *)blk_bh->b_data; blk = le64_to_cpu(xb->xb_blkno); bit = le16_to_cpu(xb->xb_suballoc_bit); - bg_blkno = ocfs2_which_suballoc_group(blk, bit); + if (xb->xb_suballoc_loc) + bg_blkno = le64_to_cpu(xb->xb_suballoc_loc); + else + bg_blkno = ocfs2_which_suballoc_group(blk, bit); xb_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, @@ -2594,9 +2589,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); spin_unlock(&oi->ip_lock); - ret = ocfs2_journal_dirty(handle, di_bh); - if (ret < 0) - mlog_errno(ret); + ocfs2_journal_dirty(handle, di_bh); out_commit: ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out: @@ -2724,9 +2717,7 @@ static int ocfs2_xattr_ibody_init(struct inode *inode, di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); spin_unlock(&oi->ip_lock); - ret = ocfs2_journal_dirty(ctxt->handle, di_bh); - if (ret < 0) - mlog_errno(ret); + ocfs2_journal_dirty(ctxt->handle, di_bh); out: return ret; @@ -2846,9 +2837,8 @@ static int ocfs2_create_xattr_block(struct inode *inode, int ret; u16 suballoc_bit_start; u32 num_got; - u64 first_blkno; + u64 suballoc_loc, first_blkno; struct ocfs2_dinode *di = (struct ocfs2_dinode *)inode_bh->b_data; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *new_bh = NULL; struct ocfs2_xattr_block *xblk; @@ -2859,9 +2849,9 @@ static int ocfs2_create_xattr_block(struct inode *inode, goto end; } - ret = ocfs2_claim_metadata(osb, ctxt->handle, ctxt->meta_ac, 1, - &suballoc_bit_start, &num_got, - &first_blkno); + ret = ocfs2_claim_metadata(ctxt->handle, ctxt->meta_ac, 1, + &suballoc_loc, &suballoc_bit_start, + &num_got, &first_blkno); if (ret < 0) { mlog_errno(ret); goto end; @@ -2883,8 +2873,10 @@ static int ocfs2_create_xattr_block(struct inode *inode, memset(xblk, 0, inode->i_sb->s_blocksize); strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot); + xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc); xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); - xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); + xblk->xb_fs_generation = + cpu_to_le32(OCFS2_SB(inode->i_sb)->fs_generation); xblk->xb_blkno = cpu_to_le64(first_blkno); if (indexed) { struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root; @@ -2956,7 +2948,7 @@ static int ocfs2_xattr_block_set(struct inode *inode, ret = ocfs2_xa_set(&loc, xi, ctxt); if (!ret) xs->here = loc.xl_entry; - else if (ret != -ENOSPC) + else if ((ret != -ENOSPC) || ctxt->set_abort) goto end; else { ret = ocfs2_xattr_create_index_block(inode, xs, ctxt); @@ -3312,14 +3304,13 @@ static int __ocfs2_xattr_set_handle(struct inode *inode, goto out; } - ret = ocfs2_extend_trans(ctxt->handle, credits + - ctxt->handle->h_buffer_credits); + ret = ocfs2_extend_trans(ctxt->handle, credits); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt); - } else if (ret == -ENOSPC) { + } else if ((ret == -ENOSPC) && !ctxt->set_abort) { if (di->i_xattr_loc && !xbs->xattr_bh) { ret = ocfs2_xattr_block_find(inode, xi->xi_name_index, @@ -3343,8 +3334,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode, goto out; } - ret = ocfs2_extend_trans(ctxt->handle, credits + - ctxt->handle->h_buffer_credits); + ret = ocfs2_extend_trans(ctxt->handle, credits); if (ret) { mlog_errno(ret); goto out; @@ -3378,8 +3368,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode, goto out; } - ret = ocfs2_extend_trans(ctxt->handle, credits + - ctxt->handle->h_buffer_credits); + ret = ocfs2_extend_trans(ctxt->handle, credits); if (ret) { mlog_errno(ret); goto out; @@ -4249,7 +4238,6 @@ static int ocfs2_xattr_create_index_block(struct inode *inode, u32 bit_off, len; u64 blkno; handle_t *handle = ctxt->handle; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_inode_info *oi = OCFS2_I(inode); struct buffer_head *xb_bh = xs->xattr_bh; struct ocfs2_xattr_block *xb = @@ -4277,7 +4265,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode, goto out; } - ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, + ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, 1, 1, &bit_off, &len); if (ret) { mlog_errno(ret); @@ -4887,8 +4875,7 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle, * We need to update the first bucket of the old extent and all * the buckets going to the new extent. */ - credits = ((num_buckets + 1) * blks_per_bucket) + - handle->h_buffer_credits; + credits = ((num_buckets + 1) * blks_per_bucket); ret = ocfs2_extend_trans(handle, credits); if (ret) { mlog_errno(ret); @@ -4958,7 +4945,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode, u32 *first_hash) { u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); - int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits; + int ret, credits = 2 * blk_per_bucket; BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize); @@ -5099,7 +5086,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode, goto leave; } - ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1, + ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, 1, clusters_to_add, &bit_off, &num_bits); if (ret < 0) { if (ret != -ENOSPC) @@ -5153,9 +5140,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode, goto leave; } - ret = ocfs2_journal_dirty(handle, root_bh); - if (ret < 0) - mlog_errno(ret); + ocfs2_journal_dirty(handle, root_bh); leave: return ret; @@ -5200,8 +5185,7 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode, * existing bucket. Then we add the last existing bucket, the * new bucket, and the first bucket (3 * blk_per_bucket). */ - credits = (end_blk - target_blk) + (3 * blk_per_bucket) + - handle->h_buffer_credits; + credits = (end_blk - target_blk) + (3 * blk_per_bucket); ret = ocfs2_extend_trans(handle, credits); if (ret) { mlog_errno(ret); @@ -5477,12 +5461,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, } le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len); - - ret = ocfs2_journal_dirty(handle, root_bh); - if (ret) { - mlog_errno(ret); - goto out_commit; - } + ocfs2_journal_dirty(handle, root_bh); ret = ocfs2_truncate_log_append(osb, handle, blkno, len); if (ret) @@ -6935,7 +6914,7 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode, goto out; } - ret = ocfs2_claim_clusters(osb, handle, data_ac, + ret = ocfs2_claim_clusters(handle, data_ac, len, &p_cluster, &num_clusters); if (ret) { mlog_errno(ret); @@ -7234,7 +7213,7 @@ int ocfs2_init_security_set(handle_t *handle, xattr_ac, data_ac); } -struct xattr_handler ocfs2_xattr_security_handler = { +const struct xattr_handler ocfs2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ocfs2_xattr_security_list, .get = ocfs2_xattr_security_get, @@ -7278,7 +7257,7 @@ static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name, name, value, size, flags); } -struct xattr_handler ocfs2_xattr_trusted_handler = { +const struct xattr_handler ocfs2_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .list = ocfs2_xattr_trusted_list, .get = ocfs2_xattr_trusted_get, @@ -7334,7 +7313,7 @@ static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name, name, value, size, flags); } -struct xattr_handler ocfs2_xattr_user_handler = { +const struct xattr_handler ocfs2_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .list = ocfs2_xattr_user_list, .get = ocfs2_xattr_user_get, diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index abd72a47f52..aa64bb37a65 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -37,12 +37,12 @@ struct ocfs2_security_xattr_info { size_t value_len; }; -extern struct xattr_handler ocfs2_xattr_user_handler; -extern struct xattr_handler ocfs2_xattr_trusted_handler; -extern struct xattr_handler ocfs2_xattr_security_handler; -extern struct xattr_handler ocfs2_xattr_acl_access_handler; -extern struct xattr_handler ocfs2_xattr_acl_default_handler; -extern struct xattr_handler *ocfs2_xattr_handlers[]; +extern const struct xattr_handler ocfs2_xattr_user_handler; +extern const struct xattr_handler ocfs2_xattr_trusted_handler; +extern const struct xattr_handler ocfs2_xattr_security_handler; +extern const struct xattr_handler ocfs2_xattr_acl_access_handler; +extern const struct xattr_handler ocfs2_xattr_acl_default_handler; +extern const struct xattr_handler *ocfs2_xattr_handlers[]; ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int, diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index b44bb835e8e..089839a6cc6 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -37,9 +37,7 @@ struct inode *omfs_new_inode(struct inode *dir, int mode) goto fail; inode->i_ino = new_block; - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); + inode_init_owner(inode, NULL, mode); inode->i_mapping->a_ops = &omfs_aops; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/open.c b/fs/open.c index 74e5cd9f718..5463266db9e 100644 --- a/fs/open.c +++ b/fs/open.c @@ -17,7 +17,6 @@ #include <linux/securebits.h> #include <linux/security.h> #include <linux/mount.h> -#include <linux/vfs.h> #include <linux/fcntl.h> #include <linux/slab.h> #include <asm/uaccess.h> @@ -33,171 +32,6 @@ #include "internal.h" -int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - int retval = -ENODEV; - - if (dentry) { - retval = -ENOSYS; - if (dentry->d_sb->s_op->statfs) { - memset(buf, 0, sizeof(*buf)); - retval = security_sb_statfs(dentry); - if (retval) - return retval; - retval = dentry->d_sb->s_op->statfs(dentry, buf); - if (retval == 0 && buf->f_frsize == 0) - buf->f_frsize = buf->f_bsize; - } - } - return retval; -} - -EXPORT_SYMBOL(vfs_statfs); - -static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf) -{ - struct kstatfs st; - int retval; - - retval = vfs_statfs(dentry, &st); - if (retval) - return retval; - - if (sizeof(*buf) == sizeof(st)) - memcpy(buf, &st, sizeof(st)); - else { - if (sizeof buf->f_blocks == 4) { - if ((st.f_blocks | st.f_bfree | st.f_bavail | - st.f_bsize | st.f_frsize) & - 0xffffffff00000000ULL) - return -EOVERFLOW; - /* - * f_files and f_ffree may be -1; it's okay to stuff - * that into 32 bits - */ - if (st.f_files != -1 && - (st.f_files & 0xffffffff00000000ULL)) - return -EOVERFLOW; - if (st.f_ffree != -1 && - (st.f_ffree & 0xffffffff00000000ULL)) - return -EOVERFLOW; - } - - buf->f_type = st.f_type; - buf->f_bsize = st.f_bsize; - buf->f_blocks = st.f_blocks; - buf->f_bfree = st.f_bfree; - buf->f_bavail = st.f_bavail; - buf->f_files = st.f_files; - buf->f_ffree = st.f_ffree; - buf->f_fsid = st.f_fsid; - buf->f_namelen = st.f_namelen; - buf->f_frsize = st.f_frsize; - memset(buf->f_spare, 0, sizeof(buf->f_spare)); - } - return 0; -} - -static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf) -{ - struct kstatfs st; - int retval; - - retval = vfs_statfs(dentry, &st); - if (retval) - return retval; - - if (sizeof(*buf) == sizeof(st)) - memcpy(buf, &st, sizeof(st)); - else { - buf->f_type = st.f_type; - buf->f_bsize = st.f_bsize; - buf->f_blocks = st.f_blocks; - buf->f_bfree = st.f_bfree; - buf->f_bavail = st.f_bavail; - buf->f_files = st.f_files; - buf->f_ffree = st.f_ffree; - buf->f_fsid = st.f_fsid; - buf->f_namelen = st.f_namelen; - buf->f_frsize = st.f_frsize; - memset(buf->f_spare, 0, sizeof(buf->f_spare)); - } - return 0; -} - -SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf) -{ - struct path path; - int error; - - error = user_path(pathname, &path); - if (!error) { - struct statfs tmp; - error = vfs_statfs_native(path.dentry, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - path_put(&path); - } - return error; -} - -SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf) -{ - struct path path; - long error; - - if (sz != sizeof(*buf)) - return -EINVAL; - error = user_path(pathname, &path); - if (!error) { - struct statfs64 tmp; - error = vfs_statfs64(path.dentry, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - path_put(&path); - } - return error; -} - -SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf) -{ - struct file * file; - struct statfs tmp; - int error; - - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = vfs_statfs_native(file->f_path.dentry, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - fput(file); -out: - return error; -} - -SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf) -{ - struct file * file; - struct statfs64 tmp; - int error; - - if (sz != sizeof(*buf)) - return -EINVAL; - - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = vfs_statfs64(file->f_path.dentry, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - fput(file); -out: - return error; -} - int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c index a97b477ac0f..6921e7890be 100644 --- a/fs/partitions/acorn.c +++ b/fs/partitions/acorn.c @@ -70,14 +70,14 @@ struct riscix_record { #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ defined(CONFIG_ACORN_PARTITION_ADFS) -static int -riscix_partition(struct parsed_partitions *state, struct block_device *bdev, - unsigned long first_sect, int slot, unsigned long nr_sects) +static int riscix_partition(struct parsed_partitions *state, + unsigned long first_sect, int slot, + unsigned long nr_sects) { Sector sect; struct riscix_record *rr; - rr = (struct riscix_record *)read_dev_sector(bdev, first_sect, §); + rr = read_part_sector(state, first_sect, §); if (!rr) return -1; @@ -123,9 +123,9 @@ struct linux_part { #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ defined(CONFIG_ACORN_PARTITION_ADFS) -static int -linux_partition(struct parsed_partitions *state, struct block_device *bdev, - unsigned long first_sect, int slot, unsigned long nr_sects) +static int linux_partition(struct parsed_partitions *state, + unsigned long first_sect, int slot, + unsigned long nr_sects) { Sector sect; struct linux_part *linuxp; @@ -135,7 +135,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev, put_partition(state, slot++, first_sect, size); - linuxp = (struct linux_part *)read_dev_sector(bdev, first_sect, §); + linuxp = read_part_sector(state, first_sect, §); if (!linuxp) return -1; @@ -157,8 +157,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev, #endif #ifdef CONFIG_ACORN_PARTITION_CUMANA -int -adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev) +int adfspart_check_CUMANA(struct parsed_partitions *state) { unsigned long first_sector = 0; unsigned int start_blk = 0; @@ -185,7 +184,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev struct adfs_discrecord *dr; unsigned int nr_sects; - data = read_dev_sector(bdev, start_blk * 2 + 6, §); + data = read_part_sector(state, start_blk * 2 + 6, §); if (!data) return -1; @@ -217,14 +216,14 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev #ifdef CONFIG_ACORN_PARTITION_RISCIX case PARTITION_RISCIX_SCSI: /* RISCiX - we don't know how to find the next one. */ - slot = riscix_partition(state, bdev, first_sector, - slot, nr_sects); + slot = riscix_partition(state, first_sector, slot, + nr_sects); break; #endif case PARTITION_LINUX: - slot = linux_partition(state, bdev, first_sector, - slot, nr_sects); + slot = linux_partition(state, first_sector, slot, + nr_sects); break; } put_dev_sector(sect); @@ -249,8 +248,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev * hda1 = ADFS partition on first drive. * hda2 = non-ADFS partition. */ -int -adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev) +int adfspart_check_ADFS(struct parsed_partitions *state) { unsigned long start_sect, nr_sects, sectscyl, heads; Sector sect; @@ -259,7 +257,7 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev) unsigned char id; int slot = 1; - data = read_dev_sector(bdev, 6, §); + data = read_part_sector(state, 6, §); if (!data) return -1; @@ -278,21 +276,21 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev) /* * Work out start of non-adfs partition. */ - nr_sects = (bdev->bd_inode->i_size >> 9) - start_sect; + nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect; if (start_sect) { switch (id) { #ifdef CONFIG_ACORN_PARTITION_RISCIX case PARTITION_RISCIX_SCSI: case PARTITION_RISCIX_MFM: - slot = riscix_partition(state, bdev, start_sect, - slot, nr_sects); + slot = riscix_partition(state, start_sect, slot, + nr_sects); break; #endif case PARTITION_LINUX: - slot = linux_partition(state, bdev, start_sect, - slot, nr_sects); + slot = linux_partition(state, start_sect, slot, + nr_sects); break; } } @@ -308,10 +306,11 @@ struct ics_part { __le32 size; }; -static int adfspart_check_ICSLinux(struct block_device *bdev, unsigned long block) +static int adfspart_check_ICSLinux(struct parsed_partitions *state, + unsigned long block) { Sector sect; - unsigned char *data = read_dev_sector(bdev, block, §); + unsigned char *data = read_part_sector(state, block, §); int result = 0; if (data) { @@ -349,8 +348,7 @@ static inline int valid_ics_sector(const unsigned char *data) * hda2 = ADFS partition 1 on first drive. * ..etc.. */ -int -adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev) +int adfspart_check_ICS(struct parsed_partitions *state) { const unsigned char *data; const struct ics_part *p; @@ -360,7 +358,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev) /* * Try ICS style partitions - sector 0 contains partition info. */ - data = read_dev_sector(bdev, 0, §); + data = read_part_sector(state, 0, §); if (!data) return -1; @@ -392,7 +390,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev) * partition is. We must not make this visible * to the filesystem. */ - if (size > 1 && adfspart_check_ICSLinux(bdev, start)) { + if (size > 1 && adfspart_check_ICSLinux(state, start)) { start += 1; size -= 1; } @@ -446,8 +444,7 @@ static inline int valid_ptec_sector(const unsigned char *data) * hda2 = ADFS partition 1 on first drive. * ..etc.. */ -int -adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev) +int adfspart_check_POWERTEC(struct parsed_partitions *state) { Sector sect; const unsigned char *data; @@ -455,7 +452,7 @@ adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bd int slot = 1; int i; - data = read_dev_sector(bdev, 0, §); + data = read_part_sector(state, 0, §); if (!data) return -1; @@ -508,8 +505,7 @@ static const char eesox_name[] = { * 1. The individual ADFS boot block entries that are placed on the disk. * 2. The start address of the next entry. */ -int -adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev) +int adfspart_check_EESOX(struct parsed_partitions *state) { Sector sect; const unsigned char *data; @@ -518,7 +514,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev) sector_t start = 0; int i, slot = 1; - data = read_dev_sector(bdev, 7, §); + data = read_part_sector(state, 7, §); if (!data) return -1; @@ -545,7 +541,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev) if (i != 0) { sector_t size; - size = get_capacity(bdev->bd_disk); + size = get_capacity(state->bdev->bd_disk); put_partition(state, slot++, start, size - start); printk("\n"); } diff --git a/fs/partitions/acorn.h b/fs/partitions/acorn.h index 81fd50ecc08..ede82852969 100644 --- a/fs/partitions/acorn.h +++ b/fs/partitions/acorn.h @@ -7,8 +7,8 @@ * format, and everyone stick to it? */ -int adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev); -int adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev); -int adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev); -int adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev); -int adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev); +int adfspart_check_CUMANA(struct parsed_partitions *state); +int adfspart_check_ADFS(struct parsed_partitions *state); +int adfspart_check_ICS(struct parsed_partitions *state); +int adfspart_check_POWERTEC(struct parsed_partitions *state); +int adfspart_check_EESOX(struct parsed_partitions *state); diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c index 9917a8c360f..ba443d4229f 100644 --- a/fs/partitions/amiga.c +++ b/fs/partitions/amiga.c @@ -23,8 +23,7 @@ checksum_block(__be32 *m, int size) return sum; } -int -amiga_partition(struct parsed_partitions *state, struct block_device *bdev) +int amiga_partition(struct parsed_partitions *state) { Sector sect; unsigned char *data; @@ -38,11 +37,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev) for (blk = 0; ; blk++, put_dev_sector(sect)) { if (blk == RDB_ALLOCATION_LIMIT) goto rdb_done; - data = read_dev_sector(bdev, blk, §); + data = read_part_sector(state, blk, §); if (!data) { if (warn_no_part) printk("Dev %s: unable to read RDB block %d\n", - bdevname(bdev, b), blk); + bdevname(state->bdev, b), blk); res = -1; goto rdb_done; } @@ -64,7 +63,7 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev) } printk("Dev %s: RDB in block %d has bad checksum\n", - bdevname(bdev, b), blk); + bdevname(state->bdev, b), blk); } /* blksize is blocks per 512 byte standard block */ @@ -75,11 +74,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev) put_dev_sector(sect); for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { blk *= blksize; /* Read in terms partition table understands */ - data = read_dev_sector(bdev, blk, §); + data = read_part_sector(state, blk, §); if (!data) { if (warn_no_part) printk("Dev %s: unable to read partition block %d\n", - bdevname(bdev, b), blk); + bdevname(state->bdev, b), blk); res = -1; goto rdb_done; } diff --git a/fs/partitions/amiga.h b/fs/partitions/amiga.h index 2f3e9ce22d5..d094585cada 100644 --- a/fs/partitions/amiga.h +++ b/fs/partitions/amiga.h @@ -2,5 +2,5 @@ * fs/partitions/amiga.h */ -int amiga_partition(struct parsed_partitions *state, struct block_device *bdev); +int amiga_partition(struct parsed_partitions *state); diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c index 1f3572d5b75..4439ff1b6ce 100644 --- a/fs/partitions/atari.c +++ b/fs/partitions/atari.c @@ -30,7 +30,7 @@ static inline int OK_id(char *s) memcmp (s, "RAW", 3) == 0 ; } -int atari_partition(struct parsed_partitions *state, struct block_device *bdev) +int atari_partition(struct parsed_partitions *state) { Sector sect; struct rootsector *rs; @@ -42,12 +42,12 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev) int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */ #endif - rs = (struct rootsector *) read_dev_sector(bdev, 0, §); + rs = read_part_sector(state, 0, §); if (!rs) return -1; /* Verify this is an Atari rootsector: */ - hd_size = bdev->bd_inode->i_size >> 9; + hd_size = state->bdev->bd_inode->i_size >> 9; if (!VALID_PARTITION(&rs->part[0], hd_size) && !VALID_PARTITION(&rs->part[1], hd_size) && !VALID_PARTITION(&rs->part[2], hd_size) && @@ -84,7 +84,7 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev) printk(" XGM<"); partsect = extensect = be32_to_cpu(pi->st); while (1) { - xrs = (struct rootsector *)read_dev_sector(bdev, partsect, §2); + xrs = read_part_sector(state, partsect, §2); if (!xrs) { printk (" block %ld read failed\n", partsect); put_dev_sector(sect); diff --git a/fs/partitions/atari.h b/fs/partitions/atari.h index 63186b00e13..fe2d32a89f3 100644 --- a/fs/partitions/atari.h +++ b/fs/partitions/atari.h @@ -31,4 +31,4 @@ struct rootsector u16 checksum; /* checksum for bootable disks */ } __attribute__((__packed__)); -int atari_partition(struct parsed_partitions *state, struct block_device *bdev); +int atari_partition(struct parsed_partitions *state); diff --git a/fs/partitions/check.c b/fs/partitions/check.c index e238ab23a9e..5dcd4b0c553 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -45,7 +45,7 @@ extern void md_autodetect_dev(dev_t dev); int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/ -static int (*check_part[])(struct parsed_partitions *, struct block_device *) = { +static int (*check_part[])(struct parsed_partitions *) = { /* * Probe partition formats with tables at disk address 0 * that also have an ADFS boot block at 0xdc0. @@ -161,10 +161,11 @@ check_partition(struct gendisk *hd, struct block_device *bdev) struct parsed_partitions *state; int i, res, err; - state = kmalloc(sizeof(struct parsed_partitions), GFP_KERNEL); + state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL); if (!state) return NULL; + state->bdev = bdev; disk_name(hd, 0, state->name); printk(KERN_INFO " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) @@ -174,7 +175,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev) i = res = err = 0; while (!res && check_part[i]) { memset(&state->parts, 0, sizeof(state->parts)); - res = check_part[i++](state, bdev); + res = check_part[i++](state); if (res < 0) { /* We have hit an I/O error which we don't report now. * But record it, and let the others do their job. @@ -186,6 +187,8 @@ check_partition(struct gendisk *hd, struct block_device *bdev) } if (res > 0) return state; + if (state->access_beyond_eod) + err = -ENOSPC; if (err) /* The partition is unrecognized. So report I/O errors if there were any */ res = err; @@ -538,12 +541,33 @@ exit: disk_part_iter_exit(&piter); } +static bool disk_unlock_native_capacity(struct gendisk *disk) +{ + const struct block_device_operations *bdops = disk->fops; + + if (bdops->unlock_native_capacity && + !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) { + printk(KERN_CONT "enabling native capacity\n"); + bdops->unlock_native_capacity(disk); + disk->flags |= GENHD_FL_NATIVE_CAPACITY; + return true; + } else { + printk(KERN_CONT "truncated\n"); + return false; + } +} + int rescan_partitions(struct gendisk *disk, struct block_device *bdev) { + struct parsed_partitions *state = NULL; struct disk_part_iter piter; struct hd_struct *part; - struct parsed_partitions *state; int p, highest, res; +rescan: + if (state && !IS_ERR(state)) { + kfree(state); + state = NULL; + } if (bdev->bd_part_count) return -EBUSY; @@ -562,8 +586,32 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) bdev->bd_invalidated = 0; if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) return 0; - if (IS_ERR(state)) /* I/O error reading the partition table */ + if (IS_ERR(state)) { + /* + * I/O error reading the partition table. If any + * partition code tried to read beyond EOD, retry + * after unlocking native capacity. + */ + if (PTR_ERR(state) == -ENOSPC) { + printk(KERN_WARNING "%s: partition table beyond EOD, ", + disk->disk_name); + if (disk_unlock_native_capacity(disk)) + goto rescan; + } return -EIO; + } + /* + * If any partition code tried to read beyond EOD, try + * unlocking native capacity even if partition table is + * sucessfully read as we could be missing some partitions. + */ + if (state->access_beyond_eod) { + printk(KERN_WARNING + "%s: partition table partially beyond EOD, ", + disk->disk_name); + if (disk_unlock_native_capacity(disk)) + goto rescan; + } /* tell userspace that the media / partition table may have changed */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); @@ -581,7 +629,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) /* add partitions */ for (p = 1; p < state->limit; p++) { sector_t size, from; -try_scan: + size = state->parts[p].size; if (!size) continue; @@ -589,30 +637,21 @@ try_scan: from = state->parts[p].from; if (from >= get_capacity(disk)) { printk(KERN_WARNING - "%s: p%d ignored, start %llu is behind the end of the disk\n", + "%s: p%d start %llu is beyond EOD, ", disk->disk_name, p, (unsigned long long) from); + if (disk_unlock_native_capacity(disk)) + goto rescan; continue; } if (from + size > get_capacity(disk)) { - const struct block_device_operations *bdops = disk->fops; - unsigned long long capacity; - printk(KERN_WARNING - "%s: p%d size %llu exceeds device capacity, ", + "%s: p%d size %llu extends beyond EOD, ", disk->disk_name, p, (unsigned long long) size); - if (bdops->set_capacity && - (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) { - printk(KERN_CONT "enabling native capacity\n"); - capacity = bdops->set_capacity(disk, ~0ULL); - disk->flags |= GENHD_FL_NATIVE_CAPACITY; - if (capacity > get_capacity(disk)) { - set_capacity(disk, capacity); - check_disk_size_change(disk, bdev); - bdev->bd_invalidated = 0; - } - goto try_scan; + if (disk_unlock_native_capacity(disk)) { + /* free state and restart */ + goto rescan; } else { /* * we can not ignore partitions of broken tables @@ -620,7 +659,6 @@ try_scan: * we limit them to the end of the disk to avoid * creating invalid block devices */ - printk(KERN_CONT "limited to end of disk\n"); size = get_capacity(disk) - from; } } diff --git a/fs/partitions/check.h b/fs/partitions/check.h index 98dbe1a8452..52f8bd39939 100644 --- a/fs/partitions/check.h +++ b/fs/partitions/check.h @@ -6,6 +6,7 @@ * description. */ struct parsed_partitions { + struct block_device *bdev; char name[BDEVNAME_SIZE]; struct { sector_t from; @@ -14,8 +15,19 @@ struct parsed_partitions { } parts[DISK_MAX_PARTS]; int next; int limit; + bool access_beyond_eod; }; +static inline void *read_part_sector(struct parsed_partitions *state, + sector_t n, Sector *p) +{ + if (n >= get_capacity(state->bdev->bd_disk)) { + state->access_beyond_eod = true; + return NULL; + } + return read_dev_sector(state->bdev, n, p); +} + static inline void put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) { diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c index 91babdae758..9efb2cfe241 100644 --- a/fs/partitions/efi.c +++ b/fs/partitions/efi.c @@ -140,8 +140,7 @@ efi_crc32(const void *buf, unsigned long len) * the part[0] entry for this disk, and is the number of * physical sectors available on the disk. */ -static u64 -last_lba(struct block_device *bdev) +static u64 last_lba(struct block_device *bdev) { if (!bdev || !bdev->bd_inode) return 0; @@ -181,27 +180,28 @@ is_pmbr_valid(legacy_mbr *mbr) /** * read_lba(): Read bytes from disk, starting at given LBA - * @bdev + * @state * @lba * @buffer * @size_t * - * Description: Reads @count bytes from @bdev into @buffer. + * Description: Reads @count bytes from @state->bdev into @buffer. * Returns number of bytes read on success, 0 on error. */ -static size_t -read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count) +static size_t read_lba(struct parsed_partitions *state, + u64 lba, u8 *buffer, size_t count) { size_t totalreadcount = 0; + struct block_device *bdev = state->bdev; sector_t n = lba * (bdev_logical_block_size(bdev) / 512); - if (!bdev || !buffer || lba > last_lba(bdev)) + if (!buffer || lba > last_lba(bdev)) return 0; while (count) { int copied = 512; Sector sect; - unsigned char *data = read_dev_sector(bdev, n++, §); + unsigned char *data = read_part_sector(state, n++, §); if (!data) break; if (copied > count) @@ -217,19 +217,20 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count) /** * alloc_read_gpt_entries(): reads partition entries from disk - * @bdev + * @state * @gpt - GPT header * * Description: Returns ptes on success, NULL on error. * Allocates space for PTEs based on information found in @gpt. * Notes: remember to free pte when you're done! */ -static gpt_entry * -alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt) +static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, + gpt_header *gpt) { size_t count; gpt_entry *pte; - if (!bdev || !gpt) + + if (!gpt) return NULL; count = le32_to_cpu(gpt->num_partition_entries) * @@ -240,7 +241,7 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt) if (!pte) return NULL; - if (read_lba(bdev, le64_to_cpu(gpt->partition_entry_lba), + if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba), (u8 *) pte, count) < count) { kfree(pte); @@ -252,27 +253,24 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt) /** * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk - * @bdev + * @state * @lba is the Logical Block Address of the partition table * * Description: returns GPT header on success, NULL on error. Allocates - * and fills a GPT header starting at @ from @bdev. + * and fills a GPT header starting at @ from @state->bdev. * Note: remember to free gpt when finished with it. */ -static gpt_header * -alloc_read_gpt_header(struct block_device *bdev, u64 lba) +static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state, + u64 lba) { gpt_header *gpt; - unsigned ssz = bdev_logical_block_size(bdev); - - if (!bdev) - return NULL; + unsigned ssz = bdev_logical_block_size(state->bdev); gpt = kzalloc(ssz, GFP_KERNEL); if (!gpt) return NULL; - if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) { + if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) { kfree(gpt); gpt=NULL; return NULL; @@ -283,7 +281,7 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba) /** * is_gpt_valid() - tests one GPT header and PTEs for validity - * @bdev + * @state * @lba is the logical block address of the GPT header to test * @gpt is a GPT header ptr, filled on return. * @ptes is a PTEs ptr, filled on return. @@ -291,16 +289,15 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba) * Description: returns 1 if valid, 0 on error. * If valid, returns pointers to newly allocated GPT header and PTEs. */ -static int -is_gpt_valid(struct block_device *bdev, u64 lba, - gpt_header **gpt, gpt_entry **ptes) +static int is_gpt_valid(struct parsed_partitions *state, u64 lba, + gpt_header **gpt, gpt_entry **ptes) { u32 crc, origcrc; u64 lastlba; - if (!bdev || !gpt || !ptes) + if (!ptes) return 0; - if (!(*gpt = alloc_read_gpt_header(bdev, lba))) + if (!(*gpt = alloc_read_gpt_header(state, lba))) return 0; /* Check the GUID Partition Table signature */ @@ -336,7 +333,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba, /* Check the first_usable_lba and last_usable_lba are * within the disk. */ - lastlba = last_lba(bdev); + lastlba = last_lba(state->bdev); if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), @@ -350,7 +347,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba, goto fail; } - if (!(*ptes = alloc_read_gpt_entries(bdev, *gpt))) + if (!(*ptes = alloc_read_gpt_entries(state, *gpt))) goto fail; /* Check the GUID Partition Entry Array CRC */ @@ -495,7 +492,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) /** * find_valid_gpt() - Search disk for valid GPT headers and PTEs - * @bdev + * @state * @gpt is a GPT header ptr, filled on return. * @ptes is a PTEs ptr, filled on return. * Description: Returns 1 if valid, 0 on error. @@ -508,24 +505,25 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) * This protects against devices which misreport their size, and forces * the user to decide to use the Alternate GPT. */ -static int -find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes) +static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, + gpt_entry **ptes) { int good_pgpt = 0, good_agpt = 0, good_pmbr = 0; gpt_header *pgpt = NULL, *agpt = NULL; gpt_entry *pptes = NULL, *aptes = NULL; legacy_mbr *legacymbr; u64 lastlba; - if (!bdev || !gpt || !ptes) + + if (!ptes) return 0; - lastlba = last_lba(bdev); + lastlba = last_lba(state->bdev); if (!force_gpt) { /* This will be added to the EFI Spec. per Intel after v1.02. */ legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL); if (legacymbr) { - read_lba(bdev, 0, (u8 *) legacymbr, - sizeof (*legacymbr)); + read_lba(state, 0, (u8 *) legacymbr, + sizeof (*legacymbr)); good_pmbr = is_pmbr_valid(legacymbr); kfree(legacymbr); } @@ -533,15 +531,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes) goto fail; } - good_pgpt = is_gpt_valid(bdev, GPT_PRIMARY_PARTITION_TABLE_LBA, + good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA, &pgpt, &pptes); if (good_pgpt) - good_agpt = is_gpt_valid(bdev, + good_agpt = is_gpt_valid(state, le64_to_cpu(pgpt->alternate_lba), &agpt, &aptes); if (!good_agpt && force_gpt) - good_agpt = is_gpt_valid(bdev, lastlba, - &agpt, &aptes); + good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes); /* The obviously unsuccessful case */ if (!good_pgpt && !good_agpt) @@ -583,9 +580,8 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes) } /** - * efi_partition(struct parsed_partitions *state, struct block_device *bdev) + * efi_partition(struct parsed_partitions *state) * @state - * @bdev * * Description: called from check.c, if the disk contains GPT * partitions, sets up partition entries in the kernel. @@ -602,15 +598,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes) * 1 if successful * */ -int -efi_partition(struct parsed_partitions *state, struct block_device *bdev) +int efi_partition(struct parsed_partitions *state) { gpt_header *gpt = NULL; gpt_entry *ptes = NULL; u32 i; - unsigned ssz = bdev_logical_block_size(bdev) / 512; + unsigned ssz = bdev_logical_block_size(state->bdev) / 512; - if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) { + if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { kfree(gpt); kfree(ptes); return 0; @@ -623,7 +618,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev) u64 size = le64_to_cpu(ptes[i].ending_lba) - le64_to_cpu(ptes[i].starting_lba) + 1ULL; - if (!is_pte_valid(&ptes[i], last_lba(bdev))) + if (!is_pte_valid(&ptes[i], last_lba(state->bdev))) continue; put_partition(state, i+1, start * ssz, size * ssz); @@ -631,7 +626,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev) /* If this is a RAID volume, tell md */ if (!efi_guidcmp(ptes[i].partition_type_guid, PARTITION_LINUX_RAID_GUID)) - state->parts[i+1].flags = 1; + state->parts[i + 1].flags = ADDPART_FLAG_RAID; } kfree(ptes); kfree(gpt); diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h index 6998b589abf..b69ab729558 100644 --- a/fs/partitions/efi.h +++ b/fs/partitions/efi.h @@ -110,7 +110,7 @@ typedef struct _legacy_mbr { } __attribute__ ((packed)) legacy_mbr; /* Functions */ -extern int efi_partition(struct parsed_partitions *state, struct block_device *bdev); +extern int efi_partition(struct parsed_partitions *state); #endif diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c index fc71aab0846..3e73de5967f 100644 --- a/fs/partitions/ibm.c +++ b/fs/partitions/ibm.c @@ -58,9 +58,9 @@ cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) { /* */ -int -ibm_partition(struct parsed_partitions *state, struct block_device *bdev) +int ibm_partition(struct parsed_partitions *state) { + struct block_device *bdev = state->bdev; int blocksize, res; loff_t i_size, offset, size, fmt_size; dasd_information2_t *info; @@ -100,7 +100,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) /* * Get volume label, extract name and type. */ - data = read_dev_sector(bdev, info->label_block*(blocksize/512), §); + data = read_part_sector(state, info->label_block*(blocksize/512), + §); if (data == NULL) goto out_readerr; @@ -193,8 +194,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) */ blk = cchhb2blk(&label->vol.vtoc, geo) + 1; counter = 0; - data = read_dev_sector(bdev, blk * (blocksize/512), - §); + data = read_part_sector(state, blk * (blocksize/512), + §); while (data != NULL) { struct vtoc_format1_label f1; @@ -208,9 +209,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) || f1.DS1FMTID == _ascebc['7'] || f1.DS1FMTID == _ascebc['9']) { blk++; - data = read_dev_sector(bdev, blk * - (blocksize/512), - §); + data = read_part_sector(state, + blk * (blocksize/512), §); continue; } @@ -230,9 +230,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) size * (blocksize >> 9)); counter++; blk++; - data = read_dev_sector(bdev, - blk * (blocksize/512), - §); + data = read_part_sector(state, + blk * (blocksize/512), §); } if (!data) diff --git a/fs/partitions/ibm.h b/fs/partitions/ibm.h index 31f85a6ac45..08fb0804a81 100644 --- a/fs/partitions/ibm.h +++ b/fs/partitions/ibm.h @@ -1 +1 @@ -int ibm_partition(struct parsed_partitions *, struct block_device *); +int ibm_partition(struct parsed_partitions *); diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c index 176d89bcf12..1cc928bb762 100644 --- a/fs/partitions/karma.c +++ b/fs/partitions/karma.c @@ -9,7 +9,7 @@ #include "check.h" #include "karma.h" -int karma_partition(struct parsed_partitions *state, struct block_device *bdev) +int karma_partition(struct parsed_partitions *state) { int i; int slot = 1; @@ -29,7 +29,7 @@ int karma_partition(struct parsed_partitions *state, struct block_device *bdev) } __attribute__((packed)) *label; struct d_partition *p; - data = read_dev_sector(bdev, 0, §); + data = read_part_sector(state, 0, §); if (!data) return -1; diff --git a/fs/partitions/karma.h b/fs/partitions/karma.h index ecf7d3f2a3d..c764b2e9df2 100644 --- a/fs/partitions/karma.h +++ b/fs/partitions/karma.h @@ -4,5 +4,5 @@ #define KARMA_LABEL_MAGIC 0xAB56 -int karma_partition(struct parsed_partitions *state, struct block_device *bdev); +int karma_partition(struct parsed_partitions *state); diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c index 8652fb99e96..648c9d8f335 100644 --- a/fs/partitions/ldm.c +++ b/fs/partitions/ldm.c @@ -26,6 +26,7 @@ #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/stringify.h> +#include <linux/kernel.h> #include "ldm.h" #include "check.h" #include "msdos.h" @@ -77,17 +78,16 @@ static int ldm_parse_hexbyte (const u8 *src) int h; /* high part */ - if ((x = src[0] - '0') <= '9'-'0') h = x; - else if ((x = src[0] - 'a') <= 'f'-'a') h = x+10; - else if ((x = src[0] - 'A') <= 'F'-'A') h = x+10; - else return -1; - h <<= 4; + x = h = hex_to_bin(src[0]); + if (h < 0) + return -1; /* low part */ - if ((x = src[1] - '0') <= '9'-'0') return h | x; - if ((x = src[1] - 'a') <= 'f'-'a') return h | (x+10); - if ((x = src[1] - 'A') <= 'F'-'A') return h | (x+10); - return -1; + h = hex_to_bin(src[1]); + if (h < 0) + return -1; + + return (x << 4) + h; } /** @@ -309,7 +309,7 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1, /** * ldm_validate_privheads - Compare the primary privhead with its backups - * @bdev: Device holding the LDM Database + * @state: Partition check state including device holding the LDM Database * @ph1: Memory struct to fill with ph contents * * Read and compare all three privheads from disk. @@ -321,8 +321,8 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1, * Return: 'true' Success * 'false' Error */ -static bool ldm_validate_privheads (struct block_device *bdev, - struct privhead *ph1) +static bool ldm_validate_privheads(struct parsed_partitions *state, + struct privhead *ph1) { static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 }; struct privhead *ph[3] = { ph1 }; @@ -332,7 +332,7 @@ static bool ldm_validate_privheads (struct block_device *bdev, long num_sects; int i; - BUG_ON (!bdev || !ph1); + BUG_ON (!state || !ph1); ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL); ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL); @@ -346,8 +346,8 @@ static bool ldm_validate_privheads (struct block_device *bdev, /* Read and parse privheads */ for (i = 0; i < 3; i++) { - data = read_dev_sector (bdev, - ph[0]->config_start + off[i], §); + data = read_part_sector(state, ph[0]->config_start + off[i], + §); if (!data) { ldm_crit ("Disk read failed."); goto out; @@ -363,7 +363,7 @@ static bool ldm_validate_privheads (struct block_device *bdev, } } - num_sects = bdev->bd_inode->i_size >> 9; + num_sects = state->bdev->bd_inode->i_size >> 9; if ((ph[0]->config_start > num_sects) || ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { @@ -397,20 +397,20 @@ out: /** * ldm_validate_tocblocks - Validate the table of contents and its backups - * @bdev: Device holding the LDM Database - * @base: Offset, into @bdev, of the database + * @state: Partition check state including device holding the LDM Database + * @base: Offset, into @state->bdev, of the database * @ldb: Cache of the database structures * * Find and compare the four tables of contents of the LDM Database stored on - * @bdev and return the parsed information into @toc1. + * @state->bdev and return the parsed information into @toc1. * * The offsets and sizes of the configs are range-checked against a privhead. * * Return: 'true' @toc1 contains validated TOCBLOCK info * 'false' @toc1 contents are undefined */ -static bool ldm_validate_tocblocks(struct block_device *bdev, - unsigned long base, struct ldmdb *ldb) +static bool ldm_validate_tocblocks(struct parsed_partitions *state, + unsigned long base, struct ldmdb *ldb) { static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4}; struct tocblock *tb[4]; @@ -420,7 +420,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev, int i, nr_tbs; bool result = false; - BUG_ON(!bdev || !ldb); + BUG_ON(!state || !ldb); ph = &ldb->ph; tb[0] = &ldb->toc; tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL); @@ -437,7 +437,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev, * skip any that fail as long as we get at least one valid TOCBLOCK. */ for (nr_tbs = i = 0; i < 4; i++) { - data = read_dev_sector(bdev, base + off[i], §); + data = read_part_sector(state, base + off[i], §); if (!data) { ldm_error("Disk read failed for TOCBLOCK %d.", i); continue; @@ -473,7 +473,7 @@ err: /** * ldm_validate_vmdb - Read the VMDB and validate it - * @bdev: Device holding the LDM Database + * @state: Partition check state including device holding the LDM Database * @base: Offset, into @bdev, of the database * @ldb: Cache of the database structures * @@ -483,8 +483,8 @@ err: * Return: 'true' @ldb contains validated VBDB info * 'false' @ldb contents are undefined */ -static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base, - struct ldmdb *ldb) +static bool ldm_validate_vmdb(struct parsed_partitions *state, + unsigned long base, struct ldmdb *ldb) { Sector sect; u8 *data; @@ -492,12 +492,12 @@ static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base, struct vmdb *vm; struct tocblock *toc; - BUG_ON (!bdev || !ldb); + BUG_ON (!state || !ldb); vm = &ldb->vm; toc = &ldb->toc; - data = read_dev_sector (bdev, base + OFF_VMDB, §); + data = read_part_sector(state, base + OFF_VMDB, §); if (!data) { ldm_crit ("Disk read failed."); return false; @@ -534,21 +534,21 @@ out: /** * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk - * @bdev: Device holding the LDM Database + * @state: Partition check state including device holding the LDM Database * * This function provides a weak test to decide whether the device is a dynamic * disk or not. It looks for an MS-DOS-style partition table containing at * least one partition of type 0x42 (formerly SFS, now used by Windows for * dynamic disks). * - * N.B. The only possible error can come from the read_dev_sector and that is + * N.B. The only possible error can come from the read_part_sector and that is * only likely to happen if the underlying device is strange. If that IS * the case we should return zero to let someone else try. * - * Return: 'true' @bdev is a dynamic disk - * 'false' @bdev is not a dynamic disk, or an error occurred + * Return: 'true' @state->bdev is a dynamic disk + * 'false' @state->bdev is not a dynamic disk, or an error occurred */ -static bool ldm_validate_partition_table (struct block_device *bdev) +static bool ldm_validate_partition_table(struct parsed_partitions *state) { Sector sect; u8 *data; @@ -556,9 +556,9 @@ static bool ldm_validate_partition_table (struct block_device *bdev) int i; bool result = false; - BUG_ON (!bdev); + BUG_ON(!state); - data = read_dev_sector (bdev, 0, §); + data = read_part_sector(state, 0, §); if (!data) { ldm_crit ("Disk read failed."); return false; @@ -1391,8 +1391,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb) /** * ldm_get_vblks - Read the on-disk database of VBLKs into memory - * @bdev: Device holding the LDM Database - * @base: Offset, into @bdev, of the database + * @state: Partition check state including device holding the LDM Database + * @base: Offset, into @state->bdev, of the database * @ldb: Cache of the database structures * * To use the information from the VBLKs, they need to be read from the disk, @@ -1401,8 +1401,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb) * Return: 'true' All the VBLKs were read successfully * 'false' An error occurred */ -static bool ldm_get_vblks (struct block_device *bdev, unsigned long base, - struct ldmdb *ldb) +static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base, + struct ldmdb *ldb) { int size, perbuf, skip, finish, s, v, recs; u8 *data = NULL; @@ -1410,7 +1410,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base, bool result = false; LIST_HEAD (frags); - BUG_ON (!bdev || !ldb); + BUG_ON(!state || !ldb); size = ldb->vm.vblk_size; perbuf = 512 / size; @@ -1418,7 +1418,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base, finish = (size * ldb->vm.last_vblk_seq) >> 9; for (s = skip; s < finish; s++) { /* For each sector */ - data = read_dev_sector (bdev, base + OFF_VMDB + s, §); + data = read_part_sector(state, base + OFF_VMDB + s, §); if (!data) { ldm_crit ("Disk read failed."); goto out; @@ -1474,8 +1474,7 @@ static void ldm_free_vblks (struct list_head *lh) /** * ldm_partition - Find out whether a device is a dynamic disk and handle it - * @pp: List of the partitions parsed so far - * @bdev: Device holding the LDM Database + * @state: Partition check state including device holding the LDM Database * * This determines whether the device @bdev is a dynamic disk and if so creates * the partitions necessary in the gendisk structure pointed to by @hd. @@ -1485,21 +1484,21 @@ static void ldm_free_vblks (struct list_head *lh) * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, * and so on: the actual data containing partitions. * - * Return: 1 Success, @bdev is a dynamic disk and we handled it - * 0 Success, @bdev is not a dynamic disk + * Return: 1 Success, @state->bdev is a dynamic disk and we handled it + * 0 Success, @state->bdev is not a dynamic disk * -1 An error occurred before enough information had been read - * Or @bdev is a dynamic disk, but it may be corrupted + * Or @state->bdev is a dynamic disk, but it may be corrupted */ -int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev) +int ldm_partition(struct parsed_partitions *state) { struct ldmdb *ldb; unsigned long base; int result = -1; - BUG_ON (!pp || !bdev); + BUG_ON(!state); /* Look for signs of a Dynamic Disk */ - if (!ldm_validate_partition_table (bdev)) + if (!ldm_validate_partition_table(state)) return 0; ldb = kmalloc (sizeof (*ldb), GFP_KERNEL); @@ -1509,15 +1508,15 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev) } /* Parse and check privheads. */ - if (!ldm_validate_privheads (bdev, &ldb->ph)) + if (!ldm_validate_privheads(state, &ldb->ph)) goto out; /* Already logged */ /* All further references are relative to base (database start). */ base = ldb->ph.config_start; /* Parse and check tocs and vmdb. */ - if (!ldm_validate_tocblocks (bdev, base, ldb) || - !ldm_validate_vmdb (bdev, base, ldb)) + if (!ldm_validate_tocblocks(state, base, ldb) || + !ldm_validate_vmdb(state, base, ldb)) goto out; /* Already logged */ /* Initialize vblk lists in ldmdb struct */ @@ -1527,13 +1526,13 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev) INIT_LIST_HEAD (&ldb->v_comp); INIT_LIST_HEAD (&ldb->v_part); - if (!ldm_get_vblks (bdev, base, ldb)) { + if (!ldm_get_vblks(state, base, ldb)) { ldm_crit ("Failed to read the VBLKs from the database."); goto cleanup; } /* Finally, create the data partition devices. */ - if (ldm_create_data_partitions (pp, ldb)) { + if (ldm_create_data_partitions(state, ldb)) { ldm_debug ("Parsed LDM database successfully."); result = 1; } diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h index 30e08e809c1..d1fb50b28d8 100644 --- a/fs/partitions/ldm.h +++ b/fs/partitions/ldm.h @@ -209,7 +209,7 @@ struct ldmdb { /* Cache of the database */ struct list_head v_part; }; -int ldm_partition (struct parsed_partitions *state, struct block_device *bdev); +int ldm_partition(struct parsed_partitions *state); #endif /* _FS_PT_LDM_H_ */ diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c index d4a0fad3563..74465ff7c26 100644 --- a/fs/partitions/mac.c +++ b/fs/partitions/mac.c @@ -27,7 +27,7 @@ static inline void mac_fix_string(char *stg, int len) stg[i] = 0; } -int mac_partition(struct parsed_partitions *state, struct block_device *bdev) +int mac_partition(struct parsed_partitions *state) { int slot = 1; Sector sect; @@ -42,7 +42,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev) struct mac_driver_desc *md; /* Get 0th block and look at the first partition map entry. */ - md = (struct mac_driver_desc *) read_dev_sector(bdev, 0, §); + md = read_part_sector(state, 0, §); if (!md) return -1; if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) { @@ -51,7 +51,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev) } secsize = be16_to_cpu(md->block_size); put_dev_sector(sect); - data = read_dev_sector(bdev, secsize/512, §); + data = read_part_sector(state, secsize/512, §); if (!data) return -1; part = (struct mac_partition *) (data + secsize%512); @@ -64,7 +64,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev) for (blk = 1; blk <= blocks_in_map; ++blk) { int pos = blk * secsize; put_dev_sector(sect); - data = read_dev_sector(bdev, pos/512, §); + data = read_part_sector(state, pos/512, §); if (!data) return -1; part = (struct mac_partition *) (data + pos%512); @@ -75,7 +75,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev) be32_to_cpu(part->block_count) * (secsize/512)); if (!strnicmp(part->type, "Linux_RAID", 10)) - state->parts[slot].flags = 1; + state->parts[slot].flags = ADDPART_FLAG_RAID; #ifdef CONFIG_PPC_PMAC /* * If this is the first bootable partition, tell the @@ -123,7 +123,8 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev) } #ifdef CONFIG_PPC_PMAC if (found_root_goodness) - note_bootable_part(bdev->bd_dev, found_root, found_root_goodness); + note_bootable_part(state->bdev->bd_dev, found_root, + found_root_goodness); #endif put_dev_sector(sect); diff --git a/fs/partitions/mac.h b/fs/partitions/mac.h index bbf26e1386f..3c7d9843638 100644 --- a/fs/partitions/mac.h +++ b/fs/partitions/mac.h @@ -41,4 +41,4 @@ struct mac_driver_desc { /* ... more stuff */ }; -int mac_partition(struct parsed_partitions *state, struct block_device *bdev); +int mac_partition(struct parsed_partitions *state); diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c index 90be97f1f5a..15bfb7b1e04 100644 --- a/fs/partitions/msdos.c +++ b/fs/partitions/msdos.c @@ -64,7 +64,7 @@ msdos_magic_present(unsigned char *p) #define AIX_LABEL_MAGIC2 0xC2 #define AIX_LABEL_MAGIC3 0xD4 #define AIX_LABEL_MAGIC4 0xC1 -static int aix_magic_present(unsigned char *p, struct block_device *bdev) +static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) { struct partition *pt = (struct partition *) (p + 0x1be); Sector sect; @@ -85,7 +85,7 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev) is_extended_partition(pt)) return 0; } - d = read_dev_sector(bdev, 7, §); + d = read_part_sector(state, 7, §); if (d) { if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') ret = 1; @@ -105,15 +105,14 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev) * only for the actual data partitions. */ -static void -parse_extended(struct parsed_partitions *state, struct block_device *bdev, - sector_t first_sector, sector_t first_size) +static void parse_extended(struct parsed_partitions *state, + sector_t first_sector, sector_t first_size) { struct partition *p; Sector sect; unsigned char *data; sector_t this_sector, this_size; - sector_t sector_size = bdev_logical_block_size(bdev) / 512; + sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; int loopct = 0; /* number of links followed without finding a data partition */ int i; @@ -126,7 +125,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev, return; if (state->next == state->limit) return; - data = read_dev_sector(bdev, this_sector, §); + data = read_part_sector(state, this_sector, §); if (!data) return; @@ -198,9 +197,8 @@ done: /* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also indicates linux swap. Be careful before believing this is Solaris. */ -static void -parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, - sector_t offset, sector_t size, int origin) +static void parse_solaris_x86(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) { #ifdef CONFIG_SOLARIS_X86_PARTITION Sector sect; @@ -208,7 +206,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, int i; short max_nparts; - v = (struct solaris_x86_vtoc *)read_dev_sector(bdev, offset+1, §); + v = read_part_sector(state, offset + 1, §); if (!v) return; if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) { @@ -245,16 +243,15 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, * Create devices for BSD partitions listed in a disklabel, under a * dos-like partition. See parse_extended() for more information. */ -static void -parse_bsd(struct parsed_partitions *state, struct block_device *bdev, - sector_t offset, sector_t size, int origin, char *flavour, - int max_partitions) +static void parse_bsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin, char *flavour, + int max_partitions) { Sector sect; struct bsd_disklabel *l; struct bsd_partition *p; - l = (struct bsd_disklabel *)read_dev_sector(bdev, offset+1, §); + l = read_part_sector(state, offset + 1, §); if (!l) return; if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) { @@ -291,33 +288,28 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev, } #endif -static void -parse_freebsd(struct parsed_partitions *state, struct block_device *bdev, - sector_t offset, sector_t size, int origin) +static void parse_freebsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) { #ifdef CONFIG_BSD_DISKLABEL - parse_bsd(state, bdev, offset, size, origin, - "bsd", BSD_MAXPARTITIONS); + parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS); #endif } -static void -parse_netbsd(struct parsed_partitions *state, struct block_device *bdev, - sector_t offset, sector_t size, int origin) +static void parse_netbsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) { #ifdef CONFIG_BSD_DISKLABEL - parse_bsd(state, bdev, offset, size, origin, - "netbsd", BSD_MAXPARTITIONS); + parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS); #endif } -static void -parse_openbsd(struct parsed_partitions *state, struct block_device *bdev, - sector_t offset, sector_t size, int origin) +static void parse_openbsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) { #ifdef CONFIG_BSD_DISKLABEL - parse_bsd(state, bdev, offset, size, origin, - "openbsd", OPENBSD_MAXPARTITIONS); + parse_bsd(state, offset, size, origin, "openbsd", + OPENBSD_MAXPARTITIONS); #endif } @@ -325,16 +317,15 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev, * Create devices for Unixware partitions listed in a disklabel, under a * dos-like partition. See parse_extended() for more information. */ -static void -parse_unixware(struct parsed_partitions *state, struct block_device *bdev, - sector_t offset, sector_t size, int origin) +static void parse_unixware(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) { #ifdef CONFIG_UNIXWARE_DISKLABEL Sector sect; struct unixware_disklabel *l; struct unixware_slice *p; - l = (struct unixware_disklabel *)read_dev_sector(bdev, offset+29, §); + l = read_part_sector(state, offset + 29, §); if (!l) return; if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC || @@ -365,9 +356,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev, * Anand Krishnamurthy <anandk@wiproge.med.ge.com> * Rajeev V. Pillai <rajeevvp@yahoo.com> */ -static void -parse_minix(struct parsed_partitions *state, struct block_device *bdev, - sector_t offset, sector_t size, int origin) +static void parse_minix(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) { #ifdef CONFIG_MINIX_SUBPARTITION Sector sect; @@ -375,7 +365,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev, struct partition *p; int i; - data = read_dev_sector(bdev, offset, §); + data = read_part_sector(state, offset, §); if (!data) return; @@ -404,8 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev, static struct { unsigned char id; - void (*parse)(struct parsed_partitions *, struct block_device *, - sector_t, sector_t, int); + void (*parse)(struct parsed_partitions *, sector_t, sector_t, int); } subtypes[] = { {FREEBSD_PARTITION, parse_freebsd}, {NETBSD_PARTITION, parse_netbsd}, @@ -417,16 +406,16 @@ static struct { {0, NULL}, }; -int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) +int msdos_partition(struct parsed_partitions *state) { - sector_t sector_size = bdev_logical_block_size(bdev) / 512; + sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; Sector sect; unsigned char *data; struct partition *p; struct fat_boot_sector *fb; int slot; - data = read_dev_sector(bdev, 0, §); + data = read_part_sector(state, 0, §); if (!data) return -1; if (!msdos_magic_present(data + 510)) { @@ -434,7 +423,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) return 0; } - if (aix_magic_present(data, bdev)) { + if (aix_magic_present(state, data)) { put_dev_sector(sect); printk( " [AIX]"); return 0; @@ -503,13 +492,13 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) put_partition(state, slot, start, n); printk(" <"); - parse_extended(state, bdev, start, size); + parse_extended(state, start, size); printk(" >"); continue; } put_partition(state, slot, start, size); if (SYS_IND(p) == LINUX_RAID_PARTITION) - state->parts[slot].flags = 1; + state->parts[slot].flags = ADDPART_FLAG_RAID; if (SYS_IND(p) == DM6_PARTITION) printk("[DM]"); if (SYS_IND(p) == EZD_PARTITION) @@ -532,8 +521,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) if (!subtypes[n].parse) continue; - subtypes[n].parse(state, bdev, start_sect(p)*sector_size, - nr_sects(p)*sector_size, slot); + subtypes[n].parse(state, start_sect(p) * sector_size, + nr_sects(p) * sector_size, slot); } put_dev_sector(sect); return 1; diff --git a/fs/partitions/msdos.h b/fs/partitions/msdos.h index 01e5e0b6902..38c781c490b 100644 --- a/fs/partitions/msdos.h +++ b/fs/partitions/msdos.h @@ -4,5 +4,5 @@ #define MSDOS_LABEL_MAGIC 0xAA55 -int msdos_partition(struct parsed_partitions *state, struct block_device *bdev); +int msdos_partition(struct parsed_partitions *state); diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c index c05c17bc5df..fc22b85d436 100644 --- a/fs/partitions/osf.c +++ b/fs/partitions/osf.c @@ -10,7 +10,7 @@ #include "check.h" #include "osf.h" -int osf_partition(struct parsed_partitions *state, struct block_device *bdev) +int osf_partition(struct parsed_partitions *state) { int i; int slot = 1; @@ -49,7 +49,7 @@ int osf_partition(struct parsed_partitions *state, struct block_device *bdev) } * label; struct d_partition * partition; - data = read_dev_sector(bdev, 0, §); + data = read_part_sector(state, 0, §); if (!data) return -1; diff --git a/fs/partitions/osf.h b/fs/partitions/osf.h index 427b8eab314..20ed2315ec1 100644 --- a/fs/partitions/osf.h +++ b/fs/partitions/osf.h @@ -4,4 +4,4 @@ #define DISKLABELMAGIC (0x82564557UL) -int osf_partition(struct parsed_partitions *state, struct block_device *bdev); +int osf_partition(struct parsed_partitions *state); diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c index ed5ac83fe83..43b1df9aa16 100644 --- a/fs/partitions/sgi.c +++ b/fs/partitions/sgi.c @@ -27,7 +27,7 @@ struct sgi_disklabel { __be32 _unused1; /* Padding */ }; -int sgi_partition(struct parsed_partitions *state, struct block_device *bdev) +int sgi_partition(struct parsed_partitions *state) { int i, csum; __be32 magic; @@ -39,7 +39,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev) struct sgi_partition *p; char b[BDEVNAME_SIZE]; - label = (struct sgi_disklabel *) read_dev_sector(bdev, 0, §); + label = read_part_sector(state, 0, §); if (!label) return -1; p = &label->partitions[0]; @@ -57,7 +57,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev) } if(csum) { printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", - bdevname(bdev, b)); + bdevname(state->bdev, b)); put_dev_sector(sect); return 0; } diff --git a/fs/partitions/sgi.h b/fs/partitions/sgi.h index 5d5595c0992..b9553ebdd5a 100644 --- a/fs/partitions/sgi.h +++ b/fs/partitions/sgi.h @@ -2,7 +2,7 @@ * fs/partitions/sgi.h */ -extern int sgi_partition(struct parsed_partitions *state, struct block_device *bdev); +extern int sgi_partition(struct parsed_partitions *state); #define SGI_LABEL_MAGIC 0x0be5a941 diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c index c95e6a62c01..a32660e25f7 100644 --- a/fs/partitions/sun.c +++ b/fs/partitions/sun.c @@ -10,7 +10,7 @@ #include "check.h" #include "sun.h" -int sun_partition(struct parsed_partitions *state, struct block_device *bdev) +int sun_partition(struct parsed_partitions *state) { int i; __be16 csum; @@ -61,7 +61,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev) int use_vtoc; int nparts; - label = (struct sun_disklabel *)read_dev_sector(bdev, 0, §); + label = read_part_sector(state, 0, §); if (!label) return -1; @@ -78,7 +78,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev) csum ^= *ush--; if (csum) { printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", - bdevname(bdev, b)); + bdevname(state->bdev, b)); put_dev_sector(sect); return 0; } diff --git a/fs/partitions/sun.h b/fs/partitions/sun.h index 7f864d1f86d..2424baa8319 100644 --- a/fs/partitions/sun.h +++ b/fs/partitions/sun.h @@ -5,4 +5,4 @@ #define SUN_LABEL_MAGIC 0xDABE #define SUN_VTOC_SANITY 0x600DDEEE -int sun_partition(struct parsed_partitions *state, struct block_device *bdev); +int sun_partition(struct parsed_partitions *state); diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c index 4eba27b7864..9030c864428 100644 --- a/fs/partitions/sysv68.c +++ b/fs/partitions/sysv68.c @@ -46,7 +46,7 @@ struct slice { }; -int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev) +int sysv68_partition(struct parsed_partitions *state) { int i, slices; int slot = 1; @@ -55,7 +55,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev) struct dkblk0 *b; struct slice *slice; - data = read_dev_sector(bdev, 0, §); + data = read_part_sector(state, 0, §); if (!data) return -1; @@ -68,7 +68,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev) i = be32_to_cpu(b->dk_ios.ios_slcblk); put_dev_sector(sect); - data = read_dev_sector(bdev, i, §); + data = read_part_sector(state, i, §); if (!data) return -1; diff --git a/fs/partitions/sysv68.h b/fs/partitions/sysv68.h index fa733f68431..bf2f5ffa97a 100644 --- a/fs/partitions/sysv68.h +++ b/fs/partitions/sysv68.h @@ -1 +1 @@ -extern int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev); +extern int sysv68_partition(struct parsed_partitions *state); diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c index ec852c11dce..db9eef26036 100644 --- a/fs/partitions/ultrix.c +++ b/fs/partitions/ultrix.c @@ -9,7 +9,7 @@ #include "check.h" #include "ultrix.h" -int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev) +int ultrix_partition(struct parsed_partitions *state) { int i; Sector sect; @@ -26,7 +26,7 @@ int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev) #define PT_MAGIC 0x032957 /* Partition magic number */ #define PT_VALID 1 /* Indicates if struct is valid */ - data = read_dev_sector(bdev, (16384 - sizeof(*label))/512, §); + data = read_part_sector(state, (16384 - sizeof(*label))/512, §); if (!data) return -1; diff --git a/fs/partitions/ultrix.h b/fs/partitions/ultrix.h index a74bf8e2d37..a3cc00b2bde 100644 --- a/fs/partitions/ultrix.h +++ b/fs/partitions/ultrix.h @@ -2,4 +2,4 @@ * fs/partitions/ultrix.h */ -int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev); +int ultrix_partition(struct parsed_partitions *state); diff --git a/fs/pipe.c b/fs/pipe.c index 37ba29ff315..d79872eba09 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -11,6 +11,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/fs.h> +#include <linux/log2.h> #include <linux/mount.h> #include <linux/pipe_fs_i.h> #include <linux/uio.h> @@ -18,11 +19,18 @@ #include <linux/pagemap.h> #include <linux/audit.h> #include <linux/syscalls.h> +#include <linux/fcntl.h> #include <asm/uaccess.h> #include <asm/ioctls.h> /* + * The max size that a non-root user is allowed to grow the pipe. Can + * be set by root in /proc/sys/fs/pipe-max-pages + */ +unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16; + +/* * We use a start+len construction, which provides full use of the * allocated memory. * -- Florian Coosmann (FGC) @@ -390,7 +398,7 @@ redo: if (!buf->len) { buf->ops = NULL; ops->release(pipe, buf); - curbuf = (curbuf + 1) & (PIPE_BUFFERS-1); + curbuf = (curbuf + 1) & (pipe->buffers - 1); pipe->curbuf = curbuf; pipe->nrbufs = --bufs; do_wakeup = 1; @@ -472,7 +480,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov, chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ if (pipe->nrbufs && chars != 0) { int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & - (PIPE_BUFFERS-1); + (pipe->buffers - 1); struct pipe_buffer *buf = pipe->bufs + lastbuf; const struct pipe_buf_operations *ops = buf->ops; int offset = buf->offset + buf->len; @@ -518,8 +526,8 @@ redo1: break; } bufs = pipe->nrbufs; - if (bufs < PIPE_BUFFERS) { - int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); + if (bufs < pipe->buffers) { + int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); struct pipe_buffer *buf = pipe->bufs + newbuf; struct page *page = pipe->tmp_page; char *src; @@ -580,7 +588,7 @@ redo2: if (!total_len) break; } - if (bufs < PIPE_BUFFERS) + if (bufs < pipe->buffers) continue; if (filp->f_flags & O_NONBLOCK) { if (!ret) @@ -640,7 +648,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) nrbufs = pipe->nrbufs; while (--nrbufs >= 0) { count += pipe->bufs[buf].len; - buf = (buf+1) & (PIPE_BUFFERS-1); + buf = (buf+1) & (pipe->buffers - 1); } mutex_unlock(&inode->i_mutex); @@ -671,7 +679,7 @@ pipe_poll(struct file *filp, poll_table *wait) } if (filp->f_mode & FMODE_WRITE) { - mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0; + mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0; /* * Most Unices do not set POLLERR for FIFOs but on Linux they * behave exactly like pipes for poll(). @@ -877,25 +885,32 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode) pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); if (pipe) { - init_waitqueue_head(&pipe->wait); - pipe->r_counter = pipe->w_counter = 1; - pipe->inode = inode; + pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); + if (pipe->bufs) { + init_waitqueue_head(&pipe->wait); + pipe->r_counter = pipe->w_counter = 1; + pipe->inode = inode; + pipe->buffers = PIPE_DEF_BUFFERS; + return pipe; + } + kfree(pipe); } - return pipe; + return NULL; } void __free_pipe_info(struct pipe_inode_info *pipe) { int i; - for (i = 0; i < PIPE_BUFFERS; i++) { + for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) buf->ops->release(pipe, buf); } if (pipe->tmp_page) __free_page(pipe->tmp_page); + kfree(pipe->bufs); kfree(pipe); } @@ -1094,6 +1109,89 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes) } /* + * Allocate a new array of pipe buffers and copy the info over. Returns the + * pipe size if successful, or return -ERROR on error. + */ +static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) +{ + struct pipe_buffer *bufs; + + /* + * Must be a power-of-2 currently + */ + if (!is_power_of_2(arg)) + return -EINVAL; + + /* + * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't + * expect a lot of shrink+grow operations, just free and allocate + * again like we would do for growing. If the pipe currently + * contains more buffers than arg, then return busy. + */ + if (arg < pipe->nrbufs) + return -EBUSY; + + bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL); + if (unlikely(!bufs)) + return -ENOMEM; + + /* + * The pipe array wraps around, so just start the new one at zero + * and adjust the indexes. + */ + if (pipe->nrbufs) { + const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1); + const unsigned int head = pipe->nrbufs - tail; + + if (head) + memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); + if (tail) + memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer)); + } + + pipe->curbuf = 0; + kfree(pipe->bufs); + pipe->bufs = bufs; + pipe->buffers = arg; + return arg; +} + +long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct pipe_inode_info *pipe; + long ret; + + pipe = file->f_path.dentry->d_inode->i_pipe; + if (!pipe) + return -EBADF; + + mutex_lock(&pipe->inode->i_mutex); + + switch (cmd) { + case F_SETPIPE_SZ: + if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages) + return -EINVAL; + /* + * The pipe needs to be at least 2 pages large to + * guarantee POSIX behaviour. + */ + if (arg < 2) + return -EINVAL; + ret = pipe_set_size(pipe, arg); + break; + case F_GETPIPE_SZ: + ret = pipe->buffers; + break; + default: + ret = -EINVAL; + break; + } + + mutex_unlock(&pipe->inode->i_mutex); + return ret; +} + +/* * pipefs should _never_ be mounted by userland - too much of security hassle, * no real gain from having the whole whorehouse mounted. So we don't need * any operations on the root directory. However, we need a non-trivial diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 47f5b145f56..aea1d3f1ffb 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -634,6 +634,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return err; } +#ifdef CONFIG_HUGETLB_PAGE static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset) { u64 pme = 0; @@ -664,6 +665,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, return err; } +#endif /* HUGETLB_PAGE */ /* * /proc/pid/pagemap - an array mapping virtual pages to pfns @@ -733,7 +735,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, pagemap_walk.pmd_entry = pagemap_pte_range; pagemap_walk.pte_hole = pagemap_pte_hole; +#ifdef CONFIG_HUGETLB_PAGE pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; +#endif pagemap_walk.mm = mm; pagemap_walk.private = ± diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 788b5802a7c..655a4c52b8c 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -82,7 +82,7 @@ /* * There are three quota SMP locks. dq_list_lock protects all lists with quotas - * and quota formats, dqstats structure containing statistics about the lists + * and quota formats. * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes. * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly @@ -132,7 +132,9 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock); EXPORT_SYMBOL(dq_data_lock); +#if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING) static char *quotatypes[] = INITQFNAMES; +#endif static struct quota_format_type *quota_formats; /* List of registered formats */ static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES; @@ -226,6 +228,10 @@ static struct hlist_head *dquot_hash; struct dqstats dqstats; EXPORT_SYMBOL(dqstats); +#ifdef CONFIG_SMP +struct dqstats *dqstats_pcpu; +EXPORT_SYMBOL(dqstats_pcpu); +#endif static qsize_t inode_get_rsv_space(struct inode *inode); static void __dquot_initialize(struct inode *inode, int type); @@ -273,7 +279,7 @@ static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb, static inline void put_dquot_last(struct dquot *dquot) { list_add_tail(&dquot->dq_free, &free_dquots); - dqstats.free_dquots++; + dqstats_inc(DQST_FREE_DQUOTS); } static inline void remove_free_dquot(struct dquot *dquot) @@ -281,7 +287,7 @@ static inline void remove_free_dquot(struct dquot *dquot) if (list_empty(&dquot->dq_free)) return; list_del_init(&dquot->dq_free); - dqstats.free_dquots--; + dqstats_dec(DQST_FREE_DQUOTS); } static inline void put_inuse(struct dquot *dquot) @@ -289,12 +295,12 @@ static inline void put_inuse(struct dquot *dquot) /* We add to the back of inuse list so we don't have to restart * when traversing this list and we block */ list_add_tail(&dquot->dq_inuse, &inuse_list); - dqstats.allocated_dquots++; + dqstats_inc(DQST_ALLOC_DQUOTS); } static inline void remove_inuse(struct dquot *dquot) { - dqstats.allocated_dquots--; + dqstats_dec(DQST_ALLOC_DQUOTS); list_del(&dquot->dq_inuse); } /* @@ -317,14 +323,23 @@ static inline int mark_dquot_dirty(struct dquot *dquot) return dquot->dq_sb->dq_op->mark_dirty(dquot); } +/* Mark dquot dirty in atomic manner, and return it's old dirty flag state */ int dquot_mark_dquot_dirty(struct dquot *dquot) { + int ret = 1; + + /* If quota is dirty already, we don't have to acquire dq_list_lock */ + if (test_bit(DQ_MOD_B, &dquot->dq_flags)) + return 1; + spin_lock(&dq_list_lock); - if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) + if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) { list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)-> info[dquot->dq_type].dqi_dirty_list); + ret = 0; + } spin_unlock(&dq_list_lock); - return 0; + return ret; } EXPORT_SYMBOL(dquot_mark_dquot_dirty); @@ -550,8 +565,8 @@ int dquot_scan_active(struct super_block *sb, continue; /* Now we have active dquot so we can just increase use count */ atomic_inc(&dquot->dq_count); - dqstats.lookups++; spin_unlock(&dq_list_lock); + dqstats_inc(DQST_LOOKUPS); dqput(old_dquot); old_dquot = dquot; ret = fn(dquot, priv); @@ -596,8 +611,8 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait) * holding reference so we can safely just increase * use count */ atomic_inc(&dquot->dq_count); - dqstats.lookups++; spin_unlock(&dq_list_lock); + dqstats_inc(DQST_LOOKUPS); sb->dq_op->write_dquot(dquot); dqput(dquot); spin_lock(&dq_list_lock); @@ -609,9 +624,7 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait) if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt) && info_dirty(&dqopt->info[cnt])) sb->dq_op->write_info(sb, cnt); - spin_lock(&dq_list_lock); - dqstats.syncs++; - spin_unlock(&dq_list_lock); + dqstats_inc(DQST_SYNCS); mutex_unlock(&dqopt->dqonoff_mutex); if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)) @@ -663,6 +676,22 @@ static void prune_dqcache(int count) } } +static int dqstats_read(unsigned int type) +{ + int count = 0; +#ifdef CONFIG_SMP + int cpu; + for_each_possible_cpu(cpu) + count += per_cpu_ptr(dqstats_pcpu, cpu)->stat[type]; + /* Statistics reading is racy, but absolute accuracy isn't required */ + if (count < 0) + count = 0; +#else + count = dqstats.stat[type]; +#endif + return count; +} + /* * This is called from kswapd when we think we need some * more memory @@ -675,7 +704,7 @@ static int shrink_dqcache_memory(int nr, gfp_t gfp_mask) prune_dqcache(nr); spin_unlock(&dq_list_lock); } - return (dqstats.free_dquots / 100) * sysctl_vfs_cache_pressure; + return (dqstats_read(DQST_FREE_DQUOTS)/100) * sysctl_vfs_cache_pressure; } static struct shrinker dqcache_shrinker = { @@ -703,10 +732,7 @@ void dqput(struct dquot *dquot) BUG(); } #endif - - spin_lock(&dq_list_lock); - dqstats.drops++; - spin_unlock(&dq_list_lock); + dqstats_inc(DQST_DROPS); we_slept: spin_lock(&dq_list_lock); if (atomic_read(&dquot->dq_count) > 1) { @@ -823,15 +849,15 @@ we_slept: put_inuse(dquot); /* hash it first so it can be found */ insert_dquot_hash(dquot); - dqstats.lookups++; spin_unlock(&dq_list_lock); + dqstats_inc(DQST_LOOKUPS); } else { if (!atomic_read(&dquot->dq_count)) remove_free_dquot(dquot); atomic_inc(&dquot->dq_count); - dqstats.cache_hits++; - dqstats.lookups++; spin_unlock(&dq_list_lock); + dqstats_inc(DQST_CACHE_HITS); + dqstats_inc(DQST_LOOKUPS); } /* Wait for dq_lock - after this we know that either dquot_release() is * already finished or it will be canceled due to dq_count > 1 test */ @@ -1677,16 +1703,19 @@ EXPORT_SYMBOL(dquot_free_inode); /* * Transfer the number of inode and blocks from one diskquota to an other. + * On success, dquot references in transfer_to are consumed and references + * to original dquots that need to be released are placed there. On failure, + * references are kept untouched. * * This operation can block, but only after everything is updated * A transaction must be started when entering this function. + * */ -static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask) +int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) { qsize_t space, cur_space; qsize_t rsv_space = 0; - struct dquot *transfer_from[MAXQUOTAS]; - struct dquot *transfer_to[MAXQUOTAS]; + struct dquot *transfer_from[MAXQUOTAS] = {}; int cnt, ret = 0; char warntype_to[MAXQUOTAS]; char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; @@ -1696,19 +1725,12 @@ static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask if (IS_NOQUOTA(inode)) return 0; /* Initialize the arrays */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - transfer_from[cnt] = NULL; - transfer_to[cnt] = NULL; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype_to[cnt] = QUOTA_NL_NOWARN; - } - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (mask & (1 << cnt)) - transfer_to[cnt] = dqget(inode->i_sb, chid[cnt], cnt); - } down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - goto put_all; + return 0; } spin_lock(&dq_data_lock); cur_space = inode_get_bytes(inode); @@ -1760,47 +1782,41 @@ static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask mark_all_dquot_dirty(transfer_from); mark_all_dquot_dirty(transfer_to); - /* The reference we got is transferred to the inode */ + /* Pass back references to put */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) - transfer_to[cnt] = NULL; -warn_put_all: + transfer_to[cnt] = transfer_from[cnt]; +warn: flush_warnings(transfer_to, warntype_to); flush_warnings(transfer_from, warntype_from_inodes); flush_warnings(transfer_from, warntype_from_space); -put_all: - dqput_all(transfer_from); - dqput_all(transfer_to); return ret; over_quota: spin_unlock(&dq_data_lock); up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - /* Clear dquot pointers we don't want to dqput() */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - transfer_from[cnt] = NULL; - goto warn_put_all; + goto warn; } +EXPORT_SYMBOL(__dquot_transfer); /* Wrapper for transferring ownership of an inode for uid/gid only * Called from FSXXX_setattr() */ int dquot_transfer(struct inode *inode, struct iattr *iattr) { - qid_t chid[MAXQUOTAS]; - unsigned long mask = 0; + struct dquot *transfer_to[MAXQUOTAS] = {}; + struct super_block *sb = inode->i_sb; + int ret; - if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) { - mask |= 1 << USRQUOTA; - chid[USRQUOTA] = iattr->ia_uid; - } - if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) { - mask |= 1 << GRPQUOTA; - chid[GRPQUOTA] = iattr->ia_gid; - } - if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) { - dquot_initialize(inode); - return __dquot_transfer(inode, chid, mask); - } - return 0; + if (!sb_any_quota_active(sb) || IS_NOQUOTA(inode)) + return 0; + + if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) + transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA); + if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) + transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_uid, GRPQUOTA); + + ret = __dquot_transfer(inode, transfer_to); + dqput_all(transfer_to); + return ret; } EXPORT_SYMBOL(dquot_transfer); @@ -2275,25 +2291,30 @@ static inline qsize_t stoqb(qsize_t space) } /* Generic routine for getting common part of quota structure */ -static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di) +static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di) { struct mem_dqblk *dm = &dquot->dq_dqb; + memset(di, 0, sizeof(*di)); + di->d_version = FS_DQUOT_VERSION; + di->d_flags = dquot->dq_type == USRQUOTA ? + XFS_USER_QUOTA : XFS_GROUP_QUOTA; + di->d_id = dquot->dq_id; + spin_lock(&dq_data_lock); - di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit); - di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit); - di->dqb_curspace = dm->dqb_curspace + dm->dqb_rsvspace; - di->dqb_ihardlimit = dm->dqb_ihardlimit; - di->dqb_isoftlimit = dm->dqb_isoftlimit; - di->dqb_curinodes = dm->dqb_curinodes; - di->dqb_btime = dm->dqb_btime; - di->dqb_itime = dm->dqb_itime; - di->dqb_valid = QIF_ALL; + di->d_blk_hardlimit = stoqb(dm->dqb_bhardlimit); + di->d_blk_softlimit = stoqb(dm->dqb_bsoftlimit); + di->d_ino_hardlimit = dm->dqb_ihardlimit; + di->d_ino_softlimit = dm->dqb_isoftlimit; + di->d_bcount = dm->dqb_curspace + dm->dqb_rsvspace; + di->d_icount = dm->dqb_curinodes; + di->d_btimer = dm->dqb_btime; + di->d_itimer = dm->dqb_itime; spin_unlock(&dq_data_lock); } int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, - struct if_dqblk *di) + struct fs_disk_quota *di) { struct dquot *dquot; @@ -2307,51 +2328,70 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, } EXPORT_SYMBOL(vfs_get_dqblk); +#define VFS_FS_DQ_MASK \ + (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \ + FS_DQ_ICOUNT | FS_DQ_ISOFT | FS_DQ_IHARD | \ + FS_DQ_BTIMER | FS_DQ_ITIMER) + /* Generic routine for setting common part of quota structure */ -static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) +static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di) { struct mem_dqblk *dm = &dquot->dq_dqb; int check_blim = 0, check_ilim = 0; struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type]; - if ((di->dqb_valid & QIF_BLIMITS && - (di->dqb_bhardlimit > dqi->dqi_maxblimit || - di->dqb_bsoftlimit > dqi->dqi_maxblimit)) || - (di->dqb_valid & QIF_ILIMITS && - (di->dqb_ihardlimit > dqi->dqi_maxilimit || - di->dqb_isoftlimit > dqi->dqi_maxilimit))) + if (di->d_fieldmask & ~VFS_FS_DQ_MASK) + return -EINVAL; + + if (((di->d_fieldmask & FS_DQ_BSOFT) && + (di->d_blk_softlimit > dqi->dqi_maxblimit)) || + ((di->d_fieldmask & FS_DQ_BHARD) && + (di->d_blk_hardlimit > dqi->dqi_maxblimit)) || + ((di->d_fieldmask & FS_DQ_ISOFT) && + (di->d_ino_softlimit > dqi->dqi_maxilimit)) || + ((di->d_fieldmask & FS_DQ_IHARD) && + (di->d_ino_hardlimit > dqi->dqi_maxilimit))) return -ERANGE; spin_lock(&dq_data_lock); - if (di->dqb_valid & QIF_SPACE) { - dm->dqb_curspace = di->dqb_curspace - dm->dqb_rsvspace; + if (di->d_fieldmask & FS_DQ_BCOUNT) { + dm->dqb_curspace = di->d_bcount - dm->dqb_rsvspace; check_blim = 1; set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); } - if (di->dqb_valid & QIF_BLIMITS) { - dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit); - dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit); + + if (di->d_fieldmask & FS_DQ_BSOFT) + dm->dqb_bsoftlimit = qbtos(di->d_blk_softlimit); + if (di->d_fieldmask & FS_DQ_BHARD) + dm->dqb_bhardlimit = qbtos(di->d_blk_hardlimit); + if (di->d_fieldmask & (FS_DQ_BSOFT | FS_DQ_BHARD)) { check_blim = 1; set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags); } - if (di->dqb_valid & QIF_INODES) { - dm->dqb_curinodes = di->dqb_curinodes; + + if (di->d_fieldmask & FS_DQ_ICOUNT) { + dm->dqb_curinodes = di->d_icount; check_ilim = 1; set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags); } - if (di->dqb_valid & QIF_ILIMITS) { - dm->dqb_isoftlimit = di->dqb_isoftlimit; - dm->dqb_ihardlimit = di->dqb_ihardlimit; + + if (di->d_fieldmask & FS_DQ_ISOFT) + dm->dqb_isoftlimit = di->d_ino_softlimit; + if (di->d_fieldmask & FS_DQ_IHARD) + dm->dqb_ihardlimit = di->d_ino_hardlimit; + if (di->d_fieldmask & (FS_DQ_ISOFT | FS_DQ_IHARD)) { check_ilim = 1; set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags); } - if (di->dqb_valid & QIF_BTIME) { - dm->dqb_btime = di->dqb_btime; + + if (di->d_fieldmask & FS_DQ_BTIMER) { + dm->dqb_btime = di->d_btimer; check_blim = 1; set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); } - if (di->dqb_valid & QIF_ITIME) { - dm->dqb_itime = di->dqb_itime; + + if (di->d_fieldmask & FS_DQ_ITIMER) { + dm->dqb_itime = di->d_itimer; check_ilim = 1; set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); } @@ -2361,7 +2401,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) dm->dqb_curspace < dm->dqb_bsoftlimit) { dm->dqb_btime = 0; clear_bit(DQ_BLKS_B, &dquot->dq_flags); - } else if (!(di->dqb_valid & QIF_BTIME)) + } else if (!(di->d_fieldmask & FS_DQ_BTIMER)) /* Set grace only if user hasn't provided his own... */ dm->dqb_btime = get_seconds() + dqi->dqi_bgrace; } @@ -2370,7 +2410,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) dm->dqb_curinodes < dm->dqb_isoftlimit) { dm->dqb_itime = 0; clear_bit(DQ_INODES_B, &dquot->dq_flags); - } else if (!(di->dqb_valid & QIF_ITIME)) + } else if (!(di->d_fieldmask & FS_DQ_ITIMER)) /* Set grace only if user hasn't provided his own... */ dm->dqb_itime = get_seconds() + dqi->dqi_igrace; } @@ -2386,7 +2426,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) } int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, - struct if_dqblk *di) + struct fs_disk_quota *di) { struct dquot *dquot; int rc; @@ -2465,62 +2505,74 @@ const struct quotactl_ops vfs_quotactl_ops = { .set_dqblk = vfs_set_dqblk }; + +static int do_proc_dqstats(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ +#ifdef CONFIG_SMP + /* Update global table */ + unsigned int type = (int *)table->data - dqstats.stat; + dqstats.stat[type] = dqstats_read(type); +#endif + return proc_dointvec(table, write, buffer, lenp, ppos); +} + static ctl_table fs_dqstats_table[] = { { .procname = "lookups", - .data = &dqstats.lookups, + .data = &dqstats.stat[DQST_LOOKUPS], .maxlen = sizeof(int), .mode = 0444, - .proc_handler = proc_dointvec, + .proc_handler = do_proc_dqstats, }, { .procname = "drops", - .data = &dqstats.drops, + .data = &dqstats.stat[DQST_DROPS], .maxlen = sizeof(int), .mode = 0444, - .proc_handler = proc_dointvec, + .proc_handler = do_proc_dqstats, }, { .procname = "reads", - .data = &dqstats.reads, + .data = &dqstats.stat[DQST_READS], .maxlen = sizeof(int), .mode = 0444, - .proc_handler = proc_dointvec, + .proc_handler = do_proc_dqstats, }, { .procname = "writes", - .data = &dqstats.writes, + .data = &dqstats.stat[DQST_WRITES], .maxlen = sizeof(int), .mode = 0444, - .proc_handler = proc_dointvec, + .proc_handler = do_proc_dqstats, }, { .procname = "cache_hits", - .data = &dqstats.cache_hits, + .data = &dqstats.stat[DQST_CACHE_HITS], .maxlen = sizeof(int), .mode = 0444, - .proc_handler = proc_dointvec, + .proc_handler = do_proc_dqstats, }, { .procname = "allocated_dquots", - .data = &dqstats.allocated_dquots, + .data = &dqstats.stat[DQST_ALLOC_DQUOTS], .maxlen = sizeof(int), .mode = 0444, - .proc_handler = proc_dointvec, + .proc_handler = do_proc_dqstats, }, { .procname = "free_dquots", - .data = &dqstats.free_dquots, + .data = &dqstats.stat[DQST_FREE_DQUOTS], .maxlen = sizeof(int), .mode = 0444, - .proc_handler = proc_dointvec, + .proc_handler = do_proc_dqstats, }, { .procname = "syncs", - .data = &dqstats.syncs, + .data = &dqstats.stat[DQST_SYNCS], .maxlen = sizeof(int), .mode = 0444, - .proc_handler = proc_dointvec, + .proc_handler = do_proc_dqstats, }, #ifdef CONFIG_PRINT_QUOTA_WARNING { @@ -2572,6 +2624,13 @@ static int __init dquot_init(void) if (!dquot_hash) panic("Cannot create dquot hash table"); +#ifdef CONFIG_SMP + dqstats_pcpu = alloc_percpu(struct dqstats); + if (!dqstats_pcpu) + panic("Cannot create dquot stats table"); +#endif + memset(&dqstats, 0, sizeof(struct dqstats)); + /* Find power-of-two hlist_heads which can fit into allocation */ nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head); dq_hash_bits = 0; diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 95388f9b735..ce3dfd066f5 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -45,36 +45,22 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd, return security_quotactl(cmd, type, id, sb); } +static void quota_sync_one(struct super_block *sb, void *arg) +{ + if (sb->s_qcop && sb->s_qcop->quota_sync) + sb->s_qcop->quota_sync(sb, *(int *)arg, 1); +} + static int quota_sync_all(int type) { - struct super_block *sb; int ret; if (type >= MAXQUOTAS) return -EINVAL; ret = security_quotactl(Q_SYNC, type, 0, NULL); - if (ret) - return ret; - - spin_lock(&sb_lock); -restart: - list_for_each_entry(sb, &super_blocks, s_list) { - if (!sb->s_qcop || !sb->s_qcop->quota_sync) - continue; - - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (sb->s_root) - sb->s_qcop->quota_sync(sb, type, 1); - up_read(&sb->s_umount); - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; - } - spin_unlock(&sb_lock); - - return 0; + if (!ret) + iterate_supers(quota_sync_one, &type); + return ret; } static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, @@ -113,8 +99,6 @@ static int quota_getinfo(struct super_block *sb, int type, void __user *addr) struct if_dqinfo info; int ret; - if (!sb_has_quota_active(sb, type)) - return -ESRCH; if (!sb->s_qcop->get_info) return -ENOSYS; ret = sb->s_qcop->get_info(sb, type, &info); @@ -129,43 +113,80 @@ static int quota_setinfo(struct super_block *sb, int type, void __user *addr) if (copy_from_user(&info, addr, sizeof(info))) return -EFAULT; - if (!sb_has_quota_active(sb, type)) - return -ESRCH; if (!sb->s_qcop->set_info) return -ENOSYS; return sb->s_qcop->set_info(sb, type, &info); } +static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src) +{ + dst->dqb_bhardlimit = src->d_blk_hardlimit; + dst->dqb_bsoftlimit = src->d_blk_softlimit; + dst->dqb_curspace = src->d_bcount; + dst->dqb_ihardlimit = src->d_ino_hardlimit; + dst->dqb_isoftlimit = src->d_ino_softlimit; + dst->dqb_curinodes = src->d_icount; + dst->dqb_btime = src->d_btimer; + dst->dqb_itime = src->d_itimer; + dst->dqb_valid = QIF_ALL; +} + static int quota_getquota(struct super_block *sb, int type, qid_t id, void __user *addr) { + struct fs_disk_quota fdq; struct if_dqblk idq; int ret; - if (!sb_has_quota_active(sb, type)) - return -ESRCH; if (!sb->s_qcop->get_dqblk) return -ENOSYS; - ret = sb->s_qcop->get_dqblk(sb, type, id, &idq); + ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq); if (ret) return ret; + copy_to_if_dqblk(&idq, &fdq); if (copy_to_user(addr, &idq, sizeof(idq))) return -EFAULT; return 0; } +static void copy_from_if_dqblk(struct fs_disk_quota *dst, struct if_dqblk *src) +{ + dst->d_blk_hardlimit = src->dqb_bhardlimit; + dst->d_blk_softlimit = src->dqb_bsoftlimit; + dst->d_bcount = src->dqb_curspace; + dst->d_ino_hardlimit = src->dqb_ihardlimit; + dst->d_ino_softlimit = src->dqb_isoftlimit; + dst->d_icount = src->dqb_curinodes; + dst->d_btimer = src->dqb_btime; + dst->d_itimer = src->dqb_itime; + + dst->d_fieldmask = 0; + if (src->dqb_valid & QIF_BLIMITS) + dst->d_fieldmask |= FS_DQ_BSOFT | FS_DQ_BHARD; + if (src->dqb_valid & QIF_SPACE) + dst->d_fieldmask |= FS_DQ_BCOUNT; + if (src->dqb_valid & QIF_ILIMITS) + dst->d_fieldmask |= FS_DQ_ISOFT | FS_DQ_IHARD; + if (src->dqb_valid & QIF_INODES) + dst->d_fieldmask |= FS_DQ_ICOUNT; + if (src->dqb_valid & QIF_BTIME) + dst->d_fieldmask |= FS_DQ_BTIMER; + if (src->dqb_valid & QIF_ITIME) + dst->d_fieldmask |= FS_DQ_ITIMER; +} + static int quota_setquota(struct super_block *sb, int type, qid_t id, void __user *addr) { + struct fs_disk_quota fdq; struct if_dqblk idq; if (copy_from_user(&idq, addr, sizeof(idq))) return -EFAULT; - if (!sb_has_quota_active(sb, type)) - return -ESRCH; if (!sb->s_qcop->set_dqblk) return -ENOSYS; - return sb->s_qcop->set_dqblk(sb, type, id, &idq); + copy_from_if_dqblk(&fdq, &idq); + return sb->s_qcop->set_dqblk(sb, type, id, &fdq); } static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr) @@ -199,9 +220,9 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id, if (copy_from_user(&fdq, addr, sizeof(fdq))) return -EFAULT; - if (!sb->s_qcop->set_xquota) + if (!sb->s_qcop->set_dqblk) return -ENOSYS; - return sb->s_qcop->set_xquota(sb, type, id, &fdq); + return sb->s_qcop->set_dqblk(sb, type, id, &fdq); } static int quota_getxquota(struct super_block *sb, int type, qid_t id, @@ -210,9 +231,9 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id, struct fs_disk_quota fdq; int ret; - if (!sb->s_qcop->get_xquota) + if (!sb->s_qcop->get_dqblk) return -ENOSYS; - ret = sb->s_qcop->get_xquota(sb, type, id, &fdq); + ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq); if (!ret && copy_to_user(addr, &fdq, sizeof(fdq))) return -EFAULT; return ret; diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index f81f4bcfb17..24f03407eeb 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -60,9 +60,17 @@ static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) { struct super_block *sb = info->dqi_sb; + ssize_t ret; - return sb->s_op->quota_write(sb, info->dqi_type, buf, + ret = sb->s_op->quota_write(sb, info->dqi_type, buf, info->dqi_usable_bs, blk << info->dqi_blocksize_bits); + if (ret != info->dqi_usable_bs) { + q_warn(KERN_WARNING "VFS: dquota write failed on " + "dev %s\n", sb->s_id); + if (ret >= 0) + ret = -EIO; + } + return ret; } /* Remove empty block from list and return it */ @@ -152,7 +160,7 @@ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf, dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0); /* No matter whether write succeeds block is out of list */ if (write_blk(info, blk, buf) < 0) - printk(KERN_ERR + q_warn(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk); return 0; @@ -244,7 +252,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info, if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) { *err = remove_free_dqentry(info, buf, blk); if (*err < 0) { - printk(KERN_ERR "VFS: find_free_dqentry(): Can't " + q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't " "remove block (%u) from entry free list.\n", blk); goto out_buf; @@ -268,7 +276,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info, #endif *err = write_blk(info, blk, buf); if (*err < 0) { - printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota " + q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't write quota " "data block %u.\n", blk); goto out_buf; } @@ -303,7 +311,7 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, } else { ret = read_blk(info, *treeblk, buf); if (ret < 0) { - printk(KERN_ERR "VFS: Can't read tree quota block " + q_warn(KERN_ERR "VFS: Can't read tree quota block " "%u.\n", *treeblk); goto out_buf; } @@ -365,7 +373,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) if (!dquot->dq_off) { ret = dq_insert_tree(info, dquot); if (ret < 0) { - printk(KERN_ERR "VFS: Error %zd occurred while " + q_warn(KERN_ERR "VFS: Error %zd occurred while " "creating quota.\n", ret); kfree(ddquot); return ret; @@ -377,14 +385,14 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size, dquot->dq_off); if (ret != info->dqi_entry_size) { - printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", + q_warn(KERN_WARNING "VFS: dquota write failed on dev %s\n", sb->s_id); if (ret >= 0) ret = -ENOSPC; } else { ret = 0; } - dqstats.writes++; + dqstats_inc(DQST_WRITES); kfree(ddquot); return ret; @@ -402,14 +410,14 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, if (!buf) return -ENOMEM; if (dquot->dq_off >> info->dqi_blocksize_bits != blk) { - printk(KERN_ERR "VFS: Quota structure has offset to other " + q_warn(KERN_ERR "VFS: Quota structure has offset to other " "block (%u) than it should (%u).\n", blk, (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); goto out_buf; } ret = read_blk(info, blk, buf); if (ret < 0) { - printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk); + q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", blk); goto out_buf; } dh = (struct qt_disk_dqdbheader *)buf; @@ -419,7 +427,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, if (ret >= 0) ret = put_free_dqblk(info, buf, blk); if (ret < 0) { - printk(KERN_ERR "VFS: Can't move quota data block (%u) " + q_warn(KERN_ERR "VFS: Can't move quota data block (%u) " "to free list.\n", blk); goto out_buf; } @@ -432,14 +440,14 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, /* Insert will write block itself */ ret = insert_free_dqentry(info, buf, blk); if (ret < 0) { - printk(KERN_ERR "VFS: Can't insert quota data " + q_warn(KERN_ERR "VFS: Can't insert quota data " "block (%u) to free entry list.\n", blk); goto out_buf; } } else { ret = write_blk(info, blk, buf); if (ret < 0) { - printk(KERN_ERR "VFS: Can't write quota data " + q_warn(KERN_ERR "VFS: Can't write quota data " "block %u\n", blk); goto out_buf; } @@ -464,7 +472,7 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, return -ENOMEM; ret = read_blk(info, *blk, buf); if (ret < 0) { - printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk); + q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", *blk); goto out_buf; } newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); @@ -488,7 +496,7 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, } else { ret = write_blk(info, *blk, buf); if (ret < 0) - printk(KERN_ERR "VFS: Can't write quota tree " + q_warn(KERN_ERR "VFS: Can't write quota tree " "block %u.\n", *blk); } } @@ -521,7 +529,7 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info, return -ENOMEM; ret = read_blk(info, blk, buf); if (ret < 0) { - printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); + q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); goto out_buf; } ddquot = buf + sizeof(struct qt_disk_dqdbheader); @@ -531,7 +539,7 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info, ddquot += info->dqi_entry_size; } if (i == qtree_dqstr_in_blk(info)) { - printk(KERN_ERR "VFS: Quota for id %u referenced " + q_warn(KERN_ERR "VFS: Quota for id %u referenced " "but not present.\n", dquot->dq_id); ret = -EIO; goto out_buf; @@ -556,7 +564,7 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, return -ENOMEM; ret = read_blk(info, blk, buf); if (ret < 0) { - printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); + q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); goto out_buf; } ret = 0; @@ -599,7 +607,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) offset = find_dqentry(info, dquot); if (offset <= 0) { /* Entry not present? */ if (offset < 0) - printk(KERN_ERR "VFS: Can't read quota " + q_warn(KERN_ERR "VFS: Can't read quota " "structure for id %u.\n", dquot->dq_id); dquot->dq_off = 0; set_bit(DQ_FAKE_B, &dquot->dq_flags); @@ -617,7 +625,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) if (ret != info->dqi_entry_size) { if (ret >= 0) ret = -EIO; - printk(KERN_ERR "VFS: Error while reading quota " + q_warn(KERN_ERR "VFS: Error while reading quota " "structure for id %u.\n", dquot->dq_id); set_bit(DQ_FAKE_B, &dquot->dq_flags); memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); @@ -634,7 +642,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) spin_unlock(&dq_data_lock); kfree(ddquot); out: - dqstats.reads++; + dqstats_inc(DQST_READS); return ret; } EXPORT_SYMBOL(qtree_read_dquot); diff --git a/fs/quota/quota_tree.h b/fs/quota/quota_tree.h index a1ab8db81a5..ccc3e71fb1d 100644 --- a/fs/quota/quota_tree.h +++ b/fs/quota/quota_tree.h @@ -22,4 +22,10 @@ struct qt_disk_dqdbheader { #define QT_TREEOFF 1 /* Offset of tree in file in blocks */ +#define q_warn(fmt, args...) \ +do { \ + if (printk_ratelimit()) \ + printk(fmt, ## args); \ +} while(0) + #endif /* _LINUX_QUOTAIO_TREE_H */ diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c index 2ae757e9c00..4af344c5852 100644 --- a/fs/quota/quota_v1.c +++ b/fs/quota/quota_v1.c @@ -71,7 +71,7 @@ static int v1_read_dqblk(struct dquot *dquot) dquot->dq_dqb.dqb_ihardlimit == 0 && dquot->dq_dqb.dqb_isoftlimit == 0) set_bit(DQ_FAKE_B, &dquot->dq_flags); - dqstats.reads++; + dqstats_inc(DQST_READS); return 0; } @@ -104,7 +104,7 @@ static int v1_commit_dqblk(struct dquot *dquot) ret = 0; out: - dqstats.writes++; + dqstats_inc(DQST_WRITES); return ret; } diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index e3da02f4986..135206af145 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -63,7 +63,7 @@ static int v2_read_header(struct super_block *sb, int type, size = sb->s_op->quota_read(sb, type, (char *)dqhead, sizeof(struct v2_disk_dqheader), 0); if (size != sizeof(struct v2_disk_dqheader)) { - printk(KERN_WARNING "quota_v2: Failed header read:" + q_warn(KERN_WARNING "quota_v2: Failed header read:" " expected=%zd got=%zd\n", sizeof(struct v2_disk_dqheader), size); return 0; @@ -106,7 +106,7 @@ static int v2_read_file_info(struct super_block *sb, int type) size = sb->s_op->quota_read(sb, type, (char *)&dinfo, sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); if (size != sizeof(struct v2_disk_dqinfo)) { - printk(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n", + q_warn(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n", sb->s_id); return -1; } @@ -167,7 +167,7 @@ static int v2_write_file_info(struct super_block *sb, int type) size = sb->s_op->quota_write(sb, type, (char *)&dinfo, sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); if (size != sizeof(struct v2_disk_dqinfo)) { - printk(KERN_WARNING "Can't write info structure on device %s.\n", + q_warn(KERN_WARNING "Can't write info structure on device %s.\n", sb->s_id); return -1; } diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index c94853473ca..a5ebae70dc6 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -52,14 +52,13 @@ static struct backing_dev_info ramfs_backing_dev_info = { BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP, }; -struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev) +struct inode *ramfs_get_inode(struct super_block *sb, + const struct inode *dir, int mode, dev_t dev) { struct inode * inode = new_inode(sb); if (inode) { - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); + inode_init_owner(inode, dir, mode); inode->i_mapping->a_ops = &ramfs_aops; inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); @@ -95,15 +94,10 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev) static int ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) { - struct inode * inode = ramfs_get_inode(dir->i_sb, mode, dev); + struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev); int error = -ENOSPC; if (inode) { - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - inode->i_mode |= S_ISGID; - } d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ error = 0; @@ -130,13 +124,11 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * struct inode *inode; int error = -ENOSPC; - inode = ramfs_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0); + inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0); if (inode) { int l = strlen(symname)+1; error = page_symlink(inode, symname, l); if (!error) { - if (dir->i_mode & S_ISGID) - inode->i_gid = dir->i_gid; d_instantiate(dentry, inode); dget(dentry); dir->i_mtime = dir->i_ctime = CURRENT_TIME; @@ -214,7 +206,7 @@ static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts) return 0; } -static int ramfs_fill_super(struct super_block * sb, void * data, int silent) +int ramfs_fill_super(struct super_block *sb, void *data, int silent) { struct ramfs_fs_info *fsi; struct inode *inode = NULL; @@ -241,7 +233,7 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent) sb->s_op = &ramfs_ops; sb->s_time_gran = 1; - inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0); + inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0); if (!inode) { err = -ENOMEM; goto fail; diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 1d9c12714c5..9977df9f3a5 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -147,7 +147,8 @@ static int reiserfs_sync_file(struct file *filp, barrier_done = reiserfs_commit_for_inode(inode); reiserfs_write_unlock(inode->i_sb); if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) - blkdev_issue_flush(inode->i_sb->s_bdev, NULL); + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); if (barrier_done < 0) return barrier_done; return (err < 0) ? -EIO : 0; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index dc2c65e0485..0f22fdaf54a 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -3076,9 +3076,10 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); depth = reiserfs_write_lock_once(inode->i_sb); - if (attr->ia_valid & ATTR_SIZE) { + if (is_quota_modification(inode, attr)) dquot_initialize(inode); + if (attr->ia_valid & ATTR_SIZE) { /* version 2 items will be caught by the s_maxbytes check ** done for us in vmtruncate */ diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index d0c43cb99ff..ee78d4a0086 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -561,23 +561,13 @@ static int drop_new_inode(struct inode *inode) */ static int new_inode_init(struct inode *inode, struct inode *dir, int mode) { - - /* the quota init calls have to know who to charge the quota to, so - ** we have to set uid and gid here - */ - inode->i_uid = current_fsuid(); - inode->i_mode = mode; /* Make inode invalid - just in case we are going to drop it before * the initialization happens */ INODE_PKEY(inode)->k_objectid = 0; - - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - inode->i_mode |= S_ISGID; - } else { - inode->i_gid = current_fsgid(); - } + /* the quota init calls have to know who to charge the quota to, so + ** we have to set uid and gid here + */ + inode_init_owner(inode, dir, mode); dquot_initialize(inode); return 0; } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index e7cc00e636d..8c4cf273c67 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -723,11 +723,11 @@ out: (handler) = *(handlers)++) /* This is the implementation for the xattr plugin infrastructure */ -static inline struct xattr_handler * -find_xattr_handler_prefix(struct xattr_handler **handlers, +static inline const struct xattr_handler * +find_xattr_handler_prefix(const struct xattr_handler **handlers, const char *name) { - struct xattr_handler *xah; + const struct xattr_handler *xah; if (!handlers) return NULL; @@ -748,7 +748,7 @@ ssize_t reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, size_t size) { - struct xattr_handler *handler; + const struct xattr_handler *handler; handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); @@ -767,7 +767,7 @@ int reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct xattr_handler *handler; + const struct xattr_handler *handler; handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); @@ -784,7 +784,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, */ int reiserfs_removexattr(struct dentry *dentry, const char *name) { - struct xattr_handler *handler; + const struct xattr_handler *handler; handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) @@ -807,7 +807,7 @@ static int listxattr_filler(void *buf, const char *name, int namelen, size_t size; if (name[0] != '.' || (namelen != 1 && (name[1] != '.' || namelen != 2))) { - struct xattr_handler *handler; + const struct xattr_handler *handler; handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr, name); if (!handler) /* Unsupported xattr name */ @@ -920,7 +920,7 @@ static int create_privroot(struct dentry *dentry) { return 0; } #endif /* Actual operations that are exported to VFS-land */ -struct xattr_handler *reiserfs_xattr_handlers[] = { +const struct xattr_handler *reiserfs_xattr_handlers[] = { #ifdef CONFIG_REISERFS_FS_XATTR &reiserfs_xattr_user_handler, &reiserfs_xattr_trusted_handler, diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 9cdb759645a..536d697a8a2 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -500,7 +500,7 @@ static size_t posix_acl_access_list(struct dentry *dentry, char *list, return size; } -struct xattr_handler reiserfs_posix_acl_access_handler = { +const struct xattr_handler reiserfs_posix_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .get = posix_acl_get, @@ -520,7 +520,7 @@ static size_t posix_acl_default_list(struct dentry *dentry, char *list, return size; } -struct xattr_handler reiserfs_posix_acl_default_handler = { +const struct xattr_handler reiserfs_posix_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .get = posix_acl_get, diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 7271a477c04..237c6928d3c 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -111,7 +111,7 @@ void reiserfs_security_free(struct reiserfs_security_handle *sec) sec->value = NULL; } -struct xattr_handler reiserfs_xattr_security_handler = { +const struct xattr_handler reiserfs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = security_get, .set = security_set, diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index 5b08aaca3da..9883736ce3e 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -48,7 +48,7 @@ static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size, return len; } -struct xattr_handler reiserfs_xattr_trusted_handler = { +const struct xattr_handler reiserfs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = trusted_get, .set = trusted_set, diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index 75d59c49b91..45ae1a00013 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -44,7 +44,7 @@ static size_t user_list(struct dentry *dentry, char *list, size_t list_size, return len; } -struct xattr_handler reiserfs_xattr_user_handler = { +const struct xattr_handler reiserfs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .get = user_get, .set = user_set, diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c index 3e4803b4427..6c978428892 100644 --- a/fs/smbfs/dir.c +++ b/fs/smbfs/dir.c @@ -39,7 +39,7 @@ const struct file_operations smb_dir_operations = { .read = generic_read_dir, .readdir = smb_readdir, - .ioctl = smb_ioctl, + .unlocked_ioctl = smb_ioctl, .open = smb_dir_open, }; diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c index dbf6548bbf0..84ecf0e43f9 100644 --- a/fs/smbfs/file.c +++ b/fs/smbfs/file.c @@ -437,7 +437,7 @@ const struct file_operations smb_file_operations = .aio_read = smb_file_aio_read, .write = do_sync_write, .aio_write = smb_file_aio_write, - .ioctl = smb_ioctl, + .unlocked_ioctl = smb_ioctl, .mmap = smb_file_mmap, .open = smb_file_open, .release = smb_file_release, diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c index dbae1f8ea26..07215312ad3 100644 --- a/fs/smbfs/ioctl.c +++ b/fs/smbfs/ioctl.c @@ -13,6 +13,7 @@ #include <linux/time.h> #include <linux/mm.h> #include <linux/highuid.h> +#include <linux/smp_lock.h> #include <linux/net.h> #include <linux/smb_fs.h> @@ -22,14 +23,14 @@ #include "proto.h" -int -smb_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +long +smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct smb_sb_info *server = server_from_inode(inode); + struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode); struct smb_conn_opt opt; int result = -EINVAL; + lock_kernel(); switch (cmd) { uid16_t uid16; uid_t uid32; @@ -62,6 +63,7 @@ smb_ioctl(struct inode *inode, struct file *filp, default: break; } + unlock_kernel(); return result; } diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h index 03f456c1b7d..05939a6f43e 100644 --- a/fs/smbfs/proto.h +++ b/fs/smbfs/proto.h @@ -67,7 +67,7 @@ extern const struct address_space_operations smb_file_aops; extern const struct file_operations smb_file_operations; extern const struct inode_operations smb_file_inode_operations; /* ioctl.c */ -extern int smb_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg); +extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); /* smbiod.c */ extern void smbiod_wake_up(void); extern int smbiod_register_server(struct smb_sb_info *server); diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c index 54350b59046..00b2909bd46 100644 --- a/fs/smbfs/symlink.c +++ b/fs/smbfs/symlink.c @@ -15,7 +15,6 @@ #include <linux/pagemap.h> #include <linux/net.h> #include <linux/namei.h> -#include <linux/slab.h> #include <asm/uaccess.h> #include <asm/system.h> diff --git a/fs/splice.c b/fs/splice.c index 9313b6124a2..ac22b00d86c 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -193,8 +193,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, break; } - if (pipe->nrbufs < PIPE_BUFFERS) { - int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); + if (pipe->nrbufs < pipe->buffers) { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); struct pipe_buffer *buf = pipe->bufs + newbuf; buf->page = spd->pages[page_nr]; @@ -214,7 +214,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, if (!--spd->nr_pages) break; - if (pipe->nrbufs < PIPE_BUFFERS) + if (pipe->nrbufs < pipe->buffers) continue; break; @@ -265,6 +265,36 @@ static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) page_cache_release(spd->pages[i]); } +/* + * Check if we need to grow the arrays holding pages and partial page + * descriptions. + */ +int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) +{ + if (pipe->buffers <= PIPE_DEF_BUFFERS) + return 0; + + spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL); + spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL); + + if (spd->pages && spd->partial) + return 0; + + kfree(spd->pages); + kfree(spd->partial); + return -ENOMEM; +} + +void splice_shrink_spd(struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) +{ + if (pipe->buffers <= PIPE_DEF_BUFFERS) + return; + + kfree(spd->pages); + kfree(spd->partial); +} + static int __generic_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, @@ -272,8 +302,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, { struct address_space *mapping = in->f_mapping; unsigned int loff, nr_pages, req_pages; - struct page *pages[PIPE_BUFFERS]; - struct partial_page partial[PIPE_BUFFERS]; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; struct page *page; pgoff_t index, end_index; loff_t isize; @@ -286,15 +316,18 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, .spd_release = spd_release_page, }; + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + index = *ppos >> PAGE_CACHE_SHIFT; loff = *ppos & ~PAGE_CACHE_MASK; req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS); + nr_pages = min(req_pages, pipe->buffers); /* * Lookup the (hopefully) full range of pages we need. */ - spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); + spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages); index += spd.nr_pages; /* @@ -335,7 +368,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, unlock_page(page); } - pages[spd.nr_pages++] = page; + spd.pages[spd.nr_pages++] = page; index++; } @@ -356,7 +389,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, * this_len is the max we'll use from this page */ this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); - page = pages[page_nr]; + page = spd.pages[page_nr]; if (PageReadahead(page)) page_cache_async_readahead(mapping, &in->f_ra, in, @@ -393,8 +426,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, error = -ENOMEM; break; } - page_cache_release(pages[page_nr]); - pages[page_nr] = page; + page_cache_release(spd.pages[page_nr]); + spd.pages[page_nr] = page; } /* * page was already under io and is now done, great @@ -451,8 +484,8 @@ fill_it: len = this_len; } - partial[page_nr].offset = loff; - partial[page_nr].len = this_len; + spd.partial[page_nr].offset = loff; + spd.partial[page_nr].len = this_len; len -= this_len; loff = 0; spd.nr_pages++; @@ -464,12 +497,13 @@ fill_it: * we got, 'nr_pages' is how many pages are in the map. */ while (page_nr < nr_pages) - page_cache_release(pages[page_nr++]); + page_cache_release(spd.pages[page_nr++]); in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; if (spd.nr_pages) - return splice_to_pipe(pipe, &spd); + error = splice_to_pipe(pipe, &spd); + splice_shrink_spd(pipe, &spd); return error; } @@ -560,9 +594,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, unsigned int nr_pages; unsigned int nr_freed; size_t offset; - struct page *pages[PIPE_BUFFERS]; - struct partial_page partial[PIPE_BUFFERS]; - struct iovec vec[PIPE_BUFFERS]; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; + struct iovec *vec, __vec[PIPE_DEF_BUFFERS]; pgoff_t index; ssize_t res; size_t this_len; @@ -576,11 +610,22 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, .spd_release = spd_release_page, }; + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + res = -ENOMEM; + vec = __vec; + if (pipe->buffers > PIPE_DEF_BUFFERS) { + vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL); + if (!vec) + goto shrink_ret; + } + index = *ppos >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) { + for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) { struct page *page; page = alloc_page(GFP_USER); @@ -591,7 +636,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); vec[i].iov_base = (void __user *) page_address(page); vec[i].iov_len = this_len; - pages[i] = page; + spd.pages[i] = page; spd.nr_pages++; len -= this_len; offset = 0; @@ -610,11 +655,11 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, nr_freed = 0; for (i = 0; i < spd.nr_pages; i++) { this_len = min_t(size_t, vec[i].iov_len, res); - partial[i].offset = 0; - partial[i].len = this_len; + spd.partial[i].offset = 0; + spd.partial[i].len = this_len; if (!this_len) { - __free_page(pages[i]); - pages[i] = NULL; + __free_page(spd.pages[i]); + spd.pages[i] = NULL; nr_freed++; } res -= this_len; @@ -625,13 +670,18 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, if (res > 0) *ppos += res; +shrink_ret: + if (vec != __vec) + kfree(vec); + splice_shrink_spd(pipe, &spd); return res; err: for (i = 0; i < spd.nr_pages; i++) - __free_page(pages[i]); + __free_page(spd.pages[i]); - return error; + res = error; + goto shrink_ret; } EXPORT_SYMBOL(default_file_splice_read); @@ -784,7 +834,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, if (!buf->len) { buf->ops = NULL; ops->release(pipe, buf); - pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); + pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->nrbufs--; if (pipe->inode) sd->need_wakeup = true; @@ -1211,7 +1261,7 @@ out_release: * If we did an incomplete transfer we must release * the pipe buffers in question: */ - for (i = 0; i < PIPE_BUFFERS; i++) { + for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) { @@ -1371,7 +1421,8 @@ static long do_splice(struct file *in, loff_t __user *off_in, */ static int get_iovec_page_array(const struct iovec __user *iov, unsigned int nr_vecs, struct page **pages, - struct partial_page *partial, int aligned) + struct partial_page *partial, int aligned, + unsigned int pipe_buffers) { int buffers = 0, error = 0; @@ -1414,8 +1465,8 @@ static int get_iovec_page_array(const struct iovec __user *iov, break; npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (npages > PIPE_BUFFERS - buffers) - npages = PIPE_BUFFERS - buffers; + if (npages > pipe_buffers - buffers) + npages = pipe_buffers - buffers; error = get_user_pages_fast((unsigned long)base, npages, 0, &pages[buffers]); @@ -1450,7 +1501,7 @@ static int get_iovec_page_array(const struct iovec __user *iov, * or if we mapped the max number of pages that we have * room for. */ - if (error < npages || buffers == PIPE_BUFFERS) + if (error < npages || buffers == pipe_buffers) break; nr_vecs--; @@ -1593,8 +1644,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, unsigned long nr_segs, unsigned int flags) { struct pipe_inode_info *pipe; - struct page *pages[PIPE_BUFFERS]; - struct partial_page partial[PIPE_BUFFERS]; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; struct splice_pipe_desc spd = { .pages = pages, .partial = partial, @@ -1602,17 +1653,25 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, .ops = &user_page_pipe_buf_ops, .spd_release = spd_release_page, }; + long ret; pipe = pipe_info(file->f_path.dentry->d_inode); if (!pipe) return -EBADF; - spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, - flags & SPLICE_F_GIFT); + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages, + spd.partial, flags & SPLICE_F_GIFT, + pipe->buffers); if (spd.nr_pages <= 0) - return spd.nr_pages; + ret = spd.nr_pages; + else + ret = splice_to_pipe(pipe, &spd); - return splice_to_pipe(pipe, &spd); + splice_shrink_spd(pipe, &spd); + return ret; } /* @@ -1738,13 +1797,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) * Check ->nrbufs without the inode lock first. This function * is speculative anyways, so missing one is ok. */ - if (pipe->nrbufs < PIPE_BUFFERS) + if (pipe->nrbufs < pipe->buffers) return 0; ret = 0; pipe_lock(pipe); - while (pipe->nrbufs >= PIPE_BUFFERS) { + while (pipe->nrbufs >= pipe->buffers) { if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; @@ -1810,7 +1869,7 @@ retry: * Cannot make any progress, because either the input * pipe is empty or the output pipe is full. */ - if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) { + if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) { /* Already processed some buffers, break */ if (ret) break; @@ -1831,7 +1890,7 @@ retry: } ibuf = ipipe->bufs + ipipe->curbuf; - nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS; + nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); obuf = opipe->bufs + nbuf; if (len >= ibuf->len) { @@ -1841,7 +1900,7 @@ retry: *obuf = *ibuf; ibuf->ops = NULL; opipe->nrbufs++; - ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS; + ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1); ipipe->nrbufs--; input_wakeup = true; } else { @@ -1914,11 +1973,11 @@ static int link_pipe(struct pipe_inode_info *ipipe, * If we have iterated all input buffers or ran out of * output room, break. */ - if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) + if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) break; - ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); - nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); + ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1)); + nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); /* * Get a reference to this pipe buffer, diff --git a/fs/statfs.c b/fs/statfs.c new file mode 100644 index 00000000000..4ef021f3b61 --- /dev/null +++ b/fs/statfs.c @@ -0,0 +1,196 @@ +#include <linux/syscalls.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/namei.h> +#include <linux/statfs.h> +#include <linux/security.h> +#include <linux/uaccess.h> + +int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + int retval = -ENODEV; + + if (dentry) { + retval = -ENOSYS; + if (dentry->d_sb->s_op->statfs) { + memset(buf, 0, sizeof(*buf)); + retval = security_sb_statfs(dentry); + if (retval) + return retval; + retval = dentry->d_sb->s_op->statfs(dentry, buf); + if (retval == 0 && buf->f_frsize == 0) + buf->f_frsize = buf->f_bsize; + } + } + return retval; +} + +EXPORT_SYMBOL(vfs_statfs); + +static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf) +{ + struct kstatfs st; + int retval; + + retval = vfs_statfs(dentry, &st); + if (retval) + return retval; + + if (sizeof(*buf) == sizeof(st)) + memcpy(buf, &st, sizeof(st)); + else { + if (sizeof buf->f_blocks == 4) { + if ((st.f_blocks | st.f_bfree | st.f_bavail | + st.f_bsize | st.f_frsize) & + 0xffffffff00000000ULL) + return -EOVERFLOW; + /* + * f_files and f_ffree may be -1; it's okay to stuff + * that into 32 bits + */ + if (st.f_files != -1 && + (st.f_files & 0xffffffff00000000ULL)) + return -EOVERFLOW; + if (st.f_ffree != -1 && + (st.f_ffree & 0xffffffff00000000ULL)) + return -EOVERFLOW; + } + + buf->f_type = st.f_type; + buf->f_bsize = st.f_bsize; + buf->f_blocks = st.f_blocks; + buf->f_bfree = st.f_bfree; + buf->f_bavail = st.f_bavail; + buf->f_files = st.f_files; + buf->f_ffree = st.f_ffree; + buf->f_fsid = st.f_fsid; + buf->f_namelen = st.f_namelen; + buf->f_frsize = st.f_frsize; + memset(buf->f_spare, 0, sizeof(buf->f_spare)); + } + return 0; +} + +static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf) +{ + struct kstatfs st; + int retval; + + retval = vfs_statfs(dentry, &st); + if (retval) + return retval; + + if (sizeof(*buf) == sizeof(st)) + memcpy(buf, &st, sizeof(st)); + else { + buf->f_type = st.f_type; + buf->f_bsize = st.f_bsize; + buf->f_blocks = st.f_blocks; + buf->f_bfree = st.f_bfree; + buf->f_bavail = st.f_bavail; + buf->f_files = st.f_files; + buf->f_ffree = st.f_ffree; + buf->f_fsid = st.f_fsid; + buf->f_namelen = st.f_namelen; + buf->f_frsize = st.f_frsize; + memset(buf->f_spare, 0, sizeof(buf->f_spare)); + } + return 0; +} + +SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf) +{ + struct path path; + int error; + + error = user_path(pathname, &path); + if (!error) { + struct statfs tmp; + error = vfs_statfs_native(path.dentry, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + path_put(&path); + } + return error; +} + +SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf) +{ + struct path path; + long error; + + if (sz != sizeof(*buf)) + return -EINVAL; + error = user_path(pathname, &path); + if (!error) { + struct statfs64 tmp; + error = vfs_statfs64(path.dentry, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + path_put(&path); + } + return error; +} + +SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf) +{ + struct file *file; + struct statfs tmp; + int error; + + error = -EBADF; + file = fget(fd); + if (!file) + goto out; + error = vfs_statfs_native(file->f_path.dentry, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + fput(file); +out: + return error; +} + +SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf) +{ + struct file *file; + struct statfs64 tmp; + int error; + + if (sz != sizeof(*buf)) + return -EINVAL; + + error = -EBADF; + file = fget(fd); + if (!file) + goto out; + error = vfs_statfs64(file->f_path.dentry, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + fput(file); +out: + return error; +} + +SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf) +{ + struct super_block *s; + struct ustat tmp; + struct kstatfs sbuf; + int err; + + s = user_get_super(new_decode_dev(dev)); + if (!s) + return -EINVAL; + + err = vfs_statfs(s->s_root, &sbuf); + drop_super(s); + if (err) + return err; + + memset(&tmp,0,sizeof(struct ustat)); + tmp.f_tfree = sbuf.f_bfree; + tmp.f_tinode = sbuf.f_ffree; + + return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0; +} diff --git a/fs/super.c b/fs/super.c index 1527e6a0ee3..69688b15f1f 100644 --- a/fs/super.c +++ b/fs/super.c @@ -22,23 +22,15 @@ #include <linux/module.h> #include <linux/slab.h> -#include <linux/init.h> -#include <linux/smp_lock.h> #include <linux/acct.h> #include <linux/blkdev.h> #include <linux/quotaops.h> -#include <linux/namei.h> #include <linux/mount.h> #include <linux/security.h> -#include <linux/syscalls.h> -#include <linux/vfs.h> #include <linux/writeback.h> /* for the emergency remount stuff */ #include <linux/idr.h> -#include <linux/kobject.h> #include <linux/mutex.h> -#include <linux/file.h> #include <linux/backing-dev.h> -#include <asm/uaccess.h> #include "internal.h" @@ -93,9 +85,10 @@ static struct super_block *alloc_super(struct file_system_type *type) * subclass. */ down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); - s->s_count = S_BIAS; + s->s_count = 1; atomic_set(&s->s_active, 1); mutex_init(&s->s_vfs_rename_mutex); + lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); mutex_init(&s->s_dquot.dqio_mutex); mutex_init(&s->s_dquot.dqonoff_mutex); init_rwsem(&s->s_dquot.dqptr_sem); @@ -127,39 +120,14 @@ static inline void destroy_super(struct super_block *s) /* Superblock refcounting */ /* - * Drop a superblock's refcount. Returns non-zero if the superblock was - * destroyed. The caller must hold sb_lock. + * Drop a superblock's refcount. The caller must hold sb_lock. */ -static int __put_super(struct super_block *sb) +void __put_super(struct super_block *sb) { - int ret = 0; - if (!--sb->s_count) { + list_del_init(&sb->s_list); destroy_super(sb); - ret = 1; } - return ret; -} - -/* - * Drop a superblock's refcount. - * Returns non-zero if the superblock is about to be destroyed and - * at least is already removed from super_blocks list, so if we are - * making a loop through super blocks then we need to restart. - * The caller must hold sb_lock. - */ -int __put_super_and_need_restart(struct super_block *sb) -{ - /* check for race with generic_shutdown_super() */ - if (list_empty(&sb->s_list)) { - /* super block is removed, need to restart... */ - __put_super(sb); - return 1; - } - /* can't be the last, since s_list is still in use */ - sb->s_count--; - BUG_ON(sb->s_count == 0); - return 0; } /** @@ -178,57 +146,48 @@ void put_super(struct super_block *sb) /** - * deactivate_super - drop an active reference to superblock + * deactivate_locked_super - drop an active reference to superblock * @s: superblock to deactivate * - * Drops an active reference to superblock, acquiring a temprory one if - * there is no active references left. In that case we lock superblock, + * Drops an active reference to superblock, converting it into a temprory + * one if there is no other active references left. In that case we * tell fs driver to shut it down and drop the temporary reference we * had just acquired. + * + * Caller holds exclusive lock on superblock; that lock is released. */ -void deactivate_super(struct super_block *s) +void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; - if (atomic_dec_and_lock(&s->s_active, &sb_lock)) { - s->s_count -= S_BIAS-1; - spin_unlock(&sb_lock); + if (atomic_dec_and_test(&s->s_active)) { vfs_dq_off(s, 0); - down_write(&s->s_umount); fs->kill_sb(s); put_filesystem(fs); put_super(s); + } else { + up_write(&s->s_umount); } } -EXPORT_SYMBOL(deactivate_super); +EXPORT_SYMBOL(deactivate_locked_super); /** - * deactivate_locked_super - drop an active reference to superblock + * deactivate_super - drop an active reference to superblock * @s: superblock to deactivate * - * Equivalent of up_write(&s->s_umount); deactivate_super(s);, except that - * it does not unlock it until it's all over. As the result, it's safe to - * use to dispose of new superblock on ->get_sb() failure exits - nobody - * will see the sucker until it's all over. Equivalent using up_write + - * deactivate_super is safe for that purpose only if superblock is either - * safe to use or has NULL ->s_root when we unlock. + * Variant of deactivate_locked_super(), except that superblock is *not* + * locked by caller. If we are going to drop the final active reference, + * lock will be acquired prior to that. */ -void deactivate_locked_super(struct super_block *s) +void deactivate_super(struct super_block *s) { - struct file_system_type *fs = s->s_type; - if (atomic_dec_and_lock(&s->s_active, &sb_lock)) { - s->s_count -= S_BIAS-1; - spin_unlock(&sb_lock); - vfs_dq_off(s, 0); - fs->kill_sb(s); - put_filesystem(fs); - put_super(s); - } else { - up_write(&s->s_umount); + if (!atomic_add_unless(&s->s_active, -1, 1)) { + down_write(&s->s_umount); + deactivate_locked_super(s); } } -EXPORT_SYMBOL(deactivate_locked_super); +EXPORT_SYMBOL(deactivate_super); /** * grab_super - acquire an active reference @@ -243,22 +202,17 @@ EXPORT_SYMBOL(deactivate_locked_super); */ static int grab_super(struct super_block *s) __releases(sb_lock) { + if (atomic_inc_not_zero(&s->s_active)) { + spin_unlock(&sb_lock); + return 1; + } + /* it's going away */ s->s_count++; spin_unlock(&sb_lock); + /* wait for it to die */ down_write(&s->s_umount); - if (s->s_root) { - spin_lock(&sb_lock); - if (s->s_count > S_BIAS) { - atomic_inc(&s->s_active); - s->s_count--; - spin_unlock(&sb_lock); - return 1; - } - spin_unlock(&sb_lock); - } up_write(&s->s_umount); put_super(s); - yield(); return 0; } @@ -321,8 +275,7 @@ void generic_shutdown_super(struct super_block *sb) } spin_lock(&sb_lock); /* should be initialized for __put_super_and_need_restart() */ - list_del_init(&sb->s_list); - list_del(&sb->s_instances); + list_del_init(&sb->s_instances); spin_unlock(&sb_lock); up_write(&sb->s_umount); } @@ -357,6 +310,7 @@ retry: up_write(&s->s_umount); destroy_super(s); } + down_write(&old->s_umount); return old; } } @@ -408,11 +362,12 @@ EXPORT_SYMBOL(drop_super); */ void sync_supers(void) { - struct super_block *sb; + struct super_block *sb, *n; spin_lock(&sb_lock); -restart: - list_for_each_entry(sb, &super_blocks, s_list) { + list_for_each_entry_safe(sb, n, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; if (sb->s_op->write_super && sb->s_dirt) { sb->s_count++; spin_unlock(&sb_lock); @@ -423,14 +378,43 @@ restart: up_read(&sb->s_umount); spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; + __put_super(sb); } } spin_unlock(&sb_lock); } /** + * iterate_supers - call function for all active superblocks + * @f: function to call + * @arg: argument to pass to it + * + * Scans the superblock list and calls given function, passing it + * locked superblock and given argument. + */ +void iterate_supers(void (*f)(struct super_block *, void *), void *arg) +{ + struct super_block *sb, *n; + + spin_lock(&sb_lock); + list_for_each_entry_safe(sb, n, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; + sb->s_count++; + spin_unlock(&sb_lock); + + down_read(&sb->s_umount); + if (sb->s_root) + f(sb, arg); + up_read(&sb->s_umount); + + spin_lock(&sb_lock); + __put_super(sb); + } + spin_unlock(&sb_lock); +} + +/** * get_super - get the superblock of a device * @bdev: device to get the superblock for * @@ -438,7 +422,7 @@ restart: * mounted on the device given. %NULL is returned if no match is found. */ -struct super_block * get_super(struct block_device *bdev) +struct super_block *get_super(struct block_device *bdev) { struct super_block *sb; @@ -448,17 +432,20 @@ struct super_block * get_super(struct block_device *bdev) spin_lock(&sb_lock); rescan: list_for_each_entry(sb, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; if (sb->s_bdev == bdev) { sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); + /* still alive? */ if (sb->s_root) return sb; up_read(&sb->s_umount); - /* restart only when sb is no longer on the list */ + /* nope, got unmounted */ spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto rescan; + __put_super(sb); + goto rescan; } } spin_unlock(&sb_lock); @@ -473,7 +460,7 @@ EXPORT_SYMBOL(get_super); * * Scans the superblock list and finds the superblock of the file system * mounted on the device given. Returns the superblock with an active - * reference and s_umount held exclusively or %NULL if none was found. + * reference or %NULL if none was found. */ struct super_block *get_active_super(struct block_device *bdev) { @@ -482,81 +469,49 @@ struct super_block *get_active_super(struct block_device *bdev) if (!bdev) return NULL; +restart: spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (sb->s_bdev != bdev) + if (list_empty(&sb->s_instances)) continue; - - sb->s_count++; - spin_unlock(&sb_lock); - down_write(&sb->s_umount); - if (sb->s_root) { - spin_lock(&sb_lock); - if (sb->s_count > S_BIAS) { - atomic_inc(&sb->s_active); - sb->s_count--; - spin_unlock(&sb_lock); + if (sb->s_bdev == bdev) { + if (grab_super(sb)) /* drops sb_lock */ return sb; - } - spin_unlock(&sb_lock); + else + goto restart; } - up_write(&sb->s_umount); - put_super(sb); - yield(); - spin_lock(&sb_lock); } spin_unlock(&sb_lock); return NULL; } -struct super_block * user_get_super(dev_t dev) +struct super_block *user_get_super(dev_t dev) { struct super_block *sb; spin_lock(&sb_lock); rescan: list_for_each_entry(sb, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; if (sb->s_dev == dev) { sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); + /* still alive? */ if (sb->s_root) return sb; up_read(&sb->s_umount); - /* restart only when sb is no longer on the list */ + /* nope, got unmounted */ spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto rescan; + __put_super(sb); + goto rescan; } } spin_unlock(&sb_lock); return NULL; } -SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf) -{ - struct super_block *s; - struct ustat tmp; - struct kstatfs sbuf; - int err = -EINVAL; - - s = user_get_super(new_decode_dev(dev)); - if (s == NULL) - goto out; - err = vfs_statfs(s->s_root, &sbuf); - drop_super(s); - if (err) - goto out; - - memset(&tmp,0,sizeof(struct ustat)); - tmp.f_tfree = sbuf.f_bfree; - tmp.f_tinode = sbuf.f_ffree; - - err = copy_to_user(ubuf,&tmp,sizeof(struct ustat)) ? -EFAULT : 0; -out: - return err; -} - /** * do_remount_sb - asks filesystem to change mount options. * @sb: superblock in question @@ -622,24 +577,24 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) static void do_emergency_remount(struct work_struct *work) { - struct super_block *sb; + struct super_block *sb, *n; spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { + list_for_each_entry_safe(sb, n, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; sb->s_count++; spin_unlock(&sb_lock); down_write(&sb->s_umount); if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) { /* - * ->remount_fs needs lock_kernel(). - * * What lock protects sb->s_flags?? */ do_remount_sb(sb, MS_RDONLY, NULL, 1); } up_write(&sb->s_umount); - put_super(sb); spin_lock(&sb_lock); + __put_super(sb); } spin_unlock(&sb_lock); kfree(work); @@ -990,6 +945,96 @@ out: EXPORT_SYMBOL_GPL(vfs_kern_mount); +/** + * freeze_super -- lock the filesystem and force it into a consistent state + * @super: the super to lock + * + * Syncs the super to make sure the filesystem is consistent and calls the fs's + * freeze_fs. Subsequent calls to this without first thawing the fs will return + * -EBUSY. + */ +int freeze_super(struct super_block *sb) +{ + int ret; + + atomic_inc(&sb->s_active); + down_write(&sb->s_umount); + if (sb->s_frozen) { + deactivate_locked_super(sb); + return -EBUSY; + } + + if (sb->s_flags & MS_RDONLY) { + sb->s_frozen = SB_FREEZE_TRANS; + smp_wmb(); + up_write(&sb->s_umount); + return 0; + } + + sb->s_frozen = SB_FREEZE_WRITE; + smp_wmb(); + + sync_filesystem(sb); + + sb->s_frozen = SB_FREEZE_TRANS; + smp_wmb(); + + sync_blockdev(sb->s_bdev); + if (sb->s_op->freeze_fs) { + ret = sb->s_op->freeze_fs(sb); + if (ret) { + printk(KERN_ERR + "VFS:Filesystem freeze failed\n"); + sb->s_frozen = SB_UNFROZEN; + deactivate_locked_super(sb); + return ret; + } + } + up_write(&sb->s_umount); + return 0; +} +EXPORT_SYMBOL(freeze_super); + +/** + * thaw_super -- unlock filesystem + * @sb: the super to thaw + * + * Unlocks the filesystem and marks it writeable again after freeze_super(). + */ +int thaw_super(struct super_block *sb) +{ + int error; + + down_write(&sb->s_umount); + if (sb->s_frozen == SB_UNFROZEN) { + up_write(&sb->s_umount); + return -EINVAL; + } + + if (sb->s_flags & MS_RDONLY) + goto out; + + if (sb->s_op->unfreeze_fs) { + error = sb->s_op->unfreeze_fs(sb); + if (error) { + printk(KERN_ERR + "VFS:Filesystem thaw failed\n"); + sb->s_frozen = SB_FREEZE_TRANS; + up_write(&sb->s_umount); + return error; + } + } + +out: + sb->s_frozen = SB_UNFROZEN; + smp_wmb(); + wake_up(&sb->s_wait_unfrozen); + deactivate_locked_super(sb); + + return 0; +} +EXPORT_SYMBOL(thaw_super); + static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) { int err; diff --git a/fs/sync.c b/fs/sync.c index 92b228176f7..e8cbd415e50 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -42,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait) if (wait) sync_inodes_sb(sb); else - writeback_inodes_sb(sb); + writeback_inodes_sb_locked(sb); if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, wait); @@ -77,50 +77,18 @@ int sync_filesystem(struct super_block *sb) } EXPORT_SYMBOL_GPL(sync_filesystem); +static void sync_one_sb(struct super_block *sb, void *arg) +{ + if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi) + __sync_filesystem(sb, *(int *)arg); +} /* * Sync all the data for all the filesystems (called by sys_sync() and * emergency sync) - * - * This operation is careful to avoid the livelock which could easily happen - * if two or more filesystems are being continuously dirtied. s_need_sync - * is used only here. We set it against all filesystems and then clear it as - * we sync them. So redirtied filesystems are skipped. - * - * But if process A is currently running sync_filesystems and then process B - * calls sync_filesystems as well, process B will set all the s_need_sync - * flags again, which will cause process A to resync everything. Fix that with - * a local mutex. */ static void sync_filesystems(int wait) { - struct super_block *sb; - static DEFINE_MUTEX(mutex); - - mutex_lock(&mutex); /* Could be down_interruptible */ - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) - sb->s_need_sync = 1; - -restart: - list_for_each_entry(sb, &super_blocks, s_list) { - if (!sb->s_need_sync) - continue; - sb->s_need_sync = 0; - sb->s_count++; - spin_unlock(&sb_lock); - - down_read(&sb->s_umount); - if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi) - __sync_filesystem(sb, wait); - up_read(&sb->s_umount); - - /* restart only when sb is no longer on the list */ - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; - } - spin_unlock(&sb_lock); - mutex_unlock(&mutex); + iterate_supers(sync_one_sb, &wait); } /* @@ -190,7 +158,6 @@ EXPORT_SYMBOL(file_fsync); /** * vfs_fsync_range - helper to sync a range of data & metadata to disk * @file: file to sync - * @dentry: dentry of @file * @start: offset in bytes of the beginning of data range to sync * @end: offset in bytes of the end of data range (inclusive) * @datasync: perform only datasync @@ -198,32 +165,13 @@ EXPORT_SYMBOL(file_fsync); * Write back data in range @start..@end and metadata for @file to disk. If * @datasync is set only metadata needed to access modified file data is * written. - * - * In case this function is called from nfsd @file may be %NULL and - * only @dentry is set. This can only happen when the filesystem - * implements the export_operations API. */ -int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start, - loff_t end, int datasync) +int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) { - const struct file_operations *fop; - struct address_space *mapping; + struct address_space *mapping = file->f_mapping; int err, ret; - /* - * Get mapping and operations from the file in case we have - * as file, or get the default values for them in case we - * don't have a struct file available. Damn nfsd.. - */ - if (file) { - mapping = file->f_mapping; - fop = file->f_op; - } else { - mapping = dentry->d_inode->i_mapping; - fop = dentry->d_inode->i_fop; - } - - if (!fop || !fop->fsync) { + if (!file->f_op || !file->f_op->fsync) { ret = -EINVAL; goto out; } @@ -235,7 +183,7 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start, * livelocks in fsync_buffers_list(). */ mutex_lock(&mapping->host->i_mutex); - err = fop->fsync(file, dentry, datasync); + err = file->f_op->fsync(file, file->f_path.dentry, datasync); if (!ret) ret = err; mutex_unlock(&mapping->host->i_mutex); @@ -248,19 +196,14 @@ EXPORT_SYMBOL(vfs_fsync_range); /** * vfs_fsync - perform a fsync or fdatasync on a file * @file: file to sync - * @dentry: dentry of @file * @datasync: only perform a fdatasync operation * * Write back data and metadata for @file to disk. If @datasync is * set only metadata needed to access modified file data is written. - * - * In case this function is called from nfsd @file may be %NULL and - * only @dentry is set. This can only happen when the filesystem - * implements the export_operations API. */ -int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) +int vfs_fsync(struct file *file, int datasync) { - return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync); + return vfs_fsync_range(file, 0, LLONG_MAX, datasync); } EXPORT_SYMBOL(vfs_fsync); @@ -271,7 +214,7 @@ static int do_fsync(unsigned int fd, int datasync) file = fget(fd); if (file) { - ret = vfs_fsync(file, file->f_path.dentry, datasync); + ret = vfs_fsync(file, datasync); fput(file); } return ret; @@ -299,8 +242,7 @@ int generic_write_sync(struct file *file, loff_t pos, loff_t count) { if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host)) return 0; - return vfs_fsync_range(file, file->f_path.dentry, pos, - pos + count - 1, + return vfs_fsync_range(file, pos, pos + count - 1, (file->f_flags & __O_SYNC) ? 0 : 1); } EXPORT_SYMBOL(generic_write_sync); diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index e9d293593e5..4e321f7353f 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -46,9 +46,9 @@ struct bin_buffer { }; static int -fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count) +fill_read(struct file *file, char *buffer, loff_t off, size_t count) { - struct sysfs_dirent *attr_sd = dentry->d_fsdata; + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; int rc; @@ -59,7 +59,7 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count) rc = -EIO; if (attr->read) - rc = attr->read(kobj, attr, buffer, off, count); + rc = attr->read(file, kobj, attr, buffer, off, count); sysfs_put_active(attr_sd); @@ -70,8 +70,7 @@ static ssize_t read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) { struct bin_buffer *bb = file->private_data; - struct dentry *dentry = file->f_path.dentry; - int size = dentry->d_inode->i_size; + int size = file->f_path.dentry->d_inode->i_size; loff_t offs = *off; int count = min_t(size_t, bytes, PAGE_SIZE); char *temp; @@ -92,7 +91,7 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) mutex_lock(&bb->mutex); - count = fill_read(dentry, bb->buffer, offs, count); + count = fill_read(file, bb->buffer, offs, count); if (count < 0) { mutex_unlock(&bb->mutex); goto out_free; @@ -117,9 +116,9 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) } static int -flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count) +flush_write(struct file *file, char *buffer, loff_t offset, size_t count) { - struct sysfs_dirent *attr_sd = dentry->d_fsdata; + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; int rc; @@ -130,7 +129,7 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count) rc = -EIO; if (attr->write) - rc = attr->write(kobj, attr, buffer, offset, count); + rc = attr->write(file, kobj, attr, buffer, offset, count); sysfs_put_active(attr_sd); @@ -141,8 +140,7 @@ static ssize_t write(struct file *file, const char __user *userbuf, size_t bytes, loff_t *off) { struct bin_buffer *bb = file->private_data; - struct dentry *dentry = file->f_path.dentry; - int size = dentry->d_inode->i_size; + int size = file->f_path.dentry->d_inode->i_size; loff_t offs = *off; int count = min_t(size_t, bytes, PAGE_SIZE); char *temp; @@ -165,7 +163,7 @@ static ssize_t write(struct file *file, const char __user *userbuf, memcpy(bb->buffer, temp, count); - count = flush_write(dentry, bb->buffer, offs, count); + count = flush_write(file, bb->buffer, offs, count); mutex_unlock(&bb->mutex); if (count > 0) @@ -363,7 +361,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma) if (!attr->mmap) goto out_put; - rc = attr->mmap(kobj, attr, vma); + rc = attr->mmap(file, kobj, attr, vma); if (rc) goto out_put; @@ -501,7 +499,7 @@ int sysfs_create_bin_file(struct kobject *kobj, void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { - sysfs_hash_and_remove(kobj->sd, attr->attr.name); + sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name); } EXPORT_SYMBOL_GPL(sysfs_create_bin_file); diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 590717861c7..7e54bac8c4b 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -380,7 +380,7 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) { struct sysfs_inode_attrs *ps_iattr; - if (sysfs_find_dirent(acxt->parent_sd, sd->s_name)) + if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name)) return -EEXIST; sd->s_parent = sysfs_get(acxt->parent_sd); @@ -533,13 +533,17 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt) * Pointer to sysfs_dirent if found, NULL if not. */ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, + const void *ns, const unsigned char *name) { struct sysfs_dirent *sd; - for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) + for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) { + if (ns && sd->s_ns && (sd->s_ns != ns)) + continue; if (!strcmp(sd->s_name, name)) return sd; + } return NULL; } @@ -558,12 +562,13 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, * Pointer to sysfs_dirent if found, NULL if not. */ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, + const void *ns, const unsigned char *name) { struct sysfs_dirent *sd; mutex_lock(&sysfs_mutex); - sd = sysfs_find_dirent(parent_sd, name); + sd = sysfs_find_dirent(parent_sd, ns, name); sysfs_get(sd); mutex_unlock(&sysfs_mutex); @@ -572,7 +577,8 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, EXPORT_SYMBOL_GPL(sysfs_get_dirent); static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, - const char *name, struct sysfs_dirent **p_sd) + enum kobj_ns_type type, const void *ns, const char *name, + struct sysfs_dirent **p_sd) { umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; struct sysfs_addrm_cxt acxt; @@ -583,6 +589,9 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, sd = sysfs_new_dirent(name, mode, SYSFS_DIR); if (!sd) return -ENOMEM; + + sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT); + sd->s_ns = ns; sd->s_dir.kobj = kobj; /* link in */ @@ -601,7 +610,33 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, int sysfs_create_subdir(struct kobject *kobj, const char *name, struct sysfs_dirent **p_sd) { - return create_dir(kobj, kobj->sd, name, p_sd); + return create_dir(kobj, kobj->sd, + KOBJ_NS_TYPE_NONE, NULL, name, p_sd); +} + +/** + * sysfs_read_ns_type: return associated ns_type + * @kobj: the kobject being queried + * + * Each kobject can be tagged with exactly one namespace type + * (i.e. network or user). Return the ns_type associated with + * this object if any + */ +static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj) +{ + const struct kobj_ns_type_operations *ops; + enum kobj_ns_type type; + + ops = kobj_child_ns_ops(kobj); + if (!ops) + return KOBJ_NS_TYPE_NONE; + + type = ops->type; + BUG_ON(type <= KOBJ_NS_TYPE_NONE); + BUG_ON(type >= KOBJ_NS_TYPES); + BUG_ON(!kobj_ns_type_registered(type)); + + return type; } /** @@ -610,7 +645,9 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name, */ int sysfs_create_dir(struct kobject * kobj) { + enum kobj_ns_type type; struct sysfs_dirent *parent_sd, *sd; + const void *ns = NULL; int error = 0; BUG_ON(!kobj); @@ -620,7 +657,11 @@ int sysfs_create_dir(struct kobject * kobj) else parent_sd = &sysfs_root; - error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd); + if (sysfs_ns_type(parent_sd)) + ns = kobj->ktype->namespace(kobj); + type = sysfs_read_ns_type(kobj); + + error = create_dir(kobj, parent_sd, type, ns, kobject_name(kobj), &sd); if (!error) kobj->sd = sd; return error; @@ -630,13 +671,19 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { struct dentry *ret = NULL; - struct sysfs_dirent *parent_sd = dentry->d_parent->d_fsdata; + struct dentry *parent = dentry->d_parent; + struct sysfs_dirent *parent_sd = parent->d_fsdata; struct sysfs_dirent *sd; struct inode *inode; + enum kobj_ns_type type; + const void *ns; mutex_lock(&sysfs_mutex); - sd = sysfs_find_dirent(parent_sd, dentry->d_name.name); + type = sysfs_ns_type(parent_sd); + ns = sysfs_info(dir->i_sb)->ns[type]; + + sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name); /* no such entry */ if (!sd) { @@ -735,7 +782,8 @@ void sysfs_remove_dir(struct kobject * kobj) } int sysfs_rename(struct sysfs_dirent *sd, - struct sysfs_dirent *new_parent_sd, const char *new_name) + struct sysfs_dirent *new_parent_sd, const void *new_ns, + const char *new_name) { const char *dup_name = NULL; int error; @@ -743,12 +791,12 @@ int sysfs_rename(struct sysfs_dirent *sd, mutex_lock(&sysfs_mutex); error = 0; - if ((sd->s_parent == new_parent_sd) && + if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) && (strcmp(sd->s_name, new_name) == 0)) goto out; /* nothing to rename */ error = -EEXIST; - if (sysfs_find_dirent(new_parent_sd, new_name)) + if (sysfs_find_dirent(new_parent_sd, new_ns, new_name)) goto out; /* rename sysfs_dirent */ @@ -770,6 +818,7 @@ int sysfs_rename(struct sysfs_dirent *sd, sd->s_parent = new_parent_sd; sysfs_link_sibling(sd); } + sd->s_ns = new_ns; error = 0; out: @@ -780,19 +829,28 @@ int sysfs_rename(struct sysfs_dirent *sd, int sysfs_rename_dir(struct kobject *kobj, const char *new_name) { - return sysfs_rename(kobj->sd, kobj->sd->s_parent, new_name); + struct sysfs_dirent *parent_sd = kobj->sd->s_parent; + const void *new_ns = NULL; + + if (sysfs_ns_type(parent_sd)) + new_ns = kobj->ktype->namespace(kobj); + + return sysfs_rename(kobj->sd, parent_sd, new_ns, new_name); } int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj) { struct sysfs_dirent *sd = kobj->sd; struct sysfs_dirent *new_parent_sd; + const void *new_ns = NULL; BUG_ON(!sd->s_parent); + if (sysfs_ns_type(sd->s_parent)) + new_ns = kobj->ktype->namespace(kobj); new_parent_sd = new_parent_kobj && new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root; - return sysfs_rename(sd, new_parent_sd, sd->s_name); + return sysfs_rename(sd, new_parent_sd, new_ns, sd->s_name); } /* Relationship between s_mode and the DT_xxx types */ @@ -807,32 +865,35 @@ static int sysfs_dir_release(struct inode *inode, struct file *filp) return 0; } -static struct sysfs_dirent *sysfs_dir_pos(struct sysfs_dirent *parent_sd, - ino_t ino, struct sysfs_dirent *pos) +static struct sysfs_dirent *sysfs_dir_pos(const void *ns, + struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos) { if (pos) { int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) && pos->s_parent == parent_sd && ino == pos->s_ino; sysfs_put(pos); - if (valid) - return pos; + if (!valid) + pos = NULL; } - pos = NULL; - if ((ino > 1) && (ino < INT_MAX)) { + if (!pos && (ino > 1) && (ino < INT_MAX)) { pos = parent_sd->s_dir.children; while (pos && (ino > pos->s_ino)) pos = pos->s_sibling; } + while (pos && pos->s_ns && pos->s_ns != ns) + pos = pos->s_sibling; return pos; } -static struct sysfs_dirent *sysfs_dir_next_pos(struct sysfs_dirent *parent_sd, - ino_t ino, struct sysfs_dirent *pos) +static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns, + struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos) { - pos = sysfs_dir_pos(parent_sd, ino, pos); + pos = sysfs_dir_pos(ns, parent_sd, ino, pos); if (pos) pos = pos->s_sibling; + while (pos && pos->s_ns && pos->s_ns != ns) + pos = pos->s_sibling; return pos; } @@ -841,8 +902,13 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) struct dentry *dentry = filp->f_path.dentry; struct sysfs_dirent * parent_sd = dentry->d_fsdata; struct sysfs_dirent *pos = filp->private_data; + enum kobj_ns_type type; + const void *ns; ino_t ino; + type = sysfs_ns_type(parent_sd); + ns = sysfs_info(dentry->d_sb)->ns[type]; + if (filp->f_pos == 0) { ino = parent_sd->s_ino; if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0) @@ -857,9 +923,9 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) filp->f_pos++; } mutex_lock(&sysfs_mutex); - for (pos = sysfs_dir_pos(parent_sd, filp->f_pos, pos); + for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos); pos; - pos = sysfs_dir_next_pos(parent_sd, filp->f_pos, pos)) { + pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) { const char * name; unsigned int type; int len, ret; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index e222b258274..1beaa739d0a 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -478,9 +478,12 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr) mutex_lock(&sysfs_mutex); if (sd && dir) - sd = sysfs_find_dirent(sd, dir); + /* Only directories are tagged, so no need to pass + * a tag explicitly. + */ + sd = sysfs_find_dirent(sd, NULL, dir); if (sd && attr) - sd = sysfs_find_dirent(sd, attr); + sd = sysfs_find_dirent(sd, NULL, attr); if (sd) sysfs_notify_dirent(sd); @@ -569,7 +572,7 @@ int sysfs_add_file_to_group(struct kobject *kobj, int error; if (group) - dir_sd = sysfs_get_dirent(kobj->sd, group); + dir_sd = sysfs_get_dirent(kobj->sd, NULL, group); else dir_sd = sysfs_get(kobj->sd); @@ -599,7 +602,7 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode) mutex_lock(&sysfs_mutex); rc = -ENOENT; - sd = sysfs_find_dirent(kobj->sd, attr->name); + sd = sysfs_find_dirent(kobj->sd, NULL, attr->name); if (!sd) goto out; @@ -624,7 +627,7 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file); void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) { - sysfs_hash_and_remove(kobj->sd, attr->name); + sysfs_hash_and_remove(kobj->sd, NULL, attr->name); } void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr) @@ -646,11 +649,11 @@ void sysfs_remove_file_from_group(struct kobject *kobj, struct sysfs_dirent *dir_sd; if (group) - dir_sd = sysfs_get_dirent(kobj->sd, group); + dir_sd = sysfs_get_dirent(kobj->sd, NULL, group); else dir_sd = sysfs_get(kobj->sd); if (dir_sd) { - sysfs_hash_and_remove(dir_sd, attr->name); + sysfs_hash_and_remove(dir_sd, NULL, attr->name); sysfs_put(dir_sd); } } diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index fe611949a7f..23c1e598792 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -23,7 +23,7 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, int i; for (i = 0, attr = grp->attrs; *attr; i++, attr++) - sysfs_hash_and_remove(dir_sd, (*attr)->name); + sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name); } static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, @@ -39,7 +39,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, * visibility. Do this by first removing then * re-adding (if required) the file */ if (update) - sysfs_hash_and_remove(dir_sd, (*attr)->name); + sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name); if (grp->is_visible) { mode = grp->is_visible(kobj, *attr, i); if (!mode) @@ -132,7 +132,7 @@ void sysfs_remove_group(struct kobject * kobj, struct sysfs_dirent *sd; if (grp->name) { - sd = sysfs_get_dirent(dir_sd, grp->name); + sd = sysfs_get_dirent(dir_sd, NULL, grp->name); if (!sd) { WARN(!sd, KERN_WARNING "sysfs group %p not found for " "kobject '%s'\n", grp, kobject_name(kobj)); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index a4a0a941971..bbd77e95cf7 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -324,7 +324,7 @@ void sysfs_delete_inode(struct inode *inode) sysfs_put(sd); } -int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name) +int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name) { struct sysfs_addrm_cxt acxt; struct sysfs_dirent *sd; @@ -334,7 +334,9 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name) sysfs_addrm_start(&acxt, dir_sd); - sd = sysfs_find_dirent(dir_sd, name); + sd = sysfs_find_dirent(dir_sd, ns, name); + if (sd && (sd->s_ns != ns)) + sd = NULL; if (sd) sysfs_remove_one(&acxt, sd); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 776137828dc..281c0c9bc39 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -35,7 +35,7 @@ static const struct super_operations sysfs_ops = { struct sysfs_dirent sysfs_root = { .s_name = "", .s_count = ATOMIC_INIT(1), - .s_flags = SYSFS_DIR, + .s_flags = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT), .s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, .s_ino = 1, }; @@ -72,18 +72,107 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent) return 0; } +static int sysfs_test_super(struct super_block *sb, void *data) +{ + struct sysfs_super_info *sb_info = sysfs_info(sb); + struct sysfs_super_info *info = data; + enum kobj_ns_type type; + int found = 1; + + for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) { + if (sb_info->ns[type] != info->ns[type]) + found = 0; + } + return found; +} + +static int sysfs_set_super(struct super_block *sb, void *data) +{ + int error; + error = set_anon_super(sb, data); + if (!error) + sb->s_fs_info = data; + return error; +} + static int sysfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt); + struct sysfs_super_info *info; + enum kobj_ns_type type; + struct super_block *sb; + int error; + + error = -ENOMEM; + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + goto out; + + for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) + info->ns[type] = kobj_ns_current(type); + + sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info); + if (IS_ERR(sb) || sb->s_fs_info != info) + kfree(info); + if (IS_ERR(sb)) { + error = PTR_ERR(sb); + goto out; + } + if (!sb->s_root) { + sb->s_flags = flags; + error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); + if (error) { + deactivate_locked_super(sb); + goto out; + } + sb->s_flags |= MS_ACTIVE; + } + + simple_set_mnt(mnt, sb); + error = 0; +out: + return error; +} + +static void sysfs_kill_sb(struct super_block *sb) +{ + struct sysfs_super_info *info = sysfs_info(sb); + + /* Remove the superblock from fs_supers/s_instances + * so we can't find it, before freeing sysfs_super_info. + */ + kill_anon_super(sb); + kfree(info); } static struct file_system_type sysfs_fs_type = { .name = "sysfs", .get_sb = sysfs_get_sb, - .kill_sb = kill_anon_super, + .kill_sb = sysfs_kill_sb, }; +void sysfs_exit_ns(enum kobj_ns_type type, const void *ns) +{ + struct super_block *sb; + + mutex_lock(&sysfs_mutex); + spin_lock(&sb_lock); + list_for_each_entry(sb, &sysfs_fs_type.fs_supers, s_instances) { + struct sysfs_super_info *info = sysfs_info(sb); + /* + * If we see a superblock on the fs_supers/s_instances + * list the unmount has not completed and sb->s_fs_info + * points to a valid struct sysfs_super_info. + */ + /* Ignore superblocks with the wrong ns */ + if (info->ns[type] != ns) + continue; + info->ns[type] = NULL; + } + spin_unlock(&sb_lock); + mutex_unlock(&sysfs_mutex); +} + int __init sysfs_init(void) { int err = -ENOMEM; diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index b93ec51fa7a..f71246bebfe 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -58,6 +58,8 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, if (!sd) goto out_put; + if (sysfs_ns_type(parent_sd)) + sd->s_ns = target->ktype->namespace(target); sd->s_symlink.target_sd = target_sd; target_sd = NULL; /* reference is now owned by the symlink */ @@ -107,6 +109,26 @@ int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, } /** + * sysfs_delete_link - remove symlink in object's directory. + * @kobj: object we're acting for. + * @targ: object we're pointing to. + * @name: name of the symlink to remove. + * + * Unlike sysfs_remove_link sysfs_delete_link has enough information + * to successfully delete symlinks in tagged directories. + */ +void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, + const char *name) +{ + const void *ns = NULL; + spin_lock(&sysfs_assoc_lock); + if (targ->sd) + ns = targ->sd->s_ns; + spin_unlock(&sysfs_assoc_lock); + sysfs_hash_and_remove(kobj->sd, ns, name); +} + +/** * sysfs_remove_link - remove symlink in object's directory. * @kobj: object we're acting for. * @name: name of the symlink to remove. @@ -121,7 +143,7 @@ void sysfs_remove_link(struct kobject * kobj, const char * name) else parent_sd = kobj->sd; - sysfs_hash_and_remove(parent_sd, name); + sysfs_hash_and_remove(parent_sd, NULL, name); } /** @@ -137,6 +159,7 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ, const char *old, const char *new) { struct sysfs_dirent *parent_sd, *sd = NULL; + const void *old_ns = NULL, *new_ns = NULL; int result; if (!kobj) @@ -144,8 +167,11 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ, else parent_sd = kobj->sd; + if (targ->sd) + old_ns = targ->sd->s_ns; + result = -ENOENT; - sd = sysfs_get_dirent(parent_sd, old); + sd = sysfs_get_dirent(parent_sd, old_ns, old); if (!sd) goto out; @@ -155,7 +181,10 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ, if (sd->s_symlink.target_sd->s_dir.kobj != targ) goto out; - result = sysfs_rename(sd, parent_sd, new); + if (sysfs_ns_type(parent_sd)) + new_ns = targ->ktype->namespace(targ); + + result = sysfs_rename(sd, parent_sd, new_ns, new); out: sysfs_put(sd); @@ -261,3 +290,4 @@ const struct inode_operations sysfs_symlink_inode_operations = { EXPORT_SYMBOL_GPL(sysfs_create_link); EXPORT_SYMBOL_GPL(sysfs_remove_link); +EXPORT_SYMBOL_GPL(sysfs_rename_link); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 30f5a44fb5d..6a13105b559 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -58,6 +58,7 @@ struct sysfs_dirent { struct sysfs_dirent *s_sibling; const char *s_name; + const void *s_ns; /* namespace tag */ union { struct sysfs_elem_dir s_dir; struct sysfs_elem_symlink s_symlink; @@ -81,14 +82,27 @@ struct sysfs_dirent { #define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) #define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR) -#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK -#define SYSFS_FLAG_REMOVED 0x0200 +/* identify any namespace tag on sysfs_dirents */ +#define SYSFS_NS_TYPE_MASK 0xff00 +#define SYSFS_NS_TYPE_SHIFT 8 + +#define SYSFS_FLAG_MASK ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK) +#define SYSFS_FLAG_REMOVED 0x020000 static inline unsigned int sysfs_type(struct sysfs_dirent *sd) { return sd->s_flags & SYSFS_TYPE_MASK; } +/* + * Return any namespace tags on this dirent. + * enum kobj_ns_type is defined in linux/kobject.h + */ +static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd) +{ + return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT; +} + #ifdef CONFIG_DEBUG_LOCK_ALLOC #define sysfs_dirent_init_lockdep(sd) \ do { \ @@ -114,6 +128,16 @@ struct sysfs_addrm_cxt { /* * mount.c */ + +/* + * Each sb is associated with a set of namespace tags (i.e. + * the network namespace of the task which mounted this sysfs + * instance). + */ +struct sysfs_super_info { + const void *ns[KOBJ_NS_TYPES]; +}; +#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info)) extern struct sysfs_dirent sysfs_root; extern struct kmem_cache *sysfs_dir_cachep; @@ -137,8 +161,10 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd); void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt); struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, + const void *ns, const unsigned char *name); struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, + const void *ns, const unsigned char *name); struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type); @@ -149,7 +175,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name, void sysfs_remove_subdir(struct sysfs_dirent *sd); int sysfs_rename(struct sysfs_dirent *sd, - struct sysfs_dirent *new_parent_sd, const char *new_name); + struct sysfs_dirent *new_parent_sd, const void *ns, const char *new_name); static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd) { @@ -179,7 +205,7 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); -int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name); +int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name); int sysfs_inode_init(void); /* diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c index 241e9765cfa..bbd69bdb0fa 100644 --- a/fs/sysv/ialloc.c +++ b/fs/sysv/ialloc.c @@ -159,15 +159,7 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode) *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); fs16_add(sbi, sbi->s_sb_total_free_inodes, -1); dirty_sb(sb); - - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - inode->i_gid = current_fsgid(); - - inode->i_uid = current_fsuid(); + inode_init_owner(inode, dir, mode); inode->i_ino = fs16_to_cpu(sbi, ino); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; inode->i_blocks = 0; @@ -176,7 +168,6 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode) insert_inode_hash(inode); mark_inode_dirty(inode); - inode->i_mode = mode; /* for sysv_write_inode() */ sysv_write_inode(inode, 0); /* ensure inode not allocated again */ mark_inode_dirty(inode); /* cleared by sysv_write_inode() */ /* That's it. */ diff --git a/fs/timerfd.c b/fs/timerfd.c index 98158de91d2..b86ab8eff79 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -110,31 +110,14 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, struct timerfd_ctx *ctx = file->private_data; ssize_t res; u64 ticks = 0; - DECLARE_WAITQUEUE(wait, current); if (count < sizeof(ticks)) return -EINVAL; spin_lock_irq(&ctx->wqh.lock); - res = -EAGAIN; - if (!ctx->ticks && !(file->f_flags & O_NONBLOCK)) { - __add_wait_queue(&ctx->wqh, &wait); - for (res = 0;;) { - set_current_state(TASK_INTERRUPTIBLE); - if (ctx->ticks) { - res = 0; - break; - } - if (signal_pending(current)) { - res = -ERESTARTSYS; - break; - } - spin_unlock_irq(&ctx->wqh.lock); - schedule(); - spin_lock_irq(&ctx->wqh.lock); - } - __remove_wait_queue(&ctx->wqh, &wait); - __set_current_state(TASK_RUNNING); - } + if (file->f_flags & O_NONBLOCK) + res = -EAGAIN; + else + res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks); if (ctx->ticks) { ticks = ctx->ticks; if (ctx->expired && ctx->tintv.tv64) { diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 401e503d44a..87ebcce7221 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -104,14 +104,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, */ inode->i_flags |= (S_NOCMTIME); - inode->i_uid = current_fsuid(); - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - inode->i_gid = current_fsgid(); - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); inode->i_mtime = inode->i_atime = inode->i_ctime = ubifs_current_time(inode); inode->i_mapping->nrpages = 0; diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 77d5cf4a754..bcf5a16f30b 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -64,6 +64,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err) if (!c->ro_media) { c->ro_media = 1; c->no_chk_data_crc = 0; + c->vfs_sb->s_flags |= MS_RDONLY; ubifs_warn("switched to read-only mode, error %d", err); dbg_dump_stack(); } diff --git a/fs/udf/dir.c b/fs/udf/dir.c index f0f2a436251..3a84455c2a7 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -209,6 +209,6 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir) const struct file_operations udf_dir_operations = { .read = generic_read_dir, .readdir = udf_readdir, - .ioctl = udf_ioctl, + .unlocked_ioctl = udf_ioctl, .fsync = simple_fsync, }; diff --git a/fs/udf/file.c b/fs/udf/file.c index 4b6a46ccbf4..baae3a72394 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -37,6 +37,7 @@ #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/aio.h> +#include <linux/smp_lock.h> #include "udf_i.h" #include "udf_sb.h" @@ -144,50 +145,60 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, return retval; } -int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, - unsigned long arg) +long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { + struct inode *inode = filp->f_dentry->d_inode; long old_block, new_block; int result = -EINVAL; + lock_kernel(); + if (file_permission(filp, MAY_READ) != 0) { - udf_debug("no permission to access inode %lu\n", - inode->i_ino); - return -EPERM; + udf_debug("no permission to access inode %lu\n", inode->i_ino); + result = -EPERM; + goto out; } if (!arg) { udf_debug("invalid argument to udf_ioctl\n"); - return -EINVAL; + result = -EINVAL; + goto out; } switch (cmd) { case UDF_GETVOLIDENT: if (copy_to_user((char __user *)arg, UDF_SB(inode->i_sb)->s_volume_ident, 32)) - return -EFAULT; + result = -EFAULT; else - return 0; + result = 0; + goto out; case UDF_RELOCATE_BLOCKS: - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (get_user(old_block, (long __user *)arg)) - return -EFAULT; + if (!capable(CAP_SYS_ADMIN)) { + result = -EACCES; + goto out; + } + if (get_user(old_block, (long __user *)arg)) { + result = -EFAULT; + goto out; + } result = udf_relocate_blocks(inode->i_sb, old_block, &new_block); if (result == 0) result = put_user(new_block, (long __user *)arg); - return result; + goto out; case UDF_GETEASIZE: result = put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg); - break; + goto out; case UDF_GETEABLOCK: result = copy_to_user((char __user *)arg, UDF_I(inode)->i_ext.i_data, UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0; - break; + goto out; } +out: + unlock_kernel(); return result; } @@ -207,7 +218,7 @@ static int udf_release_file(struct inode *inode, struct file *filp) const struct file_operations udf_file_operations = { .read = do_sync_read, .aio_read = generic_file_aio_read, - .ioctl = udf_ioctl, + .unlocked_ioctl = udf_ioctl, .open = dquot_file_open, .mmap = generic_file_mmap, .write = do_sync_write, @@ -227,7 +238,7 @@ int udf_setattr(struct dentry *dentry, struct iattr *iattr) if (error) return error; - if (iattr->ia_valid & ATTR_SIZE) + if (is_quota_modification(inode, iattr)) dquot_initialize(inode); if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index fb68c9cd0c3..2b5586c7f02 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -124,15 +124,8 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) udf_updated_lvid(sb); } mutex_unlock(&sbi->s_alloc_mutex); - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else { - inode->i_gid = current_fsgid(); - } + + inode_init_owner(inode, dir, mode); iinfo->i_location.logicalBlockNum = block; iinfo->i_location.partitionReferenceNum = diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 75816025f95..585f733615d 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -579,7 +579,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode, inode->i_data.a_ops = &udf_aops; inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; - inode->i_mode = mode; mark_inode_dirty(inode); fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); @@ -627,7 +626,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode, goto out; iinfo = UDF_I(inode); - inode->i_uid = current_fsuid(); init_special_inode(inode, mode, rdev); fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); if (!fi) { @@ -674,7 +672,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) goto out; err = -EIO; - inode = udf_new_inode(dir, S_IFDIR, &err); + inode = udf_new_inode(dir, S_IFDIR | mode, &err); if (!inode) goto out; @@ -697,9 +695,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT; udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL); brelse(fibh.sbh); - inode->i_mode = S_IFDIR | mode; - if (dir->i_mode & S_ISGID) - inode->i_mode |= S_ISGID; mark_inode_dirty(inode); fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); @@ -912,7 +907,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, dquot_initialize(dir); lock_kernel(); - inode = udf_new_inode(dir, S_IFLNK, &err); + inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err); if (!inode) goto out; @@ -923,7 +918,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, } iinfo = UDF_I(inode); - inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_data.a_ops = &udf_symlink_aops; inode->i_op = &udf_symlink_inode_operations; diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 702a1148e70..9079ff7d625 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -130,8 +130,7 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, uint8_t *, uint8_t *); /* file.c */ -extern int udf_ioctl(struct inode *, struct file *, unsigned int, - unsigned long); +extern long udf_ioctl(struct file *, unsigned int, unsigned long); extern int udf_setattr(struct dentry *dentry, struct iattr *iattr); /* inode.c */ extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 230ecf60802..3a959d55084 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -303,15 +303,7 @@ cg_found: sb->s_dirt = 1; inode->i_ino = cg * uspi->s_ipg + bit; - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - inode->i_mode |= S_ISGID; - } else - inode->i_gid = current_fsgid(); - + inode_init_owner(inode, dir, mode); inode->i_blocks = 0; inode->i_generation = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 80b68c3702d..cffa756f104 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -603,7 +603,7 @@ static void ufs_set_inode_ops(struct inode *inode) if (!inode->i_blocks) inode->i_op = &ufs_fast_symlink_inode_operations; else { - inode->i_op = &page_symlink_inode_operations; + inode->i_op = &ufs_symlink_inode_operations; inode->i_mapping->a_ops = &ufs_aops; } } else diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 118556243e7..eabc02eb129 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -148,7 +148,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) { /* slow symlink */ - inode->i_op = &page_symlink_inode_operations; + inode->i_op = &ufs_symlink_inode_operations; inode->i_mapping->a_ops = &ufs_aops; err = page_symlink(inode, symname, l); if (err) diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c index c0156eda44b..d283628b477 100644 --- a/fs/ufs/symlink.c +++ b/fs/ufs/symlink.c @@ -42,4 +42,12 @@ static void *ufs_follow_link(struct dentry *dentry, struct nameidata *nd) const struct inode_operations ufs_fast_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = ufs_follow_link, + .setattr = ufs_setattr, +}; + +const struct inode_operations ufs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = ufs_setattr, }; diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index d3b6270cb37..f294c44577d 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -508,7 +508,7 @@ out: * - there is no way to know old size * - there is no way inform user about error, if it happens in `truncate' */ -static int ufs_setattr(struct dentry *dentry, struct iattr *attr) +int ufs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; unsigned int ia_valid = attr->ia_valid; @@ -518,18 +518,18 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; + if (is_quota_modification(inode, attr)) + dquot_initialize(inode); + if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { error = dquot_transfer(inode, attr); if (error) return error; } - if (ia_valid & ATTR_SIZE && - attr->ia_size != i_size_read(inode)) { + if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { loff_t old_i_size = inode->i_size; - dquot_initialize(inode); - error = vmtruncate(inode, attr->ia_size); if (error) return error; diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 43f9f5d5670..179ae6b3180 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -122,9 +122,11 @@ extern void ufs_panic (struct super_block *, const char *, const char *, ...) __ /* symlink.c */ extern const struct inode_operations ufs_fast_symlink_inode_operations; +extern const struct inode_operations ufs_symlink_inode_operations; /* truncate.c */ extern int ufs_truncate (struct inode *, loff_t); +extern int ufs_setattr(struct dentry *dentry, struct iattr *attr); static inline struct ufs_sb_info *UFS_SB(struct super_block *sb) { diff --git a/fs/xattr.c b/fs/xattr.c index 46f87e828b4..01bb8135e14 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -590,10 +590,10 @@ strcmp_prefix(const char *a, const char *a_prefix) /* * Find the xattr_handler with the matching prefix. */ -static struct xattr_handler * -xattr_resolve_name(struct xattr_handler **handlers, const char **name) +static const struct xattr_handler * +xattr_resolve_name(const struct xattr_handler **handlers, const char **name) { - struct xattr_handler *handler; + const struct xattr_handler *handler; if (!*name) return NULL; @@ -614,7 +614,7 @@ xattr_resolve_name(struct xattr_handler **handlers, const char **name) ssize_t generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) { - struct xattr_handler *handler; + const struct xattr_handler *handler; handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (!handler) @@ -629,7 +629,7 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr; + const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr; unsigned int size = 0; if (!buffer) { @@ -659,7 +659,7 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) int generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct xattr_handler *handler; + const struct xattr_handler *handler; if (size == 0) value = ""; /* empty EA, do not remove */ @@ -676,7 +676,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz int generic_removexattr(struct dentry *dentry, const char *name) { - struct xattr_handler *handler; + const struct xattr_handler *handler; handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (!handler) diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index b4769e40e8b..c8fb13f83b3 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -77,6 +77,7 @@ xfs-y += xfs_alloc.o \ xfs_itable.o \ xfs_dfrag.o \ xfs_log.o \ + xfs_log_cil.o \ xfs_log_recover.o \ xfs_mount.o \ xfs_mru_cache.o \ diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c index a7bc925c4d6..9f769b5b38f 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/linux-2.6/xfs_acl.c @@ -440,14 +440,14 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name, return error; } -struct xattr_handler xfs_xattr_acl_access_handler = { +const struct xattr_handler xfs_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .get = xfs_xattr_acl_get, .set = xfs_xattr_acl_set, }; -struct xattr_handler xfs_xattr_acl_default_handler = { +const struct xattr_handler xfs_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .get = xfs_xattr_acl_get, diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 0f8b9968a80..089eaca860b 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -45,6 +45,15 @@ #include <linux/pagevec.h> #include <linux/writeback.h> +/* + * Types of I/O for bmap clustering and I/O completion tracking. + */ +enum { + IO_READ, /* mapping for a read */ + IO_DELAY, /* mapping covers delalloc region */ + IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */ + IO_NEW /* just allocated */ +}; /* * Prime number of hash buckets since address is used as the key. @@ -103,8 +112,9 @@ xfs_count_page_state( STATIC struct block_device * xfs_find_bdev_for_inode( - struct xfs_inode *ip) + struct inode *inode) { + struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; if (XFS_IS_REALTIME_INODE(ip)) @@ -183,7 +193,7 @@ xfs_setfilesize( xfs_fsize_t isize; ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); - ASSERT(ioend->io_type != IOMAP_READ); + ASSERT(ioend->io_type != IO_READ); if (unlikely(ioend->io_error)) return 0; @@ -214,7 +224,7 @@ xfs_finish_ioend( if (atomic_dec_and_test(&ioend->io_remaining)) { struct workqueue_struct *wq; - wq = (ioend->io_type == IOMAP_UNWRITTEN) ? + wq = (ioend->io_type == IO_UNWRITTEN) ? xfsconvertd_workqueue : xfsdatad_workqueue; queue_work(wq, &ioend->io_work); if (wait) @@ -237,7 +247,7 @@ xfs_end_io( * For unwritten extents we need to issue transactions to convert a * range to normal written extens after the data I/O has finished. */ - if (ioend->io_type == IOMAP_UNWRITTEN && + if (ioend->io_type == IO_UNWRITTEN && likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { error = xfs_iomap_write_unwritten(ip, ioend->io_offset, @@ -250,7 +260,7 @@ xfs_end_io( * We might have to update the on-disk file size after extending * writes. */ - if (ioend->io_type != IOMAP_READ) { + if (ioend->io_type != IO_READ) { error = xfs_setfilesize(ioend); ASSERT(!error || error == EAGAIN); } @@ -309,21 +319,25 @@ xfs_map_blocks( struct inode *inode, loff_t offset, ssize_t count, - xfs_iomap_t *mapp, + struct xfs_bmbt_irec *imap, int flags) { int nmaps = 1; + int new = 0; - return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps); + return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new); } STATIC int -xfs_iomap_valid( - xfs_iomap_t *iomapp, - loff_t offset) +xfs_imap_valid( + struct inode *inode, + struct xfs_bmbt_irec *imap, + xfs_off_t offset) { - return offset >= iomapp->iomap_offset && - offset < iomapp->iomap_offset + iomapp->iomap_bsize; + offset >>= inode->i_blkbits; + + return offset >= imap->br_startoff && + offset < imap->br_startoff + imap->br_blockcount; } /* @@ -554,19 +568,23 @@ xfs_add_to_ioend( STATIC void xfs_map_buffer( + struct inode *inode, struct buffer_head *bh, - xfs_iomap_t *mp, - xfs_off_t offset, - uint block_bits) + struct xfs_bmbt_irec *imap, + xfs_off_t offset) { sector_t bn; + struct xfs_mount *m = XFS_I(inode)->i_mount; + xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff); + xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock); - ASSERT(mp->iomap_bn != IOMAP_DADDR_NULL); + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); - bn = (mp->iomap_bn >> (block_bits - BBSHIFT)) + - ((offset - mp->iomap_offset) >> block_bits); + bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) + + ((offset - iomap_offset) >> inode->i_blkbits); - ASSERT(bn || (mp->iomap_flags & IOMAP_REALTIME)); + ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode))); bh->b_blocknr = bn; set_buffer_mapped(bh); @@ -574,17 +592,17 @@ xfs_map_buffer( STATIC void xfs_map_at_offset( + struct inode *inode, struct buffer_head *bh, - loff_t offset, - int block_bits, - xfs_iomap_t *iomapp) + struct xfs_bmbt_irec *imap, + xfs_off_t offset) { - ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE)); - ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY)); + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); lock_buffer(bh); - xfs_map_buffer(bh, iomapp, offset, block_bits); - bh->b_bdev = iomapp->iomap_target->bt_bdev; + xfs_map_buffer(inode, bh, imap, offset); + bh->b_bdev = xfs_find_bdev_for_inode(inode); set_buffer_mapped(bh); clear_buffer_delay(bh); clear_buffer_unwritten(bh); @@ -713,11 +731,11 @@ xfs_is_delayed_page( bh = head = page_buffers(page); do { if (buffer_unwritten(bh)) - acceptable = (type == IOMAP_UNWRITTEN); + acceptable = (type == IO_UNWRITTEN); else if (buffer_delay(bh)) - acceptable = (type == IOMAP_DELAY); + acceptable = (type == IO_DELAY); else if (buffer_dirty(bh) && buffer_mapped(bh)) - acceptable = (type == IOMAP_NEW); + acceptable = (type == IO_NEW); else break; } while ((bh = bh->b_this_page) != head); @@ -740,7 +758,7 @@ xfs_convert_page( struct inode *inode, struct page *page, loff_t tindex, - xfs_iomap_t *mp, + struct xfs_bmbt_irec *imap, xfs_ioend_t **ioendp, struct writeback_control *wbc, int startio, @@ -750,7 +768,6 @@ xfs_convert_page( xfs_off_t end_offset; unsigned long p_offset; unsigned int type; - int bbits = inode->i_blkbits; int len, page_dirty; int count = 0, done = 0, uptodate = 1; xfs_off_t offset = page_offset(page); @@ -802,19 +819,19 @@ xfs_convert_page( if (buffer_unwritten(bh) || buffer_delay(bh)) { if (buffer_unwritten(bh)) - type = IOMAP_UNWRITTEN; + type = IO_UNWRITTEN; else - type = IOMAP_DELAY; + type = IO_DELAY; - if (!xfs_iomap_valid(mp, offset)) { + if (!xfs_imap_valid(inode, imap, offset)) { done = 1; continue; } - ASSERT(!(mp->iomap_flags & IOMAP_HOLE)); - ASSERT(!(mp->iomap_flags & IOMAP_DELAY)); + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); - xfs_map_at_offset(bh, offset, bbits, mp); + xfs_map_at_offset(inode, bh, imap, offset); if (startio) { xfs_add_to_ioend(inode, bh, offset, type, ioendp, done); @@ -826,7 +843,7 @@ xfs_convert_page( page_dirty--; count++; } else { - type = IOMAP_NEW; + type = IO_NEW; if (buffer_mapped(bh) && all_bh && startio) { lock_buffer(bh); xfs_add_to_ioend(inode, bh, offset, @@ -866,7 +883,7 @@ STATIC void xfs_cluster_write( struct inode *inode, pgoff_t tindex, - xfs_iomap_t *iomapp, + struct xfs_bmbt_irec *imap, xfs_ioend_t **ioendp, struct writeback_control *wbc, int startio, @@ -885,7 +902,7 @@ xfs_cluster_write( for (i = 0; i < pagevec_count(&pvec); i++) { done = xfs_convert_page(inode, pvec.pages[i], tindex++, - iomapp, ioendp, wbc, startio, all_bh); + imap, ioendp, wbc, startio, all_bh); if (done) break; } @@ -930,7 +947,7 @@ xfs_aops_discard_page( loff_t offset = page_offset(page); ssize_t len = 1 << inode->i_blkbits; - if (!xfs_is_delayed_page(page, IOMAP_DELAY)) + if (!xfs_is_delayed_page(page, IO_DELAY)) goto out_invalidate; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) @@ -1042,15 +1059,15 @@ xfs_page_state_convert( int unmapped) /* also implies page uptodate */ { struct buffer_head *bh, *head; - xfs_iomap_t iomap; + struct xfs_bmbt_irec imap; xfs_ioend_t *ioend = NULL, *iohead = NULL; loff_t offset; unsigned long p_offset = 0; unsigned int type; __uint64_t end_offset; - pgoff_t end_index, last_index, tlast; + pgoff_t end_index, last_index; ssize_t size, len; - int flags, err, iomap_valid = 0, uptodate = 1; + int flags, err, imap_valid = 0, uptodate = 1; int page_dirty, count = 0; int trylock = 0; int all_bh = unmapped; @@ -1097,7 +1114,7 @@ xfs_page_state_convert( bh = head = page_buffers(page); offset = page_offset(page); flags = BMAPI_READ; - type = IOMAP_NEW; + type = IO_NEW; /* TODO: cleanup count and page_dirty */ @@ -1111,12 +1128,12 @@ xfs_page_state_convert( * the iomap is actually still valid, but the ioend * isn't. shouldn't happen too often. */ - iomap_valid = 0; + imap_valid = 0; continue; } - if (iomap_valid) - iomap_valid = xfs_iomap_valid(&iomap, offset); + if (imap_valid) + imap_valid = xfs_imap_valid(inode, &imap, offset); /* * First case, map an unwritten extent and prepare for @@ -1137,20 +1154,20 @@ xfs_page_state_convert( * Make sure we don't use a read-only iomap */ if (flags == BMAPI_READ) - iomap_valid = 0; + imap_valid = 0; if (buffer_unwritten(bh)) { - type = IOMAP_UNWRITTEN; + type = IO_UNWRITTEN; flags = BMAPI_WRITE | BMAPI_IGNSTATE; } else if (buffer_delay(bh)) { - type = IOMAP_DELAY; + type = IO_DELAY; flags = BMAPI_ALLOCATE | trylock; } else { - type = IOMAP_NEW; + type = IO_NEW; flags = BMAPI_WRITE | BMAPI_MMAP; } - if (!iomap_valid) { + if (!imap_valid) { /* * if we didn't have a valid mapping then we * need to ensure that we put the new mapping @@ -1160,7 +1177,7 @@ xfs_page_state_convert( * for unwritten extent conversion. */ new_ioend = 1; - if (type == IOMAP_NEW) { + if (type == IO_NEW) { size = xfs_probe_cluster(inode, page, bh, head, 0); } else { @@ -1168,14 +1185,14 @@ xfs_page_state_convert( } err = xfs_map_blocks(inode, offset, size, - &iomap, flags); + &imap, flags); if (err) goto error; - iomap_valid = xfs_iomap_valid(&iomap, offset); + imap_valid = xfs_imap_valid(inode, &imap, + offset); } - if (iomap_valid) { - xfs_map_at_offset(bh, offset, - inode->i_blkbits, &iomap); + if (imap_valid) { + xfs_map_at_offset(inode, bh, &imap, offset); if (startio) { xfs_add_to_ioend(inode, bh, offset, type, &ioend, @@ -1194,40 +1211,41 @@ xfs_page_state_convert( * That means it must already have extents allocated * underneath it. Map the extent by reading it. */ - if (!iomap_valid || flags != BMAPI_READ) { + if (!imap_valid || flags != BMAPI_READ) { flags = BMAPI_READ; size = xfs_probe_cluster(inode, page, bh, head, 1); err = xfs_map_blocks(inode, offset, size, - &iomap, flags); + &imap, flags); if (err) goto error; - iomap_valid = xfs_iomap_valid(&iomap, offset); + imap_valid = xfs_imap_valid(inode, &imap, + offset); } /* - * We set the type to IOMAP_NEW in case we are doing a + * We set the type to IO_NEW in case we are doing a * small write at EOF that is extending the file but * without needing an allocation. We need to update the * file size on I/O completion in this case so it is * the same case as having just allocated a new extent * that we are writing into for the first time. */ - type = IOMAP_NEW; + type = IO_NEW; if (trylock_buffer(bh)) { ASSERT(buffer_mapped(bh)); - if (iomap_valid) + if (imap_valid) all_bh = 1; xfs_add_to_ioend(inode, bh, offset, type, - &ioend, !iomap_valid); + &ioend, !imap_valid); page_dirty--; count++; } else { - iomap_valid = 0; + imap_valid = 0; } } else if ((buffer_uptodate(bh) || PageUptodate(page)) && (unmapped || startio)) { - iomap_valid = 0; + imap_valid = 0; } if (!iohead) @@ -1241,12 +1259,23 @@ xfs_page_state_convert( if (startio) xfs_start_page_writeback(page, 1, count); - if (ioend && iomap_valid) { - offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >> - PAGE_CACHE_SHIFT; - tlast = min_t(pgoff_t, offset, last_index); - xfs_cluster_write(inode, page->index + 1, &iomap, &ioend, - wbc, startio, all_bh, tlast); + if (ioend && imap_valid) { + xfs_off_t end_index; + + end_index = imap.br_startoff + imap.br_blockcount; + + /* to bytes */ + end_index <<= inode->i_blkbits; + + /* to pages */ + end_index = (end_index - 1) >> PAGE_CACHE_SHIFT; + + /* check against file size */ + if (end_index > last_index) + end_index = last_index; + + xfs_cluster_write(inode, page->index + 1, &imap, &ioend, + wbc, startio, all_bh, end_index); } if (iohead) @@ -1448,10 +1477,11 @@ __xfs_get_blocks( int direct, bmapi_flags_t flags) { - xfs_iomap_t iomap; + struct xfs_bmbt_irec imap; xfs_off_t offset; ssize_t size; - int niomap = 1; + int nimap = 1; + int new = 0; int error; offset = (xfs_off_t)iblock << inode->i_blkbits; @@ -1462,22 +1492,21 @@ __xfs_get_blocks( return 0; error = xfs_iomap(XFS_I(inode), offset, size, - create ? flags : BMAPI_READ, &iomap, &niomap); + create ? flags : BMAPI_READ, &imap, &nimap, &new); if (error) return -error; - if (niomap == 0) + if (nimap == 0) return 0; - if (iomap.iomap_bn != IOMAP_DADDR_NULL) { + if (imap.br_startblock != HOLESTARTBLOCK && + imap.br_startblock != DELAYSTARTBLOCK) { /* * For unwritten extents do not report a disk address on * the read case (treat as if we're reading into a hole). */ - if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) { - xfs_map_buffer(bh_result, &iomap, offset, - inode->i_blkbits); - } - if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) { + if (create || !ISUNWRITTEN(&imap)) + xfs_map_buffer(inode, bh_result, &imap, offset); + if (create && ISUNWRITTEN(&imap)) { if (direct) bh_result->b_private = inode; set_buffer_unwritten(bh_result); @@ -1488,7 +1517,7 @@ __xfs_get_blocks( * If this is a realtime file, data may be on a different device. * to that pointed to from the buffer_head b_bdev currently. */ - bh_result->b_bdev = iomap.iomap_target->bt_bdev; + bh_result->b_bdev = xfs_find_bdev_for_inode(inode); /* * If we previously allocated a block out beyond eof and we are now @@ -1502,10 +1531,10 @@ __xfs_get_blocks( if (create && ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || (offset >= i_size_read(inode)) || - (iomap.iomap_flags & (IOMAP_NEW|IOMAP_UNWRITTEN)))) + (new || ISUNWRITTEN(&imap)))) set_buffer_new(bh_result); - if (iomap.iomap_flags & IOMAP_DELAY) { + if (imap.br_startblock == DELAYSTARTBLOCK) { BUG_ON(direct); if (create) { set_buffer_uptodate(bh_result); @@ -1514,11 +1543,23 @@ __xfs_get_blocks( } } + /* + * If this is O_DIRECT or the mpage code calling tell them how large + * the mapping is, so that we can avoid repeated get_blocks calls. + */ if (direct || size > (1 << inode->i_blkbits)) { - ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0); - offset = min_t(xfs_off_t, - iomap.iomap_bsize - iomap.iomap_delta, size); - bh_result->b_size = (ssize_t)min_t(xfs_off_t, LONG_MAX, offset); + xfs_off_t mapping_size; + + mapping_size = imap.br_startoff + imap.br_blockcount - iblock; + mapping_size <<= inode->i_blkbits; + + ASSERT(mapping_size > 0); + if (mapping_size > size) + mapping_size = size; + if (mapping_size > LONG_MAX) + mapping_size = LONG_MAX; + + bh_result->b_size = mapping_size; } return 0; @@ -1576,7 +1617,7 @@ xfs_end_io_direct( */ ioend->io_offset = offset; ioend->io_size = size; - if (ioend->io_type == IOMAP_READ) { + if (ioend->io_type == IO_READ) { xfs_finish_ioend(ioend, 0); } else if (private && size > 0) { xfs_finish_ioend(ioend, is_sync_kiocb(iocb)); @@ -1587,7 +1628,7 @@ xfs_end_io_direct( * didn't map an unwritten extent so switch it's completion * handler. */ - ioend->io_type = IOMAP_NEW; + ioend->io_type = IO_NEW; xfs_finish_ioend(ioend, 0); } @@ -1612,10 +1653,10 @@ xfs_vm_direct_IO( struct block_device *bdev; ssize_t ret; - bdev = xfs_find_bdev_for_inode(XFS_I(inode)); + bdev = xfs_find_bdev_for_inode(inode); iocb->private = xfs_alloc_ioend(inode, rw == WRITE ? - IOMAP_UNWRITTEN : IOMAP_READ); + IO_UNWRITTEN : IO_READ); ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, offset, nr_segs, diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 44c2b0ef9a4..649ade8ef59 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -37,6 +37,7 @@ #include "xfs_sb.h" #include "xfs_inum.h" +#include "xfs_log.h" #include "xfs_ag.h" #include "xfs_dmapi.h" #include "xfs_mount.h" @@ -850,6 +851,12 @@ xfs_buf_lock_value( * Note that this in no way locks the underlying pages, so it is only * useful for synchronizing concurrent use of buffer objects, not for * synchronizing independent access to the underlying pages. + * + * If we come across a stale, pinned, locked buffer, we know that we + * are being asked to lock a buffer that has been reallocated. Because + * it is pinned, we know that the log has not been pushed to disk and + * hence it will still be locked. Rather than sleeping until someone + * else pushes the log, push it ourselves before trying to get the lock. */ void xfs_buf_lock( @@ -857,6 +864,8 @@ xfs_buf_lock( { trace_xfs_buf_lock(bp, _RET_IP_); + if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) + xfs_log_force(bp->b_mount, 0); if (atomic_read(&bp->b_io_remaining)) blk_run_address_space(bp->b_target->bt_mapping); down(&bp->b_sema); @@ -1007,25 +1016,20 @@ xfs_bwrite( struct xfs_mount *mp, struct xfs_buf *bp) { - int iowait = (bp->b_flags & XBF_ASYNC) == 0; - int error = 0; + int error; bp->b_strat = xfs_bdstrat_cb; bp->b_mount = mp; bp->b_flags |= XBF_WRITE; - if (!iowait) - bp->b_flags |= _XBF_RUN_QUEUES; + bp->b_flags &= ~(XBF_ASYNC | XBF_READ); xfs_buf_delwri_dequeue(bp); xfs_buf_iostrategy(bp); - if (iowait) { - error = xfs_buf_iowait(bp); - if (error) - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - xfs_buf_relse(bp); - } - + error = xfs_buf_iowait(bp); + if (error) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + xfs_buf_relse(bp); return error; } @@ -1614,7 +1618,8 @@ xfs_mapping_buftarg( STATIC int xfs_alloc_delwrite_queue( - xfs_buftarg_t *btp) + xfs_buftarg_t *btp, + const char *fsname) { int error = 0; @@ -1622,7 +1627,7 @@ xfs_alloc_delwrite_queue( INIT_LIST_HEAD(&btp->bt_delwrite_queue); spin_lock_init(&btp->bt_delwrite_lock); btp->bt_flags = 0; - btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd"); + btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); if (IS_ERR(btp->bt_task)) { error = PTR_ERR(btp->bt_task); goto out_error; @@ -1635,7 +1640,8 @@ out_error: xfs_buftarg_t * xfs_alloc_buftarg( struct block_device *bdev, - int external) + int external, + const char *fsname) { xfs_buftarg_t *btp; @@ -1647,7 +1653,7 @@ xfs_alloc_buftarg( goto error; if (xfs_mapping_buftarg(btp, bdev)) goto error; - if (xfs_alloc_delwrite_queue(btp)) + if (xfs_alloc_delwrite_queue(btp, fsname)) goto error; xfs_alloc_bufhash(btp, external); return btp; diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 386e7361e50..5fbecefa5df 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -390,7 +390,7 @@ static inline void xfs_buf_relse(xfs_buf_t *bp) /* * Handling of buftargs. */ -extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); +extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *); extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 42dd3bcfba6..d8fb1b5d6cb 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -115,6 +115,8 @@ xfs_file_fsync( xfs_iflags_clear(ip, XFS_ITRUNCATED); + xfs_ioend_wait(ip); + /* * We always need to make sure that the required inode state is safe on * disk. The inode might be clean but we still might need to force the diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 7b26cc2fd28..699b60cbab9 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -527,6 +527,10 @@ xfs_attrmulti_by_handle( if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) return -XFS_ERROR(EFAULT); + /* overflow check */ + if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t)) + return -E2BIG; + dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq); if (IS_ERR(dentry)) return PTR_ERR(dentry); diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index 593c05b4df8..9287135e9bf 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -420,6 +420,10 @@ xfs_compat_attrmulti_by_handle( sizeof(compat_xfs_fsop_attrmulti_handlereq_t))) return -XFS_ERROR(EFAULT); + /* overflow check */ + if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t)) + return -E2BIG; + dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq); if (IS_ERR(dentry)) return PTR_ERR(dentry); diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index e65a7937f3a..9c8019c78c9 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -673,7 +673,10 @@ xfs_vn_fiemap( bm.bmv_length = BTOBB(length); /* We add one because in getbmap world count includes the header */ - bm.bmv_count = fieinfo->fi_extents_max + 1; + bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM : + fieinfo->fi_extents_max + 1; + bm.bmv_count = min_t(__s32, bm.bmv_count, + (PAGE_SIZE * 16 / sizeof(struct getbmapx))); bm.bmv_iflags = BMV_IF_PREALLOC; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) bm.bmv_iflags |= BMV_IF_ATTRFORK; diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c index 1947514ce1a..9ac8aea9152 100644 --- a/fs/xfs/linux-2.6/xfs_quotaops.c +++ b/fs/xfs/linux-2.6/xfs_quotaops.c @@ -19,6 +19,7 @@ #include "xfs_dmapi.h" #include "xfs_sb.h" #include "xfs_inum.h" +#include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_quota.h" @@ -97,7 +98,7 @@ xfs_fs_set_xstate( } STATIC int -xfs_fs_get_xquota( +xfs_fs_get_dqblk( struct super_block *sb, int type, qid_t id, @@ -114,7 +115,7 @@ xfs_fs_get_xquota( } STATIC int -xfs_fs_set_xquota( +xfs_fs_set_dqblk( struct super_block *sb, int type, qid_t id, @@ -135,6 +136,6 @@ xfs_fs_set_xquota( const struct quotactl_ops xfs_quotactl_operations = { .get_xstate = xfs_fs_get_xstate, .set_xstate = xfs_fs_set_xstate, - .get_xquota = xfs_fs_get_xquota, - .set_xquota = xfs_fs_set_xquota, + .get_dqblk = xfs_fs_get_dqblk, + .set_dqblk = xfs_fs_set_dqblk, }; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 29f1edca76d..f2d1718c916 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -119,6 +119,8 @@ mempool_t *xfs_ioend_pool; #define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */ #define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */ #define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */ +#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */ +#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */ /* * Table driven mount option parser. @@ -374,6 +376,13 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_DMAPI; } else if (!strcmp(this_char, MNTOPT_DMI)) { mp->m_flags |= XFS_MOUNT_DMAPI; + } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { + mp->m_flags |= XFS_MOUNT_DELAYLOG; + cmn_err(CE_WARN, + "Enabling EXPERIMENTAL delayed logging feature " + "- use at your own risk.\n"); + } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { + mp->m_flags &= ~XFS_MOUNT_DELAYLOG; } else if (!strcmp(this_char, "ihashsize")) { cmn_err(CE_WARN, "XFS: ihashsize no longer used, option is deprecated."); @@ -535,6 +544,7 @@ xfs_showargs( { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI }, { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, + { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG }, { 0, NULL } }; static struct proc_xfs_info xfs_info_unset[] = { @@ -725,7 +735,8 @@ void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) { - blkdev_issue_flush(buftarg->bt_bdev, NULL); + blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); } STATIC void @@ -789,18 +800,18 @@ xfs_open_devices( * Setup xfs_mount buffer target pointers */ error = ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0); + mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname); if (!mp->m_ddev_targp) goto out_close_rtdev; if (rtdev) { - mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1); + mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname); if (!mp->m_rtdev_targp) goto out_free_ddev_targ; } if (logdev && logdev != ddev) { - mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1); + mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname); if (!mp->m_logdev_targp) goto out_free_rtdev_targ; } else { @@ -902,7 +913,8 @@ xfsaild_start( struct xfs_ail *ailp) { ailp->xa_target = 0; - ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild"); + ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s", + ailp->xa_mount->m_fsname); if (IS_ERR(ailp->xa_task)) return -PTR_ERR(ailp->xa_task); return 0; @@ -1092,6 +1104,7 @@ xfs_fs_write_inode( * the code will only flush the inode if it isn't already * being flushed. */ + xfs_ioend_wait(ip); xfs_ilock(ip, XFS_ILOCK_SHARED); if (ip->i_update_core) { error = xfs_log_inode(ip); @@ -1752,7 +1765,7 @@ xfs_init_zones(void) * but it is much faster. */ xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) + - (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) / + (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) * sizeof(int))), "xfs_buf_item"); if (!xfs_buf_item_zone) goto out_destroy_trans_zone; diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h index 233d4b9881b..519618e9279 100644 --- a/fs/xfs/linux-2.6/xfs_super.h +++ b/fs/xfs/linux-2.6/xfs_super.h @@ -85,7 +85,7 @@ extern __uint64_t xfs_max_file_offset(unsigned int); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern const struct export_operations xfs_export_operations; -extern struct xattr_handler *xfs_xattr_handlers[]; +extern const struct xattr_handler *xfs_xattr_handlers[]; extern const struct quotactl_ops xfs_quotactl_operations; #define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index a427c638d90..3884e20bc14 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -356,68 +356,23 @@ xfs_commit_dummy_trans( STATIC int xfs_sync_fsdata( - struct xfs_mount *mp, - int flags) + struct xfs_mount *mp) { struct xfs_buf *bp; - struct xfs_buf_log_item *bip; - int error = 0; /* - * If this is xfssyncd() then only sync the superblock if we can - * lock it without sleeping and it is not pinned. + * If the buffer is pinned then push on the log so we won't get stuck + * waiting in the write for someone, maybe ourselves, to flush the log. + * + * Even though we just pushed the log above, we did not have the + * superblock buffer locked at that point so it can become pinned in + * between there and here. */ - if (flags & SYNC_TRYLOCK) { - ASSERT(!(flags & SYNC_WAIT)); - - bp = xfs_getsb(mp, XBF_TRYLOCK); - if (!bp) - goto out; - - bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *); - if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp)) - goto out_brelse; - } else { - bp = xfs_getsb(mp, 0); - - /* - * If the buffer is pinned then push on the log so we won't - * get stuck waiting in the write for someone, maybe - * ourselves, to flush the log. - * - * Even though we just pushed the log above, we did not have - * the superblock buffer locked at that point so it can - * become pinned in between there and here. - */ - if (XFS_BUF_ISPINNED(bp)) - xfs_log_force(mp, 0); - } - - - if (flags & SYNC_WAIT) - XFS_BUF_UNASYNC(bp); - else - XFS_BUF_ASYNC(bp); - - error = xfs_bwrite(mp, bp); - if (error) - return error; - - /* - * If this is a data integrity sync make sure all pending buffers - * are flushed out for the log coverage check below. - */ - if (flags & SYNC_WAIT) - xfs_flush_buftarg(mp->m_ddev_targp, 1); - - if (xfs_log_need_covered(mp)) - error = xfs_commit_dummy_trans(mp, flags); - return error; + bp = xfs_getsb(mp, 0); + if (XFS_BUF_ISPINNED(bp)) + xfs_log_force(mp, 0); - out_brelse: - xfs_buf_relse(bp); - out: - return error; + return xfs_bwrite(mp, bp); } /* @@ -441,7 +396,7 @@ int xfs_quiesce_data( struct xfs_mount *mp) { - int error; + int error, error2 = 0; /* push non-blocking */ xfs_sync_data(mp, 0); @@ -452,13 +407,20 @@ xfs_quiesce_data( xfs_qm_sync(mp, SYNC_WAIT); /* write superblock and hoover up shutdown errors */ - error = xfs_sync_fsdata(mp, SYNC_WAIT); + error = xfs_sync_fsdata(mp); + + /* make sure all delwri buffers are written out */ + xfs_flush_buftarg(mp->m_ddev_targp, 1); + + /* mark the log as covered if needed */ + if (xfs_log_need_covered(mp)) + error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT); /* flush data-only devices */ if (mp->m_rtdev_targp) XFS_bflush(mp->m_rtdev_targp); - return error; + return error ? error : error2; } STATIC void @@ -581,9 +543,9 @@ xfs_flush_inodes( } /* - * Every sync period we need to unpin all items, reclaim inodes, sync - * quota and write out the superblock. We might need to cover the log - * to indicate it is idle. + * Every sync period we need to unpin all items, reclaim inodes and sync + * disk quotas. We might need to cover the log to indicate that the + * filesystem is idle. */ STATIC void xfs_sync_worker( @@ -597,7 +559,8 @@ xfs_sync_worker( xfs_reclaim_inodes(mp, 0); /* dgc: errors ignored here */ error = xfs_qm_sync(mp, SYNC_TRYLOCK); - error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); + if (xfs_log_need_covered(mp)) + error = xfs_commit_dummy_trans(mp, 0); } mp->m_sync_seq++; wake_up(&mp->m_wait_single_sync_task); @@ -660,7 +623,7 @@ xfs_syncd_init( mp->m_sync_work.w_syncer = xfs_sync_worker; mp->m_sync_work.w_mount = mp; mp->m_sync_work.w_completion = NULL; - mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd"); + mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname); if (IS_ERR(mp->m_sync_task)) return -PTR_ERR(mp->m_sync_task); return 0; diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c index 5a107601e96..207fa77f63a 100644 --- a/fs/xfs/linux-2.6/xfs_trace.c +++ b/fs/xfs/linux-2.6/xfs_trace.c @@ -41,7 +41,6 @@ #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_attr.h" -#include "xfs_attr_sf.h" #include "xfs_attr_leaf.h" #include "xfs_log_priv.h" #include "xfs_buf_item.h" @@ -50,6 +49,9 @@ #include "xfs_aops.h" #include "quota/xfs_dquot_item.h" #include "quota/xfs_dquot.h" +#include "xfs_log_recover.h" +#include "xfs_buf_item.h" +#include "xfs_inode_item.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index fcaa62f0799..ff6bc797baf 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -32,6 +32,10 @@ struct xfs_da_node_entry; struct xfs_dquot; struct xlog_ticket; struct log; +struct xlog_recover; +struct xlog_recover_item; +struct xfs_buf_log_format; +struct xfs_inode_log_format; DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), @@ -562,18 +566,21 @@ DECLARE_EVENT_CLASS(xfs_inode_class, __field(dev_t, dev) __field(xfs_ino_t, ino) __field(int, count) + __field(int, pincount) __field(unsigned long, caller_ip) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; __entry->count = atomic_read(&VFS_I(ip)->i_count); + __entry->pincount = atomic_read(&ip->i_pincount); __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino 0x%llx count %d caller %pf", + TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->count, + __entry->pincount, (char *)__entry->caller_ip) ) @@ -583,6 +590,10 @@ DEFINE_EVENT(xfs_inode_class, name, \ TP_ARGS(ip, caller_ip)) DEFINE_INODE_EVENT(xfs_ihold); DEFINE_INODE_EVENT(xfs_irele); +DEFINE_INODE_EVENT(xfs_inode_pin); +DEFINE_INODE_EVENT(xfs_inode_unpin); +DEFINE_INODE_EVENT(xfs_inode_unpin_nowait); + /* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */ DEFINE_INODE_EVENT(xfs_inode); #define xfs_itrace_entry(ip) \ @@ -642,8 +653,6 @@ DEFINE_EVENT(xfs_dquot_class, name, \ TP_PROTO(struct xfs_dquot *dqp), \ TP_ARGS(dqp)) DEFINE_DQUOT_EVENT(xfs_dqadjust); -DEFINE_DQUOT_EVENT(xfs_dqshake_dirty); -DEFINE_DQUOT_EVENT(xfs_dqshake_unlink); DEFINE_DQUOT_EVENT(xfs_dqreclaim_want); DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty); DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink); @@ -658,7 +667,6 @@ DEFINE_DQUOT_EVENT(xfs_dqread_fail); DEFINE_DQUOT_EVENT(xfs_dqlookup_found); DEFINE_DQUOT_EVENT(xfs_dqlookup_want); DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist); -DEFINE_DQUOT_EVENT(xfs_dqlookup_move); DEFINE_DQUOT_EVENT(xfs_dqlookup_done); DEFINE_DQUOT_EVENT(xfs_dqget_hit); DEFINE_DQUOT_EVENT(xfs_dqget_miss); @@ -1051,83 +1059,112 @@ TRACE_EVENT(xfs_bunmap, ); +#define XFS_BUSY_SYNC \ + { 0, "async" }, \ + { 1, "sync" } + TRACE_EVENT(xfs_alloc_busy, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_extlen_t len, int slot), - TP_ARGS(mp, agno, agbno, len, slot), + TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, int sync), + TP_ARGS(trans, agno, agbno, len, sync), TP_STRUCT__entry( __field(dev_t, dev) + __field(struct xfs_trans *, tp) + __field(int, tid) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) - __field(int, slot) + __field(int, sync) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; + __entry->dev = trans->t_mountp->m_super->s_dev; + __entry->tp = trans; + __entry->tid = trans->t_ticket->t_tid; __entry->agno = agno; __entry->agbno = agbno; __entry->len = len; - __entry->slot = slot; + __entry->sync = sync; ), - TP_printk("dev %d:%d agno %u agbno %u len %u slot %d", + TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s", MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tp, + __entry->tid, __entry->agno, __entry->agbno, __entry->len, - __entry->slot) + __print_symbolic(__entry->sync, XFS_BUSY_SYNC)) ); -#define XFS_BUSY_STATES \ - { 0, "found" }, \ - { 1, "missing" } - TRACE_EVENT(xfs_alloc_unbusy, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - int slot, int found), - TP_ARGS(mp, agno, slot, found), + xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, agbno, len), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) - __field(int, slot) - __field(int, found) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; - __entry->slot = slot; - __entry->found = found; + __entry->agbno = agbno; + __entry->len = len; ), - TP_printk("dev %d:%d agno %u slot %d %s", + TP_printk("dev %d:%d agno %u agbno %u len %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, - __entry->slot, - __print_symbolic(__entry->found, XFS_BUSY_STATES)) + __entry->agbno, + __entry->len) ); +#define XFS_BUSY_STATES \ + { 0, "missing" }, \ + { 1, "found" } + TRACE_EVENT(xfs_alloc_busysearch, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_extlen_t len, xfs_lsn_t lsn), - TP_ARGS(mp, agno, agbno, len, lsn), + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, int found), + TP_ARGS(mp, agno, agbno, len, found), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) - __field(xfs_lsn_t, lsn) + __field(int, found) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; __entry->agbno = agbno; __entry->len = len; - __entry->lsn = lsn; + __entry->found = found; ), - TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx", + TP_printk("dev %d:%d agno %u agbno %u len %u %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, __entry->len, + __print_symbolic(__entry->found, XFS_BUSY_STATES)) +); + +TRACE_EVENT(xfs_trans_commit_lsn, + TP_PROTO(struct xfs_trans *trans), + TP_ARGS(trans), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(struct xfs_trans *, tp) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = trans->t_mountp->m_super->s_dev; + __entry->tp = trans; + __entry->lsn = trans->t_commit_lsn; + ), + TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tp, __entry->lsn) ); @@ -1495,6 +1532,140 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \ DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); +DECLARE_EVENT_CLASS(xfs_log_recover_item_class, + TP_PROTO(struct log *log, struct xlog_recover *trans, + struct xlog_recover_item *item, int pass), + TP_ARGS(log, trans, item, pass), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, item) + __field(xlog_tid_t, tid) + __field(int, type) + __field(int, pass) + __field(int, count) + __field(int, total) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->item = (unsigned long)item; + __entry->tid = trans->r_log_tid; + __entry->type = ITEM_TYPE(item); + __entry->pass = pass; + __entry->count = item->ri_cnt; + __entry->total = item->ri_total; + ), + TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s " + "item region count/total %d/%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tid, + __entry->pass, + (void *)__entry->item, + __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), + __entry->count, + __entry->total) +) + +#define DEFINE_LOG_RECOVER_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_item_class, name, \ + TP_PROTO(struct log *log, struct xlog_recover *trans, \ + struct xlog_recover_item *item, int pass), \ + TP_ARGS(log, trans, item, pass)) + +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover); + +DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class, + TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), + TP_ARGS(log, buf_f), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(__int64_t, blkno) + __field(unsigned short, len) + __field(unsigned short, flags) + __field(unsigned short, size) + __field(unsigned int, map_size) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->blkno = buf_f->blf_blkno; + __entry->len = buf_f->blf_len; + __entry->flags = buf_f->blf_flags; + __entry->size = buf_f->blf_size; + __entry->map_size = buf_f->blf_map_size; + ), + TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, " + "map_size %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->blkno, + __entry->len, + __entry->flags, + __entry->size, + __entry->map_size) +) + +#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \ + TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), \ + TP_ARGS(log, buf_f)) + +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf); + +DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class, + TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), + TP_ARGS(log, in_f), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned short, size) + __field(int, fields) + __field(unsigned short, asize) + __field(unsigned short, dsize) + __field(__int64_t, blkno) + __field(int, len) + __field(int, boffset) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->ino = in_f->ilf_ino; + __entry->size = in_f->ilf_size; + __entry->fields = in_f->ilf_fields; + __entry->asize = in_f->ilf_asize; + __entry->dsize = in_f->ilf_dsize; + __entry->blkno = in_f->ilf_blkno; + __entry->len = in_f->ilf_len; + __entry->boffset = in_f->ilf_boffset; + ), + TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, " + "dsize %d, blkno 0x%llx, len %d, boffset %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->fields, + __entry->asize, + __entry->dsize, + __entry->blkno, + __entry->len, + __entry->boffset) +) +#define DEFINE_LOG_RECOVER_INO_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \ + TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), \ + TP_ARGS(log, in_f)) + +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover); +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c index fa01b9daba6..87d3e03878c 100644 --- a/fs/xfs/linux-2.6/xfs_xattr.c +++ b/fs/xfs/linux-2.6/xfs_xattr.c @@ -72,28 +72,28 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value, (void *)value, size, xflags); } -static struct xattr_handler xfs_xattr_user_handler = { +static const struct xattr_handler xfs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .flags = 0, /* no flags implies user namespace */ .get = xfs_xattr_get, .set = xfs_xattr_set, }; -static struct xattr_handler xfs_xattr_trusted_handler = { +static const struct xattr_handler xfs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .flags = ATTR_ROOT, .get = xfs_xattr_get, .set = xfs_xattr_set, }; -static struct xattr_handler xfs_xattr_security_handler = { +static const struct xattr_handler xfs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .flags = ATTR_SECURE, .get = xfs_xattr_get, .set = xfs_xattr_set, }; -struct xattr_handler *xfs_xattr_handlers[] = { +const struct xattr_handler *xfs_xattr_handlers[] = { &xfs_xattr_user_handler, &xfs_xattr_trusted_handler, &xfs_xattr_security_handler, diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 5f79dd78626..585e7633dfc 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -101,7 +101,7 @@ xfs_qm_dqinit( * No need to re-initialize these if this is a reclaimed dquot. */ if (brandnewdquot) { - dqp->dq_flnext = dqp->dq_flprev = dqp; + INIT_LIST_HEAD(&dqp->q_freelist); mutex_init(&dqp->q_qlock); init_waitqueue_head(&dqp->q_pinwait); @@ -119,20 +119,20 @@ xfs_qm_dqinit( * Only the q_core portion was zeroed in dqreclaim_one(). * So, we need to reset others. */ - dqp->q_nrefs = 0; - dqp->q_blkno = 0; - dqp->MPL_NEXT = dqp->HL_NEXT = NULL; - dqp->HL_PREVP = dqp->MPL_PREVP = NULL; - dqp->q_bufoffset = 0; - dqp->q_fileoffset = 0; - dqp->q_transp = NULL; - dqp->q_gdquot = NULL; - dqp->q_res_bcount = 0; - dqp->q_res_icount = 0; - dqp->q_res_rtbcount = 0; - atomic_set(&dqp->q_pincount, 0); - dqp->q_hash = NULL; - ASSERT(dqp->dq_flnext == dqp->dq_flprev); + dqp->q_nrefs = 0; + dqp->q_blkno = 0; + INIT_LIST_HEAD(&dqp->q_mplist); + INIT_LIST_HEAD(&dqp->q_hashlist); + dqp->q_bufoffset = 0; + dqp->q_fileoffset = 0; + dqp->q_transp = NULL; + dqp->q_gdquot = NULL; + dqp->q_res_bcount = 0; + dqp->q_res_icount = 0; + dqp->q_res_rtbcount = 0; + atomic_set(&dqp->q_pincount, 0); + dqp->q_hash = NULL; + ASSERT(list_empty(&dqp->q_freelist)); trace_xfs_dqreuse(dqp); } @@ -158,7 +158,7 @@ void xfs_qm_dqdestroy( xfs_dquot_t *dqp) { - ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp)); + ASSERT(list_empty(&dqp->q_freelist)); mutex_destroy(&dqp->q_qlock); sv_destroy(&dqp->q_pinwait); @@ -252,7 +252,7 @@ xfs_qm_adjust_dqtimers( (be64_to_cpu(d->d_bcount) >= be64_to_cpu(d->d_blk_hardlimit)))) { d->d_btimer = cpu_to_be32(get_seconds() + - XFS_QI_BTIMELIMIT(mp)); + mp->m_quotainfo->qi_btimelimit); } else { d->d_bwarns = 0; } @@ -275,7 +275,7 @@ xfs_qm_adjust_dqtimers( (be64_to_cpu(d->d_icount) >= be64_to_cpu(d->d_ino_hardlimit)))) { d->d_itimer = cpu_to_be32(get_seconds() + - XFS_QI_ITIMELIMIT(mp)); + mp->m_quotainfo->qi_itimelimit); } else { d->d_iwarns = 0; } @@ -298,7 +298,7 @@ xfs_qm_adjust_dqtimers( (be64_to_cpu(d->d_rtbcount) >= be64_to_cpu(d->d_rtb_hardlimit)))) { d->d_rtbtimer = cpu_to_be32(get_seconds() + - XFS_QI_RTBTIMELIMIT(mp)); + mp->m_quotainfo->qi_rtbtimelimit); } else { d->d_rtbwarns = 0; } @@ -325,6 +325,7 @@ xfs_qm_init_dquot_blk( uint type, xfs_buf_t *bp) { + struct xfs_quotainfo *q = mp->m_quotainfo; xfs_dqblk_t *d; int curid, i; @@ -337,16 +338,16 @@ xfs_qm_init_dquot_blk( /* * ID of the first dquot in the block - id's are zero based. */ - curid = id - (id % XFS_QM_DQPERBLK(mp)); + curid = id - (id % q->qi_dqperchunk); ASSERT(curid >= 0); - memset(d, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp))); - for (i = 0; i < XFS_QM_DQPERBLK(mp); i++, d++, curid++) + memset(d, 0, BBTOB(q->qi_dqchunklen)); + for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) xfs_qm_dqinit_core(curid, type, d); xfs_trans_dquot_buf(tp, bp, - (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF : - ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF : - XFS_BLI_GDQUOT_BUF))); - xfs_trans_log_buf(tp, bp, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1); + (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF : + ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF : + XFS_BLF_GDQUOT_BUF))); + xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } @@ -419,7 +420,7 @@ xfs_qm_dqalloc( /* now we can just get the buffer (there's nothing to read yet) */ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno, - XFS_QI_DQCHUNKLEN(mp), + mp->m_quotainfo->qi_dqchunklen, 0); if (!bp || (error = XFS_BUF_GETERROR(bp))) goto error1; @@ -500,7 +501,8 @@ xfs_qm_dqtobp( */ if (dqp->q_blkno == (xfs_daddr_t) 0) { /* We use the id as an index */ - dqp->q_fileoffset = (xfs_fileoff_t)id / XFS_QM_DQPERBLK(mp); + dqp->q_fileoffset = (xfs_fileoff_t)id / + mp->m_quotainfo->qi_dqperchunk; nmaps = 1; quotip = XFS_DQ_TO_QIP(dqp); xfs_ilock(quotip, XFS_ILOCK_SHARED); @@ -529,7 +531,7 @@ xfs_qm_dqtobp( /* * offset of dquot in the (fixed sized) dquot chunk. */ - dqp->q_bufoffset = (id % XFS_QM_DQPERBLK(mp)) * + dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * sizeof(xfs_dqblk_t); if (map.br_startblock == HOLESTARTBLOCK) { /* @@ -559,15 +561,13 @@ xfs_qm_dqtobp( * Read in the buffer, unless we've just done the allocation * (in which case we already have the buf). */ - if (! newdquot) { + if (!newdquot) { trace_xfs_dqtobp_read(dqp); - if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - dqp->q_blkno, - XFS_QI_DQCHUNKLEN(mp), - 0, &bp))) { - return (error); - } + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, + 0, &bp); if (error || !bp) return XFS_ERROR(error); } @@ -689,14 +689,14 @@ xfs_qm_idtodq( tp = NULL; if (flags & XFS_QMOPT_DQALLOC) { tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); - if ((error = xfs_trans_reserve(tp, - XFS_QM_DQALLOC_SPACE_RES(mp), - XFS_WRITE_LOG_RES(mp) + - BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1 + - 128, - 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT))) { + error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp), + XFS_WRITE_LOG_RES(mp) + + BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + + 128, + 0, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + if (error) { cancelflags = 0; goto error0; } @@ -751,7 +751,6 @@ xfs_qm_dqlookup( { xfs_dquot_t *dqp; uint flist_locked; - xfs_dquot_t *d; ASSERT(mutex_is_locked(&qh->qh_lock)); @@ -760,7 +759,7 @@ xfs_qm_dqlookup( /* * Traverse the hashchain looking for a match */ - for (dqp = qh->qh_next; dqp != NULL; dqp = dqp->HL_NEXT) { + list_for_each_entry(dqp, &qh->qh_list, q_hashlist) { /* * We already have the hashlock. We don't need the * dqlock to look at the id field of the dquot, since the @@ -772,12 +771,12 @@ xfs_qm_dqlookup( /* * All in core dquots must be on the dqlist of mp */ - ASSERT(dqp->MPL_PREVP != NULL); + ASSERT(!list_empty(&dqp->q_mplist)); xfs_dqlock(dqp); if (dqp->q_nrefs == 0) { - ASSERT (XFS_DQ_IS_ON_FREELIST(dqp)); - if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { + ASSERT(!list_empty(&dqp->q_freelist)); + if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) { trace_xfs_dqlookup_want(dqp); /* @@ -787,7 +786,7 @@ xfs_qm_dqlookup( */ dqp->dq_flags |= XFS_DQ_WANT; xfs_dqunlock(dqp); - xfs_qm_freelist_lock(xfs_Gqm); + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); xfs_dqlock(dqp); dqp->dq_flags &= ~(XFS_DQ_WANT); } @@ -802,46 +801,28 @@ xfs_qm_dqlookup( if (flist_locked) { if (dqp->q_nrefs != 0) { - xfs_qm_freelist_unlock(xfs_Gqm); + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); flist_locked = B_FALSE; } else { - /* - * take it off the freelist - */ + /* take it off the freelist */ trace_xfs_dqlookup_freelist(dqp); - XQM_FREELIST_REMOVE(dqp); - /* xfs_qm_freelist_print(&(xfs_Gqm-> - qm_dqfreelist), - "after removal"); */ + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; } } - /* - * grab a reference - */ XFS_DQHOLD(dqp); if (flist_locked) - xfs_qm_freelist_unlock(xfs_Gqm); + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); /* * move the dquot to the front of the hashchain */ ASSERT(mutex_is_locked(&qh->qh_lock)); - if (dqp->HL_PREVP != &qh->qh_next) { - trace_xfs_dqlookup_move(dqp); - if ((d = dqp->HL_NEXT)) - d->HL_PREVP = dqp->HL_PREVP; - *(dqp->HL_PREVP) = d; - d = qh->qh_next; - d->HL_PREVP = &dqp->HL_NEXT; - dqp->HL_NEXT = d; - dqp->HL_PREVP = &qh->qh_next; - qh->qh_next = dqp; - } + list_move(&dqp->q_hashlist, &qh->qh_list); trace_xfs_dqlookup_done(dqp); *O_dqpp = dqp; - ASSERT(mutex_is_locked(&qh->qh_lock)); - return (0); + return 0; } } @@ -975,16 +956,17 @@ xfs_qm_dqget( */ if (ip) { xfs_ilock(ip, XFS_ILOCK_EXCL); - if (! XFS_IS_DQTYPE_ON(mp, type)) { - /* inode stays locked on return */ - xfs_qm_dqdestroy(dqp); - return XFS_ERROR(ESRCH); - } + /* * A dquot could be attached to this inode by now, since * we had dropped the ilock. */ if (type == XFS_DQ_USER) { + if (!XFS_IS_UQUOTA_ON(mp)) { + /* inode stays locked on return */ + xfs_qm_dqdestroy(dqp); + return XFS_ERROR(ESRCH); + } if (ip->i_udquot) { xfs_qm_dqdestroy(dqp); dqp = ip->i_udquot; @@ -992,6 +974,11 @@ xfs_qm_dqget( goto dqret; } } else { + if (!XFS_IS_OQUOTA_ON(mp)) { + /* inode stays locked on return */ + xfs_qm_dqdestroy(dqp); + return XFS_ERROR(ESRCH); + } if (ip->i_gdquot) { xfs_qm_dqdestroy(dqp); dqp = ip->i_gdquot; @@ -1033,13 +1020,14 @@ xfs_qm_dqget( */ ASSERT(mutex_is_locked(&h->qh_lock)); dqp->q_hash = h; - XQM_HASHLIST_INSERT(h, dqp); + list_add(&dqp->q_hashlist, &h->qh_list); + h->qh_version++; /* * Attach this dquot to this filesystem's list of all dquots, * kept inside the mount structure in m_quotainfo field */ - xfs_qm_mplist_lock(mp); + mutex_lock(&mp->m_quotainfo->qi_dqlist_lock); /* * We return a locked dquot to the caller, with a reference taken @@ -1047,9 +1035,9 @@ xfs_qm_dqget( xfs_dqlock(dqp); dqp->q_nrefs = 1; - XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp); - - xfs_qm_mplist_unlock(mp); + list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist); + mp->m_quotainfo->qi_dquots++; + mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); mutex_unlock(&h->qh_lock); dqret: ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -1086,10 +1074,10 @@ xfs_qm_dqput( * drop the dqlock and acquire the freelist and dqlock * in the right order; but try to get it out-of-order first */ - if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { + if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) { trace_xfs_dqput_wait(dqp); xfs_dqunlock(dqp); - xfs_qm_freelist_lock(xfs_Gqm); + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); xfs_dqlock(dqp); } @@ -1100,10 +1088,8 @@ xfs_qm_dqput( if (--dqp->q_nrefs == 0) { trace_xfs_dqput_free(dqp); - /* - * insert at end of the freelist. - */ - XQM_FREELIST_INSERT(&(xfs_Gqm->qm_dqfreelist), dqp); + list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist); + xfs_Gqm->qm_dqfrlist_cnt++; /* * If we just added a udquot to the freelist, then @@ -1118,10 +1104,6 @@ xfs_qm_dqput( xfs_dqlock(gdqp); dqp->q_gdquot = NULL; } - - /* xfs_qm_freelist_print(&(xfs_Gqm->qm_dqfreelist), - "@@@@@++ Free list (after append) @@@@@+"); - */ } xfs_dqunlock(dqp); @@ -1133,7 +1115,7 @@ xfs_qm_dqput( break; dqp = gdqp; } - xfs_qm_freelist_unlock(xfs_Gqm); + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); } /* @@ -1386,10 +1368,10 @@ int xfs_qm_dqpurge( xfs_dquot_t *dqp) { - xfs_dqhash_t *thishash; + xfs_dqhash_t *qh = dqp->q_hash; xfs_mount_t *mp = dqp->q_mount; - ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); + ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock)); ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock)); xfs_dqlock(dqp); @@ -1407,7 +1389,7 @@ xfs_qm_dqpurge( return (1); } - ASSERT(XFS_DQ_IS_ON_FREELIST(dqp)); + ASSERT(!list_empty(&dqp->q_freelist)); /* * If we're turning off quotas, we have to make sure that, for @@ -1452,14 +1434,16 @@ xfs_qm_dqpurge( ASSERT(XFS_FORCED_SHUTDOWN(mp) || !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); - thishash = dqp->q_hash; - XQM_HASHLIST_REMOVE(thishash, dqp); - XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(mp)), dqp); + list_del_init(&dqp->q_hashlist); + qh->qh_version++; + list_del_init(&dqp->q_mplist); + mp->m_quotainfo->qi_dqreclaims++; + mp->m_quotainfo->qi_dquots--; /* * XXX Move this to the front of the freelist, if we can get the * freelist lock. */ - ASSERT(XFS_DQ_IS_ON_FREELIST(dqp)); + ASSERT(!list_empty(&dqp->q_freelist)); dqp->q_mount = NULL; dqp->q_hash = NULL; @@ -1467,7 +1451,7 @@ xfs_qm_dqpurge( memset(&dqp->q_core, 0, sizeof(dqp->q_core)); xfs_dqfunlock(dqp); xfs_dqunlock(dqp); - mutex_unlock(&thishash->qh_lock); + mutex_unlock(&qh->qh_lock); return (0); } @@ -1517,6 +1501,7 @@ void xfs_qm_dqflock_pushbuf_wait( xfs_dquot_t *dqp) { + xfs_mount_t *mp = dqp->q_mount; xfs_buf_t *bp; /* @@ -1525,14 +1510,14 @@ xfs_qm_dqflock_pushbuf_wait( * out immediately. We'll be able to acquire * the flush lock when the I/O completes. */ - bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno, - XFS_QI_DQCHUNKLEN(dqp->q_mount), XBF_TRYLOCK); + bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); if (!bp) goto out_lock; if (XFS_BUF_ISDELAYWRITE(bp)) { if (XFS_BUF_ISPINNED(bp)) - xfs_log_force(dqp->q_mount, 0); + xfs_log_force(mp, 0); xfs_buf_delwri_promote(bp); wake_up_process(bp->b_target->bt_task); } diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h index a0f7da586d1..5da3a23b820 100644 --- a/fs/xfs/quota/xfs_dquot.h +++ b/fs/xfs/quota/xfs_dquot.h @@ -33,40 +33,23 @@ * The hash chain headers (hash buckets) */ typedef struct xfs_dqhash { - struct xfs_dquot *qh_next; + struct list_head qh_list; struct mutex qh_lock; uint qh_version; /* ever increasing version */ uint qh_nelems; /* number of dquots on the list */ } xfs_dqhash_t; -typedef struct xfs_dqlink { - struct xfs_dquot *ql_next; /* forward link */ - struct xfs_dquot **ql_prevp; /* pointer to prev ql_next */ -} xfs_dqlink_t; - struct xfs_mount; struct xfs_trans; /* - * This is the marker which is designed to occupy the first few - * bytes of the xfs_dquot_t structure. Even inside this, the freelist pointers - * must come first. - * This serves as the marker ("sentinel") when we have to restart list - * iterations because of locking considerations. - */ -typedef struct xfs_dqmarker { - struct xfs_dquot*dqm_flnext; /* link to freelist: must be first */ - struct xfs_dquot*dqm_flprev; - xfs_dqlink_t dqm_mplist; /* link to mount's list of dquots */ - xfs_dqlink_t dqm_hashlist; /* link to the hash chain */ - uint dqm_flags; /* various flags (XFS_DQ_*) */ -} xfs_dqmarker_t; - -/* * The incore dquot structure */ typedef struct xfs_dquot { - xfs_dqmarker_t q_lists; /* list ptrs, q_flags (marker) */ + uint dq_flags; /* various flags (XFS_DQ_*) */ + struct list_head q_freelist; /* global free list of dquots */ + struct list_head q_mplist; /* mount's list of dquots */ + struct list_head q_hashlist; /* gloabl hash list of dquots */ xfs_dqhash_t *q_hash; /* the hashchain header */ struct xfs_mount*q_mount; /* filesystem this relates to */ struct xfs_trans*q_transp; /* trans this belongs to currently */ @@ -87,13 +70,6 @@ typedef struct xfs_dquot { wait_queue_head_t q_pinwait; /* dquot pinning wait queue */ } xfs_dquot_t; - -#define dq_flnext q_lists.dqm_flnext -#define dq_flprev q_lists.dqm_flprev -#define dq_mplist q_lists.dqm_mplist -#define dq_hashlist q_lists.dqm_hashlist -#define dq_flags q_lists.dqm_flags - /* * Lock hierarchy for q_qlock: * XFS_QLOCK_NORMAL is the implicit default, @@ -127,7 +103,6 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp) } #define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) -#define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp)) #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) #define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) #define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c index 4e4ee9a5719..8d89a24ae32 100644 --- a/fs/xfs/quota/xfs_dquot_item.c +++ b/fs/xfs/quota/xfs_dquot_item.c @@ -107,8 +107,7 @@ xfs_qm_dquot_logitem_pin( /* ARGSUSED */ STATIC void xfs_qm_dquot_logitem_unpin( - xfs_dq_logitem_t *logitem, - int stale) + xfs_dq_logitem_t *logitem) { xfs_dquot_t *dqp = logitem->qli_dquot; @@ -123,7 +122,7 @@ xfs_qm_dquot_logitem_unpin_remove( xfs_dq_logitem_t *logitem, xfs_trans_t *tp) { - xfs_qm_dquot_logitem_unpin(logitem, 0); + xfs_qm_dquot_logitem_unpin(logitem); } /* @@ -228,7 +227,7 @@ xfs_qm_dquot_logitem_pushbuf( } mp = dqp->q_mount; bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno, - XFS_QI_DQCHUNKLEN(mp), XBF_TRYLOCK); + mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); xfs_dqunlock(dqp); if (!bp) return; @@ -329,8 +328,7 @@ static struct xfs_item_ops xfs_dquot_item_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_qm_dquot_logitem_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int)) - xfs_qm_dquot_logitem_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) xfs_qm_dquot_logitem_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*)) @@ -357,9 +355,8 @@ xfs_qm_dquot_logitem_init( xfs_dq_logitem_t *lp; lp = &dqp->q_logitem; - lp->qli_item.li_type = XFS_LI_DQUOT; - lp->qli_item.li_ops = &xfs_dquot_item_ops; - lp->qli_item.li_mountp = dqp->q_mount; + xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT, + &xfs_dquot_item_ops); lp->qli_dquot = dqp; lp->qli_format.qlf_type = XFS_LI_DQUOT; lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id); @@ -426,7 +423,7 @@ xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf) */ /*ARGSUSED*/ STATIC void -xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf, int stale) +xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf) { return; } @@ -537,8 +534,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_qm_qoff_logitem_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, - .iop_unpin = (void(*)(xfs_log_item_t* ,int)) - xfs_qm_qoff_logitem_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) xfs_qm_qoff_logitem_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, @@ -559,8 +555,7 @@ static struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_qm_qoff_logitem_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int)) - xfs_qm_qoff_logitem_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) xfs_qm_qoff_logitem_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, @@ -586,11 +581,8 @@ xfs_qm_qoff_logitem_init( qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP); - qf->qql_item.li_type = XFS_LI_QUOTAOFF; - if (start) - qf->qql_item.li_ops = &xfs_qm_qoffend_logitem_ops; - else - qf->qql_item.li_ops = &xfs_qm_qoff_logitem_ops; + xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? + &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); qf->qql_item.li_mountp = mp; qf->qql_format.qf_type = XFS_LI_QUOTAOFF; qf->qql_format.qf_flags = flags; diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 417e61e3d9d..38e76414664 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -67,9 +67,6 @@ static cred_t xfs_zerocr; STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); -STATIC void xfs_qm_freelist_init(xfs_frlist_t *); -STATIC void xfs_qm_freelist_destroy(xfs_frlist_t *); - STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); STATIC int xfs_qm_shake(int, gfp_t); @@ -84,21 +81,25 @@ extern struct mutex qcheck_lock; #endif #ifdef QUOTADEBUG -#define XQM_LIST_PRINT(l, NXT, title) \ -{ \ - xfs_dquot_t *dqp; int i = 0; \ - cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \ - for (dqp = (l)->qh_next; dqp != NULL; dqp = dqp->NXT) { \ - cmn_err(CE_DEBUG, " %d. \"%d (%s)\" " \ - "bcnt = %d, icnt = %d, refs = %d", \ - ++i, (int) be32_to_cpu(dqp->q_core.d_id), \ - DQFLAGTO_TYPESTR(dqp), \ - (int) be64_to_cpu(dqp->q_core.d_bcount), \ - (int) be64_to_cpu(dqp->q_core.d_icount), \ - (int) dqp->q_nrefs); } \ +static void +xfs_qm_dquot_list_print( + struct xfs_mount *mp) +{ + xfs_dquot_t *dqp; + int i = 0; + + list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) { + cmn_err(CE_DEBUG, " %d. \"%d (%s)\" " + "bcnt = %lld, icnt = %lld, refs = %d", + i++, be32_to_cpu(dqp->q_core.d_id), + DQFLAGTO_TYPESTR(dqp), + (long long)be64_to_cpu(dqp->q_core.d_bcount), + (long long)be64_to_cpu(dqp->q_core.d_icount), + dqp->q_nrefs); + } } #else -#define XQM_LIST_PRINT(l, NXT, title) do { } while (0) +static void xfs_qm_dquot_list_print(struct xfs_mount *mp) { } #endif /* @@ -144,7 +145,9 @@ xfs_Gqm_init(void) /* * Freelist of all dquots of all file systems */ - xfs_qm_freelist_init(&(xqm->qm_dqfreelist)); + INIT_LIST_HEAD(&xqm->qm_dqfrlist); + xqm->qm_dqfrlist_cnt = 0; + mutex_init(&xqm->qm_dqfrlist_lock); /* * dquot zone. we register our own low-memory callback. @@ -189,6 +192,7 @@ STATIC void xfs_qm_destroy( struct xfs_qm *xqm) { + struct xfs_dquot *dqp, *n; int hsize, i; ASSERT(xqm != NULL); @@ -204,7 +208,21 @@ xfs_qm_destroy( xqm->qm_usr_dqhtable = NULL; xqm->qm_grp_dqhtable = NULL; xqm->qm_dqhashmask = 0; - xfs_qm_freelist_destroy(&(xqm->qm_dqfreelist)); + + /* frlist cleanup */ + mutex_lock(&xqm->qm_dqfrlist_lock); + list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) { + xfs_dqlock(dqp); +#ifdef QUOTADEBUG + cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp); +#endif + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; + xfs_dqunlock(dqp); + xfs_qm_dqdestroy(dqp); + } + mutex_unlock(&xqm->qm_dqfrlist_lock); + mutex_destroy(&xqm->qm_dqfrlist_lock); #ifdef DEBUG mutex_destroy(&qcheck_lock); #endif @@ -256,7 +274,7 @@ STATIC void xfs_qm_rele_quotafs_ref( struct xfs_mount *mp) { - xfs_dquot_t *dqp, *nextdqp; + xfs_dquot_t *dqp, *n; ASSERT(xfs_Gqm); ASSERT(xfs_Gqm->qm_nrefs > 0); @@ -264,26 +282,24 @@ xfs_qm_rele_quotafs_ref( /* * Go thru the freelist and destroy all inactive dquots. */ - xfs_qm_freelist_lock(xfs_Gqm); + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - for (dqp = xfs_Gqm->qm_dqfreelist.qh_next; - dqp != (xfs_dquot_t *)&(xfs_Gqm->qm_dqfreelist); ) { + list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) { xfs_dqlock(dqp); - nextdqp = dqp->dq_flnext; if (dqp->dq_flags & XFS_DQ_INACTIVE) { ASSERT(dqp->q_mount == NULL); ASSERT(! XFS_DQ_IS_DIRTY(dqp)); - ASSERT(dqp->HL_PREVP == NULL); - ASSERT(dqp->MPL_PREVP == NULL); - XQM_FREELIST_REMOVE(dqp); + ASSERT(list_empty(&dqp->q_hashlist)); + ASSERT(list_empty(&dqp->q_mplist)); + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; xfs_dqunlock(dqp); xfs_qm_dqdestroy(dqp); } else { xfs_dqunlock(dqp); } - dqp = nextdqp; } - xfs_qm_freelist_unlock(xfs_Gqm); + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); /* * Destroy the entire XQM. If somebody mounts with quotaon, this'll @@ -305,7 +321,7 @@ xfs_qm_unmount( struct xfs_mount *mp) { if (mp->m_quotainfo) { - xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING); + xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); xfs_qm_destroy_quotainfo(mp); } } @@ -449,20 +465,21 @@ xfs_qm_unmount_quotas( */ STATIC int xfs_qm_dqflush_all( - xfs_mount_t *mp, - int sync_mode) + struct xfs_mount *mp, + int sync_mode) { - int recl; - xfs_dquot_t *dqp; - int niters; - int error; + struct xfs_quotainfo *q = mp->m_quotainfo; + int recl; + struct xfs_dquot *dqp; + int niters; + int error; - if (mp->m_quotainfo == NULL) + if (!q) return 0; niters = 0; again: - xfs_qm_mplist_lock(mp); - FOREACH_DQUOT_IN_MP(dqp, mp) { + mutex_lock(&q->qi_dqlist_lock); + list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { xfs_dqlock(dqp); if (! XFS_DQ_IS_DIRTY(dqp)) { xfs_dqunlock(dqp); @@ -470,7 +487,7 @@ again: } /* XXX a sentinel would be better */ - recl = XFS_QI_MPLRECLAIMS(mp); + recl = q->qi_dqreclaims; if (!xfs_dqflock_nowait(dqp)) { /* * If we can't grab the flush lock then check @@ -485,21 +502,21 @@ again: * Let go of the mplist lock. We don't want to hold it * across a disk write. */ - xfs_qm_mplist_unlock(mp); + mutex_unlock(&q->qi_dqlist_lock); error = xfs_qm_dqflush(dqp, sync_mode); xfs_dqunlock(dqp); if (error) return error; - xfs_qm_mplist_lock(mp); - if (recl != XFS_QI_MPLRECLAIMS(mp)) { - xfs_qm_mplist_unlock(mp); + mutex_lock(&q->qi_dqlist_lock); + if (recl != q->qi_dqreclaims) { + mutex_unlock(&q->qi_dqlist_lock); /* XXX restart limit */ goto again; } } - xfs_qm_mplist_unlock(mp); + mutex_unlock(&q->qi_dqlist_lock); /* return ! busy */ return 0; } @@ -509,15 +526,15 @@ again: */ STATIC void xfs_qm_detach_gdquots( - xfs_mount_t *mp) + struct xfs_mount *mp) { - xfs_dquot_t *dqp, *gdqp; - int nrecl; + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_dquot *dqp, *gdqp; + int nrecl; again: - ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); - dqp = XFS_QI_MPLNEXT(mp); - while (dqp) { + ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); + list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { xfs_dqlock(dqp); if ((gdqp = dqp->q_gdquot)) { xfs_dqlock(gdqp); @@ -530,15 +547,14 @@ xfs_qm_detach_gdquots( * Can't hold the mplist lock across a dqput. * XXXmust convert to marker based iterations here. */ - nrecl = XFS_QI_MPLRECLAIMS(mp); - xfs_qm_mplist_unlock(mp); + nrecl = q->qi_dqreclaims; + mutex_unlock(&q->qi_dqlist_lock); xfs_qm_dqput(gdqp); - xfs_qm_mplist_lock(mp); - if (nrecl != XFS_QI_MPLRECLAIMS(mp)) + mutex_lock(&q->qi_dqlist_lock); + if (nrecl != q->qi_dqreclaims) goto again; } - dqp = dqp->MPL_NEXT; } } @@ -550,23 +566,23 @@ xfs_qm_detach_gdquots( */ STATIC int xfs_qm_dqpurge_int( - xfs_mount_t *mp, - uint flags) /* QUOTAOFF/UMOUNTING/UQUOTA/PQUOTA/GQUOTA */ + struct xfs_mount *mp, + uint flags) { - xfs_dquot_t *dqp; - uint dqtype; - int nrecl; - xfs_dquot_t *nextdqp; - int nmisses; + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_dquot *dqp, *n; + uint dqtype; + int nrecl; + int nmisses; - if (mp->m_quotainfo == NULL) + if (!q) return 0; dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0; dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0; dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0; - xfs_qm_mplist_lock(mp); + mutex_lock(&q->qi_dqlist_lock); /* * In the first pass through all incore dquots of this filesystem, @@ -578,28 +594,25 @@ xfs_qm_dqpurge_int( again: nmisses = 0; - ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); + ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); /* * Try to get rid of all of the unwanted dquots. The idea is to * get them off mplist and hashlist, but leave them on freelist. */ - dqp = XFS_QI_MPLNEXT(mp); - while (dqp) { + list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) { /* * It's OK to look at the type without taking dqlock here. * We're holding the mplist lock here, and that's needed for * a dqreclaim. */ - if ((dqp->dq_flags & dqtype) == 0) { - dqp = dqp->MPL_NEXT; + if ((dqp->dq_flags & dqtype) == 0) continue; - } if (!mutex_trylock(&dqp->q_hash->qh_lock)) { - nrecl = XFS_QI_MPLRECLAIMS(mp); - xfs_qm_mplist_unlock(mp); + nrecl = q->qi_dqreclaims; + mutex_unlock(&q->qi_dqlist_lock); mutex_lock(&dqp->q_hash->qh_lock); - xfs_qm_mplist_lock(mp); + mutex_lock(&q->qi_dqlist_lock); /* * XXXTheoretically, we can get into a very long @@ -607,7 +620,7 @@ xfs_qm_dqpurge_int( * No one can be adding dquots to the mplist at * this point, but somebody might be taking things off. */ - if (nrecl != XFS_QI_MPLRECLAIMS(mp)) { + if (nrecl != q->qi_dqreclaims) { mutex_unlock(&dqp->q_hash->qh_lock); goto again; } @@ -617,11 +630,9 @@ xfs_qm_dqpurge_int( * Take the dquot off the mplist and hashlist. It may remain on * freelist in INACTIVE state. */ - nextdqp = dqp->MPL_NEXT; nmisses += xfs_qm_dqpurge(dqp); - dqp = nextdqp; } - xfs_qm_mplist_unlock(mp); + mutex_unlock(&q->qi_dqlist_lock); return nmisses; } @@ -921,12 +932,13 @@ xfs_qm_dqdetach( int xfs_qm_sync( - xfs_mount_t *mp, - int flags) + struct xfs_mount *mp, + int flags) { - int recl, restarts; - xfs_dquot_t *dqp; - int error; + struct xfs_quotainfo *q = mp->m_quotainfo; + int recl, restarts; + struct xfs_dquot *dqp; + int error; if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return 0; @@ -934,18 +946,19 @@ xfs_qm_sync( restarts = 0; again: - xfs_qm_mplist_lock(mp); + mutex_lock(&q->qi_dqlist_lock); /* * dqpurge_all() also takes the mplist lock and iterate thru all dquots * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared * when we have the mplist lock, we know that dquots will be consistent * as long as we have it locked. */ - if (! XFS_IS_QUOTA_ON(mp)) { - xfs_qm_mplist_unlock(mp); + if (!XFS_IS_QUOTA_ON(mp)) { + mutex_unlock(&q->qi_dqlist_lock); return 0; } - FOREACH_DQUOT_IN_MP(dqp, mp) { + ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); + list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { /* * If this is vfs_sync calling, then skip the dquots that * don't 'seem' to be dirty. ie. don't acquire dqlock. @@ -969,7 +982,7 @@ xfs_qm_sync( } /* XXX a sentinel would be better */ - recl = XFS_QI_MPLRECLAIMS(mp); + recl = q->qi_dqreclaims; if (!xfs_dqflock_nowait(dqp)) { if (flags & SYNC_TRYLOCK) { xfs_dqunlock(dqp); @@ -989,7 +1002,7 @@ xfs_qm_sync( * Let go of the mplist lock. We don't want to hold it * across a disk write */ - xfs_qm_mplist_unlock(mp); + mutex_unlock(&q->qi_dqlist_lock); error = xfs_qm_dqflush(dqp, flags); xfs_dqunlock(dqp); if (error && XFS_FORCED_SHUTDOWN(mp)) @@ -997,17 +1010,17 @@ xfs_qm_sync( else if (error) return error; - xfs_qm_mplist_lock(mp); - if (recl != XFS_QI_MPLRECLAIMS(mp)) { + mutex_lock(&q->qi_dqlist_lock); + if (recl != q->qi_dqreclaims) { if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS) break; - xfs_qm_mplist_unlock(mp); + mutex_unlock(&q->qi_dqlist_lock); goto again; } } - xfs_qm_mplist_unlock(mp); + mutex_unlock(&q->qi_dqlist_lock); return 0; } @@ -1052,8 +1065,9 @@ xfs_qm_init_quotainfo( return error; } - xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0); - lockdep_set_class(&qinf->qi_dqlist.qh_lock, &xfs_quota_mplist_class); + INIT_LIST_HEAD(&qinf->qi_dqlist); + mutex_init(&qinf->qi_dqlist_lock); + lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class); qinf->qi_dqreclaims = 0; @@ -1150,7 +1164,8 @@ xfs_qm_destroy_quotainfo( */ xfs_qm_rele_quotafs_ref(mp); - xfs_qm_list_destroy(&qi->qi_dqlist); + ASSERT(list_empty(&qi->qi_dqlist)); + mutex_destroy(&qi->qi_dqlist_lock); if (qi->qi_uquotaip) { IRELE(qi->qi_uquotaip); @@ -1177,7 +1192,7 @@ xfs_qm_list_init( int n) { mutex_init(&list->qh_lock); - list->qh_next = NULL; + INIT_LIST_HEAD(&list->qh_list); list->qh_version = 0; list->qh_nelems = 0; } @@ -1316,9 +1331,6 @@ xfs_qm_qino_alloc( */ spin_lock(&mp->m_sb_lock); if (flags & XFS_QMOPT_SBVERSION) { -#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) - unsigned oldv = mp->m_sb.sb_versionnum; -#endif ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) == @@ -1331,11 +1343,6 @@ xfs_qm_qino_alloc( /* qflags will get updated _after_ quotacheck */ mp->m_sb.sb_qflags = 0; -#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) - cmn_err(CE_NOTE, - "Old superblock version %x, converting to %x.", - oldv, mp->m_sb.sb_versionnum); -#endif } if (flags & XFS_QMOPT_UQUOTA) mp->m_sb.sb_uquotino = (*ip)->i_ino; @@ -1371,10 +1378,10 @@ xfs_qm_reset_dqcounts( #ifdef DEBUG j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); do_div(j, sizeof(xfs_dqblk_t)); - ASSERT(XFS_QM_DQPERBLK(mp) == j); + ASSERT(mp->m_quotainfo->qi_dqperchunk == j); #endif ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp); - for (j = 0; j < XFS_QM_DQPERBLK(mp); j++) { + for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) { /* * Do a sanity check, and if needed, repair the dqblk. Don't * output any warnings because it's perfectly possible to @@ -1429,7 +1436,7 @@ xfs_qm_dqiter_bufs( while (blkcnt--) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), - (int)XFS_QI_DQCHUNKLEN(mp), 0, &bp); + mp->m_quotainfo->qi_dqchunklen, 0, &bp); if (error) break; @@ -1439,7 +1446,7 @@ xfs_qm_dqiter_bufs( * goto the next block. */ bno++; - firstid += XFS_QM_DQPERBLK(mp); + firstid += mp->m_quotainfo->qi_dqperchunk; } return error; } @@ -1505,7 +1512,7 @@ xfs_qm_dqiterate( continue; firstid = (xfs_dqid_t) map[i].br_startoff * - XFS_QM_DQPERBLK(mp); + mp->m_quotainfo->qi_dqperchunk; /* * Do a read-ahead on the next extent. */ @@ -1516,7 +1523,7 @@ xfs_qm_dqiterate( while (rablkcnt--) { xfs_baread(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, rablkno), - (int)XFS_QI_DQCHUNKLEN(mp)); + mp->m_quotainfo->qi_dqchunklen); rablkno++; } } @@ -1576,8 +1583,10 @@ xfs_qm_quotacheck_dqadjust( /* * Set default limits, adjust timers (since we changed usages) + * + * There are no timers for the default values set in the root dquot. */ - if (! XFS_IS_SUSER_DQUOT(dqp)) { + if (dqp->q_core.d_id) { xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core); xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core); } @@ -1747,14 +1756,14 @@ xfs_qm_quotacheck( lastino = 0; flags = 0; - ASSERT(XFS_QI_UQIP(mp) || XFS_QI_GQIP(mp)); + ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); /* * There should be no cached dquots. The (simplistic) quotacheck * algorithm doesn't like that. */ - ASSERT(XFS_QI_MPLNDQUOTS(mp) == 0); + ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist)); cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname); @@ -1763,15 +1772,19 @@ xfs_qm_quotacheck( * their counters to zero. We need a clean slate. * We don't log our changes till later. */ - if ((uip = XFS_QI_UQIP(mp))) { - if ((error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA))) + uip = mp->m_quotainfo->qi_uquotaip; + if (uip) { + error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA); + if (error) goto error_return; flags |= XFS_UQUOTA_CHKD; } - if ((gip = XFS_QI_GQIP(mp))) { - if ((error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? - XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA))) + gip = mp->m_quotainfo->qi_gquotaip; + if (gip) { + error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? + XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA); + if (error) goto error_return; flags |= XFS_OQUOTA_CHKD; } @@ -1804,7 +1817,7 @@ xfs_qm_quotacheck( * at this point (because we intentionally didn't in dqget_noattach). */ if (error) { - xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF); + xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); goto error_return; } @@ -1825,7 +1838,7 @@ xfs_qm_quotacheck( mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD); mp->m_qflags |= flags; - XQM_LIST_PRINT(&(XFS_QI_MPL_LIST(mp)), MPL_NEXT, "++++ Mp list +++"); + xfs_qm_dquot_list_print(mp); error_return: if (error) { @@ -1920,59 +1933,53 @@ xfs_qm_init_quotainos( } } - XFS_QI_UQIP(mp) = uip; - XFS_QI_GQIP(mp) = gip; + mp->m_quotainfo->qi_uquotaip = uip; + mp->m_quotainfo->qi_gquotaip = gip; return 0; } + /* - * Traverse the freelist of dquots and attempt to reclaim a maximum of - * 'howmany' dquots. This operation races with dqlookup(), and attempts to - * favor the lookup function ... - * XXXsup merge this with qm_reclaim_one(). + * Just pop the least recently used dquot off the freelist and + * recycle it. The returned dquot is locked. */ -STATIC int -xfs_qm_shake_freelist( - int howmany) +STATIC xfs_dquot_t * +xfs_qm_dqreclaim_one(void) { - int nreclaimed; - xfs_dqhash_t *hash; - xfs_dquot_t *dqp, *nextdqp; + xfs_dquot_t *dqpout; + xfs_dquot_t *dqp; int restarts; - int nflushes; - - if (howmany <= 0) - return 0; - nreclaimed = 0; restarts = 0; - nflushes = 0; + dqpout = NULL; -#ifdef QUOTADEBUG - cmn_err(CE_DEBUG, "Shake free 0x%x", howmany); -#endif - /* lock order is : hashchainlock, freelistlock, mplistlock */ - tryagain: - xfs_qm_freelist_lock(xfs_Gqm); + /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ +startagain: + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - for (dqp = xfs_Gqm->qm_dqfreelist.qh_next; - ((dqp != (xfs_dquot_t *) &xfs_Gqm->qm_dqfreelist) && - nreclaimed < howmany); ) { + list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) { + struct xfs_mount *mp = dqp->q_mount; xfs_dqlock(dqp); /* * We are racing with dqlookup here. Naturally we don't - * want to reclaim a dquot that lookup wants. + * want to reclaim a dquot that lookup wants. We release the + * freelist lock and start over, so that lookup will grab + * both the dquot and the freelistlock. */ if (dqp->dq_flags & XFS_DQ_WANT) { + ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); + + trace_xfs_dqreclaim_want(dqp); + xfs_dqunlock(dqp); - xfs_qm_freelist_unlock(xfs_Gqm); + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - return nreclaimed; + return NULL; XQM_STATS_INC(xqmstats.xs_qm_dqwants); - goto tryagain; + goto startagain; } /* @@ -1981,23 +1988,27 @@ xfs_qm_shake_freelist( * life easier. */ if (dqp->dq_flags & XFS_DQ_INACTIVE) { - ASSERT(dqp->q_mount == NULL); + ASSERT(mp == NULL); ASSERT(! XFS_DQ_IS_DIRTY(dqp)); - ASSERT(dqp->HL_PREVP == NULL); - ASSERT(dqp->MPL_PREVP == NULL); + ASSERT(list_empty(&dqp->q_hashlist)); + ASSERT(list_empty(&dqp->q_mplist)); + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; + xfs_dqunlock(dqp); + dqpout = dqp; XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); - nextdqp = dqp->dq_flnext; - goto off_freelist; + break; } - ASSERT(dqp->MPL_PREVP); + ASSERT(dqp->q_hash); + ASSERT(!list_empty(&dqp->q_mplist)); + /* * Try to grab the flush lock. If this dquot is in the process of * getting flushed to disk, we don't want to reclaim it. */ if (!xfs_dqflock_nowait(dqp)) { xfs_dqunlock(dqp); - dqp = dqp->dq_flnext; continue; } @@ -2010,21 +2021,21 @@ xfs_qm_shake_freelist( if (XFS_DQ_IS_DIRTY(dqp)) { int error; - trace_xfs_dqshake_dirty(dqp); + trace_xfs_dqreclaim_dirty(dqp); /* * We flush it delayed write, so don't bother - * releasing the mplock. + * releasing the freelist lock. */ error = xfs_qm_dqflush(dqp, 0); if (error) { - xfs_fs_cmn_err(CE_WARN, dqp->q_mount, - "xfs_qm_dqflush_all: dquot %p flush failed", dqp); + xfs_fs_cmn_err(CE_WARN, mp, + "xfs_qm_dqreclaim: dquot %p flush failed", dqp); } xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ - dqp = dqp->dq_flnext; continue; } + /* * We're trying to get the hashlock out of order. This races * with dqlookup; so, we giveup and goto the next dquot if @@ -2033,56 +2044,74 @@ xfs_qm_shake_freelist( * waiting for the freelist lock. */ if (!mutex_trylock(&dqp->q_hash->qh_lock)) { - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - dqp = dqp->dq_flnext; - continue; + restarts++; + goto dqfunlock; } + /* * This races with dquot allocation code as well as dqflush_all * and reclaim code. So, if we failed to grab the mplist lock, * giveup everything and start over. */ - hash = dqp->q_hash; - ASSERT(hash); - if (! xfs_qm_mplist_nowait(dqp->q_mount)) { - /* XXX put a sentinel so that we can come back here */ + if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) { + restarts++; + mutex_unlock(&dqp->q_hash->qh_lock); xfs_dqfunlock(dqp); xfs_dqunlock(dqp); - mutex_unlock(&hash->qh_lock); - xfs_qm_freelist_unlock(xfs_Gqm); - if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - return nreclaimed; - goto tryagain; + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS) + return NULL; + goto startagain; } - trace_xfs_dqshake_unlink(dqp); - -#ifdef QUOTADEBUG - cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n", - dqp, be32_to_cpu(dqp->q_core.d_id)); -#endif ASSERT(dqp->q_nrefs == 0); - nextdqp = dqp->dq_flnext; - XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp); - XQM_HASHLIST_REMOVE(hash, dqp); + list_del_init(&dqp->q_mplist); + mp->m_quotainfo->qi_dquots--; + mp->m_quotainfo->qi_dqreclaims++; + list_del_init(&dqp->q_hashlist); + dqp->q_hash->qh_version++; + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; + dqpout = dqp; + mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); + mutex_unlock(&dqp->q_hash->qh_lock); +dqfunlock: xfs_dqfunlock(dqp); - xfs_qm_mplist_unlock(dqp->q_mount); - mutex_unlock(&hash->qh_lock); - - off_freelist: - XQM_FREELIST_REMOVE(dqp); xfs_dqunlock(dqp); - nreclaimed++; - XQM_STATS_INC(xqmstats.xs_qm_dqshake_reclaims); + if (dqpout) + break; + if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) + return NULL; + } + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + return dqpout; +} + +/* + * Traverse the freelist of dquots and attempt to reclaim a maximum of + * 'howmany' dquots. This operation races with dqlookup(), and attempts to + * favor the lookup function ... + */ +STATIC int +xfs_qm_shake_freelist( + int howmany) +{ + int nreclaimed = 0; + xfs_dquot_t *dqp; + + if (howmany <= 0) + return 0; + + while (nreclaimed < howmany) { + dqp = xfs_qm_dqreclaim_one(); + if (!dqp) + return nreclaimed; xfs_qm_dqdestroy(dqp); - dqp = nextdqp; + nreclaimed++; } - xfs_qm_freelist_unlock(xfs_Gqm); return nreclaimed; } - /* * The kmem_shake interface is invoked when memory is running low. */ @@ -2097,7 +2126,7 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask) if (!xfs_Gqm) return 0; - nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */ + nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */ /* incore dquots in all f/s's */ ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree; @@ -2113,131 +2142,6 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask) } -/* - * Just pop the least recently used dquot off the freelist and - * recycle it. The returned dquot is locked. - */ -STATIC xfs_dquot_t * -xfs_qm_dqreclaim_one(void) -{ - xfs_dquot_t *dqpout; - xfs_dquot_t *dqp; - int restarts; - int nflushes; - - restarts = 0; - dqpout = NULL; - nflushes = 0; - - /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ - startagain: - xfs_qm_freelist_lock(xfs_Gqm); - - FOREACH_DQUOT_IN_FREELIST(dqp, &(xfs_Gqm->qm_dqfreelist)) { - xfs_dqlock(dqp); - - /* - * We are racing with dqlookup here. Naturally we don't - * want to reclaim a dquot that lookup wants. We release the - * freelist lock and start over, so that lookup will grab - * both the dquot and the freelistlock. - */ - if (dqp->dq_flags & XFS_DQ_WANT) { - ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); - - trace_xfs_dqreclaim_want(dqp); - - xfs_dqunlock(dqp); - xfs_qm_freelist_unlock(xfs_Gqm); - if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - return NULL; - XQM_STATS_INC(xqmstats.xs_qm_dqwants); - goto startagain; - } - - /* - * If the dquot is inactive, we are assured that it is - * not on the mplist or the hashlist, and that makes our - * life easier. - */ - if (dqp->dq_flags & XFS_DQ_INACTIVE) { - ASSERT(dqp->q_mount == NULL); - ASSERT(! XFS_DQ_IS_DIRTY(dqp)); - ASSERT(dqp->HL_PREVP == NULL); - ASSERT(dqp->MPL_PREVP == NULL); - XQM_FREELIST_REMOVE(dqp); - xfs_dqunlock(dqp); - dqpout = dqp; - XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); - break; - } - - ASSERT(dqp->q_hash); - ASSERT(dqp->MPL_PREVP); - - /* - * Try to grab the flush lock. If this dquot is in the process of - * getting flushed to disk, we don't want to reclaim it. - */ - if (!xfs_dqflock_nowait(dqp)) { - xfs_dqunlock(dqp); - continue; - } - - /* - * We have the flush lock so we know that this is not in the - * process of being flushed. So, if this is dirty, flush it - * DELWRI so that we don't get a freelist infested with - * dirty dquots. - */ - if (XFS_DQ_IS_DIRTY(dqp)) { - int error; - - trace_xfs_dqreclaim_dirty(dqp); - - /* - * We flush it delayed write, so don't bother - * releasing the freelist lock. - */ - error = xfs_qm_dqflush(dqp, 0); - if (error) { - xfs_fs_cmn_err(CE_WARN, dqp->q_mount, - "xfs_qm_dqreclaim: dquot %p flush failed", dqp); - } - xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ - continue; - } - - if (! xfs_qm_mplist_nowait(dqp->q_mount)) { - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - continue; - } - - if (!mutex_trylock(&dqp->q_hash->qh_lock)) - goto mplistunlock; - - trace_xfs_dqreclaim_unlink(dqp); - - ASSERT(dqp->q_nrefs == 0); - XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp); - XQM_HASHLIST_REMOVE(dqp->q_hash, dqp); - XQM_FREELIST_REMOVE(dqp); - dqpout = dqp; - mutex_unlock(&dqp->q_hash->qh_lock); - mplistunlock: - xfs_qm_mplist_unlock(dqp->q_mount); - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - if (dqpout) - break; - } - - xfs_qm_freelist_unlock(xfs_Gqm); - return dqpout; -} - - /*------------------------------------------------------------------*/ /* @@ -2662,66 +2566,3 @@ xfs_qm_vop_create_dqattach( } } -/* ------------- list stuff -----------------*/ -STATIC void -xfs_qm_freelist_init(xfs_frlist_t *ql) -{ - ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql; - mutex_init(&ql->qh_lock); - ql->qh_version = 0; - ql->qh_nelems = 0; -} - -STATIC void -xfs_qm_freelist_destroy(xfs_frlist_t *ql) -{ - xfs_dquot_t *dqp, *nextdqp; - - mutex_lock(&ql->qh_lock); - for (dqp = ql->qh_next; - dqp != (xfs_dquot_t *)ql; ) { - xfs_dqlock(dqp); - nextdqp = dqp->dq_flnext; -#ifdef QUOTADEBUG - cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp); -#endif - XQM_FREELIST_REMOVE(dqp); - xfs_dqunlock(dqp); - xfs_qm_dqdestroy(dqp); - dqp = nextdqp; - } - mutex_unlock(&ql->qh_lock); - mutex_destroy(&ql->qh_lock); - - ASSERT(ql->qh_nelems == 0); -} - -STATIC void -xfs_qm_freelist_insert(xfs_frlist_t *ql, xfs_dquot_t *dq) -{ - dq->dq_flnext = ql->qh_next; - dq->dq_flprev = (xfs_dquot_t *)ql; - ql->qh_next = dq; - dq->dq_flnext->dq_flprev = dq; - xfs_Gqm->qm_dqfreelist.qh_nelems++; - xfs_Gqm->qm_dqfreelist.qh_version++; -} - -void -xfs_qm_freelist_unlink(xfs_dquot_t *dq) -{ - xfs_dquot_t *next = dq->dq_flnext; - xfs_dquot_t *prev = dq->dq_flprev; - - next->dq_flprev = prev; - prev->dq_flnext = next; - dq->dq_flnext = dq->dq_flprev = dq; - xfs_Gqm->qm_dqfreelist.qh_nelems--; - xfs_Gqm->qm_dqfreelist.qh_version++; -} - -void -xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq) -{ - xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq); -} diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h index 495564b8af3..c9446f1c726 100644 --- a/fs/xfs/quota/xfs_qm.h +++ b/fs/xfs/quota/xfs_qm.h @@ -72,17 +72,6 @@ extern kmem_zone_t *qm_dqtrxzone; #define XFS_QM_MAX_DQCLUSTER_LOGSZ 3 typedef xfs_dqhash_t xfs_dqlist_t; -/* - * The freelist head. The first two fields match the first two in the - * xfs_dquot_t structure (in xfs_dqmarker_t) - */ -typedef struct xfs_frlist { - struct xfs_dquot *qh_next; - struct xfs_dquot *qh_prev; - struct mutex qh_lock; - uint qh_version; - uint qh_nelems; -} xfs_frlist_t; /* * Quota Manager (global) structure. Lives only in core. @@ -91,7 +80,9 @@ typedef struct xfs_qm { xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */ xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */ uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */ - xfs_frlist_t qm_dqfreelist; /* freelist of dquots */ + struct list_head qm_dqfrlist; /* freelist of dquots */ + struct mutex qm_dqfrlist_lock; + int qm_dqfrlist_cnt; atomic_t qm_totaldquots; /* total incore dquots */ uint qm_nrefs; /* file systems with quota on */ int qm_dqfree_ratio;/* ratio of free to inuse dquots */ @@ -106,7 +97,9 @@ typedef struct xfs_qm { typedef struct xfs_quotainfo { xfs_inode_t *qi_uquotaip; /* user quota inode */ xfs_inode_t *qi_gquotaip; /* group quota inode */ - xfs_dqlist_t qi_dqlist; /* all dquots in filesys */ + struct list_head qi_dqlist; /* all dquots in filesys */ + struct mutex qi_dqlist_lock; + int qi_dquots; int qi_dqreclaims; /* a change here indicates a removal in the dqlist */ time_t qi_btimelimit; /* limit for blks timer */ @@ -175,10 +168,6 @@ extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *); extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint); extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint); -/* list stuff */ -extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *); -extern void xfs_qm_freelist_unlink(xfs_dquot_t *); - #ifdef DEBUG extern int xfs_qm_internalqcheck(xfs_mount_t *); #else diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c index 83e7ea3e25f..3d1fc79532e 100644 --- a/fs/xfs/quota/xfs_qm_stats.c +++ b/fs/xfs/quota/xfs_qm_stats.c @@ -55,7 +55,7 @@ static int xqm_proc_show(struct seq_file *m, void *v) ndquot, xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0, xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0, - xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0); + xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0); return 0; } diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 50bee07d6b0..92b002f1805 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -79,6 +79,7 @@ xfs_qm_scall_quotaoff( xfs_mount_t *mp, uint flags) { + struct xfs_quotainfo *q = mp->m_quotainfo; uint dqtype; int error; uint inactivate_flags; @@ -102,11 +103,8 @@ xfs_qm_scall_quotaoff( * critical thing. * If quotaoff, then we must be dealing with the root filesystem. */ - ASSERT(mp->m_quotainfo); - if (mp->m_quotainfo) - mutex_lock(&(XFS_QI_QOFFLOCK(mp))); - - ASSERT(mp->m_quotainfo); + ASSERT(q); + mutex_lock(&q->qi_quotaofflock); /* * If we're just turning off quota enforcement, change mp and go. @@ -117,7 +115,7 @@ xfs_qm_scall_quotaoff( spin_lock(&mp->m_sb_lock); mp->m_sb.sb_qflags = mp->m_qflags; spin_unlock(&mp->m_sb_lock); - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); + mutex_unlock(&q->qi_quotaofflock); /* XXX what to do if error ? Revert back to old vals incore ? */ error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS); @@ -150,10 +148,8 @@ xfs_qm_scall_quotaoff( * Nothing to do? Don't complain. This happens when we're just * turning off quota enforcement. */ - if ((mp->m_qflags & flags) == 0) { - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); - return (0); - } + if ((mp->m_qflags & flags) == 0) + goto out_unlock; /* * Write the LI_QUOTAOFF log record, and do SB changes atomically, @@ -162,7 +158,7 @@ xfs_qm_scall_quotaoff( */ error = xfs_qm_log_quotaoff(mp, &qoffstart, flags); if (error) - goto out_error; + goto out_unlock; /* * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct @@ -204,7 +200,7 @@ xfs_qm_scall_quotaoff( * So, if we couldn't purge all the dquots from the filesystem, * we can't get rid of the incore data structures. */ - while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype|XFS_QMOPT_QUOTAOFF))) + while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype))) delay(10 * nculprits); /* @@ -222,7 +218,7 @@ xfs_qm_scall_quotaoff( if (error) { /* We're screwed now. Shutdown is the only option. */ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - goto out_error; + goto out_unlock; } /* @@ -230,27 +226,26 @@ xfs_qm_scall_quotaoff( */ if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) || ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) { - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); + mutex_unlock(&q->qi_quotaofflock); xfs_qm_destroy_quotainfo(mp); return (0); } /* - * Release our quotainode references, and vn_purge them, - * if we don't need them anymore. + * Release our quotainode references if we don't need them anymore. */ - if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) { - IRELE(XFS_QI_UQIP(mp)); - XFS_QI_UQIP(mp) = NULL; + if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) { + IRELE(q->qi_uquotaip); + q->qi_uquotaip = NULL; } - if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && XFS_QI_GQIP(mp)) { - IRELE(XFS_QI_GQIP(mp)); - XFS_QI_GQIP(mp) = NULL; + if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) { + IRELE(q->qi_gquotaip); + q->qi_gquotaip = NULL; } -out_error: - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); - return (error); +out_unlock: + mutex_unlock(&q->qi_quotaofflock); + return error; } int @@ -379,9 +374,9 @@ xfs_qm_scall_quotaon( /* * Switch on quota enforcement in core. */ - mutex_lock(&(XFS_QI_QOFFLOCK(mp))); + mutex_lock(&mp->m_quotainfo->qi_quotaofflock); mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD); - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); + mutex_unlock(&mp->m_quotainfo->qi_quotaofflock); return (0); } @@ -392,11 +387,12 @@ xfs_qm_scall_quotaon( */ int xfs_qm_scall_getqstat( - xfs_mount_t *mp, - fs_quota_stat_t *out) + struct xfs_mount *mp, + struct fs_quota_stat *out) { - xfs_inode_t *uip, *gip; - boolean_t tempuqip, tempgqip; + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_inode *uip, *gip; + boolean_t tempuqip, tempgqip; uip = gip = NULL; tempuqip = tempgqip = B_FALSE; @@ -415,9 +411,9 @@ xfs_qm_scall_getqstat( out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino; out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; - if (mp->m_quotainfo) { - uip = mp->m_quotainfo->qi_uquotaip; - gip = mp->m_quotainfo->qi_gquotaip; + if (q) { + uip = q->qi_uquotaip; + gip = q->qi_gquotaip; } if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, @@ -441,17 +437,20 @@ xfs_qm_scall_getqstat( if (tempgqip) IRELE(gip); } - if (mp->m_quotainfo) { - out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp); - out->qs_btimelimit = XFS_QI_BTIMELIMIT(mp); - out->qs_itimelimit = XFS_QI_ITIMELIMIT(mp); - out->qs_rtbtimelimit = XFS_QI_RTBTIMELIMIT(mp); - out->qs_bwarnlimit = XFS_QI_BWARNLIMIT(mp); - out->qs_iwarnlimit = XFS_QI_IWARNLIMIT(mp); + if (q) { + out->qs_incoredqs = q->qi_dquots; + out->qs_btimelimit = q->qi_btimelimit; + out->qs_itimelimit = q->qi_itimelimit; + out->qs_rtbtimelimit = q->qi_rtbtimelimit; + out->qs_bwarnlimit = q->qi_bwarnlimit; + out->qs_iwarnlimit = q->qi_iwarnlimit; } - return (0); + return 0; } +#define XFS_DQ_MASK \ + (FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK) + /* * Adjust quota limits, and start/stop timers accordingly. */ @@ -462,15 +461,17 @@ xfs_qm_scall_setqlim( uint type, fs_disk_quota_t *newlim) { + struct xfs_quotainfo *q = mp->m_quotainfo; xfs_disk_dquot_t *ddq; xfs_dquot_t *dqp; xfs_trans_t *tp; int error; xfs_qcnt_t hard, soft; - if ((newlim->d_fieldmask & - (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0) - return (0); + if (newlim->d_fieldmask & ~XFS_DQ_MASK) + return EINVAL; + if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0) + return 0; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128, @@ -485,7 +486,7 @@ xfs_qm_scall_setqlim( * a quotaoff from happening). (XXXThis doesn't currently happen * because we take the vfslock before calling xfs_qm_sysent). */ - mutex_lock(&(XFS_QI_QOFFLOCK(mp))); + mutex_lock(&q->qi_quotaofflock); /* * Get the dquot (locked), and join it to the transaction. @@ -493,9 +494,8 @@ xfs_qm_scall_setqlim( */ if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) { xfs_trans_cancel(tp, XFS_TRANS_ABORT); - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); ASSERT(error != ENOENT); - return (error); + goto out_unlock; } xfs_trans_dqjoin(tp, dqp); ddq = &dqp->q_core; @@ -513,8 +513,8 @@ xfs_qm_scall_setqlim( ddq->d_blk_hardlimit = cpu_to_be64(hard); ddq->d_blk_softlimit = cpu_to_be64(soft); if (id == 0) { - mp->m_quotainfo->qi_bhardlimit = hard; - mp->m_quotainfo->qi_bsoftlimit = soft; + q->qi_bhardlimit = hard; + q->qi_bsoftlimit = soft; } } else { qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft); @@ -529,8 +529,8 @@ xfs_qm_scall_setqlim( ddq->d_rtb_hardlimit = cpu_to_be64(hard); ddq->d_rtb_softlimit = cpu_to_be64(soft); if (id == 0) { - mp->m_quotainfo->qi_rtbhardlimit = hard; - mp->m_quotainfo->qi_rtbsoftlimit = soft; + q->qi_rtbhardlimit = hard; + q->qi_rtbsoftlimit = soft; } } else { qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft); @@ -546,8 +546,8 @@ xfs_qm_scall_setqlim( ddq->d_ino_hardlimit = cpu_to_be64(hard); ddq->d_ino_softlimit = cpu_to_be64(soft); if (id == 0) { - mp->m_quotainfo->qi_ihardlimit = hard; - mp->m_quotainfo->qi_isoftlimit = soft; + q->qi_ihardlimit = hard; + q->qi_isoftlimit = soft; } } else { qdprintk("ihard %Ld < isoft %Ld\n", hard, soft); @@ -572,23 +572,23 @@ xfs_qm_scall_setqlim( * for warnings. */ if (newlim->d_fieldmask & FS_DQ_BTIMER) { - mp->m_quotainfo->qi_btimelimit = newlim->d_btimer; + q->qi_btimelimit = newlim->d_btimer; ddq->d_btimer = cpu_to_be32(newlim->d_btimer); } if (newlim->d_fieldmask & FS_DQ_ITIMER) { - mp->m_quotainfo->qi_itimelimit = newlim->d_itimer; + q->qi_itimelimit = newlim->d_itimer; ddq->d_itimer = cpu_to_be32(newlim->d_itimer); } if (newlim->d_fieldmask & FS_DQ_RTBTIMER) { - mp->m_quotainfo->qi_rtbtimelimit = newlim->d_rtbtimer; + q->qi_rtbtimelimit = newlim->d_rtbtimer; ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer); } if (newlim->d_fieldmask & FS_DQ_BWARNS) - mp->m_quotainfo->qi_bwarnlimit = newlim->d_bwarns; + q->qi_bwarnlimit = newlim->d_bwarns; if (newlim->d_fieldmask & FS_DQ_IWARNS) - mp->m_quotainfo->qi_iwarnlimit = newlim->d_iwarns; + q->qi_iwarnlimit = newlim->d_iwarns; if (newlim->d_fieldmask & FS_DQ_RTBWARNS) - mp->m_quotainfo->qi_rtbwarnlimit = newlim->d_rtbwarns; + q->qi_rtbwarnlimit = newlim->d_rtbwarns; } else { /* * If the user is now over quota, start the timelimit. @@ -605,8 +605,9 @@ xfs_qm_scall_setqlim( error = xfs_trans_commit(tp, 0); xfs_qm_dqprint(dqp); xfs_qm_dqrele(dqp); - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); + out_unlock: + mutex_unlock(&q->qi_quotaofflock); return error; } @@ -853,7 +854,8 @@ xfs_dqrele_inode( int error; /* skip quota inodes */ - if (ip == XFS_QI_UQIP(ip->i_mount) || ip == XFS_QI_GQIP(ip->i_mount)) { + if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || + ip == ip->i_mount->m_quotainfo->qi_gquotaip) { ASSERT(ip->i_udquot == NULL); ASSERT(ip->i_gdquot == NULL); read_unlock(&pag->pag_ici_lock); @@ -931,7 +933,8 @@ struct mutex qcheck_lock; } typedef struct dqtest { - xfs_dqmarker_t q_lists; + uint dq_flags; /* various flags (XFS_DQ_*) */ + struct list_head q_hashlist; xfs_dqhash_t *q_hash; /* the hashchain header */ xfs_mount_t *q_mount; /* filesystem this relates to */ xfs_dqid_t d_id; /* user id or group id */ @@ -942,14 +945,9 @@ typedef struct dqtest { STATIC void xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp) { - xfs_dquot_t *d; - if (((d) = (h)->qh_next)) - (d)->HL_PREVP = &((dqp)->HL_NEXT); - (dqp)->HL_NEXT = d; - (dqp)->HL_PREVP = &((h)->qh_next); - (h)->qh_next = (xfs_dquot_t *)dqp; - (h)->qh_version++; - (h)->qh_nelems++; + list_add(&dqp->q_hashlist, &h->qh_list); + h->qh_version++; + h->qh_nelems++; } STATIC void xfs_qm_dqtest_print( @@ -1061,9 +1059,7 @@ xfs_qm_internalqcheck_dqget( xfs_dqhash_t *h; h = DQTEST_HASH(mp, id, type); - for (d = (xfs_dqtest_t *) h->qh_next; d != NULL; - d = (xfs_dqtest_t *) d->HL_NEXT) { - /* DQTEST_LIST_PRINT(h, HL_NEXT, "@@@@@ dqtestlist @@@@@"); */ + list_for_each_entry(d, &h->qh_list, q_hashlist) { if (d->d_id == id && mp == d->q_mount) { *O_dq = d; return (0); @@ -1074,6 +1070,7 @@ xfs_qm_internalqcheck_dqget( d->d_id = id; d->q_mount = mp; d->q_hash = h; + INIT_LIST_HEAD(&d->q_hashlist); xfs_qm_hashinsert(h, d); *O_dq = d; return (0); @@ -1180,8 +1177,6 @@ xfs_qm_internalqcheck( xfs_ino_t lastino; int done, count; int i; - xfs_dqtest_t *d, *e; - xfs_dqhash_t *h1; int error; lastino = 0; @@ -1221,19 +1216,18 @@ xfs_qm_internalqcheck( } cmn_err(CE_DEBUG, "Checking results against system dquots"); for (i = 0; i < qmtest_hashmask; i++) { - h1 = &qmtest_udqtab[i]; - for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { + xfs_dqtest_t *d, *n; + xfs_dqhash_t *h; + + h = &qmtest_udqtab[i]; + list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) { xfs_dqtest_cmp(d); - e = (xfs_dqtest_t *) d->HL_NEXT; kmem_free(d); - d = e; } - h1 = &qmtest_gdqtab[i]; - for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { + h = &qmtest_gdqtab[i]; + list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) { xfs_dqtest_cmp(d); - e = (xfs_dqtest_t *) d->HL_NEXT; kmem_free(d); - d = e; } } diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h index 8286b2842b6..94a3d927d71 100644 --- a/fs/xfs/quota/xfs_quota_priv.h +++ b/fs/xfs/quota/xfs_quota_priv.h @@ -24,43 +24,6 @@ */ #define XFS_DQITER_MAP_SIZE 10 -/* Number of dquots that fit in to a dquot block */ -#define XFS_QM_DQPERBLK(mp) ((mp)->m_quotainfo->qi_dqperchunk) - -#define XFS_DQ_IS_ADDEDTO_TRX(t, d) ((d)->q_transp == (t)) - -#define XFS_QI_MPLRECLAIMS(mp) ((mp)->m_quotainfo->qi_dqreclaims) -#define XFS_QI_UQIP(mp) ((mp)->m_quotainfo->qi_uquotaip) -#define XFS_QI_GQIP(mp) ((mp)->m_quotainfo->qi_gquotaip) -#define XFS_QI_DQCHUNKLEN(mp) ((mp)->m_quotainfo->qi_dqchunklen) -#define XFS_QI_BTIMELIMIT(mp) ((mp)->m_quotainfo->qi_btimelimit) -#define XFS_QI_RTBTIMELIMIT(mp) ((mp)->m_quotainfo->qi_rtbtimelimit) -#define XFS_QI_ITIMELIMIT(mp) ((mp)->m_quotainfo->qi_itimelimit) -#define XFS_QI_BWARNLIMIT(mp) ((mp)->m_quotainfo->qi_bwarnlimit) -#define XFS_QI_RTBWARNLIMIT(mp) ((mp)->m_quotainfo->qi_rtbwarnlimit) -#define XFS_QI_IWARNLIMIT(mp) ((mp)->m_quotainfo->qi_iwarnlimit) -#define XFS_QI_QOFFLOCK(mp) ((mp)->m_quotainfo->qi_quotaofflock) - -#define XFS_QI_MPL_LIST(mp) ((mp)->m_quotainfo->qi_dqlist) -#define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next) -#define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems) - -#define xfs_qm_mplist_lock(mp) \ - mutex_lock(&(XFS_QI_MPL_LIST(mp).qh_lock)) -#define xfs_qm_mplist_nowait(mp) \ - mutex_trylock(&(XFS_QI_MPL_LIST(mp).qh_lock)) -#define xfs_qm_mplist_unlock(mp) \ - mutex_unlock(&(XFS_QI_MPL_LIST(mp).qh_lock)) -#define XFS_QM_IS_MPLIST_LOCKED(mp) \ - mutex_is_locked(&(XFS_QI_MPL_LIST(mp).qh_lock)) - -#define xfs_qm_freelist_lock(qm) \ - mutex_lock(&((qm)->qm_dqfreelist.qh_lock)) -#define xfs_qm_freelist_lock_nowait(qm) \ - mutex_trylock(&((qm)->qm_dqfreelist.qh_lock)) -#define xfs_qm_freelist_unlock(qm) \ - mutex_unlock(&((qm)->qm_dqfreelist.qh_lock)) - /* * Hash into a bucket in the dquot hash table, based on <mp, id>. */ @@ -72,9 +35,6 @@ XFS_DQ_HASHVAL(mp, id)) : \ (xfs_Gqm->qm_grp_dqhtable + \ XFS_DQ_HASHVAL(mp, id))) -#define XFS_IS_DQTYPE_ON(mp, type) (type == XFS_DQ_USER ? \ - XFS_IS_UQUOTA_ON(mp) : \ - XFS_IS_OQUOTA_ON(mp)) #define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ !dqp->q_core.d_blk_hardlimit && \ !dqp->q_core.d_blk_softlimit && \ @@ -86,68 +46,6 @@ !dqp->q_core.d_rtbcount && \ !dqp->q_core.d_icount) -#define HL_PREVP dq_hashlist.ql_prevp -#define HL_NEXT dq_hashlist.ql_next -#define MPL_PREVP dq_mplist.ql_prevp -#define MPL_NEXT dq_mplist.ql_next - - -#define _LIST_REMOVE(h, dqp, PVP, NXT) \ - { \ - xfs_dquot_t *d; \ - if (((d) = (dqp)->NXT)) \ - (d)->PVP = (dqp)->PVP; \ - *((dqp)->PVP) = d; \ - (dqp)->NXT = NULL; \ - (dqp)->PVP = NULL; \ - (h)->qh_version++; \ - (h)->qh_nelems--; \ - } - -#define _LIST_INSERT(h, dqp, PVP, NXT) \ - { \ - xfs_dquot_t *d; \ - if (((d) = (h)->qh_next)) \ - (d)->PVP = &((dqp)->NXT); \ - (dqp)->NXT = d; \ - (dqp)->PVP = &((h)->qh_next); \ - (h)->qh_next = dqp; \ - (h)->qh_version++; \ - (h)->qh_nelems++; \ - } - -#define FOREACH_DQUOT_IN_MP(dqp, mp) \ - for ((dqp) = XFS_QI_MPLNEXT(mp); (dqp) != NULL; (dqp) = (dqp)->MPL_NEXT) - -#define FOREACH_DQUOT_IN_FREELIST(dqp, qlist) \ -for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \ - (dqp) = (dqp)->dq_flnext) - -#define XQM_HASHLIST_INSERT(h, dqp) \ - _LIST_INSERT(h, dqp, HL_PREVP, HL_NEXT) - -#define XQM_FREELIST_INSERT(h, dqp) \ - xfs_qm_freelist_append(h, dqp) - -#define XQM_MPLIST_INSERT(h, dqp) \ - _LIST_INSERT(h, dqp, MPL_PREVP, MPL_NEXT) - -#define XQM_HASHLIST_REMOVE(h, dqp) \ - _LIST_REMOVE(h, dqp, HL_PREVP, HL_NEXT) -#define XQM_FREELIST_REMOVE(dqp) \ - xfs_qm_freelist_unlink(dqp) -#define XQM_MPLIST_REMOVE(h, dqp) \ - { _LIST_REMOVE(h, dqp, MPL_PREVP, MPL_NEXT); \ - XFS_QI_MPLRECLAIMS((dqp)->q_mount)++; } - -#define XFS_DQ_IS_LOGITEM_INITD(dqp) ((dqp)->q_logitem.qli_dquot == (dqp)) - -#define XFS_QM_DQP_TO_DQACCT(tp, dqp) (XFS_QM_ISUDQ(dqp) ? \ - (tp)->t_dqinfo->dqa_usrdquots : \ - (tp)->t_dqinfo->dqa_grpdquots) -#define XFS_IS_SUSER_DQUOT(dqp) \ - (!((dqp)->q_core.d_id)) - #define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \ (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \ (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???"))) diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c index c3ab75cb1d9..061d827da33 100644 --- a/fs/xfs/quota/xfs_trans_dquot.c +++ b/fs/xfs/quota/xfs_trans_dquot.c @@ -59,12 +59,11 @@ xfs_trans_dqjoin( xfs_trans_t *tp, xfs_dquot_t *dqp) { - xfs_dq_logitem_t *lp; + xfs_dq_logitem_t *lp = &dqp->q_logitem; - ASSERT(! XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); + ASSERT(dqp->q_transp != tp); ASSERT(XFS_DQ_IS_LOCKED(dqp)); - ASSERT(XFS_DQ_IS_LOGITEM_INITD(dqp)); - lp = &dqp->q_logitem; + ASSERT(lp->qli_dquot == dqp); /* * Get a log_item_desc to point at the new item. @@ -96,7 +95,7 @@ xfs_trans_log_dquot( { xfs_log_item_desc_t *lidp; - ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); + ASSERT(dqp->q_transp == tp); ASSERT(XFS_DQ_IS_LOCKED(dqp)); lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem)); @@ -198,16 +197,16 @@ xfs_trans_get_dqtrx( int i; xfs_dqtrx_t *qa; - for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { - qa = XFS_QM_DQP_TO_DQACCT(tp, dqp); + qa = XFS_QM_ISUDQ(dqp) ? + tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots; + for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { if (qa[i].qt_dquot == NULL || - qa[i].qt_dquot == dqp) { - return (&qa[i]); - } + qa[i].qt_dquot == dqp) + return &qa[i]; } - return (NULL); + return NULL; } /* @@ -381,7 +380,7 @@ xfs_trans_apply_dquot_deltas( break; ASSERT(XFS_DQ_IS_LOCKED(dqp)); - ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); + ASSERT(dqp->q_transp == tp); /* * adjust the actual number of blocks used @@ -639,7 +638,7 @@ xfs_trans_dqresv( softlimit = q->qi_bsoftlimit; timer = be32_to_cpu(dqp->q_core.d_btimer); warns = be16_to_cpu(dqp->q_core.d_bwarns); - warnlimit = XFS_QI_BWARNLIMIT(dqp->q_mount); + warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit; resbcountp = &dqp->q_res_bcount; } else { ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); @@ -651,7 +650,7 @@ xfs_trans_dqresv( softlimit = q->qi_rtbsoftlimit; timer = be32_to_cpu(dqp->q_core.d_rtbtimer); warns = be16_to_cpu(dqp->q_core.d_rtbwarns); - warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount); + warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit; resbcountp = &dqp->q_res_rtbcount; } @@ -691,7 +690,7 @@ xfs_trans_dqresv( count = be64_to_cpu(dqp->q_core.d_icount); timer = be32_to_cpu(dqp->q_core.d_itimer); warns = be16_to_cpu(dqp->q_core.d_iwarns); - warnlimit = XFS_QI_IWARNLIMIT(dqp->q_mount); + warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit; hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); if (!hardlimit) hardlimit = q->qi_ihardlimit; diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index d13eeba2c8f..0135e2a669d 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -49,8 +49,8 @@ extern int xfs_acl_chmod(struct inode *inode); extern int posix_acl_access_exists(struct inode *inode); extern int posix_acl_default_exists(struct inode *inode); -extern struct xattr_handler xfs_xattr_acl_access_handler; -extern struct xattr_handler xfs_xattr_acl_default_handler; +extern const struct xattr_handler xfs_xattr_acl_access_handler; +extern const struct xattr_handler xfs_xattr_acl_default_handler; #else # define xfs_check_acl NULL # define xfs_get_acl(inode, type) NULL diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index abb8222b88c..401f364ad36 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -175,14 +175,20 @@ typedef struct xfs_agfl { } xfs_agfl_t; /* - * Busy block/extent entry. Used in perag to mark blocks that have been freed - * but whose transactions aren't committed to disk yet. + * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that + * have been freed but whose transactions aren't committed to disk yet. + * + * Note that we use the transaction ID to record the transaction, not the + * transaction structure itself. See xfs_alloc_busy_insert() for details. */ -typedef struct xfs_perag_busy { - xfs_agblock_t busy_start; - xfs_extlen_t busy_length; - struct xfs_trans *busy_tp; /* transaction that did the free */ -} xfs_perag_busy_t; +struct xfs_busy_extent { + struct rb_node rb_node; /* ag by-bno indexed search tree */ + struct list_head list; /* transaction busy extent list */ + xfs_agnumber_t agno; + xfs_agblock_t bno; + xfs_extlen_t length; + xlog_tid_t tid; /* transaction that created this */ +}; /* * Per-ag incore structure, copies of information in agf and agi, @@ -216,7 +222,8 @@ typedef struct xfs_perag { xfs_agino_t pagl_leftrec; xfs_agino_t pagl_rightrec; #ifdef __KERNEL__ - spinlock_t pagb_lock; /* lock for pagb_list */ + spinlock_t pagb_lock; /* lock for pagb_tree */ + struct rb_root pagb_tree; /* ordered tree of busy extents */ atomic_t pagf_fstrms; /* # of filestreams active in this AG */ @@ -226,7 +233,6 @@ typedef struct xfs_perag { int pag_ici_reclaimable; /* reclaimable inodes */ #endif int pagb_count; /* pagb slots in use */ - xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */ } xfs_perag_t; /* diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 94cddbfb256..a7fbe8a99b1 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -46,11 +46,9 @@ #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 -STATIC void -xfs_alloc_search_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len); +static int +xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); /* * Prototypes for per-ag allocation routines @@ -540,9 +538,16 @@ xfs_alloc_ag_vextent( be32_to_cpu(agf->agf_length)); xfs_alloc_log_agf(args->tp, args->agbp, XFS_AGF_FREEBLKS); - /* search the busylist for these blocks */ - xfs_alloc_search_busy(args->tp, args->agno, - args->agbno, args->len); + /* + * Search the busylist for these blocks and mark the + * transaction as synchronous if blocks are found. This + * avoids the need to block due to a synchronous log + * force to ensure correct ordering as the synchronous + * transaction will guarantee that for us. + */ + if (xfs_alloc_busy_search(args->mp, args->agno, + args->agbno, args->len)) + xfs_trans_set_sync(args->tp); } if (!args->isfl) xfs_trans_mod_sb(args->tp, @@ -1693,7 +1698,7 @@ xfs_free_ag_extent( * when the iclog commits to disk. If a busy block is allocated, * the iclog is pushed up to the LSN that freed the block. */ - xfs_alloc_mark_busy(tp, agno, bno, len); + xfs_alloc_busy_insert(tp, agno, bno, len); return 0; error0: @@ -1989,14 +1994,20 @@ xfs_alloc_get_freelist( *bnop = bno; /* - * As blocks are freed, they are added to the per-ag busy list - * and remain there until the freeing transaction is committed to - * disk. Now that we have allocated blocks, this list must be - * searched to see if a block is being reused. If one is, then - * the freeing transaction must be pushed to disk NOW by forcing - * to disk all iclogs up that transaction's LSN. + * As blocks are freed, they are added to the per-ag busy list and + * remain there until the freeing transaction is committed to disk. + * Now that we have allocated blocks, this list must be searched to see + * if a block is being reused. If one is, then the freeing transaction + * must be pushed to disk before this transaction. + * + * We do this by setting the current transaction to a sync transaction + * which guarantees that the freeing transaction is on disk before this + * transaction. This is done instead of a synchronous log force here so + * that we don't sit and wait with the AGF locked in the transaction + * during the log force. */ - xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1); + if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1)) + xfs_trans_set_sync(tp); return 0; } @@ -2201,7 +2212,7 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); spin_lock_init(&pag->pagb_lock); pag->pagb_count = 0; - memset(pag->pagb_list, 0, sizeof(pag->pagb_list)); + pag->pagb_tree = RB_ROOT; pag->pagf_init = 1; } #ifdef DEBUG @@ -2479,127 +2490,263 @@ error0: * list is reused, the transaction that freed it must be forced to disk * before continuing to use the block. * - * xfs_alloc_mark_busy - add to the per-ag busy list - * xfs_alloc_clear_busy - remove an item from the per-ag busy list + * xfs_alloc_busy_insert - add to the per-ag busy list + * xfs_alloc_busy_clear - remove an item from the per-ag busy list + * xfs_alloc_busy_search - search for a busy extent + */ + +/* + * Insert a new extent into the busy tree. + * + * The busy extent tree is indexed by the start block of the busy extent. + * there can be multiple overlapping ranges in the busy extent tree but only + * ever one entry at a given start block. The reason for this is that + * multi-block extents can be freed, then smaller chunks of that extent + * allocated and freed again before the first transaction commit is on disk. + * If the exact same start block is freed a second time, we have to wait for + * that busy extent to pass out of the tree before the new extent is inserted. + * There are two main cases we have to handle here. + * + * The first case is a transaction that triggers a "free - allocate - free" + * cycle. This can occur during btree manipulations as a btree block is freed + * to the freelist, then allocated from the free list, then freed again. In + * this case, the second extxpnet free is what triggers the duplicate and as + * such the transaction IDs should match. Because the extent was allocated in + * this transaction, the transaction must be marked as synchronous. This is + * true for all cases where the free/alloc/free occurs in the one transaction, + * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case. + * This serves to catch violations of the second case quite effectively. + * + * The second case is where the free/alloc/free occur in different + * transactions. In this case, the thread freeing the extent the second time + * can't mark the extent busy immediately because it is already tracked in a + * transaction that may be committing. When the log commit for the existing + * busy extent completes, the busy extent will be removed from the tree. If we + * allow the second busy insert to continue using that busy extent structure, + * it can be freed before this transaction is safely in the log. Hence our + * only option in this case is to force the log to remove the existing busy + * extent from the list before we insert the new one with the current + * transaction ID. + * + * The problem we are trying to avoid in the free-alloc-free in separate + * transactions is most easily described with a timeline: + * + * Thread 1 Thread 2 Thread 3 xfslogd + * xact alloc + * free X + * mark busy + * commit xact + * free xact + * xact alloc + * alloc X + * busy search + * mark xact sync + * commit xact + * free xact + * force log + * checkpoint starts + * .... + * xact alloc + * free X + * mark busy + * finds match + * *** KABOOM! *** + * .... + * log IO completes + * unbusy X + * checkpoint completes + * + * By issuing a log force in thread 3 @ "KABOOM", the thread will block until + * the checkpoint completes, and the busy extent it matched will have been + * removed from the tree when it is woken. Hence it can then continue safely. + * + * However, to ensure this matching process is robust, we need to use the + * transaction ID for identifying transaction, as delayed logging results in + * the busy extent and transaction lifecycles being different. i.e. the busy + * extent is active for a lot longer than the transaction. Hence the + * transaction structure can be freed and reallocated, then mark the same + * extent busy again in the new transaction. In this case the new transaction + * will have a different tid but can have the same address, and hence we need + * to check against the tid. + * + * Future: for delayed logging, we could avoid the log force if the extent was + * first freed in the current checkpoint sequence. This, however, requires the + * ability to pin the current checkpoint in memory until this transaction + * commits to ensure that both the original free and the current one combine + * logically into the one checkpoint. If the checkpoint sequences are + * different, however, we still need to wait on a log force. */ void -xfs_alloc_mark_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len) +xfs_alloc_busy_insert( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) { - xfs_perag_busy_t *bsy; + struct xfs_busy_extent *new; + struct xfs_busy_extent *busyp; struct xfs_perag *pag; - int n; + struct rb_node **rbp; + struct rb_node *parent; + int match; - pag = xfs_perag_get(tp->t_mountp, agno); - spin_lock(&pag->pagb_lock); - /* search pagb_list for an open slot */ - for (bsy = pag->pagb_list, n = 0; - n < XFS_PAGB_NUM_SLOTS; - bsy++, n++) { - if (bsy->busy_tp == NULL) { - break; - } + new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL); + if (!new) { + /* + * No Memory! Since it is now not possible to track the free + * block, make this a synchronous transaction to insure that + * the block is not reused before this transaction commits. + */ + trace_xfs_alloc_busy(tp, agno, bno, len, 1); + xfs_trans_set_sync(tp); + return; } - trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n); + new->agno = agno; + new->bno = bno; + new->length = len; + new->tid = xfs_log_get_trans_ident(tp); - if (n < XFS_PAGB_NUM_SLOTS) { - bsy = &pag->pagb_list[n]; - pag->pagb_count++; - bsy->busy_start = bno; - bsy->busy_length = len; - bsy->busy_tp = tp; - xfs_trans_add_busy(tp, agno, n); - } else { + INIT_LIST_HEAD(&new->list); + + /* trace before insert to be able to see failed inserts */ + trace_xfs_alloc_busy(tp, agno, bno, len, 0); + + pag = xfs_perag_get(tp->t_mountp, new->agno); +restart: + spin_lock(&pag->pagb_lock); + rbp = &pag->pagb_tree.rb_node; + parent = NULL; + busyp = NULL; + match = 0; + while (*rbp && match >= 0) { + parent = *rbp; + busyp = rb_entry(parent, struct xfs_busy_extent, rb_node); + + if (new->bno < busyp->bno) { + /* may overlap, but exact start block is lower */ + rbp = &(*rbp)->rb_left; + if (new->bno + new->length > busyp->bno) + match = busyp->tid == new->tid ? 1 : -1; + } else if (new->bno > busyp->bno) { + /* may overlap, but exact start block is higher */ + rbp = &(*rbp)->rb_right; + if (bno < busyp->bno + busyp->length) + match = busyp->tid == new->tid ? 1 : -1; + } else { + match = busyp->tid == new->tid ? 1 : -1; + break; + } + } + if (match < 0) { + /* overlap marked busy in different transaction */ + spin_unlock(&pag->pagb_lock); + xfs_log_force(tp->t_mountp, XFS_LOG_SYNC); + goto restart; + } + if (match > 0) { /* - * The busy list is full! Since it is now not possible to - * track the free block, make this a synchronous transaction - * to insure that the block is not reused before this - * transaction commits. + * overlap marked busy in same transaction. Update if exact + * start block match, otherwise combine the busy extents into + * a single range. */ - xfs_trans_set_sync(tp); - } + if (busyp->bno == new->bno) { + busyp->length = max(busyp->length, new->length); + spin_unlock(&pag->pagb_lock); + ASSERT(tp->t_flags & XFS_TRANS_SYNC); + xfs_perag_put(pag); + kmem_free(new); + return; + } + rb_erase(&busyp->rb_node, &pag->pagb_tree); + new->length = max(busyp->bno + busyp->length, + new->bno + new->length) - + min(busyp->bno, new->bno); + new->bno = min(busyp->bno, new->bno); + } else + busyp = NULL; + rb_link_node(&new->rb_node, parent, rbp); + rb_insert_color(&new->rb_node, &pag->pagb_tree); + + list_add(&new->list, &tp->t_busy); spin_unlock(&pag->pagb_lock); xfs_perag_put(pag); + kmem_free(busyp); } -void -xfs_alloc_clear_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - int idx) +/* + * Search for a busy extent within the range of the extent we are about to + * allocate. You need to be holding the busy extent tree lock when calling + * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy + * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact + * match. This is done so that a non-zero return indicates an overlap that + * will require a synchronous transaction, but it can still be + * used to distinguish between a partial or exact match. + */ +static int +xfs_alloc_busy_search( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) { struct xfs_perag *pag; - xfs_perag_busy_t *list; + struct rb_node *rbp; + struct xfs_busy_extent *busyp; + int match = 0; - ASSERT(idx < XFS_PAGB_NUM_SLOTS); - pag = xfs_perag_get(tp->t_mountp, agno); + pag = xfs_perag_get(mp, agno); spin_lock(&pag->pagb_lock); - list = pag->pagb_list; - trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp); - - if (list[idx].busy_tp == tp) { - list[idx].busy_tp = NULL; - pag->pagb_count--; + rbp = pag->pagb_tree.rb_node; + + /* find closest start bno overlap */ + while (rbp) { + busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node); + if (bno < busyp->bno) { + /* may overlap, but exact start block is lower */ + if (bno + len > busyp->bno) + match = -1; + rbp = rbp->rb_left; + } else if (bno > busyp->bno) { + /* may overlap, but exact start block is higher */ + if (bno < busyp->bno + busyp->length) + match = -1; + rbp = rbp->rb_right; + } else { + /* bno matches busyp, length determines exact match */ + match = (busyp->length == len) ? 1 : -1; + break; + } } - spin_unlock(&pag->pagb_lock); + trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match); xfs_perag_put(pag); + return match; } - -/* - * If we find the extent in the busy list, force the log out to get the - * extent out of the busy list so the caller can use it straight away. - */ -STATIC void -xfs_alloc_search_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len) +void +xfs_alloc_busy_clear( + struct xfs_mount *mp, + struct xfs_busy_extent *busyp) { struct xfs_perag *pag; - xfs_perag_busy_t *bsy; - xfs_agblock_t uend, bend; - xfs_lsn_t lsn = 0; - int cnt; - pag = xfs_perag_get(tp->t_mountp, agno); - spin_lock(&pag->pagb_lock); - cnt = pag->pagb_count; + trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno, + busyp->length); - /* - * search pagb_list for this slot, skipping open slots. We have to - * search the entire array as there may be multiple overlaps and - * we have to get the most recent LSN for the log force to push out - * all the transactions that span the range. - */ - uend = bno + len - 1; - for (cnt = 0; cnt < pag->pagb_count; cnt++) { - bsy = &pag->pagb_list[cnt]; - if (!bsy->busy_tp) - continue; + ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno, + busyp->length) == 1); - bend = bsy->busy_start + bsy->busy_length - 1; - if (bno > bend || uend < bsy->busy_start) - continue; + list_del_init(&busyp->list); - /* (start1,length1) within (start2, length2) */ - if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0) - lsn = bsy->busy_tp->t_commit_lsn; - } + pag = xfs_perag_get(mp, busyp->agno); + spin_lock(&pag->pagb_lock); + rb_erase(&busyp->rb_node, &pag->pagb_tree); spin_unlock(&pag->pagb_lock); xfs_perag_put(pag); - trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn); - /* - * If a block was found, force the log through the LSN of the - * transaction that freed the block - */ - if (lsn) - xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC); + kmem_free(busyp); } diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 599bffa3978..6d05199b667 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -22,6 +22,7 @@ struct xfs_buf; struct xfs_mount; struct xfs_perag; struct xfs_trans; +struct xfs_busy_extent; /* * Freespace allocation types. Argument to xfs_alloc_[v]extent. @@ -119,15 +120,13 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp, #ifdef __KERNEL__ void -xfs_alloc_mark_busy(xfs_trans_t *tp, +xfs_alloc_busy_insert(xfs_trans_t *tp, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len); void -xfs_alloc_clear_busy(xfs_trans_t *tp, - xfs_agnumber_t ag, - int idx); +xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp); #endif /* __KERNEL__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index b726e10d2c1..83f49421875 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -134,7 +134,7 @@ xfs_allocbt_free_block( * disk. If a busy block is allocated, the iclog is pushed up to the * LSN that freed the block. */ - xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); + xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); xfs_trans_agbtree_delta(cur->bc_tp, -1); return 0; } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 5c11e4d1701..99587ded043 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -3829,7 +3829,7 @@ xfs_bmap_add_attrfork( } if ((error = xfs_bmap_finish(&tp, &flist, &committed))) goto error2; - error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); ASSERT(ip->i_df.if_ext_max == XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); return error; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index f3c49e69eab..02a80984aa0 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -64,7 +64,7 @@ xfs_buf_item_log_debug( nbytes = last - first + 1; bfset(bip->bli_logged, first, nbytes); for (x = 0; x < nbytes; x++) { - chunk_num = byte >> XFS_BLI_SHIFT; + chunk_num = byte >> XFS_BLF_SHIFT; word_num = chunk_num >> BIT_TO_WORD_SHIFT; bit_num = chunk_num & (NBWORD - 1); wordp = &(bip->bli_format.blf_data_map[word_num]); @@ -166,7 +166,7 @@ xfs_buf_item_size( * cancel flag in it. */ trace_xfs_buf_item_size_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); return 1; } @@ -197,9 +197,9 @@ xfs_buf_item_size( } else if (next_bit != last_bit + 1) { last_bit = next_bit; nvecs++; - } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) != - (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) + - XFS_BLI_CHUNK)) { + } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) != + (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) + + XFS_BLF_CHUNK)) { last_bit = next_bit; nvecs++; } else { @@ -254,6 +254,20 @@ xfs_buf_item_format( vecp++; nvecs = 1; + /* + * If it is an inode buffer, transfer the in-memory state to the + * format flags and clear the in-memory state. We do not transfer + * this state if the inode buffer allocation has not yet been committed + * to the log as setting the XFS_BLI_INODE_BUF flag will prevent + * correct replay of the inode allocation. + */ + if (bip->bli_flags & XFS_BLI_INODE_BUF) { + if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && + xfs_log_item_in_current_chkpt(&bip->bli_item))) + bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; + bip->bli_flags &= ~XFS_BLI_INODE_BUF; + } + if (bip->bli_flags & XFS_BLI_STALE) { /* * The buffer is stale, so all we need to log @@ -261,7 +275,7 @@ xfs_buf_item_format( * cancel flag in it. */ trace_xfs_buf_item_format_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); bip->bli_format.blf_size = nvecs; return; } @@ -294,28 +308,28 @@ xfs_buf_item_format( * keep counting and scanning. */ if (next_bit == -1) { - buffer_offset = first_bit * XFS_BLI_CHUNK; + buffer_offset = first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLI_CHUNK; + vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; nvecs++; break; } else if (next_bit != last_bit + 1) { - buffer_offset = first_bit * XFS_BLI_CHUNK; + buffer_offset = first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLI_CHUNK; + vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; nvecs++; vecp++; first_bit = next_bit; last_bit = next_bit; nbits = 1; - } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) != - (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) + - XFS_BLI_CHUNK)) { - buffer_offset = first_bit * XFS_BLI_CHUNK; + } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) != + (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) + + XFS_BLF_CHUNK)) { + buffer_offset = first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLI_CHUNK; + vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; /* You would think we need to bump the nvecs here too, but we do not * this number is used by recovery, and it gets confused by the boundary @@ -341,10 +355,15 @@ xfs_buf_item_format( } /* - * This is called to pin the buffer associated with the buf log - * item in memory so it cannot be written out. Simply call bpin() - * on the buffer to do this. + * This is called to pin the buffer associated with the buf log item in memory + * so it cannot be written out. Simply call bpin() on the buffer to do this. + * + * We also always take a reference to the buffer log item here so that the bli + * is held while the item is pinned in memory. This means that we can + * unconditionally drop the reference count a transaction holds when the + * transaction is completed. */ + STATIC void xfs_buf_item_pin( xfs_buf_log_item_t *bip) @@ -356,6 +375,7 @@ xfs_buf_item_pin( ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || (bip->bli_flags & XFS_BLI_STALE)); + atomic_inc(&bip->bli_refcount); trace_xfs_buf_item_pin(bip); xfs_bpin(bp); } @@ -372,12 +392,12 @@ xfs_buf_item_pin( */ STATIC void xfs_buf_item_unpin( - xfs_buf_log_item_t *bip, - int stale) + xfs_buf_log_item_t *bip) { struct xfs_ail *ailp; xfs_buf_t *bp; int freed; + int stale = bip->bli_flags & XFS_BLI_STALE; bp = bip->bli_buf; ASSERT(bp != NULL); @@ -393,7 +413,7 @@ xfs_buf_item_unpin( ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); trace_xfs_buf_item_unpin_stale(bip); /* @@ -428,40 +448,34 @@ xfs_buf_item_unpin_remove( xfs_buf_log_item_t *bip, xfs_trans_t *tp) { - xfs_buf_t *bp; - xfs_log_item_desc_t *lidp; - int stale = 0; - - bp = bip->bli_buf; - /* - * will xfs_buf_item_unpin() call xfs_buf_item_relse()? - */ + /* will xfs_buf_item_unpin() call xfs_buf_item_relse()? */ if ((atomic_read(&bip->bli_refcount) == 1) && (bip->bli_flags & XFS_BLI_STALE)) { + /* + * yes -- We can safely do some work here and then call + * buf_item_unpin to do the rest because we are + * are holding the buffer locked so no one else will be + * able to bump up the refcount. We have to remove the + * log item from the transaction as we are about to release + * our reference to the buffer. If we don't, the unlock that + * occurs later in the xfs_trans_uncommit() will try to + * reference the buffer which we no longer have a hold on. + */ + struct xfs_log_item_desc *lidp; + ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0); trace_xfs_buf_item_unpin_stale(bip); - /* - * yes -- clear the xaction descriptor in-use flag - * and free the chunk if required. We can safely - * do some work here and then call buf_item_unpin - * to do the rest because if the if is true, then - * we are holding the buffer locked so no one else - * will be able to bump up the refcount. - */ - lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip); - stale = lidp->lid_flags & XFS_LID_BUF_STALE; + lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)bip); xfs_trans_free_item(tp, lidp); + /* - * Since the transaction no longer refers to the buffer, - * the buffer should no longer refer to the transaction. + * Since the transaction no longer refers to the buffer, the + * buffer should no longer refer to the transaction. */ - XFS_BUF_SET_FSPRIVATE2(bp, NULL); + XFS_BUF_SET_FSPRIVATE2(bip->bli_buf, NULL); } - - xfs_buf_item_unpin(bip, stale); - - return; + xfs_buf_item_unpin(bip); } /* @@ -495,20 +509,23 @@ xfs_buf_item_trylock( } /* - * Release the buffer associated with the buf log item. - * If there is no dirty logged data associated with the - * buffer recorded in the buf log item, then free the - * buf log item and remove the reference to it in the - * buffer. + * Release the buffer associated with the buf log item. If there is no dirty + * logged data associated with the buffer recorded in the buf log item, then + * free the buf log item and remove the reference to it in the buffer. + * + * This call ignores the recursion count. It is only called when the buffer + * should REALLY be unlocked, regardless of the recursion count. * - * This call ignores the recursion count. It is only called - * when the buffer should REALLY be unlocked, regardless - * of the recursion count. + * We unconditionally drop the transaction's reference to the log item. If the + * item was logged, then another reference was taken when it was pinned, so we + * can safely drop the transaction reference now. This also allows us to avoid + * potential races with the unpin code freeing the bli by not referencing the + * bli after we've dropped the reference count. * - * If the XFS_BLI_HOLD flag is set in the buf log item, then - * free the log item if necessary but do not unlock the buffer. - * This is for support of xfs_trans_bhold(). Make sure the - * XFS_BLI_HOLD field is cleared if we don't free the item. + * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item + * if necessary but do not unlock the buffer. This is for support of + * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't + * free the item. */ STATIC void xfs_buf_item_unlock( @@ -520,73 +537,54 @@ xfs_buf_item_unlock( bp = bip->bli_buf; - /* - * Clear the buffer's association with this transaction. - */ + /* Clear the buffer's association with this transaction. */ XFS_BUF_SET_FSPRIVATE2(bp, NULL); /* - * If this is a transaction abort, don't return early. - * Instead, allow the brelse to happen. - * Normally it would be done for stale (cancelled) buffers - * at unpin time, but we'll never go through the pin/unpin - * cycle if we abort inside commit. + * If this is a transaction abort, don't return early. Instead, allow + * the brelse to happen. Normally it would be done for stale + * (cancelled) buffers at unpin time, but we'll never go through the + * pin/unpin cycle if we abort inside commit. */ aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0; /* - * If the buf item is marked stale, then don't do anything. - * We'll unlock the buffer and free the buf item when the - * buffer is unpinned for the last time. + * Before possibly freeing the buf item, determine if we should + * release the buffer at the end of this routine. */ - if (bip->bli_flags & XFS_BLI_STALE) { - bip->bli_flags &= ~XFS_BLI_LOGGED; - trace_xfs_buf_item_unlock_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); - if (!aborted) - return; - } + hold = bip->bli_flags & XFS_BLI_HOLD; + + /* Clear the per transaction state. */ + bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD); /* - * Drop the transaction's reference to the log item if - * it was not logged as part of the transaction. Otherwise - * we'll drop the reference in xfs_buf_item_unpin() when - * the transaction is really through with the buffer. + * If the buf item is marked stale, then don't do anything. We'll + * unlock the buffer and free the buf item when the buffer is unpinned + * for the last time. */ - if (!(bip->bli_flags & XFS_BLI_LOGGED)) { - atomic_dec(&bip->bli_refcount); - } else { - /* - * Clear the logged flag since this is per - * transaction state. - */ - bip->bli_flags &= ~XFS_BLI_LOGGED; + if (bip->bli_flags & XFS_BLI_STALE) { + trace_xfs_buf_item_unlock_stale(bip); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + if (!aborted) { + atomic_dec(&bip->bli_refcount); + return; + } } - /* - * Before possibly freeing the buf item, determine if we should - * release the buffer at the end of this routine. - */ - hold = bip->bli_flags & XFS_BLI_HOLD; trace_xfs_buf_item_unlock(bip); /* - * If the buf item isn't tracking any data, free it. - * Otherwise, if XFS_BLI_HOLD is set clear it. + * If the buf item isn't tracking any data, free it, otherwise drop the + * reference we hold to it. */ if (xfs_bitmap_empty(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size)) { + bip->bli_format.blf_map_size)) xfs_buf_item_relse(bp); - } else if (hold) { - bip->bli_flags &= ~XFS_BLI_HOLD; - } + else + atomic_dec(&bip->bli_refcount); - /* - * Release the buffer if XFS_BLI_HOLD was not set. - */ - if (!hold) { + if (!hold) xfs_buf_relse(bp); - } } /* @@ -675,7 +673,7 @@ static struct xfs_item_ops xfs_buf_item_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_buf_item_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_buf_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_buf_item_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_buf_item_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) xfs_buf_item_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock, @@ -723,20 +721,17 @@ xfs_buf_item_init( } /* - * chunks is the number of XFS_BLI_CHUNK size pieces + * chunks is the number of XFS_BLF_CHUNK size pieces * the buffer can be divided into. Make sure not to * truncate any pieces. map_size is the size of the * bitmap needed to describe the chunks of the buffer. */ - chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT); + chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT); map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); - bip->bli_item.li_type = XFS_LI_BUF; - bip->bli_item.li_ops = &xfs_buf_item_ops; - bip->bli_item.li_mountp = mp; - bip->bli_item.li_ailp = mp->m_ail; + xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); bip->bli_buf = bp; xfs_buf_hold(bp); bip->bli_format.blf_type = XFS_LI_BUF; @@ -799,8 +794,8 @@ xfs_buf_item_log( /* * Convert byte offsets to bit numbers. */ - first_bit = first >> XFS_BLI_SHIFT; - last_bit = last >> XFS_BLI_SHIFT; + first_bit = first >> XFS_BLF_SHIFT; + last_bit = last >> XFS_BLF_SHIFT; /* * Calculate the total number of bits to be set. diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 217f34af00c..f20bb472d58 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -26,7 +26,7 @@ extern kmem_zone_t *xfs_buf_item_zone; * have been logged. * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything. */ -typedef struct xfs_buf_log_format_t { +typedef struct xfs_buf_log_format { unsigned short blf_type; /* buf log item type indicator */ unsigned short blf_size; /* size of this item */ ushort blf_flags; /* misc state */ @@ -41,22 +41,22 @@ typedef struct xfs_buf_log_format_t { * This flag indicates that the buffer contains on disk inodes * and requires special recovery handling. */ -#define XFS_BLI_INODE_BUF 0x1 +#define XFS_BLF_INODE_BUF 0x1 /* * This flag indicates that the buffer should not be replayed * during recovery because its blocks are being freed. */ -#define XFS_BLI_CANCEL 0x2 +#define XFS_BLF_CANCEL 0x2 /* * This flag indicates that the buffer contains on disk * user or group dquots and may require special recovery handling. */ -#define XFS_BLI_UDQUOT_BUF 0x4 -#define XFS_BLI_PDQUOT_BUF 0x8 -#define XFS_BLI_GDQUOT_BUF 0x10 +#define XFS_BLF_UDQUOT_BUF 0x4 +#define XFS_BLF_PDQUOT_BUF 0x8 +#define XFS_BLF_GDQUOT_BUF 0x10 -#define XFS_BLI_CHUNK 128 -#define XFS_BLI_SHIFT 7 +#define XFS_BLF_CHUNK 128 +#define XFS_BLF_SHIFT 7 #define BIT_TO_WORD_SHIFT 5 #define NBWORD (NBBY * sizeof(unsigned int)) @@ -69,6 +69,7 @@ typedef struct xfs_buf_log_format_t { #define XFS_BLI_LOGGED 0x08 #define XFS_BLI_INODE_ALLOC_BUF 0x10 #define XFS_BLI_STALE_INODE 0x20 +#define XFS_BLI_INODE_BUF 0x40 #define XFS_BLI_FLAGS \ { XFS_BLI_HOLD, "HOLD" }, \ @@ -76,7 +77,8 @@ typedef struct xfs_buf_log_format_t { { XFS_BLI_STALE, "STALE" }, \ { XFS_BLI_LOGGED, "LOGGED" }, \ { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ - { XFS_BLI_STALE_INODE, "STALE_INODE" } + { XFS_BLI_STALE_INODE, "STALE_INODE" }, \ + { XFS_BLI_INODE_BUF, "INODE_BUF" } #ifdef __KERNEL__ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 92d5cd5bf4f..047b8a8e5c2 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -170,7 +170,7 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...) va_list ap; #ifdef DEBUG - xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT; + xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES); #endif if (xfs_panic_mask && (xfs_panic_mask & panic_tag) @@ -186,18 +186,18 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...) void xfs_error_report( - char *tag, - int level, - xfs_mount_t *mp, - char *fname, - int linenum, - inst_t *ra) + const char *tag, + int level, + struct xfs_mount *mp, + const char *filename, + int linenum, + inst_t *ra) { if (level <= xfs_error_level) { xfs_cmn_err(XFS_PTAG_ERROR_REPORT, CE_ALERT, mp, "XFS internal error %s at line %d of file %s. Caller 0x%p\n", - tag, linenum, fname, ra); + tag, linenum, filename, ra); xfs_stack_trace(); } @@ -205,15 +205,15 @@ xfs_error_report( void xfs_corruption_error( - char *tag, - int level, - xfs_mount_t *mp, - void *p, - char *fname, - int linenum, - inst_t *ra) + const char *tag, + int level, + struct xfs_mount *mp, + void *p, + const char *filename, + int linenum, + inst_t *ra) { if (level <= xfs_error_level) xfs_hex_dump(p, 16); - xfs_error_report(tag, level, mp, fname, linenum, ra); + xfs_error_report(tag, level, mp, filename, linenum, ra); } diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 0c93051c465..c2c1a072bb8 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -29,10 +29,11 @@ extern int xfs_error_trap(int); struct xfs_mount; -extern void xfs_error_report(char *tag, int level, struct xfs_mount *mp, - char *fname, int linenum, inst_t *ra); -extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp, - void *p, char *fname, int linenum, inst_t *ra); +extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, + const char *filename, int linenum, inst_t *ra); +extern void xfs_corruption_error(const char *tag, int level, + struct xfs_mount *mp, void *p, const char *filename, + int linenum, inst_t *ra); #define XFS_ERROR_REPORT(e, lvl, mp) \ xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 6f35ed1b39b..409fe81585f 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -106,7 +106,7 @@ xfs_efi_item_pin(xfs_efi_log_item_t *efip) */ /*ARGSUSED*/ STATIC void -xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale) +xfs_efi_item_unpin(xfs_efi_log_item_t *efip) { struct xfs_ail *ailp = efip->efi_item.li_ailp; @@ -224,7 +224,7 @@ static struct xfs_item_ops xfs_efi_item_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_efi_item_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_efi_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efi_item_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efi_item_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) xfs_efi_item_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock, @@ -259,10 +259,7 @@ xfs_efi_init(xfs_mount_t *mp, KM_SLEEP); } - efip->efi_item.li_type = XFS_LI_EFI; - efip->efi_item.li_ops = &xfs_efi_item_ops; - efip->efi_item.li_mountp = mp; - efip->efi_item.li_ailp = mp->m_ail; + xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); efip->efi_format.efi_nextents = nextents; efip->efi_format.efi_id = (__psint_t)(void*)efip; @@ -428,7 +425,7 @@ xfs_efd_item_pin(xfs_efd_log_item_t *efdp) */ /*ARGSUSED*/ STATIC void -xfs_efd_item_unpin(xfs_efd_log_item_t *efdp, int stale) +xfs_efd_item_unpin(xfs_efd_log_item_t *efdp) { return; } @@ -518,7 +515,7 @@ static struct xfs_item_ops xfs_efd_item_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_efd_item_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_efd_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efd_item_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efd_item_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) xfs_efd_item_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock, @@ -554,10 +551,7 @@ xfs_efd_init(xfs_mount_t *mp, KM_SLEEP); } - efdp->efd_item.li_type = XFS_LI_EFD; - efdp->efd_item.li_ops = &xfs_efd_item_ops; - efdp->efd_item.li_mountp = mp; - efdp->efd_item.li_ailp = mp->m_ail; + xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops); efdp->efd_efip = efip; efdp->efd_format.efd_nextents = nextents; efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 0ffd5644704..8cd6e8d8fe9 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2449,6 +2449,8 @@ xfs_iunpin_nowait( { ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + trace_xfs_inode_unpin_nowait(ip, _RET_IP_); + /* Give the log a push to start the unpinning I/O */ xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 7bfea854015..cf8249a6000 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -543,6 +543,7 @@ xfs_inode_item_pin( { ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); + trace_xfs_inode_pin(iip->ili_inode, _RET_IP_); atomic_inc(&iip->ili_inode->i_pincount); } @@ -556,11 +557,11 @@ xfs_inode_item_pin( /* ARGSUSED */ STATIC void xfs_inode_item_unpin( - xfs_inode_log_item_t *iip, - int stale) + xfs_inode_log_item_t *iip) { struct xfs_inode *ip = iip->ili_inode; + trace_xfs_inode_unpin(ip, _RET_IP_); ASSERT(atomic_read(&ip->i_pincount) > 0); if (atomic_dec_and_test(&ip->i_pincount)) wake_up(&ip->i_ipin_wait); @@ -572,7 +573,7 @@ xfs_inode_item_unpin_remove( xfs_inode_log_item_t *iip, xfs_trans_t *tp) { - xfs_inode_item_unpin(iip, 0); + xfs_inode_item_unpin(iip); } /* @@ -838,7 +839,7 @@ static struct xfs_item_ops xfs_inode_item_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_inode_item_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_inode_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_inode_item_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_inode_item_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) xfs_inode_item_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock, @@ -865,17 +866,9 @@ xfs_inode_item_init( ASSERT(ip->i_itemp == NULL); iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); - iip->ili_item.li_type = XFS_LI_INODE; - iip->ili_item.li_ops = &xfs_inode_item_ops; - iip->ili_item.li_mountp = mp; - iip->ili_item.li_ailp = mp->m_ail; iip->ili_inode = ip; - - /* - We have zeroed memory. No need ... - iip->ili_extents_buf = NULL; - */ - + xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, + &xfs_inode_item_ops); iip->ili_format.ilf_type = XFS_LI_INODE; iip->ili_format.ilf_ino = ip->i_ino; iip->ili_format.ilf_blkno = ip->i_imap.im_blkno; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 0b65039951a..ef14943829d 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -55,71 +55,33 @@ #define XFS_STRAT_WRITE_IMAPS 2 #define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP -STATIC int -xfs_imap_to_bmap( - xfs_inode_t *ip, - xfs_off_t offset, - xfs_bmbt_irec_t *imap, - xfs_iomap_t *iomapp, - int imaps, /* Number of imap entries */ - int iomaps, /* Number of iomap entries */ - int flags) -{ - xfs_mount_t *mp = ip->i_mount; - int pbm; - xfs_fsblock_t start_block; - - - for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) { - iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff); - iomapp->iomap_delta = offset - iomapp->iomap_offset; - iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount); - iomapp->iomap_flags = flags; - - if (XFS_IS_REALTIME_INODE(ip)) { - iomapp->iomap_flags |= IOMAP_REALTIME; - iomapp->iomap_target = mp->m_rtdev_targp; - } else { - iomapp->iomap_target = mp->m_ddev_targp; - } - start_block = imap->br_startblock; - if (start_block == HOLESTARTBLOCK) { - iomapp->iomap_bn = IOMAP_DADDR_NULL; - iomapp->iomap_flags |= IOMAP_HOLE; - } else if (start_block == DELAYSTARTBLOCK) { - iomapp->iomap_bn = IOMAP_DADDR_NULL; - iomapp->iomap_flags |= IOMAP_DELAY; - } else { - iomapp->iomap_bn = xfs_fsb_to_db(ip, start_block); - if (ISUNWRITTEN(imap)) - iomapp->iomap_flags |= IOMAP_UNWRITTEN; - } - - offset += iomapp->iomap_bsize - iomapp->iomap_delta; - } - return pbm; /* Return the number filled */ -} +STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, + int, struct xfs_bmbt_irec *, int *); +STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int, + struct xfs_bmbt_irec *, int *); +STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t, + struct xfs_bmbt_irec *, int *); int xfs_iomap( - xfs_inode_t *ip, - xfs_off_t offset, - ssize_t count, - int flags, - xfs_iomap_t *iomapp, - int *niomaps) + struct xfs_inode *ip, + xfs_off_t offset, + ssize_t count, + int flags, + struct xfs_bmbt_irec *imap, + int *nimaps, + int *new) { - xfs_mount_t *mp = ip->i_mount; - xfs_fileoff_t offset_fsb, end_fsb; - int error = 0; - int lockmode = 0; - xfs_bmbt_irec_t imap; - int nimaps = 1; - int bmapi_flags = 0; - int iomap_flags = 0; + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb, end_fsb; + int error = 0; + int lockmode = 0; + int bmapi_flags = 0; ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); + *new = 0; + if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); @@ -160,8 +122,8 @@ xfs_iomap( error = xfs_bmapi(NULL, ip, offset_fsb, (xfs_filblks_t)(end_fsb - offset_fsb), - bmapi_flags, NULL, 0, &imap, - &nimaps, NULL, NULL); + bmapi_flags, NULL, 0, imap, + nimaps, NULL, NULL); if (error) goto out; @@ -169,46 +131,41 @@ xfs_iomap( switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) { case BMAPI_WRITE: /* If we found an extent, return it */ - if (nimaps && - (imap.br_startblock != HOLESTARTBLOCK) && - (imap.br_startblock != DELAYSTARTBLOCK)) { - trace_xfs_iomap_found(ip, offset, count, flags, &imap); + if (*nimaps && + (imap->br_startblock != HOLESTARTBLOCK) && + (imap->br_startblock != DELAYSTARTBLOCK)) { + trace_xfs_iomap_found(ip, offset, count, flags, imap); break; } if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) { error = xfs_iomap_write_direct(ip, offset, count, flags, - &imap, &nimaps, nimaps); + imap, nimaps); } else { error = xfs_iomap_write_delay(ip, offset, count, flags, - &imap, &nimaps); + imap, nimaps); } if (!error) { - trace_xfs_iomap_alloc(ip, offset, count, flags, &imap); + trace_xfs_iomap_alloc(ip, offset, count, flags, imap); } - iomap_flags = IOMAP_NEW; + *new = 1; break; case BMAPI_ALLOCATE: /* If we found an extent, return it */ xfs_iunlock(ip, lockmode); lockmode = 0; - if (nimaps && !isnullstartblock(imap.br_startblock)) { - trace_xfs_iomap_found(ip, offset, count, flags, &imap); + if (*nimaps && !isnullstartblock(imap->br_startblock)) { + trace_xfs_iomap_found(ip, offset, count, flags, imap); break; } error = xfs_iomap_write_allocate(ip, offset, count, - &imap, &nimaps); + imap, nimaps); break; } - if (nimaps) { - *niomaps = xfs_imap_to_bmap(ip, offset, &imap, - iomapp, nimaps, *niomaps, iomap_flags); - } else if (niomaps) { - *niomaps = 0; - } + ASSERT(*nimaps <= 1); out: if (lockmode) @@ -216,7 +173,6 @@ out: return XFS_ERROR(error); } - STATIC int xfs_iomap_eof_align_last_fsb( xfs_mount_t *mp, @@ -285,15 +241,14 @@ xfs_cmn_err_fsblock_zero( return EFSCORRUPTED; } -int +STATIC int xfs_iomap_write_direct( xfs_inode_t *ip, xfs_off_t offset, size_t count, int flags, xfs_bmbt_irec_t *ret_imap, - int *nmaps, - int found) + int *nmaps) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb; @@ -330,7 +285,7 @@ xfs_iomap_write_direct( if (error) goto error_out; } else { - if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) + if (*nmaps && (ret_imap->br_startblock == HOLESTARTBLOCK)) last_fsb = MIN(last_fsb, (xfs_fileoff_t) ret_imap->br_blockcount + ret_imap->br_startoff); @@ -485,7 +440,7 @@ xfs_iomap_eof_want_preallocate( return 0; } -int +STATIC int xfs_iomap_write_delay( xfs_inode_t *ip, xfs_off_t offset, @@ -588,7 +543,7 @@ retry: * We no longer bother to look at the incoming map - all we have to * guarantee is that whatever we allocate fills the required range. */ -int +STATIC int xfs_iomap_write_allocate( xfs_inode_t *ip, xfs_off_t offset, diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 174f2999099..81ac4afd45b 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -18,19 +18,6 @@ #ifndef __XFS_IOMAP_H__ #define __XFS_IOMAP_H__ -#define IOMAP_DADDR_NULL ((xfs_daddr_t) (-1LL)) - - -typedef enum { /* iomap_flags values */ - IOMAP_READ = 0, /* mapping for a read */ - IOMAP_HOLE = 0x02, /* mapping covers a hole */ - IOMAP_DELAY = 0x04, /* mapping covers delalloc region */ - IOMAP_REALTIME = 0x10, /* mapping on the realtime device */ - IOMAP_UNWRITTEN = 0x20, /* mapping covers allocated */ - /* but uninitialized file data */ - IOMAP_NEW = 0x40 /* just allocate */ -} iomap_flags_t; - typedef enum { /* base extent manipulation calls */ BMAPI_READ = (1 << 0), /* read extents */ @@ -52,43 +39,11 @@ typedef enum { { BMAPI_MMAP, "MMAP" }, \ { BMAPI_TRYLOCK, "TRYLOCK" } -/* - * xfs_iomap_t: File system I/O map - * - * The iomap_bn field is expressed in 512-byte blocks, and is where the - * mapping starts on disk. - * - * The iomap_offset, iomap_bsize and iomap_delta fields are in bytes. - * iomap_offset is the offset of the mapping in the file itself. - * iomap_bsize is the size of the mapping, iomap_delta is the - * desired data's offset into the mapping, given the offset supplied - * to the file I/O map routine. - * - * When a request is made to read beyond the logical end of the object, - * iomap_size may be set to 0, but iomap_offset and iomap_length should be set - * to the actual amount of underlying storage that has been allocated, if any. - */ - -typedef struct xfs_iomap { - xfs_daddr_t iomap_bn; /* first 512B blk of mapping */ - xfs_buftarg_t *iomap_target; - xfs_off_t iomap_offset; /* offset of mapping, bytes */ - xfs_off_t iomap_bsize; /* size of mapping, bytes */ - xfs_off_t iomap_delta; /* offset into mapping, bytes */ - iomap_flags_t iomap_flags; -} xfs_iomap_t; - struct xfs_inode; struct xfs_bmbt_irec; extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int, - struct xfs_iomap *, int *); -extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, - int, struct xfs_bmbt_irec *, int *, int); -extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int, - struct xfs_bmbt_irec *, int *); -extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t, - struct xfs_bmbt_irec *, int *); + struct xfs_bmbt_irec *, int *, int *); extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 2be01913628..5215abc8023 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -44,13 +44,8 @@ kmem_zone_t *xfs_log_ticket_zone; -#define xlog_write_adv_cnt(ptr, len, off, bytes) \ - { (ptr) += (bytes); \ - (len) -= (bytes); \ - (off) += (bytes);} - /* Local miscellaneous function prototypes */ -STATIC int xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket, +STATIC int xlog_commit_record(struct log *log, struct xlog_ticket *ticket, xlog_in_core_t **, xfs_lsn_t *); STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, xfs_buftarg_t *log_target, @@ -59,11 +54,6 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); STATIC void xlog_dealloc_log(xlog_t *log); -STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[], - int nentries, struct xlog_ticket *tic, - xfs_lsn_t *start_lsn, - xlog_in_core_t **commit_iclog, - uint flags); /* local state machine functions */ STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); @@ -93,16 +83,8 @@ STATIC int xlog_regrant_write_log_space(xlog_t *log, STATIC void xlog_ungrant_log_space(xlog_t *log, xlog_ticket_t *ticket); - -/* local ticket functions */ -STATIC xlog_ticket_t *xlog_ticket_alloc(xlog_t *log, - int unit_bytes, - int count, - char clientid, - uint flags); - #if defined(DEBUG) -STATIC void xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr); +STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); STATIC void xlog_verify_grant_head(xlog_t *log, int equals); STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, int count, boolean_t syncing); @@ -258,7 +240,7 @@ xfs_log_done( * If we get an error, just continue and give back the log ticket. */ (((ticket->t_flags & XLOG_TIC_INITED) == 0) && - (xlog_commit_record(mp, ticket, iclog, &lsn)))) { + (xlog_commit_record(log, ticket, iclog, &lsn)))) { lsn = (xfs_lsn_t) -1; if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { flags |= XFS_LOG_REL_PERM_RESERV; @@ -367,6 +349,15 @@ xfs_log_reserve( ASSERT(flags & XFS_LOG_PERM_RESERV); internal_ticket = *ticket; + /* + * this is a new transaction on the ticket, so we need to + * change the transaction ID so that the next transaction has a + * different TID in the log. Just add one to the existing tid + * so that we can see chains of rolling transactions in the log + * easily. + */ + internal_ticket->t_tid++; + trace_xfs_log_reserve(log, internal_ticket); xlog_grant_push_ail(mp, internal_ticket->t_unit_res); @@ -374,7 +365,8 @@ xfs_log_reserve( } else { /* may sleep if need to allocate more tickets */ internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt, - client, flags); + client, flags, + KM_SLEEP|KM_MAYFAIL); if (!internal_ticket) return XFS_ERROR(ENOMEM); internal_ticket->t_trans_type = t_type; @@ -459,6 +451,13 @@ xfs_log_mount( /* Normal transactions can now occur */ mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; + /* + * Now the log has been fully initialised and we know were our + * space grant counters are, we can initialise the permanent ticket + * needed for delayed logging to work. + */ + xlog_cil_init_post_recovery(mp->m_log); + return 0; out_destroy_ail: @@ -516,18 +515,10 @@ xfs_log_unmount_write(xfs_mount_t *mp) #ifdef DEBUG xlog_in_core_t *first_iclog; #endif - xfs_log_iovec_t reg[1]; xlog_ticket_t *tic = NULL; xfs_lsn_t lsn; int error; - /* the data section must be 32 bit size aligned */ - struct { - __uint16_t magic; - __uint16_t pad1; - __uint32_t pad2; /* may as well make it 64 bits */ - } magic = { XLOG_UNMOUNT_TYPE, 0, 0 }; - /* * Don't write out unmount record on read-only mounts. * Or, if we are doing a forced umount (typically because of IO errors). @@ -549,16 +540,30 @@ xfs_log_unmount_write(xfs_mount_t *mp) } while (iclog != first_iclog); #endif if (! (XLOG_FORCED_SHUTDOWN(log))) { - reg[0].i_addr = (void*)&magic; - reg[0].i_len = sizeof(magic); - reg[0].i_type = XLOG_REG_TYPE_UNMOUNT; - error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE); if (!error) { + /* the data section must be 32 bit size aligned */ + struct { + __uint16_t magic; + __uint16_t pad1; + __uint32_t pad2; /* may as well make it 64 bits */ + } magic = { + .magic = XLOG_UNMOUNT_TYPE, + }; + struct xfs_log_iovec reg = { + .i_addr = (void *)&magic, + .i_len = sizeof(magic), + .i_type = XLOG_REG_TYPE_UNMOUNT, + }; + struct xfs_log_vec vec = { + .lv_niovecs = 1, + .lv_iovecp = ®, + }; + /* remove inited flag */ - ((xlog_ticket_t *)tic)->t_flags = 0; - error = xlog_write(mp, reg, 1, tic, &lsn, + tic->t_flags = 0; + error = xlog_write(log, &vec, tic, &lsn, NULL, XLOG_UNMOUNT_TRANS); /* * At this point, we're umounting anyway, @@ -648,10 +653,30 @@ xfs_log_unmount(xfs_mount_t *mp) xlog_dealloc_log(mp->m_log); } +void +xfs_log_item_init( + struct xfs_mount *mp, + struct xfs_log_item *item, + int type, + struct xfs_item_ops *ops) +{ + item->li_mountp = mp; + item->li_ailp = mp->m_ail; + item->li_type = type; + item->li_ops = ops; + item->li_lv = NULL; + + INIT_LIST_HEAD(&item->li_ail); + INIT_LIST_HEAD(&item->li_cil); +} + /* * Write region vectors to log. The write happens using the space reservation * of the ticket (tic). It is not a requirement that all writes for a given - * transaction occur with one call to xfs_log_write(). + * transaction occur with one call to xfs_log_write(). However, it is important + * to note that the transaction reservation code makes an assumption about the + * number of log headers a transaction requires that may be violated if you + * don't pass all the transaction vectors in one call.... */ int xfs_log_write( @@ -663,11 +688,15 @@ xfs_log_write( { struct log *log = mp->m_log; int error; + struct xfs_log_vec vec = { + .lv_niovecs = nentries, + .lv_iovecp = reg, + }; if (XLOG_FORCED_SHUTDOWN(log)) return XFS_ERROR(EIO); - error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0); + error = xlog_write(log, &vec, tic, start_lsn, NULL, 0); if (error) xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); return error; @@ -1020,6 +1049,7 @@ xlog_alloc_log(xfs_mount_t *mp, int i; int iclogsize; int error = ENOMEM; + uint log2_size = 0; log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL); if (!log) { @@ -1045,29 +1075,30 @@ xlog_alloc_log(xfs_mount_t *mp, error = EFSCORRUPTED; if (xfs_sb_version_hassector(&mp->m_sb)) { - log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT; - if (log->l_sectbb_log < 0 || - log->l_sectbb_log > mp->m_sectbb_log) { - xlog_warn("XFS: Log sector size (0x%x) out of range.", - log->l_sectbb_log); + log2_size = mp->m_sb.sb_logsectlog; + if (log2_size < BBSHIFT) { + xlog_warn("XFS: Log sector size too small " + "(0x%x < 0x%x)", log2_size, BBSHIFT); goto out_free_log; } - /* for larger sector sizes, must have v2 or external log */ - if (log->l_sectbb_log != 0 && - (log->l_logBBstart != 0 && - !xfs_sb_version_haslogv2(&mp->m_sb))) { - xlog_warn("XFS: log sector size (0x%x) invalid " - "for configuration.", log->l_sectbb_log); + log2_size -= BBSHIFT; + if (log2_size > mp->m_sectbb_log) { + xlog_warn("XFS: Log sector size too large " + "(0x%x > 0x%x)", log2_size, mp->m_sectbb_log); goto out_free_log; } - if (mp->m_sb.sb_logsectlog < BBSHIFT) { - xlog_warn("XFS: Log sector log (0x%x) too small.", - mp->m_sb.sb_logsectlog); + + /* for larger sector sizes, must have v2 or external log */ + if (log2_size && log->l_logBBstart > 0 && + !xfs_sb_version_haslogv2(&mp->m_sb)) { + + xlog_warn("XFS: log sector size (0x%x) invalid " + "for configuration.", log2_size); goto out_free_log; } } - log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1; + log->l_sectBBsize = 1 << log2_size; xlog_get_iclog_buffer_size(mp, log); @@ -1147,6 +1178,9 @@ xlog_alloc_log(xfs_mount_t *mp, *iclogp = log->l_iclog; /* complete ring */ log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ + error = xlog_cil_init(log); + if (error) + goto out_free_iclog; return log; out_free_iclog: @@ -1174,26 +1208,31 @@ out: * ticket. Return the lsn of the commit record. */ STATIC int -xlog_commit_record(xfs_mount_t *mp, - xlog_ticket_t *ticket, - xlog_in_core_t **iclog, - xfs_lsn_t *commitlsnp) +xlog_commit_record( + struct log *log, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, + xfs_lsn_t *commitlsnp) { - int error; - xfs_log_iovec_t reg[1]; - - reg[0].i_addr = NULL; - reg[0].i_len = 0; - reg[0].i_type = XLOG_REG_TYPE_COMMIT; + struct xfs_mount *mp = log->l_mp; + int error; + struct xfs_log_iovec reg = { + .i_addr = NULL, + .i_len = 0, + .i_type = XLOG_REG_TYPE_COMMIT, + }; + struct xfs_log_vec vec = { + .lv_niovecs = 1, + .lv_iovecp = ®, + }; ASSERT_ALWAYS(iclog); - if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp, - iclog, XLOG_COMMIT_TRANS))) { + error = xlog_write(log, &vec, ticket, commitlsnp, iclog, + XLOG_COMMIT_TRANS); + if (error) xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - } return error; -} /* xlog_commit_record */ - +} /* * Push on the buffer cache code if we ever use more than 75% of the on-disk @@ -1468,6 +1507,8 @@ xlog_dealloc_log(xlog_t *log) xlog_in_core_t *iclog, *next_iclog; int i; + xlog_cil_destroy(log); + iclog = log->l_iclog; for (i=0; i<log->l_iclog_bufs; i++) { sv_destroy(&iclog->ic_force_wait); @@ -1510,8 +1551,10 @@ xlog_state_finish_copy(xlog_t *log, * print out info relating to regions written which consume * the reservation */ -STATIC void -xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) +void +xlog_print_tic_res( + struct xfs_mount *mp, + struct xlog_ticket *ticket) { uint i; uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); @@ -1611,6 +1654,196 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) "bad-rtype" : res_type_str[r_type-1]), ticket->t_res_arr[i].r_len); } + + xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp, + "xfs_log_write: reservation ran out. Need to up reservation"); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); +} + +/* + * Calculate the potential space needed by the log vector. Each region gets + * its own xlog_op_header_t and may need to be double word aligned. + */ +static int +xlog_write_calc_vec_length( + struct xlog_ticket *ticket, + struct xfs_log_vec *log_vector) +{ + struct xfs_log_vec *lv; + int headers = 0; + int len = 0; + int i; + + /* acct for start rec of xact */ + if (ticket->t_flags & XLOG_TIC_INITED) + headers++; + + for (lv = log_vector; lv; lv = lv->lv_next) { + headers += lv->lv_niovecs; + + for (i = 0; i < lv->lv_niovecs; i++) { + struct xfs_log_iovec *vecp = &lv->lv_iovecp[i]; + + len += vecp->i_len; + xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type); + } + } + + ticket->t_res_num_ophdrs += headers; + len += headers * sizeof(struct xlog_op_header); + + return len; +} + +/* + * If first write for transaction, insert start record We can't be trying to + * commit if we are inited. We can't have any "partial_copy" if we are inited. + */ +static int +xlog_write_start_rec( + struct xlog_op_header *ophdr, + struct xlog_ticket *ticket) +{ + if (!(ticket->t_flags & XLOG_TIC_INITED)) + return 0; + + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_clientid = ticket->t_clientid; + ophdr->oh_len = 0; + ophdr->oh_flags = XLOG_START_TRANS; + ophdr->oh_res2 = 0; + + ticket->t_flags &= ~XLOG_TIC_INITED; + + return sizeof(struct xlog_op_header); +} + +static xlog_op_header_t * +xlog_write_setup_ophdr( + struct log *log, + struct xlog_op_header *ophdr, + struct xlog_ticket *ticket, + uint flags) +{ + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_clientid = ticket->t_clientid; + ophdr->oh_res2 = 0; + + /* are we copying a commit or unmount record? */ + ophdr->oh_flags = flags; + + /* + * We've seen logs corrupted with bad transaction client ids. This + * makes sure that XFS doesn't generate them on. Turn this into an EIO + * and shut down the filesystem. + */ + switch (ophdr->oh_clientid) { + case XFS_TRANSACTION: + case XFS_VOLUME: + case XFS_LOG: + break; + default: + xfs_fs_cmn_err(CE_WARN, log->l_mp, + "Bad XFS transaction clientid 0x%x in ticket 0x%p", + ophdr->oh_clientid, ticket); + return NULL; + } + + return ophdr; +} + +/* + * Set up the parameters of the region copy into the log. This has + * to handle region write split across multiple log buffers - this + * state is kept external to this function so that this code can + * can be written in an obvious, self documenting manner. + */ +static int +xlog_write_setup_copy( + struct xlog_ticket *ticket, + struct xlog_op_header *ophdr, + int space_available, + int space_required, + int *copy_off, + int *copy_len, + int *last_was_partial_copy, + int *bytes_consumed) +{ + int still_to_copy; + + still_to_copy = space_required - *bytes_consumed; + *copy_off = *bytes_consumed; + + if (still_to_copy <= space_available) { + /* write of region completes here */ + *copy_len = still_to_copy; + ophdr->oh_len = cpu_to_be32(*copy_len); + if (*last_was_partial_copy) + ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); + *last_was_partial_copy = 0; + *bytes_consumed = 0; + return 0; + } + + /* partial write of region, needs extra log op header reservation */ + *copy_len = space_available; + ophdr->oh_len = cpu_to_be32(*copy_len); + ophdr->oh_flags |= XLOG_CONTINUE_TRANS; + if (*last_was_partial_copy) + ophdr->oh_flags |= XLOG_WAS_CONT_TRANS; + *bytes_consumed += *copy_len; + (*last_was_partial_copy)++; + + /* account for new log op header */ + ticket->t_curr_res -= sizeof(struct xlog_op_header); + ticket->t_res_num_ophdrs++; + + return sizeof(struct xlog_op_header); +} + +static int +xlog_write_copy_finish( + struct log *log, + struct xlog_in_core *iclog, + uint flags, + int *record_cnt, + int *data_cnt, + int *partial_copy, + int *partial_copy_len, + int log_offset, + struct xlog_in_core **commit_iclog) +{ + if (*partial_copy) { + /* + * This iclog has already been marked WANT_SYNC by + * xlog_state_get_iclog_space. + */ + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + *record_cnt = 0; + *data_cnt = 0; + return xlog_state_release_iclog(log, iclog); + } + + *partial_copy = 0; + *partial_copy_len = 0; + + if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { + /* no more space in this iclog - push it. */ + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + *record_cnt = 0; + *data_cnt = 0; + + spin_lock(&log->l_icloglock); + xlog_state_want_sync(log, iclog); + spin_unlock(&log->l_icloglock); + + if (!commit_iclog) + return xlog_state_release_iclog(log, iclog); + ASSERT(flags & XLOG_COMMIT_TRANS); + *commit_iclog = iclog; + } + + return 0; } /* @@ -1653,211 +1886,163 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) * we don't update ic_offset until the end when we know exactly how many * bytes have been written out. */ -STATIC int +int xlog_write( - struct xfs_mount *mp, - struct xfs_log_iovec reg[], - int nentries, + struct log *log, + struct xfs_log_vec *log_vector, struct xlog_ticket *ticket, xfs_lsn_t *start_lsn, struct xlog_in_core **commit_iclog, uint flags) { - xlog_t *log = mp->m_log; - xlog_in_core_t *iclog = NULL; /* ptr to current in-core log */ - xlog_op_header_t *logop_head; /* ptr to log operation header */ - __psint_t ptr; /* copy address into data region */ - int len; /* # xlog_write() bytes 2 still copy */ - int index; /* region index currently copying */ - int log_offset; /* offset (from 0) into data region */ - int start_rec_copy; /* # bytes to copy for start record */ - int partial_copy; /* did we split a region? */ - int partial_copy_len;/* # bytes copied if split region */ - int need_copy; /* # bytes need to memcpy this region */ - int copy_len; /* # bytes actually memcpy'ing */ - int copy_off; /* # bytes from entry start */ - int contwr; /* continued write of in-core log? */ - int error; - int record_cnt = 0, data_cnt = 0; - - partial_copy_len = partial_copy = 0; - - /* Calculate potential maximum space. Each region gets its own - * xlog_op_header_t and may need to be double word aligned. - */ - len = 0; - if (ticket->t_flags & XLOG_TIC_INITED) { /* acct for start rec of xact */ - len += sizeof(xlog_op_header_t); - ticket->t_res_num_ophdrs++; - } - - for (index = 0; index < nentries; index++) { - len += sizeof(xlog_op_header_t); /* each region gets >= 1 */ - ticket->t_res_num_ophdrs++; - len += reg[index].i_len; - xlog_tic_add_region(ticket, reg[index].i_len, reg[index].i_type); - } - contwr = *start_lsn = 0; + struct xlog_in_core *iclog = NULL; + struct xfs_log_iovec *vecp; + struct xfs_log_vec *lv; + int len; + int index; + int partial_copy = 0; + int partial_copy_len = 0; + int contwr = 0; + int record_cnt = 0; + int data_cnt = 0; + int error; - if (ticket->t_curr_res < len) { - xlog_print_tic_res(mp, ticket); -#ifdef DEBUG - xlog_panic( - "xfs_log_write: reservation ran out. Need to up reservation"); -#else - /* Customer configurable panic */ - xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp, - "xfs_log_write: reservation ran out. Need to up reservation"); - /* If we did not panic, shutdown the filesystem */ - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); -#endif - } else - ticket->t_curr_res -= len; + *start_lsn = 0; - for (index = 0; index < nentries; ) { - if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket, - &contwr, &log_offset))) - return error; + len = xlog_write_calc_vec_length(ticket, log_vector); + if (log->l_cilp) { + /* + * Region headers and bytes are already accounted for. + * We only need to take into account start records and + * split regions in this function. + */ + if (ticket->t_flags & XLOG_TIC_INITED) + ticket->t_curr_res -= sizeof(xlog_op_header_t); - ASSERT(log_offset <= iclog->ic_size - 1); - ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset); + /* + * Commit record headers need to be accounted for. These + * come in as separate writes so are easy to detect. + */ + if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) + ticket->t_curr_res -= sizeof(xlog_op_header_t); + } else + ticket->t_curr_res -= len; + + if (ticket->t_curr_res < 0) + xlog_print_tic_res(log->l_mp, ticket); + + index = 0; + lv = log_vector; + vecp = lv->lv_iovecp; + while (lv && index < lv->lv_niovecs) { + void *ptr; + int log_offset; + + error = xlog_state_get_iclog_space(log, len, &iclog, ticket, + &contwr, &log_offset); + if (error) + return error; - /* start_lsn is the first lsn written to. That's all we need. */ - if (! *start_lsn) - *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); + ASSERT(log_offset <= iclog->ic_size - 1); + ptr = iclog->ic_datap + log_offset; - /* This loop writes out as many regions as can fit in the amount - * of space which was allocated by xlog_state_get_iclog_space(). - */ - while (index < nentries) { - ASSERT(reg[index].i_len % sizeof(__int32_t) == 0); - ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0); - start_rec_copy = 0; - - /* If first write for transaction, insert start record. - * We can't be trying to commit if we are inited. We can't - * have any "partial_copy" if we are inited. - */ - if (ticket->t_flags & XLOG_TIC_INITED) { - logop_head = (xlog_op_header_t *)ptr; - logop_head->oh_tid = cpu_to_be32(ticket->t_tid); - logop_head->oh_clientid = ticket->t_clientid; - logop_head->oh_len = 0; - logop_head->oh_flags = XLOG_START_TRANS; - logop_head->oh_res2 = 0; - ticket->t_flags &= ~XLOG_TIC_INITED; /* clear bit */ - record_cnt++; - - start_rec_copy = sizeof(xlog_op_header_t); - xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy); - } + /* start_lsn is the first lsn written to. That's all we need. */ + if (!*start_lsn) + *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); - /* Copy log operation header directly into data section */ - logop_head = (xlog_op_header_t *)ptr; - logop_head->oh_tid = cpu_to_be32(ticket->t_tid); - logop_head->oh_clientid = ticket->t_clientid; - logop_head->oh_res2 = 0; + /* + * This loop writes out as many regions as can fit in the amount + * of space which was allocated by xlog_state_get_iclog_space(). + */ + while (lv && index < lv->lv_niovecs) { + struct xfs_log_iovec *reg = &vecp[index]; + struct xlog_op_header *ophdr; + int start_rec_copy; + int copy_len; + int copy_off; + + ASSERT(reg->i_len % sizeof(__int32_t) == 0); + ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0); + + start_rec_copy = xlog_write_start_rec(ptr, ticket); + if (start_rec_copy) { + record_cnt++; + xlog_write_adv_cnt(&ptr, &len, &log_offset, + start_rec_copy); + } - /* header copied directly */ - xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t)); + ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); + if (!ophdr) + return XFS_ERROR(EIO); - /* are we copying a commit or unmount record? */ - logop_head->oh_flags = flags; + xlog_write_adv_cnt(&ptr, &len, &log_offset, + sizeof(struct xlog_op_header)); + + len += xlog_write_setup_copy(ticket, ophdr, + iclog->ic_size-log_offset, + reg->i_len, + ©_off, ©_len, + &partial_copy, + &partial_copy_len); + xlog_verify_dest_ptr(log, ptr); + + /* copy region */ + ASSERT(copy_len >= 0); + memcpy(ptr, reg->i_addr + copy_off, copy_len); + xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len); + + copy_len += start_rec_copy + sizeof(xlog_op_header_t); + record_cnt++; + data_cnt += contwr ? copy_len : 0; + + error = xlog_write_copy_finish(log, iclog, flags, + &record_cnt, &data_cnt, + &partial_copy, + &partial_copy_len, + log_offset, + commit_iclog); + if (error) + return error; - /* - * We've seen logs corrupted with bad transaction client - * ids. This makes sure that XFS doesn't generate them on. - * Turn this into an EIO and shut down the filesystem. - */ - switch (logop_head->oh_clientid) { - case XFS_TRANSACTION: - case XFS_VOLUME: - case XFS_LOG: - break; - default: - xfs_fs_cmn_err(CE_WARN, mp, - "Bad XFS transaction clientid 0x%x in ticket 0x%p", - logop_head->oh_clientid, ticket); - return XFS_ERROR(EIO); - } + /* + * if we had a partial copy, we need to get more iclog + * space but we don't want to increment the region + * index because there is still more is this region to + * write. + * + * If we completed writing this region, and we flushed + * the iclog (indicated by resetting of the record + * count), then we also need to get more log space. If + * this was the last record, though, we are done and + * can just return. + */ + if (partial_copy) + break; - /* Partial write last time? => (partial_copy != 0) - * need_copy is the amount we'd like to copy if everything could - * fit in the current memcpy. - */ - need_copy = reg[index].i_len - partial_copy_len; - - copy_off = partial_copy_len; - if (need_copy <= iclog->ic_size - log_offset) { /*complete write */ - copy_len = need_copy; - logop_head->oh_len = cpu_to_be32(copy_len); - if (partial_copy) - logop_head->oh_flags|= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); - partial_copy_len = partial_copy = 0; - } else { /* partial write */ - copy_len = iclog->ic_size - log_offset; - logop_head->oh_len = cpu_to_be32(copy_len); - logop_head->oh_flags |= XLOG_CONTINUE_TRANS; - if (partial_copy) - logop_head->oh_flags |= XLOG_WAS_CONT_TRANS; - partial_copy_len += copy_len; - partial_copy++; - len += sizeof(xlog_op_header_t); /* from splitting of region */ - /* account for new log op header */ - ticket->t_curr_res -= sizeof(xlog_op_header_t); - ticket->t_res_num_ophdrs++; - } - xlog_verify_dest_ptr(log, ptr); - - /* copy region */ - ASSERT(copy_len >= 0); - memcpy((xfs_caddr_t)ptr, reg[index].i_addr + copy_off, copy_len); - xlog_write_adv_cnt(ptr, len, log_offset, copy_len); - - /* make copy_len total bytes copied, including headers */ - copy_len += start_rec_copy + sizeof(xlog_op_header_t); - record_cnt++; - data_cnt += contwr ? copy_len : 0; - if (partial_copy) { /* copied partial region */ - /* already marked WANT_SYNC by xlog_state_get_iclog_space */ - xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - record_cnt = data_cnt = 0; - if ((error = xlog_state_release_iclog(log, iclog))) - return error; - break; /* don't increment index */ - } else { /* copied entire region */ - index++; - partial_copy_len = partial_copy = 0; - - if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { - xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - record_cnt = data_cnt = 0; - spin_lock(&log->l_icloglock); - xlog_state_want_sync(log, iclog); - spin_unlock(&log->l_icloglock); - if (commit_iclog) { - ASSERT(flags & XLOG_COMMIT_TRANS); - *commit_iclog = iclog; - } else if ((error = xlog_state_release_iclog(log, iclog))) - return error; - if (index == nentries) - return 0; /* we are done */ - else - break; + if (++index == lv->lv_niovecs) { + lv = lv->lv_next; + index = 0; + if (lv) + vecp = lv->lv_iovecp; + } + if (record_cnt == 0) { + if (!lv) + return 0; + break; + } } - } /* if (partial_copy) */ - } /* while (index < nentries) */ - } /* for (index = 0; index < nentries; ) */ - ASSERT(len == 0); + } + + ASSERT(len == 0); + + xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); + if (!commit_iclog) + return xlog_state_release_iclog(log, iclog); - xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - if (commit_iclog) { ASSERT(flags & XLOG_COMMIT_TRANS); *commit_iclog = iclog; return 0; - } - return xlog_state_release_iclog(log, iclog); -} /* xlog_write */ +} /***************************************************************************** @@ -2840,6 +3025,8 @@ _xfs_log_force( XFS_STATS_INC(xs_log_force); + xlog_cil_push(log, 1); + spin_lock(&log->l_icloglock); iclog = log->l_iclog; @@ -2989,6 +3176,12 @@ _xfs_log_force_lsn( XFS_STATS_INC(xs_log_force); + if (log->l_cilp) { + lsn = xlog_cil_push_lsn(log, lsn); + if (lsn == NULLCOMMITLSN) + return 0; + } + try_again: spin_lock(&log->l_icloglock); iclog = log->l_iclog; @@ -3153,20 +3346,30 @@ xfs_log_ticket_get( return ticket; } +xlog_tid_t +xfs_log_get_trans_ident( + struct xfs_trans *tp) +{ + return tp->t_ticket->t_tid; +} + /* * Allocate and initialise a new log ticket. */ -STATIC xlog_ticket_t * -xlog_ticket_alloc(xlog_t *log, - int unit_bytes, - int cnt, - char client, - uint xflags) +xlog_ticket_t * +xlog_ticket_alloc( + struct log *log, + int unit_bytes, + int cnt, + char client, + uint xflags, + int alloc_flags) { - xlog_ticket_t *tic; + struct xlog_ticket *tic; uint num_headers; + int iclog_space; - tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL); + tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); if (!tic) return NULL; @@ -3208,16 +3411,40 @@ xlog_ticket_alloc(xlog_t *log, /* for start-rec */ unit_bytes += sizeof(xlog_op_header_t); - /* for LR headers */ - num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log); + /* + * for LR headers - the space for data in an iclog is the size minus + * the space used for the headers. If we use the iclog size, then we + * undercalculate the number of headers required. + * + * Furthermore - the addition of op headers for split-recs might + * increase the space required enough to require more log and op + * headers, so take that into account too. + * + * IMPORTANT: This reservation makes the assumption that if this + * transaction is the first in an iclog and hence has the LR headers + * accounted to it, then the remaining space in the iclog is + * exclusively for this transaction. i.e. if the transaction is larger + * than the iclog, it will be the only thing in that iclog. + * Fundamentally, this means we must pass the entire log vector to + * xlog_write to guarantee this. + */ + iclog_space = log->l_iclog_size - log->l_iclog_hsize; + num_headers = howmany(unit_bytes, iclog_space); + + /* for split-recs - ophdrs added when data split over LRs */ + unit_bytes += sizeof(xlog_op_header_t) * num_headers; + + /* add extra header reservations if we overrun */ + while (!num_headers || + howmany(unit_bytes, iclog_space) > num_headers) { + unit_bytes += sizeof(xlog_op_header_t); + num_headers++; + } unit_bytes += log->l_iclog_hsize * num_headers; /* for commit-rec LR header - note: padding will subsume the ophdr */ unit_bytes += log->l_iclog_hsize; - /* for split-recs - ophdrs added when data split over LRs */ - unit_bytes += sizeof(xlog_op_header_t) * num_headers; - /* for roundoff padding for transaction data and one for commit record */ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && log->l_mp->m_sb.sb_logsunit > 1) { @@ -3233,13 +3460,13 @@ xlog_ticket_alloc(xlog_t *log, tic->t_curr_res = unit_bytes; tic->t_cnt = cnt; tic->t_ocnt = cnt; - tic->t_tid = (xlog_tid_t)((__psint_t)tic & 0xffffffff); + tic->t_tid = random32(); tic->t_clientid = client; tic->t_flags = XLOG_TIC_INITED; tic->t_trans_type = 0; if (xflags & XFS_LOG_PERM_RESERV) tic->t_flags |= XLOG_TIC_PERM_RESERV; - sv_init(&(tic->t_wait), SV_DEFAULT, "logtick"); + sv_init(&tic->t_wait, SV_DEFAULT, "logtick"); xlog_tic_reset_res(tic); @@ -3260,20 +3487,22 @@ xlog_ticket_alloc(xlog_t *log, * part of the log in case we trash the log structure. */ void -xlog_verify_dest_ptr(xlog_t *log, - __psint_t ptr) +xlog_verify_dest_ptr( + struct log *log, + char *ptr) { int i; int good_ptr = 0; - for (i=0; i < log->l_iclog_bufs; i++) { - if (ptr >= (__psint_t)log->l_iclog_bak[i] && - ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size) + for (i = 0; i < log->l_iclog_bufs; i++) { + if (ptr >= log->l_iclog_bak[i] && + ptr <= log->l_iclog_bak[i] + log->l_iclog_size) good_ptr++; } - if (! good_ptr) + + if (!good_ptr) xlog_panic("xlog_verify_dest_ptr: invalid ptr"); -} /* xlog_verify_dest_ptr */ +} STATIC void xlog_verify_grant_head(xlog_t *log, int equals) @@ -3459,6 +3688,11 @@ xlog_state_ioerror( * c. nothing new gets queued up after (a) and (b) are done. * d. if !logerror, flush the iclogs to disk, then seal them off * for business. + * + * Note: for delayed logging the !logerror case needs to flush the regions + * held in memory out to the iclogs before flushing them to disk. This needs + * to be done before the log is marked as shutdown, otherwise the flush to the + * iclogs will fail. */ int xfs_log_force_umount( @@ -3492,6 +3726,16 @@ xfs_log_force_umount( return 1; } retval = 0; + + /* + * Flush the in memory commit item list before marking the log as + * being shut down. We need to do it in this order to ensure all the + * completed transactions are flushed to disk with the xfs_log_force() + * call below. + */ + if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG)) + xlog_cil_push(log, 1); + /* * We must hold both the GRANT lock and the LOG lock, * before we mark the filesystem SHUTDOWN and wake diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 97a24c7795a..04c78e642cc 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -19,7 +19,6 @@ #define __XFS_LOG_H__ /* get lsn fields */ - #define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) #define BLOCK_LSN(lsn) ((uint)(lsn)) @@ -110,6 +109,15 @@ typedef struct xfs_log_iovec { uint i_type; /* type of region */ } xfs_log_iovec_t; +struct xfs_log_vec { + struct xfs_log_vec *lv_next; /* next lv in build list */ + int lv_niovecs; /* number of iovecs in lv */ + struct xfs_log_iovec *lv_iovecp; /* iovec array */ + struct xfs_log_item *lv_item; /* owner */ + char *lv_buf; /* formatted buffer */ + int lv_buf_len; /* size of formatted buffer */ +}; + /* * Structure used to pass callback function and the function's argument * to the log manager. @@ -126,6 +134,14 @@ typedef struct xfs_log_callback { struct xfs_mount; struct xlog_in_core; struct xlog_ticket; +struct xfs_log_item; +struct xfs_item_ops; +struct xfs_trans; + +void xfs_log_item_init(struct xfs_mount *mp, + struct xfs_log_item *item, + int type, + struct xfs_item_ops *ops); xfs_lsn_t xfs_log_done(struct xfs_mount *mp, struct xlog_ticket *ticket, @@ -174,9 +190,16 @@ int xfs_log_need_covered(struct xfs_mount *mp); void xlog_iodone(struct xfs_buf *); -struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket); +struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); +xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp); + +int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, + struct xfs_log_vec *log_vector, + xfs_lsn_t *commit_lsn, int flags); +bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); + #endif diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c new file mode 100644 index 00000000000..bb17cc044bf --- /dev/null +++ b/fs/xfs/xfs_log_cil.c @@ -0,0 +1,725 @@ +/* + * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log_priv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_error.h" +#include "xfs_alloc.h" + +/* + * Perform initial CIL structure initialisation. If the CIL is not + * enabled in this filesystem, ensure the log->l_cilp is null so + * we can check this conditional to determine if we are doing delayed + * logging or not. + */ +int +xlog_cil_init( + struct log *log) +{ + struct xfs_cil *cil; + struct xfs_cil_ctx *ctx; + + log->l_cilp = NULL; + if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG)) + return 0; + + cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); + if (!cil) + return ENOMEM; + + ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); + if (!ctx) { + kmem_free(cil); + return ENOMEM; + } + + INIT_LIST_HEAD(&cil->xc_cil); + INIT_LIST_HEAD(&cil->xc_committing); + spin_lock_init(&cil->xc_cil_lock); + init_rwsem(&cil->xc_ctx_lock); + sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait"); + + INIT_LIST_HEAD(&ctx->committing); + INIT_LIST_HEAD(&ctx->busy_extents); + ctx->sequence = 1; + ctx->cil = cil; + cil->xc_ctx = ctx; + + cil->xc_log = log; + log->l_cilp = cil; + return 0; +} + +void +xlog_cil_destroy( + struct log *log) +{ + if (!log->l_cilp) + return; + + if (log->l_cilp->xc_ctx) { + if (log->l_cilp->xc_ctx->ticket) + xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); + kmem_free(log->l_cilp->xc_ctx); + } + + ASSERT(list_empty(&log->l_cilp->xc_cil)); + kmem_free(log->l_cilp); +} + +/* + * Allocate a new ticket. Failing to get a new ticket makes it really hard to + * recover, so we don't allow failure here. Also, we allocate in a context that + * we don't want to be issuing transactions from, so we need to tell the + * allocation code this as well. + * + * We don't reserve any space for the ticket - we are going to steal whatever + * space we require from transactions as they commit. To ensure we reserve all + * the space required, we need to set the current reservation of the ticket to + * zero so that we know to steal the initial transaction overhead from the + * first transaction commit. + */ +static struct xlog_ticket * +xlog_cil_ticket_alloc( + struct log *log) +{ + struct xlog_ticket *tic; + + tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, + KM_SLEEP|KM_NOFS); + tic->t_trans_type = XFS_TRANS_CHECKPOINT; + + /* + * set the current reservation to zero so we know to steal the basic + * transaction overhead reservation from the first transaction commit. + */ + tic->t_curr_res = 0; + return tic; +} + +/* + * After the first stage of log recovery is done, we know where the head and + * tail of the log are. We need this log initialisation done before we can + * initialise the first CIL checkpoint context. + * + * Here we allocate a log ticket to track space usage during a CIL push. This + * ticket is passed to xlog_write() directly so that we don't slowly leak log + * space by failing to account for space used by log headers and additional + * region headers for split regions. + */ +void +xlog_cil_init_post_recovery( + struct log *log) +{ + if (!log->l_cilp) + return; + + log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); + log->l_cilp->xc_ctx->sequence = 1; + log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle, + log->l_curr_block); +} + +/* + * Insert the log item into the CIL and calculate the difference in space + * consumed by the item. Add the space to the checkpoint ticket and calculate + * if the change requires additional log metadata. If it does, take that space + * as well. Remove the amount of space we addded to the checkpoint ticket from + * the current transaction ticket so that the accounting works out correctly. + * + * If this is the first time the item is being placed into the CIL in this + * context, pin it so it can't be written to disk until the CIL is flushed to + * the iclog and the iclog written to disk. + */ +static void +xlog_cil_insert( + struct log *log, + struct xlog_ticket *ticket, + struct xfs_log_item *item, + struct xfs_log_vec *lv) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_log_vec *old = lv->lv_item->li_lv; + struct xfs_cil_ctx *ctx = cil->xc_ctx; + int len; + int diff_iovecs; + int iclog_space; + + if (old) { + /* existing lv on log item, space used is a delta */ + ASSERT(!list_empty(&item->li_cil)); + ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs); + + len = lv->lv_buf_len - old->lv_buf_len; + diff_iovecs = lv->lv_niovecs - old->lv_niovecs; + kmem_free(old->lv_buf); + kmem_free(old); + } else { + /* new lv, must pin the log item */ + ASSERT(!lv->lv_item->li_lv); + ASSERT(list_empty(&item->li_cil)); + + len = lv->lv_buf_len; + diff_iovecs = lv->lv_niovecs; + IOP_PIN(lv->lv_item); + + } + len += diff_iovecs * sizeof(xlog_op_header_t); + + /* attach new log vector to log item */ + lv->lv_item->li_lv = lv; + + spin_lock(&cil->xc_cil_lock); + list_move_tail(&item->li_cil, &cil->xc_cil); + ctx->nvecs += diff_iovecs; + + /* + * If this is the first time the item is being committed to the CIL, + * store the sequence number on the log item so we can tell + * in future commits whether this is the first checkpoint the item is + * being committed into. + */ + if (!item->li_seq) + item->li_seq = ctx->sequence; + + /* + * Now transfer enough transaction reservation to the context ticket + * for the checkpoint. The context ticket is special - the unit + * reservation has to grow as well as the current reservation as we + * steal from tickets so we can correctly determine the space used + * during the transaction commit. + */ + if (ctx->ticket->t_curr_res == 0) { + /* first commit in checkpoint, steal the header reservation */ + ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len); + ctx->ticket->t_curr_res = ctx->ticket->t_unit_res; + ticket->t_curr_res -= ctx->ticket->t_unit_res; + } + + /* do we need space for more log record headers? */ + iclog_space = log->l_iclog_size - log->l_iclog_hsize; + if (len > 0 && (ctx->space_used / iclog_space != + (ctx->space_used + len) / iclog_space)) { + int hdrs; + + hdrs = (len + iclog_space - 1) / iclog_space; + /* need to take into account split region headers, too */ + hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header); + ctx->ticket->t_unit_res += hdrs; + ctx->ticket->t_curr_res += hdrs; + ticket->t_curr_res -= hdrs; + ASSERT(ticket->t_curr_res >= len); + } + ticket->t_curr_res -= len; + ctx->space_used += len; + + spin_unlock(&cil->xc_cil_lock); +} + +/* + * Format log item into a flat buffers + * + * For delayed logging, we need to hold a formatted buffer containing all the + * changes on the log item. This enables us to relog the item in memory and + * write it out asynchronously without needing to relock the object that was + * modified at the time it gets written into the iclog. + * + * This function builds a vector for the changes in each log item in the + * transaction. It then works out the length of the buffer needed for each log + * item, allocates them and formats the vector for the item into the buffer. + * The buffer is then attached to the log item are then inserted into the + * Committed Item List for tracking until the next checkpoint is written out. + * + * We don't set up region headers during this process; we simply copy the + * regions into the flat buffer. We can do this because we still have to do a + * formatting step to write the regions into the iclog buffer. Writing the + * ophdrs during the iclog write means that we can support splitting large + * regions across iclog boundares without needing a change in the format of the + * item/region encapsulation. + * + * Hence what we need to do now is change the rewrite the vector array to point + * to the copied region inside the buffer we just allocated. This allows us to + * format the regions into the iclog as though they are being formatted + * directly out of the objects themselves. + */ +static void +xlog_cil_format_items( + struct log *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *ticket, + xfs_lsn_t *start_lsn) +{ + struct xfs_log_vec *lv; + + if (start_lsn) + *start_lsn = log->l_cilp->xc_ctx->sequence; + + ASSERT(log_vector); + for (lv = log_vector; lv; lv = lv->lv_next) { + void *ptr; + int index; + int len = 0; + + /* build the vector array and calculate it's length */ + IOP_FORMAT(lv->lv_item, lv->lv_iovecp); + for (index = 0; index < lv->lv_niovecs; index++) + len += lv->lv_iovecp[index].i_len; + + lv->lv_buf_len = len; + lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS); + ptr = lv->lv_buf; + + for (index = 0; index < lv->lv_niovecs; index++) { + struct xfs_log_iovec *vec = &lv->lv_iovecp[index]; + + memcpy(ptr, vec->i_addr, vec->i_len); + vec->i_addr = ptr; + ptr += vec->i_len; + } + ASSERT(ptr == lv->lv_buf + lv->lv_buf_len); + + xlog_cil_insert(log, ticket, lv->lv_item, lv); + } +} + +static void +xlog_cil_free_logvec( + struct xfs_log_vec *log_vector) +{ + struct xfs_log_vec *lv; + + for (lv = log_vector; lv; ) { + struct xfs_log_vec *next = lv->lv_next; + kmem_free(lv->lv_buf); + kmem_free(lv); + lv = next; + } +} + +/* + * Commit a transaction with the given vector to the Committed Item List. + * + * To do this, we need to format the item, pin it in memory if required and + * account for the space used by the transaction. Once we have done that we + * need to release the unused reservation for the transaction, attach the + * transaction to the checkpoint context so we carry the busy extents through + * to checkpoint completion, and then unlock all the items in the transaction. + * + * For more specific information about the order of operations in + * xfs_log_commit_cil() please refer to the comments in + * xfs_trans_commit_iclog(). + * + * Called with the context lock already held in read mode to lock out + * background commit, returns without it held once background commits are + * allowed again. + */ +int +xfs_log_commit_cil( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_log_vec *log_vector, + xfs_lsn_t *commit_lsn, + int flags) +{ + struct log *log = mp->m_log; + int log_flags = 0; + int push = 0; + + if (flags & XFS_TRANS_RELEASE_LOG_RES) + log_flags = XFS_LOG_REL_PERM_RESERV; + + if (XLOG_FORCED_SHUTDOWN(log)) { + xlog_cil_free_logvec(log_vector); + return XFS_ERROR(EIO); + } + + /* lock out background commit */ + down_read(&log->l_cilp->xc_ctx_lock); + xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn); + + /* check we didn't blow the reservation */ + if (tp->t_ticket->t_curr_res < 0) + xlog_print_tic_res(log->l_mp, tp->t_ticket); + + /* attach the transaction to the CIL if it has any busy extents */ + if (!list_empty(&tp->t_busy)) { + spin_lock(&log->l_cilp->xc_cil_lock); + list_splice_init(&tp->t_busy, + &log->l_cilp->xc_ctx->busy_extents); + spin_unlock(&log->l_cilp->xc_cil_lock); + } + + tp->t_commit_lsn = *commit_lsn; + xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + xfs_trans_unreserve_and_mod_sb(tp); + + /* check for background commit before unlock */ + if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) + push = 1; + up_read(&log->l_cilp->xc_ctx_lock); + + /* + * We need to push CIL every so often so we don't cache more than we + * can fit in the log. The limit really is that a checkpoint can't be + * more than half the log (the current checkpoint is not allowed to + * overwrite the previous checkpoint), but commit latency and memory + * usage limit this to a smaller size in most cases. + */ + if (push) + xlog_cil_push(log, 0); + return 0; +} + +/* + * Mark all items committed and clear busy extents. We free the log vector + * chains in a separate pass so that we unpin the log items as quickly as + * possible. + */ +static void +xlog_cil_committed( + void *args, + int abort) +{ + struct xfs_cil_ctx *ctx = args; + struct xfs_log_vec *lv; + int abortflag = abort ? XFS_LI_ABORTED : 0; + struct xfs_busy_extent *busyp, *n; + + /* unpin all the log items */ + for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) { + xfs_trans_item_committed(lv->lv_item, ctx->start_lsn, + abortflag); + } + + list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list) + xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp); + + spin_lock(&ctx->cil->xc_cil_lock); + list_del(&ctx->committing); + spin_unlock(&ctx->cil->xc_cil_lock); + + xlog_cil_free_logvec(ctx->lv_chain); + kmem_free(ctx); +} + +/* + * Push the Committed Item List to the log. If the push_now flag is not set, + * then it is a background flush and so we can chose to ignore it. + */ +int +xlog_cil_push( + struct log *log, + int push_now) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_log_vec *lv; + struct xfs_cil_ctx *ctx; + struct xfs_cil_ctx *new_ctx; + struct xlog_in_core *commit_iclog; + struct xlog_ticket *tic; + int num_lv; + int num_iovecs; + int len; + int error = 0; + struct xfs_trans_header thdr; + struct xfs_log_iovec lhdr; + struct xfs_log_vec lvhdr = { NULL }; + xfs_lsn_t commit_lsn; + + if (!cil) + return 0; + + new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); + new_ctx->ticket = xlog_cil_ticket_alloc(log); + + /* lock out transaction commit, but don't block on background push */ + if (!down_write_trylock(&cil->xc_ctx_lock)) { + if (!push_now) + goto out_free_ticket; + down_write(&cil->xc_ctx_lock); + } + ctx = cil->xc_ctx; + + /* check if we've anything to push */ + if (list_empty(&cil->xc_cil)) + goto out_skip; + + /* check for spurious background flush */ + if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + goto out_skip; + + /* + * pull all the log vectors off the items in the CIL, and + * remove the items from the CIL. We don't need the CIL lock + * here because it's only needed on the transaction commit + * side which is currently locked out by the flush lock. + */ + lv = NULL; + num_lv = 0; + num_iovecs = 0; + len = 0; + while (!list_empty(&cil->xc_cil)) { + struct xfs_log_item *item; + int i; + + item = list_first_entry(&cil->xc_cil, + struct xfs_log_item, li_cil); + list_del_init(&item->li_cil); + if (!ctx->lv_chain) + ctx->lv_chain = item->li_lv; + else + lv->lv_next = item->li_lv; + lv = item->li_lv; + item->li_lv = NULL; + + num_lv++; + num_iovecs += lv->lv_niovecs; + for (i = 0; i < lv->lv_niovecs; i++) + len += lv->lv_iovecp[i].i_len; + } + + /* + * initialise the new context and attach it to the CIL. Then attach + * the current context to the CIL committing lsit so it can be found + * during log forces to extract the commit lsn of the sequence that + * needs to be forced. + */ + INIT_LIST_HEAD(&new_ctx->committing); + INIT_LIST_HEAD(&new_ctx->busy_extents); + new_ctx->sequence = ctx->sequence + 1; + new_ctx->cil = cil; + cil->xc_ctx = new_ctx; + + /* + * The switch is now done, so we can drop the context lock and move out + * of a shared context. We can't just go straight to the commit record, + * though - we need to synchronise with previous and future commits so + * that the commit records are correctly ordered in the log to ensure + * that we process items during log IO completion in the correct order. + * + * For example, if we get an EFI in one checkpoint and the EFD in the + * next (e.g. due to log forces), we do not want the checkpoint with + * the EFD to be committed before the checkpoint with the EFI. Hence + * we must strictly order the commit records of the checkpoints so + * that: a) the checkpoint callbacks are attached to the iclogs in the + * correct order; and b) the checkpoints are replayed in correct order + * in log recovery. + * + * Hence we need to add this context to the committing context list so + * that higher sequences will wait for us to write out a commit record + * before they do. + */ + spin_lock(&cil->xc_cil_lock); + list_add(&ctx->committing, &cil->xc_committing); + spin_unlock(&cil->xc_cil_lock); + up_write(&cil->xc_ctx_lock); + + /* + * Build a checkpoint transaction header and write it to the log to + * begin the transaction. We need to account for the space used by the + * transaction header here as it is not accounted for in xlog_write(). + * + * The LSN we need to pass to the log items on transaction commit is + * the LSN reported by the first log vector write. If we use the commit + * record lsn then we can move the tail beyond the grant write head. + */ + tic = ctx->ticket; + thdr.th_magic = XFS_TRANS_HEADER_MAGIC; + thdr.th_type = XFS_TRANS_CHECKPOINT; + thdr.th_tid = tic->t_tid; + thdr.th_num_items = num_iovecs; + lhdr.i_addr = (xfs_caddr_t)&thdr; + lhdr.i_len = sizeof(xfs_trans_header_t); + lhdr.i_type = XLOG_REG_TYPE_TRANSHDR; + tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t); + + lvhdr.lv_niovecs = 1; + lvhdr.lv_iovecp = &lhdr; + lvhdr.lv_next = ctx->lv_chain; + + error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); + if (error) + goto out_abort; + + /* + * now that we've written the checkpoint into the log, strictly + * order the commit records so replay will get them in the right order. + */ +restart: + spin_lock(&cil->xc_cil_lock); + list_for_each_entry(new_ctx, &cil->xc_committing, committing) { + /* + * Higher sequences will wait for this one so skip them. + * Don't wait for own own sequence, either. + */ + if (new_ctx->sequence >= ctx->sequence) + continue; + if (!new_ctx->commit_lsn) { + /* + * It is still being pushed! Wait for the push to + * complete, then start again from the beginning. + */ + sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); + goto restart; + } + } + spin_unlock(&cil->xc_cil_lock); + + commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); + if (error || commit_lsn == -1) + goto out_abort; + + /* attach all the transactions w/ busy extents to iclog */ + ctx->log_cb.cb_func = xlog_cil_committed; + ctx->log_cb.cb_arg = ctx; + error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb); + if (error) + goto out_abort; + + /* + * now the checkpoint commit is complete and we've attached the + * callbacks to the iclog we can assign the commit LSN to the context + * and wake up anyone who is waiting for the commit to complete. + */ + spin_lock(&cil->xc_cil_lock); + ctx->commit_lsn = commit_lsn; + sv_broadcast(&cil->xc_commit_wait); + spin_unlock(&cil->xc_cil_lock); + + /* release the hounds! */ + return xfs_log_release_iclog(log->l_mp, commit_iclog); + +out_skip: + up_write(&cil->xc_ctx_lock); +out_free_ticket: + xfs_log_ticket_put(new_ctx->ticket); + kmem_free(new_ctx); + return 0; + +out_abort: + xlog_cil_committed(ctx, XFS_LI_ABORTED); + return XFS_ERROR(EIO); +} + +/* + * Conditionally push the CIL based on the sequence passed in. + * + * We only need to push if we haven't already pushed the sequence + * number given. Hence the only time we will trigger a push here is + * if the push sequence is the same as the current context. + * + * We return the current commit lsn to allow the callers to determine if a + * iclog flush is necessary following this call. + * + * XXX: Initially, just push the CIL unconditionally and return whatever + * commit lsn is there. It'll be empty, so this is broken for now. + */ +xfs_lsn_t +xlog_cil_push_lsn( + struct log *log, + xfs_lsn_t push_seq) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_cil_ctx *ctx; + xfs_lsn_t commit_lsn = NULLCOMMITLSN; + +restart: + down_write(&cil->xc_ctx_lock); + ASSERT(push_seq <= cil->xc_ctx->sequence); + + /* check to see if we need to force out the current context */ + if (push_seq == cil->xc_ctx->sequence) { + up_write(&cil->xc_ctx_lock); + xlog_cil_push(log, 1); + goto restart; + } + + /* + * See if we can find a previous sequence still committing. + * We can drop the flush lock as soon as we have the cil lock + * because we are now only comparing contexts protected by + * the cil lock. + * + * We need to wait for all previous sequence commits to complete + * before allowing the force of push_seq to go ahead. Hence block + * on commits for those as well. + */ + spin_lock(&cil->xc_cil_lock); + up_write(&cil->xc_ctx_lock); + list_for_each_entry(ctx, &cil->xc_committing, committing) { + if (ctx->sequence > push_seq) + continue; + if (!ctx->commit_lsn) { + /* + * It is still being pushed! Wait for the push to + * complete, then start again from the beginning. + */ + sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); + goto restart; + } + if (ctx->sequence != push_seq) + continue; + /* found it! */ + commit_lsn = ctx->commit_lsn; + } + spin_unlock(&cil->xc_cil_lock); + return commit_lsn; +} + +/* + * Check if the current log item was first committed in this sequence. + * We can't rely on just the log item being in the CIL, we have to check + * the recorded commit sequence number. + * + * Note: for this to be used in a non-racy manner, it has to be called with + * CIL flushing locked out. As a result, it should only be used during the + * transaction commit process when deciding what to format into the item. + */ +bool +xfs_log_item_in_current_chkpt( + struct xfs_log_item *lip) +{ + struct xfs_cil_ctx *ctx; + + if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG)) + return false; + if (list_empty(&lip->li_cil)) + return false; + + ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; + + /* + * li_seq is written on the first commit of a log item to record the + * first checkpoint it is written to. Hence if it is different to the + * current sequence, we're in a new checkpoint. + */ + if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0) + return false; + return true; +} diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index fd02a18facd..8c072618965 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -152,8 +152,6 @@ static inline uint xlog_get_client_id(__be32 i) #define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ #define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being shutdown */ -typedef __uint32_t xlog_tid_t; - #ifdef __KERNEL__ /* @@ -379,6 +377,99 @@ typedef struct xlog_in_core { } xlog_in_core_t; /* + * The CIL context is used to aggregate per-transaction details as well be + * passed to the iclog for checkpoint post-commit processing. After being + * passed to the iclog, another context needs to be allocated for tracking the + * next set of transactions to be aggregated into a checkpoint. + */ +struct xfs_cil; + +struct xfs_cil_ctx { + struct xfs_cil *cil; + xfs_lsn_t sequence; /* chkpt sequence # */ + xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ + xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ + struct xlog_ticket *ticket; /* chkpt ticket */ + int nvecs; /* number of regions */ + int space_used; /* aggregate size of regions */ + struct list_head busy_extents; /* busy extents in chkpt */ + struct xfs_log_vec *lv_chain; /* logvecs being pushed */ + xfs_log_callback_t log_cb; /* completion callback hook. */ + struct list_head committing; /* ctx committing list */ +}; + +/* + * Committed Item List structure + * + * This structure is used to track log items that have been committed but not + * yet written into the log. It is used only when the delayed logging mount + * option is enabled. + * + * This structure tracks the list of committing checkpoint contexts so + * we can avoid the problem of having to hold out new transactions during a + * flush until we have a the commit record LSN of the checkpoint. We can + * traverse the list of committing contexts in xlog_cil_push_lsn() to find a + * sequence match and extract the commit LSN directly from there. If the + * checkpoint is still in the process of committing, we can block waiting for + * the commit LSN to be determined as well. This should make synchronous + * operations almost as efficient as the old logging methods. + */ +struct xfs_cil { + struct log *xc_log; + struct list_head xc_cil; + spinlock_t xc_cil_lock; + struct xfs_cil_ctx *xc_ctx; + struct rw_semaphore xc_ctx_lock; + struct list_head xc_committing; + sv_t xc_commit_wait; +}; + +/* + * The amount of log space we should the CIL to aggregate is difficult to size. + * Whatever we chose we have to make we can get a reservation for the log space + * effectively, that it is large enough to capture sufficient relogging to + * reduce log buffer IO significantly, but it is not too large for the log or + * induces too much latency when writing out through the iclogs. We track both + * space consumed and the number of vectors in the checkpoint context, so we + * need to decide which to use for limiting. + * + * Every log buffer we write out during a push needs a header reserved, which + * is at least one sector and more for v2 logs. Hence we need a reservation of + * at least 512 bytes per 32k of log space just for the LR headers. That means + * 16KB of reservation per megabyte of delayed logging space we will consume, + * plus various headers. The number of headers will vary based on the num of + * io vectors, so limiting on a specific number of vectors is going to result + * in transactions of varying size. IOWs, it is more consistent to track and + * limit space consumed in the log rather than by the number of objects being + * logged in order to prevent checkpoint ticket overruns. + * + * Further, use of static reservations through the log grant mechanism is + * problematic. It introduces a lot of complexity (e.g. reserve grant vs write + * grant) and a significant deadlock potential because regranting write space + * can block on log pushes. Hence if we have to regrant log space during a log + * push, we can deadlock. + * + * However, we can avoid this by use of a dynamic "reservation stealing" + * technique during transaction commit whereby unused reservation space in the + * transaction ticket is transferred to the CIL ctx commit ticket to cover the + * space needed by the checkpoint transaction. This means that we never need to + * specifically reserve space for the CIL checkpoint transaction, nor do we + * need to regrant space once the checkpoint completes. This also means the + * checkpoint transaction ticket is specific to the checkpoint context, rather + * than the CIL itself. + * + * With dynamic reservations, we can basically make up arbitrary limits for the + * checkpoint size so long as they don't violate any other size rules. Hence + * the initial maximum size for the checkpoint transaction will be set to a + * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit + * right now based on the latency of writing out a large amount of data through + * the circular iclog buffers. + */ + +#define XLOG_CIL_SPACE_LIMIT(log) \ + (min((log->l_logsize >> 2), (8 * 1024 * 1024))) + +/* * The reservation head lsn is not made up of a cycle number and block number. * Instead, it uses a cycle number and byte number. Logs don't expect to * overflow 31 bits worth of byte offset, so using a byte number will mean @@ -388,6 +479,7 @@ typedef struct log { /* The following fields don't need locking */ struct xfs_mount *l_mp; /* mount point */ struct xfs_ail *l_ailp; /* AIL log is working with */ + struct xfs_cil *l_cilp; /* CIL log is working with */ struct xfs_buf *l_xbuf; /* extra buffer for log * wrapping */ struct xfs_buftarg *l_targ; /* buftarg of log */ @@ -396,9 +488,7 @@ typedef struct log { struct xfs_buf_cancel **l_buf_cancel_table; int l_iclog_hsize; /* size of iclog header */ int l_iclog_heads; /* # of iclog header sectors */ - uint l_sectbb_log; /* log2 of sector size in BBs */ - uint l_sectbb_mask; /* sector size (in BBs) - * alignment mask */ + uint l_sectBBsize; /* sector size in BBs (2^n) */ int l_iclog_size; /* size of log in bytes */ int l_iclog_size_log; /* log power size of log */ int l_iclog_bufs; /* number of iclog buffers */ @@ -440,14 +530,40 @@ typedef struct log { #define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) - /* common routines */ extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); extern int xlog_recover(xlog_t *log); extern int xlog_recover_finish(xlog_t *log); extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); -extern kmem_zone_t *xfs_log_ticket_zone; +extern kmem_zone_t *xfs_log_ticket_zone; +struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes, + int count, char client, uint xflags, + int alloc_flags); + + +static inline void +xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) +{ + *ptr += bytes; + *len -= bytes; + *off += bytes; +} + +void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); +int xlog_write(struct log *log, struct xfs_log_vec *log_vector, + struct xlog_ticket *tic, xfs_lsn_t *start_lsn, + xlog_in_core_t **commit_iclog, uint flags); + +/* + * Committed Item List interfaces + */ +int xlog_cil_init(struct log *log); +void xlog_cil_init_post_recovery(struct log *log); +void xlog_cil_destroy(struct log *log); + +int xlog_cil_push(struct log *log, int push_now); +xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence); /* * Unmount record type is used as a pseudo transaction type for the ticket. diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 22e6efdc17e..14a69aec2c0 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -56,33 +56,61 @@ STATIC void xlog_recover_check_summary(xlog_t *); #define xlog_recover_check_summary(log) #endif - /* * Sector aligned buffer routines for buffer create/read/write/access */ -#define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs) \ - ( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \ - ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) ) -#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask) +/* + * Verify the given count of basic blocks is valid number of blocks + * to specify for an operation involving the given XFS log buffer. + * Returns nonzero if the count is valid, 0 otherwise. + */ +static inline int +xlog_buf_bbcount_valid( + xlog_t *log, + int bbcount) +{ + return bbcount > 0 && bbcount <= log->l_logBBsize; +} + +/* + * Allocate a buffer to hold log data. The buffer needs to be able + * to map to a range of nbblks basic blocks at any valid (basic + * block) offset within the log. + */ STATIC xfs_buf_t * xlog_get_bp( xlog_t *log, int nbblks) { - if (nbblks <= 0 || nbblks > log->l_logBBsize) { - xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); - XFS_ERROR_REPORT("xlog_get_bp(1)", - XFS_ERRLEVEL_HIGH, log->l_mp); + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xlog_warn("XFS: Invalid block length (0x%x) given for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return NULL; } - if (log->l_sectbb_log) { - if (nbblks > 1) - nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); - nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); - } + /* + * We do log I/O in units of log sectors (a power-of-2 + * multiple of the basic block size), so we round up the + * requested size to acommodate the basic blocks required + * for complete log sectors. + * + * In addition, the buffer may be used for a non-sector- + * aligned block offset, in which case an I/O of the + * requested size could extend beyond the end of the + * buffer. If the requested size is only 1 basic block it + * will never straddle a sector boundary, so this won't be + * an issue. Nor will this be a problem if the log I/O is + * done in basic blocks (sector size 1). But otherwise we + * extend the buffer by one extra log sector to ensure + * there's space to accomodate this possiblility. + */ + if (nbblks > 1 && log->l_sectBBsize > 1) + nbblks += log->l_sectBBsize; + nbblks = round_up(nbblks, log->l_sectBBsize); + return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp); } @@ -93,6 +121,10 @@ xlog_put_bp( xfs_buf_free(bp); } +/* + * Return the address of the start of the given block number's data + * in a log buffer. The buffer covers a log sector-aligned region. + */ STATIC xfs_caddr_t xlog_align( xlog_t *log, @@ -100,14 +132,14 @@ xlog_align( int nbblks, xfs_buf_t *bp) { + xfs_daddr_t offset; xfs_caddr_t ptr; - if (!log->l_sectbb_log) - return XFS_BUF_PTR(bp); + offset = blk_no & ((xfs_daddr_t) log->l_sectBBsize - 1); + ptr = XFS_BUF_PTR(bp) + BBTOB(offset); + + ASSERT(ptr + BBTOB(nbblks) <= XFS_BUF_PTR(bp) + XFS_BUF_SIZE(bp)); - ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask); - ASSERT(XFS_BUF_SIZE(bp) >= - BBTOB(nbblks + (blk_no & log->l_sectbb_mask))); return ptr; } @@ -124,21 +156,18 @@ xlog_bread_noalign( { int error; - if (nbblks <= 0 || nbblks > log->l_logBBsize) { - xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); - XFS_ERROR_REPORT("xlog_bread(1)", - XFS_ERRLEVEL_HIGH, log->l_mp); + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xlog_warn("XFS: Invalid block length (0x%x) given for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return EFSCORRUPTED; } - if (log->l_sectbb_log) { - blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); - nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); - } + blk_no = round_down(blk_no, log->l_sectBBsize); + nbblks = round_up(nbblks, log->l_sectBBsize); ASSERT(nbblks > 0); ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); - ASSERT(bp); XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_READ(bp); @@ -186,17 +215,15 @@ xlog_bwrite( { int error; - if (nbblks <= 0 || nbblks > log->l_logBBsize) { - xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); - XFS_ERROR_REPORT("xlog_bwrite(1)", - XFS_ERRLEVEL_HIGH, log->l_mp); + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xlog_warn("XFS: Invalid block length (0x%x) given for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return EFSCORRUPTED; } - if (log->l_sectbb_log) { - blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); - nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); - } + blk_no = round_down(blk_no, log->l_sectBBsize); + nbblks = round_up(nbblks, log->l_sectBBsize); ASSERT(nbblks > 0); ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); @@ -327,39 +354,38 @@ xlog_find_cycle_start( { xfs_caddr_t offset; xfs_daddr_t mid_blk; + xfs_daddr_t end_blk; uint mid_cycle; int error; - mid_blk = BLK_AVG(first_blk, *last_blk); - while (mid_blk != first_blk && mid_blk != *last_blk) { + end_blk = *last_blk; + mid_blk = BLK_AVG(first_blk, end_blk); + while (mid_blk != first_blk && mid_blk != end_blk) { error = xlog_bread(log, mid_blk, 1, bp, &offset); if (error) return error; mid_cycle = xlog_get_cycle(offset); - if (mid_cycle == cycle) { - *last_blk = mid_blk; - /* last_half_cycle == mid_cycle */ - } else { - first_blk = mid_blk; - /* first_half_cycle == mid_cycle */ - } - mid_blk = BLK_AVG(first_blk, *last_blk); + if (mid_cycle == cycle) + end_blk = mid_blk; /* last_half_cycle == mid_cycle */ + else + first_blk = mid_blk; /* first_half_cycle == mid_cycle */ + mid_blk = BLK_AVG(first_blk, end_blk); } - ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) || - (mid_blk == *last_blk && mid_blk-1 == first_blk)); + ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) || + (mid_blk == end_blk && mid_blk-1 == first_blk)); + + *last_blk = end_blk; return 0; } /* - * Check that the range of blocks does not contain the cycle number - * given. The scan needs to occur from front to back and the ptr into the - * region must be updated since a later routine will need to perform another - * test. If the region is completely good, we end up returning the same - * last block number. - * - * Set blkno to -1 if we encounter no errors. This is an invalid block number - * since we don't ever expect logs to get this large. + * Check that a range of blocks does not contain stop_on_cycle_no. + * Fill in *new_blk with the block offset where such a block is + * found, or with -1 (an invalid block number) if there is no such + * block in the range. The scan needs to occur from front to back + * and the pointer into the region must be updated since a later + * routine will need to perform another test. */ STATIC int xlog_find_verify_cycle( @@ -376,12 +402,16 @@ xlog_find_verify_cycle( xfs_caddr_t buf = NULL; int error = 0; + /* + * Greedily allocate a buffer big enough to handle the full + * range of basic blocks we'll be examining. If that fails, + * try a smaller size. We need to be able to read at least + * a log sector, or we're out of luck. + */ bufblks = 1 << ffs(nbblks); - while (!(bp = xlog_get_bp(log, bufblks))) { - /* can't get enough memory to do everything in one big buffer */ bufblks >>= 1; - if (bufblks <= log->l_sectbb_log) + if (bufblks < log->l_sectBBsize) return ENOMEM; } @@ -629,7 +659,7 @@ xlog_find_head( * In this case we want to find the first block with cycle * number matching last_half_cycle. We expect the log to be * some variation on - * x + 1 ... | x ... + * x + 1 ... | x ... | x * The first block with cycle number x (last_half_cycle) will * be where the new head belongs. First we do a binary search * for the first occurrence of last_half_cycle. The binary @@ -639,11 +669,13 @@ xlog_find_head( * the log, then we look for occurrences of last_half_cycle - 1 * at the end of the log. The cases we're looking for look * like - * x + 1 ... | x | x + 1 | x ... - * ^ binary search stopped here + * v binary search stopped here + * x + 1 ... | x | x + 1 | x ... | x + * ^ but we want to locate this spot * or - * x + 1 ... | x ... | x - 1 | x * <---------> less than scan distance + * x + 1 ... | x ... | x - 1 | x + * ^ we want to locate this spot */ stop_on_cycle = last_half_cycle; if ((error = xlog_find_cycle_start(log, bp, first_blk, @@ -699,16 +731,16 @@ xlog_find_head( * certainly not the head of the log. By searching for * last_half_cycle-1 we accomplish that. */ - start_blk = log_bbnum - num_scan_bblks + head_blk; ASSERT(head_blk <= INT_MAX && - (xfs_daddr_t) num_scan_bblks - head_blk >= 0); + (xfs_daddr_t) num_scan_bblks >= head_blk); + start_blk = log_bbnum - (num_scan_bblks - head_blk); if ((error = xlog_find_verify_cycle(log, start_blk, num_scan_bblks - (int)head_blk, (stop_on_cycle - 1), &new_blk))) goto bp_err; if (new_blk != -1) { head_blk = new_blk; - goto bad_blk; + goto validate_head; } /* @@ -726,7 +758,7 @@ xlog_find_head( head_blk = new_blk; } - bad_blk: +validate_head: /* * Now we need to make sure head_blk is not pointing to a block in * the middle of a log record. @@ -748,7 +780,7 @@ xlog_find_head( if ((error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0)) == -1) { /* We hit the beginning of the log during our search */ - start_blk = log_bbnum - num_scan_bblks + head_blk; + start_blk = log_bbnum - (num_scan_bblks - head_blk); new_blk = log_bbnum; ASSERT(start_blk <= INT_MAX && (xfs_daddr_t) log_bbnum-start_blk >= 0); @@ -833,12 +865,12 @@ xlog_find_tail( if (*head_blk == 0) { /* special case */ error = xlog_bread(log, 0, 1, bp, &offset); if (error) - goto bread_err; + goto done; if (xlog_get_cycle(offset) == 0) { *tail_blk = 0; /* leave all other log inited values alone */ - goto exit; + goto done; } } @@ -849,7 +881,7 @@ xlog_find_tail( for (i = (int)(*head_blk) - 1; i >= 0; i--) { error = xlog_bread(log, i, 1, bp, &offset); if (error) - goto bread_err; + goto done; if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { found = 1; @@ -866,7 +898,7 @@ xlog_find_tail( for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { error = xlog_bread(log, i, 1, bp, &offset); if (error) - goto bread_err; + goto done; if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { @@ -941,7 +973,7 @@ xlog_find_tail( umount_data_blk = (i + hblks) % log->l_logBBsize; error = xlog_bread(log, umount_data_blk, 1, bp, &offset); if (error) - goto bread_err; + goto done; op_head = (xlog_op_header_t *)offset; if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { @@ -987,12 +1019,10 @@ xlog_find_tail( * But... if the -device- itself is readonly, just skip this. * We can't recover this device anyway, so it won't matter. */ - if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) { + if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) error = xlog_clear_stale_blocks(log, tail_lsn); - } -bread_err: -exit: +done: xlog_put_bp(bp); if (error) @@ -1152,16 +1182,22 @@ xlog_write_log_records( xfs_caddr_t offset; xfs_buf_t *bp; int balign, ealign; - int sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); + int sectbb = log->l_sectBBsize; int end_block = start_block + blocks; int bufblks; int error = 0; int i, j = 0; + /* + * Greedily allocate a buffer big enough to handle the full + * range of basic blocks to be written. If that fails, try + * a smaller size. We need to be able to write at least a + * log sector, or we're out of luck. + */ bufblks = 1 << ffs(blocks); while (!(bp = xlog_get_bp(log, bufblks))) { bufblks >>= 1; - if (bufblks <= log->l_sectbb_log) + if (bufblks < sectbb) return ENOMEM; } @@ -1169,7 +1205,7 @@ xlog_write_log_records( * the buffer in the starting sector not covered by the first * write below. */ - balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block); + balign = round_down(start_block, sectbb); if (balign != start_block) { error = xlog_bread_noalign(log, start_block, 1, bp); if (error) @@ -1188,7 +1224,7 @@ xlog_write_log_records( * the buffer in the final sector not covered by the write. * If this is the same sector as the above read, skip it. */ - ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block); + ealign = round_down(end_block, sectbb); if (j == 0 && (start_block + endcount > ealign)) { offset = XFS_BUF_PTR(bp); balign = BBTOB(ealign - start_block); @@ -1408,6 +1444,7 @@ xlog_recover_add_item( STATIC int xlog_recover_add_to_cont_trans( + struct log *log, xlog_recover_t *trans, xfs_caddr_t dp, int len) @@ -1434,6 +1471,7 @@ xlog_recover_add_to_cont_trans( memcpy(&ptr[old_len], dp, len); /* d, s, l */ item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; + trace_xfs_log_recover_item_add_cont(log, trans, item, 0); return 0; } @@ -1452,6 +1490,7 @@ xlog_recover_add_to_cont_trans( */ STATIC int xlog_recover_add_to_trans( + struct log *log, xlog_recover_t *trans, xfs_caddr_t dp, int len) @@ -1510,6 +1549,7 @@ xlog_recover_add_to_trans( item->ri_buf[item->ri_cnt].i_addr = ptr; item->ri_buf[item->ri_cnt].i_len = len; item->ri_cnt++; + trace_xfs_log_recover_item_add(log, trans, item, 0); return 0; } @@ -1521,7 +1561,9 @@ xlog_recover_add_to_trans( */ STATIC int xlog_recover_reorder_trans( - xlog_recover_t *trans) + struct log *log, + xlog_recover_t *trans, + int pass) { xlog_recover_item_t *item, *n; LIST_HEAD(sort_list); @@ -1534,7 +1576,9 @@ xlog_recover_reorder_trans( switch (ITEM_TYPE(item)) { case XFS_LI_BUF: - if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) { + if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { + trace_xfs_log_recover_item_reorder_head(log, + trans, item, pass); list_move(&item->ri_list, &trans->r_itemq); break; } @@ -1543,6 +1587,8 @@ xlog_recover_reorder_trans( case XFS_LI_QUOTAOFF: case XFS_LI_EFD: case XFS_LI_EFI: + trace_xfs_log_recover_item_reorder_tail(log, + trans, item, pass); list_move_tail(&item->ri_list, &trans->r_itemq); break; default: @@ -1592,8 +1638,10 @@ xlog_recover_do_buffer_pass1( /* * If this isn't a cancel buffer item, then just return. */ - if (!(flags & XFS_BLI_CANCEL)) + if (!(flags & XFS_BLF_CANCEL)) { + trace_xfs_log_recover_buf_not_cancel(log, buf_f); return; + } /* * Insert an xfs_buf_cancel record into the hash table of @@ -1627,6 +1675,7 @@ xlog_recover_do_buffer_pass1( while (nextp != NULL) { if (nextp->bc_blkno == blkno && nextp->bc_len == len) { nextp->bc_refcount++; + trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); return; } prevp = nextp; @@ -1640,13 +1689,14 @@ xlog_recover_do_buffer_pass1( bcp->bc_refcount = 1; bcp->bc_next = NULL; prevp->bc_next = bcp; + trace_xfs_log_recover_buf_cancel_add(log, buf_f); } /* * Check to see whether the buffer being recovered has a corresponding * entry in the buffer cancel record table. If it does then return 1 * so that it will be cancelled, otherwise return 0. If the buffer is - * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement + * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement * the refcount on the entry in the table and remove it from the table * if this is the last reference. * @@ -1671,7 +1721,7 @@ xlog_check_buffer_cancelled( * There is nothing in the table built in pass one, * so this buffer must not be cancelled. */ - ASSERT(!(flags & XFS_BLI_CANCEL)); + ASSERT(!(flags & XFS_BLF_CANCEL)); return 0; } @@ -1683,7 +1733,7 @@ xlog_check_buffer_cancelled( * There is no corresponding entry in the table built * in pass one, so this buffer has not been cancelled. */ - ASSERT(!(flags & XFS_BLI_CANCEL)); + ASSERT(!(flags & XFS_BLF_CANCEL)); return 0; } @@ -1702,7 +1752,7 @@ xlog_check_buffer_cancelled( * one in the table and remove it if this is the * last reference. */ - if (flags & XFS_BLI_CANCEL) { + if (flags & XFS_BLF_CANCEL) { bcp->bc_refcount--; if (bcp->bc_refcount == 0) { if (prevp == NULL) { @@ -1722,7 +1772,7 @@ xlog_check_buffer_cancelled( * We didn't find a corresponding entry in the table, so * return 0 so that the buffer is NOT cancelled. */ - ASSERT(!(flags & XFS_BLI_CANCEL)); + ASSERT(!(flags & XFS_BLF_CANCEL)); return 0; } @@ -1779,6 +1829,8 @@ xlog_recover_do_inode_buffer( unsigned int *data_map = NULL; unsigned int map_size = 0; + trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); + switch (buf_f->blf_type) { case XFS_LI_BUF: data_map = buf_f->blf_data_map; @@ -1822,8 +1874,8 @@ xlog_recover_do_inode_buffer( nbits = xfs_contig_bits(data_map, map_size, bit); ASSERT(nbits > 0); - reg_buf_offset = bit << XFS_BLI_SHIFT; - reg_buf_bytes = nbits << XFS_BLI_SHIFT; + reg_buf_offset = bit << XFS_BLF_SHIFT; + reg_buf_bytes = nbits << XFS_BLF_SHIFT; item_index++; } @@ -1837,7 +1889,7 @@ xlog_recover_do_inode_buffer( } ASSERT(item->ri_buf[item_index].i_addr != NULL); - ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0); + ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); /* @@ -1874,6 +1926,7 @@ xlog_recover_do_inode_buffer( /*ARGSUSED*/ STATIC void xlog_recover_do_reg_buffer( + struct xfs_mount *mp, xlog_recover_item_t *item, xfs_buf_t *bp, xfs_buf_log_format_t *buf_f) @@ -1885,6 +1938,8 @@ xlog_recover_do_reg_buffer( unsigned int map_size = 0; int error; + trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); + switch (buf_f->blf_type) { case XFS_LI_BUF: data_map = buf_f->blf_data_map; @@ -1900,9 +1955,9 @@ xlog_recover_do_reg_buffer( nbits = xfs_contig_bits(data_map, map_size, bit); ASSERT(nbits > 0); ASSERT(item->ri_buf[i].i_addr != NULL); - ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0); + ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); ASSERT(XFS_BUF_COUNT(bp) >= - ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT)); + ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT)); /* * Do a sanity check if this is a dquot buffer. Just checking @@ -1911,7 +1966,7 @@ xlog_recover_do_reg_buffer( */ error = 0; if (buf_f->blf_flags & - (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { if (item->ri_buf[i].i_addr == NULL) { cmn_err(CE_ALERT, "XFS: NULL dquot in %s.", __func__); @@ -1932,9 +1987,9 @@ xlog_recover_do_reg_buffer( } memcpy(xfs_buf_offset(bp, - (uint)bit << XFS_BLI_SHIFT), /* dest */ + (uint)bit << XFS_BLF_SHIFT), /* dest */ item->ri_buf[i].i_addr, /* source */ - nbits<<XFS_BLI_SHIFT); /* length */ + nbits<<XFS_BLF_SHIFT); /* length */ next: i++; bit += nbits; @@ -2083,6 +2138,8 @@ xlog_recover_do_dquot_buffer( { uint type; + trace_xfs_log_recover_buf_dquot_buf(log, buf_f); + /* * Filesystems are required to send in quota flags at mount time. */ @@ -2091,11 +2148,11 @@ xlog_recover_do_dquot_buffer( } type = 0; - if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF) + if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) type |= XFS_DQ_USER; - if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF) + if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) type |= XFS_DQ_PROJ; - if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF) + if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) type |= XFS_DQ_GROUP; /* * This type of quotas was turned off, so ignore this buffer @@ -2103,7 +2160,7 @@ xlog_recover_do_dquot_buffer( if (log->l_quotaoffs_flag & type) return; - xlog_recover_do_reg_buffer(item, bp, buf_f); + xlog_recover_do_reg_buffer(mp, item, bp, buf_f); } /* @@ -2116,7 +2173,7 @@ xlog_recover_do_dquot_buffer( * here which overlaps that may be stale. * * When meta-data buffers are freed at run time we log a buffer item - * with the XFS_BLI_CANCEL bit set to indicate that previous copies + * with the XFS_BLF_CANCEL bit set to indicate that previous copies * of the buffer in the log should not be replayed at recovery time. * This is so that if the blocks covered by the buffer are reused for * file data before we crash we don't end up replaying old, freed @@ -2150,7 +2207,7 @@ xlog_recover_do_buffer_trans( if (pass == XLOG_RECOVER_PASS1) { /* * In this pass we're only looking for buf items - * with the XFS_BLI_CANCEL bit set. + * with the XFS_BLF_CANCEL bit set. */ xlog_recover_do_buffer_pass1(log, buf_f); return 0; @@ -2164,9 +2221,11 @@ xlog_recover_do_buffer_trans( */ cancel = xlog_recover_do_buffer_pass2(log, buf_f); if (cancel) { + trace_xfs_log_recover_buf_cancel(log, buf_f); return 0; } } + trace_xfs_log_recover_buf_recover(log, buf_f); switch (buf_f->blf_type) { case XFS_LI_BUF: blkno = buf_f->blf_blkno; @@ -2185,7 +2244,7 @@ xlog_recover_do_buffer_trans( mp = log->l_mp; buf_flags = XBF_LOCK; - if (!(flags & XFS_BLI_INODE_BUF)) + if (!(flags & XFS_BLF_INODE_BUF)) buf_flags |= XBF_MAPPED; bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); @@ -2198,13 +2257,13 @@ xlog_recover_do_buffer_trans( } error = 0; - if (flags & XFS_BLI_INODE_BUF) { + if (flags & XFS_BLF_INODE_BUF) { error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); } else if (flags & - (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); } else { - xlog_recover_do_reg_buffer(item, bp, buf_f); + xlog_recover_do_reg_buffer(mp, item, bp, buf_f); } if (error) return XFS_ERROR(error); @@ -2284,8 +2343,10 @@ xlog_recover_do_inode_trans( if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno, in_f->ilf_len, 0)) { error = 0; + trace_xfs_log_recover_inode_cancel(log, in_f); goto error; } + trace_xfs_log_recover_inode_recover(log, in_f); bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, XBF_LOCK); @@ -2337,6 +2398,7 @@ xlog_recover_do_inode_trans( /* do nothing */ } else { xfs_buf_relse(bp); + trace_xfs_log_recover_inode_skip(log, in_f); error = 0; goto error; } @@ -2758,11 +2820,12 @@ xlog_recover_do_trans( int error = 0; xlog_recover_item_t *item; - error = xlog_recover_reorder_trans(trans); + error = xlog_recover_reorder_trans(log, trans, pass); if (error) return error; list_for_each_entry(item, &trans->r_itemq, ri_list) { + trace_xfs_log_recover_item_recover(log, trans, item, pass); switch (ITEM_TYPE(item)) { case XFS_LI_BUF: error = xlog_recover_do_buffer_trans(log, item, pass); @@ -2919,8 +2982,9 @@ xlog_recover_process_data( error = xlog_recover_unmount_trans(trans); break; case XLOG_WAS_CONT_TRANS: - error = xlog_recover_add_to_cont_trans(trans, - dp, be32_to_cpu(ohead->oh_len)); + error = xlog_recover_add_to_cont_trans(log, + trans, dp, + be32_to_cpu(ohead->oh_len)); break; case XLOG_START_TRANS: xlog_warn( @@ -2930,7 +2994,7 @@ xlog_recover_process_data( break; case 0: case XLOG_CONTINUE_TRANS: - error = xlog_recover_add_to_trans(trans, + error = xlog_recover_add_to_trans(log, trans, dp, be32_to_cpu(ohead->oh_len)); break; default: @@ -3331,42 +3395,6 @@ xlog_pack_data( } } -#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) -STATIC void -xlog_unpack_data_checksum( - xlog_rec_header_t *rhead, - xfs_caddr_t dp, - xlog_t *log) -{ - __be32 *up = (__be32 *)dp; - uint chksum = 0; - int i; - - /* divide length by 4 to get # words */ - for (i=0; i < be32_to_cpu(rhead->h_len) >> 2; i++) { - chksum ^= be32_to_cpu(*up); - up++; - } - if (chksum != be32_to_cpu(rhead->h_chksum)) { - if (rhead->h_chksum || - ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) { - cmn_err(CE_DEBUG, - "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n", - be32_to_cpu(rhead->h_chksum), chksum); - cmn_err(CE_DEBUG, -"XFS: Disregard message if filesystem was created with non-DEBUG kernel"); - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - cmn_err(CE_DEBUG, - "XFS: LogR this is a LogV2 filesystem\n"); - } - log->l_flags |= XLOG_CHKSUM_MISMATCH; - } - } -} -#else -#define xlog_unpack_data_checksum(rhead, dp, log) -#endif - STATIC void xlog_unpack_data( xlog_rec_header_t *rhead, @@ -3390,8 +3418,6 @@ xlog_unpack_data( dp += BBSIZE; } } - - xlog_unpack_data_checksum(rhead, dp, log); } STATIC int @@ -3490,7 +3516,7 @@ xlog_do_recovery_pass( hblks = 1; } } else { - ASSERT(log->l_sectbb_log == 0); + ASSERT(log->l_sectBBsize == 1); hblks = 1; hbp = xlog_get_bp(log, 1); h_size = XLOG_BIG_RECORD_BSIZE; @@ -3946,10 +3972,6 @@ xlog_recover_check_summary( xfs_agf_t *agfp; xfs_buf_t *agfbp; xfs_buf_t *agibp; - xfs_buf_t *sbbp; -#ifdef XFS_LOUD_RECOVERY - xfs_sb_t *sbp; -#endif xfs_agnumber_t agno; __uint64_t freeblks; __uint64_t itotal; @@ -3984,30 +4006,5 @@ xlog_recover_check_summary( xfs_buf_relse(agibp); } } - - sbbp = xfs_getsb(mp, 0); -#ifdef XFS_LOUD_RECOVERY - sbp = &mp->m_sb; - xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(sbbp)); - cmn_err(CE_NOTE, - "xlog_recover_check_summary: sb_icount %Lu itotal %Lu", - sbp->sb_icount, itotal); - cmn_err(CE_NOTE, - "xlog_recover_check_summary: sb_ifree %Lu itotal %Lu", - sbp->sb_ifree, ifree); - cmn_err(CE_NOTE, - "xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu", - sbp->sb_fdblocks, freeblks); -#if 0 - /* - * This is turned off until I account for the allocation - * btree blocks which live in free space. - */ - ASSERT(sbp->sb_icount == itotal); - ASSERT(sbp->sb_ifree == ifree); - ASSERT(sbp->sb_fdblocks == freeblks); -#endif -#endif - xfs_buf_relse(sbbp); } #endif /* DEBUG */ diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h index 75d74920725..1c55ccbb379 100644 --- a/fs/xfs/xfs_log_recover.h +++ b/fs/xfs/xfs_log_recover.h @@ -28,7 +28,7 @@ #define XLOG_RHASH(tid) \ ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1)) -#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1) +#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1) /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index e79b56b4bca..d7bf38c8cd1 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1405,13 +1405,6 @@ xfs_mountfs( xfs_qm_mount_quotas(mp); } -#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) - if (XFS_IS_QUOTA_ON(mp)) - xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on"); - else - xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on"); -#endif - /* * Now we are mounted, reserve a small amount of unused space for * privileged transactions. This is needed so that transaction diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 9ff48a16a7e..1d2c7eed4ed 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -268,6 +268,7 @@ typedef struct xfs_mount { #define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops must be synchronous except for space allocations */ +#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */ #define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */ #define XFS_MOUNT_WAS_CLEAN (1ULL << 3) #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index fdcab3f81dd..e0e64b113bd 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -201,9 +201,6 @@ typedef struct xfs_qoff_logformat { #define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ #define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */ #define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ -#define XFS_QMOPT_QUOTAOFF 0x0000080 /* quotas are being turned off */ -#define XFS_QMOPT_UMOUNTING 0x0000100 /* filesys is being unmounted */ -#define XFS_QMOPT_DOLOG 0x0000200 /* log buf changes (in quotacheck) */ #define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ #define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ #define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index f73e358bae8..ce558efa2ea 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -44,24 +44,14 @@ #include "xfs_trans_priv.h" #include "xfs_trans_space.h" #include "xfs_inode_item.h" - - -STATIC void xfs_trans_apply_sb_deltas(xfs_trans_t *); -STATIC uint xfs_trans_count_vecs(xfs_trans_t *); -STATIC void xfs_trans_fill_vecs(xfs_trans_t *, xfs_log_iovec_t *); -STATIC void xfs_trans_uncommit(xfs_trans_t *, uint); -STATIC void xfs_trans_committed(xfs_trans_t *, int); -STATIC void xfs_trans_chunk_committed(xfs_log_item_chunk_t *, xfs_lsn_t, int); -STATIC void xfs_trans_free(xfs_trans_t *); +#include "xfs_trace.h" kmem_zone_t *xfs_trans_zone; - /* * Reservation functions here avoid a huge stack in xfs_trans_init * due to register overflow from temporaries in the calculations. */ - STATIC uint xfs_calc_write_reservation(xfs_mount_t *mp) { @@ -254,13 +244,30 @@ _xfs_trans_alloc( tp->t_type = type; tp->t_mountp = mp; tp->t_items_free = XFS_LIC_NUM_SLOTS; - tp->t_busy_free = XFS_LBC_NUM_SLOTS; xfs_lic_init(&(tp->t_items)); - XFS_LBC_INIT(&(tp->t_busy)); + INIT_LIST_HEAD(&tp->t_busy); return tp; } /* + * Free the transaction structure. If there is more clean up + * to do when the structure is freed, add it here. + */ +STATIC void +xfs_trans_free( + struct xfs_trans *tp) +{ + struct xfs_busy_extent *busyp, *n; + + list_for_each_entry_safe(busyp, n, &tp->t_busy, list) + xfs_alloc_busy_clear(tp->t_mountp, busyp); + + atomic_dec(&tp->t_mountp->m_active_trans); + xfs_trans_free_dqinfo(tp); + kmem_zone_free(xfs_trans_zone, tp); +} + +/* * This is called to create a new transaction which will share the * permanent log reservation of the given transaction. The remaining * unused block and rt extent reservations are also inherited. This @@ -283,9 +290,8 @@ xfs_trans_dup( ntp->t_type = tp->t_type; ntp->t_mountp = tp->t_mountp; ntp->t_items_free = XFS_LIC_NUM_SLOTS; - ntp->t_busy_free = XFS_LBC_NUM_SLOTS; xfs_lic_init(&(ntp->t_items)); - XFS_LBC_INIT(&(ntp->t_busy)); + INIT_LIST_HEAD(&ntp->t_busy); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(tp->t_ticket != NULL); @@ -421,7 +427,6 @@ undo_blocks: return error; } - /* * Record the indicated change to the given field for application * to the file system's superblock when the transaction commits. @@ -650,7 +655,7 @@ xfs_trans_apply_sb_deltas( * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we * still need to update the incore superblock with the changes. */ -STATIC void +void xfs_trans_unreserve_and_mod_sb( xfs_trans_t *tp) { @@ -764,94 +769,256 @@ xfs_trans_unreserve_and_mod_sb( } } +/* + * Total up the number of log iovecs needed to commit this + * transaction. The transaction itself needs one for the + * transaction header. Ask each dirty item in turn how many + * it needs to get the total. + */ +static uint +xfs_trans_count_vecs( + struct xfs_trans *tp) +{ + int nvecs; + xfs_log_item_desc_t *lidp; + + nvecs = 1; + lidp = xfs_trans_first_item(tp); + ASSERT(lidp != NULL); + + /* In the non-debug case we need to start bailing out if we + * didn't find a log_item here, return zero and let trans_commit + * deal with it. + */ + if (lidp == NULL) + return 0; + + while (lidp != NULL) { + /* + * Skip items which aren't dirty in this transaction. + */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) { + lidp = xfs_trans_next_item(tp, lidp); + continue; + } + lidp->lid_size = IOP_SIZE(lidp->lid_item); + nvecs += lidp->lid_size; + lidp = xfs_trans_next_item(tp, lidp); + } + + return nvecs; +} /* - * xfs_trans_commit + * Fill in the vector with pointers to data to be logged + * by this transaction. The transaction header takes + * the first vector, and then each dirty item takes the + * number of vectors it indicated it needed in xfs_trans_count_vecs(). * - * Commit the given transaction to the log a/synchronously. + * As each item fills in the entries it needs, also pin the item + * so that it cannot be flushed out until the log write completes. + */ +static void +xfs_trans_fill_vecs( + struct xfs_trans *tp, + struct xfs_log_iovec *log_vector) +{ + xfs_log_item_desc_t *lidp; + struct xfs_log_iovec *vecp; + uint nitems; + + /* + * Skip over the entry for the transaction header, we'll + * fill that in at the end. + */ + vecp = log_vector + 1; + + nitems = 0; + lidp = xfs_trans_first_item(tp); + ASSERT(lidp); + while (lidp) { + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) { + lidp = xfs_trans_next_item(tp, lidp); + continue; + } + + /* + * The item may be marked dirty but not log anything. This can + * be used to get called when a transaction is committed. + */ + if (lidp->lid_size) + nitems++; + IOP_FORMAT(lidp->lid_item, vecp); + vecp += lidp->lid_size; + IOP_PIN(lidp->lid_item); + lidp = xfs_trans_next_item(tp, lidp); + } + + /* + * Now that we've counted the number of items in this transaction, fill + * in the transaction header. Note that the transaction header does not + * have a log item. + */ + tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC; + tp->t_header.th_type = tp->t_type; + tp->t_header.th_num_items = nitems; + log_vector->i_addr = (xfs_caddr_t)&tp->t_header; + log_vector->i_len = sizeof(xfs_trans_header_t); + log_vector->i_type = XLOG_REG_TYPE_TRANSHDR; +} + +/* + * The committed item processing consists of calling the committed routine of + * each logged item, updating the item's position in the AIL if necessary, and + * unpinning each item. If the committed routine returns -1, then do nothing + * further with the item because it may have been freed. * - * XFS disk error handling mechanism is not based on a typical - * transaction abort mechanism. Logically after the filesystem - * gets marked 'SHUTDOWN', we can't let any new transactions - * be durable - ie. committed to disk - because some metadata might - * be inconsistent. In such cases, this returns an error, and the - * caller may assume that all locked objects joined to the transaction - * have already been unlocked as if the commit had succeeded. - * Do not reference the transaction structure after this call. + * Since items are unlocked when they are copied to the incore log, it is + * possible for two transactions to be completing and manipulating the same + * item simultaneously. The AIL lock will protect the lsn field of each item. + * The value of this field can never go backwards. + * + * We unpin the items after repositioning them in the AIL, because otherwise + * they could be immediately flushed and we'd have to race with the flusher + * trying to pull the item from the AIL as we add it. */ - /*ARGSUSED*/ -int -_xfs_trans_commit( - xfs_trans_t *tp, - uint flags, - int *log_flushed) +void +xfs_trans_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t commit_lsn, + int aborted) { - xfs_log_iovec_t *log_vector; - int nvec; - xfs_mount_t *mp; - xfs_lsn_t commit_lsn; - /* REFERENCED */ - int error; - int log_flags; - int sync; -#define XFS_TRANS_LOGVEC_COUNT 16 - xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT]; - struct xlog_in_core *commit_iclog; - int shutdown; + xfs_lsn_t item_lsn; + struct xfs_ail *ailp; - commit_lsn = -1; + if (aborted) + lip->li_flags |= XFS_LI_ABORTED; + item_lsn = IOP_COMMITTED(lip, commit_lsn); + + /* If the committed routine returns -1, item has been freed. */ + if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) + return; /* - * Determine whether this commit is releasing a permanent - * log reservation or not. + * If the returned lsn is greater than what it contained before, update + * the location of the item in the AIL. If it is not, then do nothing. + * Items can never move backwards in the AIL. + * + * While the new lsn should usually be greater, it is possible that a + * later transaction completing simultaneously with an earlier one + * using the same item could complete first with a higher lsn. This + * would cause the earlier transaction to fail the test below. */ - if (flags & XFS_TRANS_RELEASE_LOG_RES) { - ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - log_flags = XFS_LOG_REL_PERM_RESERV; + ailp = lip->li_ailp; + spin_lock(&ailp->xa_lock); + if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) { + /* + * This will set the item's lsn to item_lsn and update the + * position of the item in the AIL. + * + * xfs_trans_ail_update() drops the AIL lock. + */ + xfs_trans_ail_update(ailp, lip, item_lsn); } else { - log_flags = 0; + spin_unlock(&ailp->xa_lock); } - mp = tp->t_mountp; /* - * If there is nothing to be logged by the transaction, - * then unlock all of the items associated with the - * transaction and free the transaction structure. - * Also make sure to return any reserved blocks to - * the free pool. + * Now that we've repositioned the item in the AIL, unpin it so it can + * be flushed. Pass information about buffer stale state down from the + * log item flags, if anyone else stales the buffer we do not want to + * pay any attention to it. */ -shut_us_down: - shutdown = XFS_FORCED_SHUTDOWN(mp) ? EIO : 0; - if (!(tp->t_flags & XFS_TRANS_DIRTY) || shutdown) { - xfs_trans_unreserve_and_mod_sb(tp); + IOP_UNPIN(lip); +} + +/* + * This is typically called by the LM when a transaction has been fully + * committed to disk. It needs to unpin the items which have + * been logged by the transaction and update their positions + * in the AIL if necessary. + * + * This also gets called when the transactions didn't get written out + * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then. + */ +STATIC void +xfs_trans_committed( + struct xfs_trans *tp, + int abortflag) +{ + xfs_log_item_desc_t *lidp; + xfs_log_item_chunk_t *licp; + xfs_log_item_chunk_t *next_licp; + + /* Call the transaction's completion callback if there is one. */ + if (tp->t_callback != NULL) + tp->t_callback(tp, tp->t_callarg); + + for (lidp = xfs_trans_first_item(tp); + lidp != NULL; + lidp = xfs_trans_next_item(tp, lidp)) { + xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag); + } + + /* free the item chunks, ignoring the embedded chunk */ + for (licp = tp->t_items.lic_next; licp != NULL; licp = next_licp) { + next_licp = licp->lic_next; + kmem_free(licp); + } + + xfs_trans_free(tp); +} + +/* + * Called from the trans_commit code when we notice that + * the filesystem is in the middle of a forced shutdown. + */ +STATIC void +xfs_trans_uncommit( + struct xfs_trans *tp, + uint flags) +{ + xfs_log_item_desc_t *lidp; + + for (lidp = xfs_trans_first_item(tp); + lidp != NULL; + lidp = xfs_trans_next_item(tp, lidp)) { /* - * It is indeed possible for the transaction to be - * not dirty but the dqinfo portion to be. All that - * means is that we have some (non-persistent) quota - * reservations that need to be unreserved. + * Unpin all but those that aren't dirty. */ - xfs_trans_unreserve_and_mod_dquots(tp); - if (tp->t_ticket) { - commit_lsn = xfs_log_done(mp, tp->t_ticket, - NULL, log_flags); - if (commit_lsn == -1 && !shutdown) - shutdown = XFS_ERROR(EIO); - } - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0); - xfs_trans_free_busy(tp); - xfs_trans_free(tp); - XFS_STATS_INC(xs_trans_empty); - return (shutdown); + if (lidp->lid_flags & XFS_LID_DIRTY) + IOP_UNPIN_REMOVE(lidp->lid_item, tp); } - ASSERT(tp->t_ticket != NULL); - /* - * If we need to update the superblock, then do it now. - */ - if (tp->t_flags & XFS_TRANS_SB_DIRTY) - xfs_trans_apply_sb_deltas(tp); - xfs_trans_apply_dquot_deltas(tp); + xfs_trans_unreserve_and_mod_sb(tp); + xfs_trans_unreserve_and_mod_dquots(tp); + + xfs_trans_free_items(tp, NULLCOMMITLSN, flags); + xfs_trans_free(tp); +} + +/* + * Format the transaction direct to the iclog. This isolates the physical + * transaction commit operation from the logical operation and hence allows + * other methods to be introduced without affecting the existing commit path. + */ +static int +xfs_trans_commit_iclog( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_lsn_t *commit_lsn, + int flags) +{ + int shutdown; + int error; + int log_flags = 0; + struct xlog_in_core *commit_iclog; +#define XFS_TRANS_LOGVEC_COUNT 16 + struct xfs_log_iovec log_vector_fast[XFS_TRANS_LOGVEC_COUNT]; + struct xfs_log_iovec *log_vector; + uint nvec; + /* * Ask each log item how many log_vector entries it will @@ -861,8 +1028,7 @@ shut_us_down: */ nvec = xfs_trans_count_vecs(tp); if (nvec == 0) { - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - goto shut_us_down; + return ENOMEM; /* triggers a shutdown! */ } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) { log_vector = log_vector_fast; } else { @@ -877,6 +1043,9 @@ shut_us_down: */ xfs_trans_fill_vecs(tp, log_vector); + if (flags & XFS_TRANS_RELEASE_LOG_RES) + log_flags = XFS_LOG_REL_PERM_RESERV; + error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn)); /* @@ -884,18 +1053,19 @@ shut_us_down: * at any time after this call. However, all the items associated * with the transaction are still locked and pinned in memory. */ - commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); + *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); - tp->t_commit_lsn = commit_lsn; - if (nvec > XFS_TRANS_LOGVEC_COUNT) { + tp->t_commit_lsn = *commit_lsn; + trace_xfs_trans_commit_lsn(tp); + + if (nvec > XFS_TRANS_LOGVEC_COUNT) kmem_free(log_vector); - } /* * If we got a log write error. Unpin the logitems that we * had pinned, clean up, free trans structure, and return error. */ - if (error || commit_lsn == -1) { + if (error || *commit_lsn == -1) { current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT); return XFS_ERROR(EIO); @@ -909,8 +1079,6 @@ shut_us_down: */ xfs_trans_unreserve_and_mod_sb(tp); - sync = tp->t_flags & XFS_TRANS_SYNC; - /* * Tell the LM to call the transaction completion routine * when the log write with LSN commit_lsn completes (e.g. @@ -953,7 +1121,7 @@ shut_us_down: * the commit lsn of this transaction for dependency tracking * purposes. */ - xfs_trans_unlock_items(tp, commit_lsn); + xfs_trans_unlock_items(tp, *commit_lsn); /* * If we detected a log error earlier, finish committing @@ -973,156 +1141,204 @@ shut_us_down: * and the items are released we can finally allow the iclog to * go to disk. */ - error = xfs_log_release_iclog(mp, commit_iclog); - - /* - * If the transaction needs to be synchronous, then force the - * log out now and wait for it. - */ - if (sync) { - if (!error) { - error = _xfs_log_force_lsn(mp, commit_lsn, - XFS_LOG_SYNC, log_flushed); - } - XFS_STATS_INC(xs_trans_sync); - } else { - XFS_STATS_INC(xs_trans_async); - } - - return (error); + return xfs_log_release_iclog(mp, commit_iclog); } - /* - * Total up the number of log iovecs needed to commit this - * transaction. The transaction itself needs one for the - * transaction header. Ask each dirty item in turn how many - * it needs to get the total. + * Walk the log items and allocate log vector structures for + * each item large enough to fit all the vectors they require. + * Note that this format differs from the old log vector format in + * that there is no transaction header in these log vectors. */ -STATIC uint -xfs_trans_count_vecs( +STATIC struct xfs_log_vec * +xfs_trans_alloc_log_vecs( xfs_trans_t *tp) { - int nvecs; xfs_log_item_desc_t *lidp; + struct xfs_log_vec *lv = NULL; + struct xfs_log_vec *ret_lv = NULL; - nvecs = 1; lidp = xfs_trans_first_item(tp); - ASSERT(lidp != NULL); - /* In the non-debug case we need to start bailing out if we - * didn't find a log_item here, return zero and let trans_commit - * deal with it. - */ - if (lidp == NULL) - return 0; + /* Bail out if we didn't find a log item. */ + if (!lidp) { + ASSERT(0); + return NULL; + } while (lidp != NULL) { - /* - * Skip items which aren't dirty in this transaction. - */ + struct xfs_log_vec *new_lv; + + /* Skip items which aren't dirty in this transaction. */ if (!(lidp->lid_flags & XFS_LID_DIRTY)) { lidp = xfs_trans_next_item(tp, lidp); continue; } + + /* Skip items that do not have any vectors for writing */ lidp->lid_size = IOP_SIZE(lidp->lid_item); - nvecs += lidp->lid_size; + if (!lidp->lid_size) { + lidp = xfs_trans_next_item(tp, lidp); + continue; + } + + new_lv = kmem_zalloc(sizeof(*new_lv) + + lidp->lid_size * sizeof(struct xfs_log_iovec), + KM_SLEEP); + + /* The allocated iovec region lies beyond the log vector. */ + new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; + new_lv->lv_niovecs = lidp->lid_size; + new_lv->lv_item = lidp->lid_item; + if (!ret_lv) + ret_lv = new_lv; + else + lv->lv_next = new_lv; + lv = new_lv; lidp = xfs_trans_next_item(tp, lidp); } - return nvecs; + return ret_lv; } -/* - * Called from the trans_commit code when we notice that - * the filesystem is in the middle of a forced shutdown. - */ -STATIC void -xfs_trans_uncommit( - xfs_trans_t *tp, - uint flags) +static int +xfs_trans_commit_cil( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_lsn_t *commit_lsn, + int flags) { - xfs_log_item_desc_t *lidp; + struct xfs_log_vec *log_vector; + int error; - for (lidp = xfs_trans_first_item(tp); - lidp != NULL; - lidp = xfs_trans_next_item(tp, lidp)) { - /* - * Unpin all but those that aren't dirty. - */ - if (lidp->lid_flags & XFS_LID_DIRTY) - IOP_UNPIN_REMOVE(lidp->lid_item, tp); - } + /* + * Get each log item to allocate a vector structure for + * the log item to to pass to the log write code. The + * CIL commit code will format the vector and save it away. + */ + log_vector = xfs_trans_alloc_log_vecs(tp); + if (!log_vector) + return ENOMEM; - xfs_trans_unreserve_and_mod_sb(tp); - xfs_trans_unreserve_and_mod_dquots(tp); + error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags); + if (error) + return error; - xfs_trans_free_items(tp, flags); - xfs_trans_free_busy(tp); + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + + /* xfs_trans_free_items() unlocks them first */ + xfs_trans_free_items(tp, *commit_lsn, 0); xfs_trans_free(tp); + return 0; } /* - * Fill in the vector with pointers to data to be logged - * by this transaction. The transaction header takes - * the first vector, and then each dirty item takes the - * number of vectors it indicated it needed in xfs_trans_count_vecs(). + * xfs_trans_commit * - * As each item fills in the entries it needs, also pin the item - * so that it cannot be flushed out until the log write completes. + * Commit the given transaction to the log a/synchronously. + * + * XFS disk error handling mechanism is not based on a typical + * transaction abort mechanism. Logically after the filesystem + * gets marked 'SHUTDOWN', we can't let any new transactions + * be durable - ie. committed to disk - because some metadata might + * be inconsistent. In such cases, this returns an error, and the + * caller may assume that all locked objects joined to the transaction + * have already been unlocked as if the commit had succeeded. + * Do not reference the transaction structure after this call. */ -STATIC void -xfs_trans_fill_vecs( - xfs_trans_t *tp, - xfs_log_iovec_t *log_vector) +int +_xfs_trans_commit( + struct xfs_trans *tp, + uint flags, + int *log_flushed) { - xfs_log_item_desc_t *lidp; - xfs_log_iovec_t *vecp; - uint nitems; + struct xfs_mount *mp = tp->t_mountp; + xfs_lsn_t commit_lsn = -1; + int error = 0; + int log_flags = 0; + int sync = tp->t_flags & XFS_TRANS_SYNC; /* - * Skip over the entry for the transaction header, we'll - * fill that in at the end. + * Determine whether this commit is releasing a permanent + * log reservation or not. */ - vecp = log_vector + 1; /* pointer arithmetic */ + if (flags & XFS_TRANS_RELEASE_LOG_RES) { + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + log_flags = XFS_LOG_REL_PERM_RESERV; + } - nitems = 0; - lidp = xfs_trans_first_item(tp); - ASSERT(lidp != NULL); - while (lidp != NULL) { - /* - * Skip items which aren't dirty in this transaction. - */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) { - lidp = xfs_trans_next_item(tp, lidp); - continue; - } - /* - * The item may be marked dirty but not log anything. - * This can be used to get called when a transaction - * is committed. - */ - if (lidp->lid_size) { - nitems++; + /* + * If there is nothing to be logged by the transaction, + * then unlock all of the items associated with the + * transaction and free the transaction structure. + * Also make sure to return any reserved blocks to + * the free pool. + */ + if (!(tp->t_flags & XFS_TRANS_DIRTY)) + goto out_unreserve; + + if (XFS_FORCED_SHUTDOWN(mp)) { + error = XFS_ERROR(EIO); + goto out_unreserve; + } + + ASSERT(tp->t_ticket != NULL); + + /* + * If we need to update the superblock, then do it now. + */ + if (tp->t_flags & XFS_TRANS_SB_DIRTY) + xfs_trans_apply_sb_deltas(tp); + xfs_trans_apply_dquot_deltas(tp); + + if (mp->m_flags & XFS_MOUNT_DELAYLOG) + error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags); + else + error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags); + + if (error == ENOMEM) { + xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + error = XFS_ERROR(EIO); + goto out_unreserve; + } + + /* + * If the transaction needs to be synchronous, then force the + * log out now and wait for it. + */ + if (sync) { + if (!error) { + error = _xfs_log_force_lsn(mp, commit_lsn, + XFS_LOG_SYNC, log_flushed); } - IOP_FORMAT(lidp->lid_item, vecp); - vecp += lidp->lid_size; /* pointer arithmetic */ - IOP_PIN(lidp->lid_item); - lidp = xfs_trans_next_item(tp, lidp); + XFS_STATS_INC(xs_trans_sync); + } else { + XFS_STATS_INC(xs_trans_async); } + return error; + +out_unreserve: + xfs_trans_unreserve_and_mod_sb(tp); + /* - * Now that we've counted the number of items in this - * transaction, fill in the transaction header. + * It is indeed possible for the transaction to be not dirty but + * the dqinfo portion to be. All that means is that we have some + * (non-persistent) quota reservations that need to be unreserved. */ - tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC; - tp->t_header.th_type = tp->t_type; - tp->t_header.th_num_items = nitems; - log_vector->i_addr = (xfs_caddr_t)&tp->t_header; - log_vector->i_len = sizeof(xfs_trans_header_t); - log_vector->i_type = XLOG_REG_TYPE_TRANSHDR; -} + xfs_trans_unreserve_and_mod_dquots(tp); + if (tp->t_ticket) { + commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + if (commit_lsn == -1 && !error) + error = XFS_ERROR(EIO); + } + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0); + xfs_trans_free(tp); + XFS_STATS_INC(xs_trans_empty); + return error; +} /* * Unlock all of the transaction's items and free the transaction. @@ -1195,25 +1411,10 @@ xfs_trans_cancel( /* mark this thread as no longer being in a transaction */ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, flags); - xfs_trans_free_busy(tp); + xfs_trans_free_items(tp, NULLCOMMITLSN, flags); xfs_trans_free(tp); } - -/* - * Free the transaction structure. If there is more clean up - * to do when the structure is freed, add it here. - */ -STATIC void -xfs_trans_free( - xfs_trans_t *tp) -{ - atomic_dec(&tp->t_mountp->m_active_trans); - xfs_trans_free_dqinfo(tp); - kmem_zone_free(xfs_trans_zone, tp); -} - /* * Roll from one trans in the sequence of PERMANENT transactions to * the next: permanent transactions are only flushed out when @@ -1283,174 +1484,3 @@ xfs_trans_roll( xfs_trans_ihold(trans, dp); return 0; } - -/* - * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item(). - * - * This is typically called by the LM when a transaction has been fully - * committed to disk. It needs to unpin the items which have - * been logged by the transaction and update their positions - * in the AIL if necessary. - * This also gets called when the transactions didn't get written out - * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then. - * - * Call xfs_trans_chunk_committed() to process the items in - * each chunk. - */ -STATIC void -xfs_trans_committed( - xfs_trans_t *tp, - int abortflag) -{ - xfs_log_item_chunk_t *licp; - xfs_log_item_chunk_t *next_licp; - xfs_log_busy_chunk_t *lbcp; - xfs_log_busy_slot_t *lbsp; - int i; - - /* - * Call the transaction's completion callback if there - * is one. - */ - if (tp->t_callback != NULL) { - tp->t_callback(tp, tp->t_callarg); - } - - /* - * Special case the chunk embedded in the transaction. - */ - licp = &(tp->t_items); - if (!(xfs_lic_are_all_free(licp))) { - xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag); - } - - /* - * Process the items in each chunk in turn. - */ - licp = licp->lic_next; - while (licp != NULL) { - ASSERT(!xfs_lic_are_all_free(licp)); - xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag); - next_licp = licp->lic_next; - kmem_free(licp); - licp = next_licp; - } - - /* - * Clear all the per-AG busy list items listed in this transaction - */ - lbcp = &tp->t_busy; - while (lbcp != NULL) { - for (i = 0, lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) { - if (!XFS_LBC_ISFREE(lbcp, i)) { - xfs_alloc_clear_busy(tp, lbsp->lbc_ag, - lbsp->lbc_idx); - } - } - lbcp = lbcp->lbc_next; - } - xfs_trans_free_busy(tp); - - /* - * That's it for the transaction structure. Free it. - */ - xfs_trans_free(tp); -} - -/* - * This is called to perform the commit processing for each - * item described by the given chunk. - * - * The commit processing consists of unlocking items which were - * held locked with the SYNC_UNLOCK attribute, calling the committed - * routine of each logged item, updating the item's position in the AIL - * if necessary, and unpinning each item. If the committed routine - * returns -1, then do nothing further with the item because it - * may have been freed. - * - * Since items are unlocked when they are copied to the incore - * log, it is possible for two transactions to be completing - * and manipulating the same item simultaneously. The AIL lock - * will protect the lsn field of each item. The value of this - * field can never go backwards. - * - * We unpin the items after repositioning them in the AIL, because - * otherwise they could be immediately flushed and we'd have to race - * with the flusher trying to pull the item from the AIL as we add it. - */ -STATIC void -xfs_trans_chunk_committed( - xfs_log_item_chunk_t *licp, - xfs_lsn_t lsn, - int aborted) -{ - xfs_log_item_desc_t *lidp; - xfs_log_item_t *lip; - xfs_lsn_t item_lsn; - int i; - - lidp = licp->lic_descs; - for (i = 0; i < licp->lic_unused; i++, lidp++) { - struct xfs_ail *ailp; - - if (xfs_lic_isfree(licp, i)) { - continue; - } - - lip = lidp->lid_item; - if (aborted) - lip->li_flags |= XFS_LI_ABORTED; - - /* - * Send in the ABORTED flag to the COMMITTED routine - * so that it knows whether the transaction was aborted - * or not. - */ - item_lsn = IOP_COMMITTED(lip, lsn); - - /* - * If the committed routine returns -1, make - * no more references to the item. - */ - if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) { - continue; - } - - /* - * If the returned lsn is greater than what it - * contained before, update the location of the - * item in the AIL. If it is not, then do nothing. - * Items can never move backwards in the AIL. - * - * While the new lsn should usually be greater, it - * is possible that a later transaction completing - * simultaneously with an earlier one using the - * same item could complete first with a higher lsn. - * This would cause the earlier transaction to fail - * the test below. - */ - ailp = lip->li_ailp; - spin_lock(&ailp->xa_lock); - if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) { - /* - * This will set the item's lsn to item_lsn - * and update the position of the item in - * the AIL. - * - * xfs_trans_ail_update() drops the AIL lock. - */ - xfs_trans_ail_update(ailp, lip, item_lsn); - } else { - spin_unlock(&ailp->xa_lock); - } - - /* - * Now that we've repositioned the item in the AIL, - * unpin it so it can be flushed. Pass information - * about buffer stale state down from the log item - * flags, if anyone else stales the buffer we do not - * want to pay any attention to it. - */ - IOP_UNPIN(lip, lidp->lid_flags & XFS_LID_BUF_STALE); - } -} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 79c8bab9dff..8c69e7824f6 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -49,6 +49,15 @@ typedef struct xfs_trans_header { #define XFS_LI_DQUOT 0x123d #define XFS_LI_QUOTAOFF 0x123e +#define XFS_LI_TYPE_DESC \ + { XFS_LI_EFI, "XFS_LI_EFI" }, \ + { XFS_LI_EFD, "XFS_LI_EFD" }, \ + { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \ + { XFS_LI_INODE, "XFS_LI_INODE" }, \ + { XFS_LI_BUF, "XFS_LI_BUF" }, \ + { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \ + { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" } + /* * Transaction types. Used to distinguish types of buffers. */ @@ -97,7 +106,8 @@ typedef struct xfs_trans_header { #define XFS_TRANS_GROWFSRT_FREE 39 #define XFS_TRANS_SWAPEXT 40 #define XFS_TRANS_SB_COUNT 41 -#define XFS_TRANS_TYPE_MAX 41 +#define XFS_TRANS_CHECKPOINT 42 +#define XFS_TRANS_TYPE_MAX 42 /* new transaction types need to be reflected in xfs_logprint(8) */ #define XFS_TRANS_TYPES \ @@ -139,6 +149,7 @@ typedef struct xfs_trans_header { { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ + { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \ { XFS_TRANS_DUMMY1, "DUMMY1" }, \ { XFS_TRANS_DUMMY2, "DUMMY2" }, \ { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } @@ -159,7 +170,6 @@ typedef struct xfs_log_item_desc { #define XFS_LID_DIRTY 0x1 #define XFS_LID_PINNED 0x2 -#define XFS_LID_BUF_STALE 0x8 /* * This structure is used to maintain a chunk list of log_item_desc @@ -805,6 +815,7 @@ struct xfs_log_item_desc; struct xfs_mount; struct xfs_trans; struct xfs_dquot_acct; +struct xfs_busy_extent; typedef struct xfs_log_item { struct list_head li_ail; /* AIL pointers */ @@ -820,6 +831,11 @@ typedef struct xfs_log_item { /* buffer item iodone */ /* callback func */ struct xfs_item_ops *li_ops; /* function list */ + + /* delayed logging */ + struct list_head li_cil; /* CIL pointers */ + struct xfs_log_vec *li_lv; /* active log vector */ + xfs_lsn_t li_seq; /* CIL commit seq */ } xfs_log_item_t; #define XFS_LI_IN_AIL 0x1 @@ -833,7 +849,7 @@ typedef struct xfs_item_ops { uint (*iop_size)(xfs_log_item_t *); void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); void (*iop_pin)(xfs_log_item_t *); - void (*iop_unpin)(xfs_log_item_t *, int); + void (*iop_unpin)(xfs_log_item_t *); void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *); uint (*iop_trylock)(xfs_log_item_t *); void (*iop_unlock)(xfs_log_item_t *); @@ -846,7 +862,7 @@ typedef struct xfs_item_ops { #define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) #define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) #define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) -#define IOP_UNPIN(ip, flags) (*(ip)->li_ops->iop_unpin)(ip, flags) +#define IOP_UNPIN(ip) (*(ip)->li_ops->iop_unpin)(ip) #define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp) #define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip) #define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) @@ -864,34 +880,6 @@ typedef struct xfs_item_ops { #define XFS_ITEM_PUSHBUF 3 /* - * This structure is used to maintain a list of block ranges that have been - * freed in the transaction. The ranges are listed in the perag[] busy list - * between when they're freed and the transaction is committed to disk. - */ - -typedef struct xfs_log_busy_slot { - xfs_agnumber_t lbc_ag; - ushort lbc_idx; /* index in perag.busy[] */ -} xfs_log_busy_slot_t; - -#define XFS_LBC_NUM_SLOTS 31 -typedef struct xfs_log_busy_chunk { - struct xfs_log_busy_chunk *lbc_next; - uint lbc_free; /* free slots bitmask */ - ushort lbc_unused; /* first unused */ - xfs_log_busy_slot_t lbc_busy[XFS_LBC_NUM_SLOTS]; -} xfs_log_busy_chunk_t; - -#define XFS_LBC_MAX_SLOT (XFS_LBC_NUM_SLOTS - 1) -#define XFS_LBC_FREEMASK ((1U << XFS_LBC_NUM_SLOTS) - 1) - -#define XFS_LBC_INIT(cp) ((cp)->lbc_free = XFS_LBC_FREEMASK) -#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot))) -#define XFS_LBC_SLOT(cp, slot) (&((cp)->lbc_busy[(slot)])) -#define XFS_LBC_VACANCY(cp) (((cp)->lbc_free) & XFS_LBC_FREEMASK) -#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot))) - -/* * This is the type of function which can be given to xfs_trans_callback() * to be called upon the transaction's commit to disk. */ @@ -942,8 +930,7 @@ typedef struct xfs_trans { unsigned int t_items_free; /* log item descs free */ xfs_log_item_chunk_t t_items; /* first log item desc chunk */ xfs_trans_header_t t_header; /* header for in-log trans */ - unsigned int t_busy_free; /* busy descs free */ - xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */ + struct list_head t_busy; /* list of busy extents */ unsigned long t_pflags; /* saved process flags state */ } xfs_trans_t; @@ -1017,9 +1004,6 @@ int _xfs_trans_commit(xfs_trans_t *, void xfs_trans_cancel(xfs_trans_t *, int); int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); -xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, - xfs_agnumber_t ag, - xfs_extlen_t idx); extern kmem_zone_t *xfs_trans_zone; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index fb586360d1c..63d81a22f4f 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -40,11 +40,51 @@ #include "xfs_rw.h" #include "xfs_trace.h" +/* + * Check to see if a buffer matching the given parameters is already + * a part of the given transaction. + */ +STATIC struct xfs_buf * +xfs_trans_buf_item_match( + struct xfs_trans *tp, + struct xfs_buftarg *target, + xfs_daddr_t blkno, + int len) +{ + xfs_log_item_chunk_t *licp; + xfs_log_item_desc_t *lidp; + xfs_buf_log_item_t *blip; + int i; + + len = BBTOB(len); + for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) { + if (xfs_lic_are_all_free(licp)) { + ASSERT(licp == &tp->t_items); + ASSERT(licp->lic_next == NULL); + return NULL; + } + + for (i = 0; i < licp->lic_unused; i++) { + /* + * Skip unoccupied slots. + */ + if (xfs_lic_isfree(licp, i)) + continue; + + lidp = xfs_lic_slot(licp, i); + blip = (xfs_buf_log_item_t *)lidp->lid_item; + if (blip->bli_item.li_type != XFS_LI_BUF) + continue; + + if (XFS_BUF_TARGET(blip->bli_buf) == target && + XFS_BUF_ADDR(blip->bli_buf) == blkno && + XFS_BUF_COUNT(blip->bli_buf) == len) + return blip->bli_buf; + } + } -STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *, - xfs_daddr_t, int); -STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *, - xfs_daddr_t, int); + return NULL; +} /* * Add the locked buffer to the transaction. @@ -74,7 +114,7 @@ _xfs_trans_bjoin( xfs_buf_item_init(bp, tp->t_mountp); bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); if (reset_recur) bip->bli_recur = 0; @@ -112,14 +152,6 @@ xfs_trans_bjoin( * within the transaction, just increment its lock recursion count * and return a pointer to it. * - * Use the fast path function xfs_trans_buf_item_match() or the buffer - * cache routine incore_match() to find the buffer - * if it is already owned by this transaction. - * - * If we don't already own the buffer, use get_buf() to get it. - * If it doesn't yet have an associated xfs_buf_log_item structure, - * then allocate one and add the item to this transaction. - * * If the transaction pointer is NULL, make this just a normal * get_buf() call. */ @@ -149,11 +181,7 @@ xfs_trans_get_buf(xfs_trans_t *tp, * have it locked. In this case we just increment the lock * recursion count and return the buffer to the caller. */ - if (tp->t_items.lic_next == NULL) { - bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len); - } else { - bp = xfs_trans_buf_item_match_all(tp, target_dev, blkno, len); - } + bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len); if (bp != NULL) { ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) @@ -259,14 +287,6 @@ int xfs_error_mod = 33; * within the transaction and already read in, just increment its * lock recursion count and return a pointer to it. * - * Use the fast path function xfs_trans_buf_item_match() or the buffer - * cache routine incore_match() to find the buffer - * if it is already owned by this transaction. - * - * If we don't already own the buffer, use read_buf() to get it. - * If it doesn't yet have an associated xfs_buf_log_item structure, - * then allocate one and add the item to this transaction. - * * If the transaction pointer is NULL, make this just a normal * read_buf() call. */ @@ -328,11 +348,7 @@ xfs_trans_read_buf( * If the buffer is not yet read in, then we read it in, increment * the lock recursion count, and return it to the caller. */ - if (tp->t_items.lic_next == NULL) { - bp = xfs_trans_buf_item_match(tp, target, blkno, len); - } else { - bp = xfs_trans_buf_item_match_all(tp, target, blkno, len); - } + bp = xfs_trans_buf_item_match(tp, target, blkno, len); if (bp != NULL) { ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); @@ -495,7 +511,7 @@ xfs_trans_brelse(xfs_trans_t *tp, bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(bip->bli_item.li_type == XFS_LI_BUF); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); /* @@ -603,7 +619,7 @@ xfs_trans_bhold(xfs_trans_t *tp, bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_HOLD; trace_xfs_trans_bhold(bip); @@ -625,7 +641,7 @@ xfs_trans_bhold_release(xfs_trans_t *tp, bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT(bip->bli_flags & XFS_BLI_HOLD); bip->bli_flags &= ~XFS_BLI_HOLD; @@ -688,7 +704,7 @@ xfs_trans_log_buf(xfs_trans_t *tp, bip->bli_flags &= ~XFS_BLI_STALE; ASSERT(XFS_BUF_ISSTALE(bp)); XFS_BUF_UNSTALE(bp); - bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL; + bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL; } lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); @@ -696,7 +712,6 @@ xfs_trans_log_buf(xfs_trans_t *tp, tp->t_flags |= XFS_TRANS_DIRTY; lidp->lid_flags |= XFS_LID_DIRTY; - lidp->lid_flags &= ~XFS_LID_BUF_STALE; bip->bli_flags |= XFS_BLI_LOGGED; xfs_buf_item_log(bip, first, last); } @@ -747,8 +762,8 @@ xfs_trans_binval( ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF)); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF)); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); ASSERT(lidp->lid_flags & XFS_LID_DIRTY); ASSERT(tp->t_flags & XFS_TRANS_DIRTY); return; @@ -759,7 +774,7 @@ xfs_trans_binval( * in the buf log item. The STALE flag will be used in * xfs_buf_item_unpin() to determine if it should clean up * when the last reference to the buf item is given up. - * We set the XFS_BLI_CANCEL flag in the buf log format structure + * We set the XFS_BLF_CANCEL flag in the buf log format structure * and log the buf item. This will be used at recovery time * to determine that copies of the buffer in the log before * this should not be replayed. @@ -777,26 +792,26 @@ xfs_trans_binval( XFS_BUF_UNDELAYWRITE(bp); XFS_BUF_STALE(bp); bip->bli_flags |= XFS_BLI_STALE; - bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY); - bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF; - bip->bli_format.blf_flags |= XFS_BLI_CANCEL; + bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); + bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; + bip->bli_format.blf_flags |= XFS_BLF_CANCEL; memset((char *)(bip->bli_format.blf_data_map), 0, (bip->bli_format.blf_map_size * sizeof(uint))); - lidp->lid_flags |= XFS_LID_DIRTY|XFS_LID_BUF_STALE; + lidp->lid_flags |= XFS_LID_DIRTY; tp->t_flags |= XFS_TRANS_DIRTY; } /* - * This call is used to indicate that the buffer contains on-disk - * inodes which must be handled specially during recovery. They - * require special handling because only the di_next_unlinked from - * the inodes in the buffer should be recovered. The rest of the - * data in the buffer is logged via the inodes themselves. + * This call is used to indicate that the buffer contains on-disk inodes which + * must be handled specially during recovery. They require special handling + * because only the di_next_unlinked from the inodes in the buffer should be + * recovered. The rest of the data in the buffer is logged via the inodes + * themselves. * - * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log - * format structure so that we'll know what to do at recovery time. + * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be + * transferred to the buffer's log format structure so that we'll know what to + * do at recovery time. */ -/* ARGSUSED */ void xfs_trans_inode_buf( xfs_trans_t *tp, @@ -811,7 +826,7 @@ xfs_trans_inode_buf( bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(atomic_read(&bip->bli_refcount) > 0); - bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF; + bip->bli_flags |= XFS_BLI_INODE_BUF; } /* @@ -893,120 +908,12 @@ xfs_trans_dquot_buf( ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); - ASSERT(type == XFS_BLI_UDQUOT_BUF || - type == XFS_BLI_PDQUOT_BUF || - type == XFS_BLI_GDQUOT_BUF); + ASSERT(type == XFS_BLF_UDQUOT_BUF || + type == XFS_BLF_PDQUOT_BUF || + type == XFS_BLF_GDQUOT_BUF); bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_format.blf_flags |= type; } - -/* - * Check to see if a buffer matching the given parameters is already - * a part of the given transaction. Only check the first, embedded - * chunk, since we don't want to spend all day scanning large transactions. - */ -STATIC xfs_buf_t * -xfs_trans_buf_item_match( - xfs_trans_t *tp, - xfs_buftarg_t *target, - xfs_daddr_t blkno, - int len) -{ - xfs_log_item_chunk_t *licp; - xfs_log_item_desc_t *lidp; - xfs_buf_log_item_t *blip; - xfs_buf_t *bp; - int i; - - bp = NULL; - len = BBTOB(len); - licp = &tp->t_items; - if (!xfs_lic_are_all_free(licp)) { - for (i = 0; i < licp->lic_unused; i++) { - /* - * Skip unoccupied slots. - */ - if (xfs_lic_isfree(licp, i)) { - continue; - } - - lidp = xfs_lic_slot(licp, i); - blip = (xfs_buf_log_item_t *)lidp->lid_item; - if (blip->bli_item.li_type != XFS_LI_BUF) { - continue; - } - - bp = blip->bli_buf; - if ((XFS_BUF_TARGET(bp) == target) && - (XFS_BUF_ADDR(bp) == blkno) && - (XFS_BUF_COUNT(bp) == len)) { - /* - * We found it. Break out and - * return the pointer to the buffer. - */ - break; - } else { - bp = NULL; - } - } - } - return bp; -} - -/* - * Check to see if a buffer matching the given parameters is already - * a part of the given transaction. Check all the chunks, we - * want to be thorough. - */ -STATIC xfs_buf_t * -xfs_trans_buf_item_match_all( - xfs_trans_t *tp, - xfs_buftarg_t *target, - xfs_daddr_t blkno, - int len) -{ - xfs_log_item_chunk_t *licp; - xfs_log_item_desc_t *lidp; - xfs_buf_log_item_t *blip; - xfs_buf_t *bp; - int i; - - bp = NULL; - len = BBTOB(len); - for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) { - if (xfs_lic_are_all_free(licp)) { - ASSERT(licp == &tp->t_items); - ASSERT(licp->lic_next == NULL); - return NULL; - } - for (i = 0; i < licp->lic_unused; i++) { - /* - * Skip unoccupied slots. - */ - if (xfs_lic_isfree(licp, i)) { - continue; - } - - lidp = xfs_lic_slot(licp, i); - blip = (xfs_buf_log_item_t *)lidp->lid_item; - if (blip->bli_item.li_type != XFS_LI_BUF) { - continue; - } - - bp = blip->bli_buf; - if ((XFS_BUF_TARGET(bp) == target) && - (XFS_BUF_ADDR(bp) == blkno) && - (XFS_BUF_COUNT(bp) == len)) { - /* - * We found it. Break out and - * return the pointer to the buffer. - */ - return bp; - } - } - } - return NULL; -} diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c index eb3fc57f9ee..f11d37d06dc 100644 --- a/fs/xfs/xfs_trans_item.c +++ b/fs/xfs/xfs_trans_item.c @@ -299,6 +299,7 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp) void xfs_trans_free_items( xfs_trans_t *tp, + xfs_lsn_t commit_lsn, int flags) { xfs_log_item_chunk_t *licp; @@ -311,7 +312,7 @@ xfs_trans_free_items( * Special case the embedded chunk so we don't free it below. */ if (!xfs_lic_are_all_free(licp)) { - (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); + (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn); xfs_lic_all_free(licp); licp->lic_unused = 0; } @@ -322,7 +323,7 @@ xfs_trans_free_items( */ while (licp != NULL) { ASSERT(!xfs_lic_are_all_free(licp)); - (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); + (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn); next_licp = licp->lic_next; kmem_free(licp); licp = next_licp; @@ -438,112 +439,3 @@ xfs_trans_unlock_chunk( return freed; } - - -/* - * This is called to add the given busy item to the transaction's - * list of busy items. It must find a free busy item descriptor - * or allocate a new one and add the item to that descriptor. - * The function returns a pointer to busy descriptor used to point - * to the new busy entry. The log busy entry will now point to its new - * descriptor with its ???? field. - */ -xfs_log_busy_slot_t * -xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx) -{ - xfs_log_busy_chunk_t *lbcp; - xfs_log_busy_slot_t *lbsp; - int i=0; - - /* - * If there are no free descriptors, allocate a new chunk - * of them and put it at the front of the chunk list. - */ - if (tp->t_busy_free == 0) { - lbcp = (xfs_log_busy_chunk_t*) - kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP); - ASSERT(lbcp != NULL); - /* - * Initialize the chunk, and then - * claim the first slot in the newly allocated chunk. - */ - XFS_LBC_INIT(lbcp); - XFS_LBC_CLAIM(lbcp, 0); - lbcp->lbc_unused = 1; - lbsp = XFS_LBC_SLOT(lbcp, 0); - - /* - * Link in the new chunk and update the free count. - */ - lbcp->lbc_next = tp->t_busy.lbc_next; - tp->t_busy.lbc_next = lbcp; - tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1; - - /* - * Initialize the descriptor and the generic portion - * of the log item. - * - * Point the new slot at this item and return it. - * Also point the log item at its currently active - * descriptor and set the item's mount pointer. - */ - lbsp->lbc_ag = ag; - lbsp->lbc_idx = idx; - return lbsp; - } - - /* - * Find the free descriptor. It is somewhere in the chunklist - * of descriptors. - */ - lbcp = &tp->t_busy; - while (lbcp != NULL) { - if (XFS_LBC_VACANCY(lbcp)) { - if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) { - i = lbcp->lbc_unused; - break; - } else { - /* out-of-order vacancy */ - cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp); - ASSERT(0); - } - } - lbcp = lbcp->lbc_next; - } - ASSERT(lbcp != NULL); - /* - * If we find a free descriptor, claim it, - * initialize it, and return it. - */ - XFS_LBC_CLAIM(lbcp, i); - if (lbcp->lbc_unused <= i) { - lbcp->lbc_unused = i + 1; - } - lbsp = XFS_LBC_SLOT(lbcp, i); - tp->t_busy_free--; - lbsp->lbc_ag = ag; - lbsp->lbc_idx = idx; - return lbsp; -} - - -/* - * xfs_trans_free_busy - * Free all of the busy lists from a transaction - */ -void -xfs_trans_free_busy(xfs_trans_t *tp) -{ - xfs_log_busy_chunk_t *lbcp; - xfs_log_busy_chunk_t *lbcq; - - lbcp = tp->t_busy.lbc_next; - while (lbcp != NULL) { - lbcq = lbcp->lbc_next; - kmem_free(lbcp); - lbcp = lbcq; - } - - XFS_LBC_INIT(&tp->t_busy); - tp->t_busy.lbc_unused = 0; -} diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 73e2ad39743..c6e4f2c8de6 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -35,13 +35,14 @@ struct xfs_log_item_desc *xfs_trans_find_item(struct xfs_trans *, struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *); struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *, struct xfs_log_item_desc *); -void xfs_trans_free_items(struct xfs_trans *, int); -void xfs_trans_unlock_items(struct xfs_trans *, - xfs_lsn_t); -void xfs_trans_free_busy(xfs_trans_t *tp); -xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, - xfs_agnumber_t ag, - xfs_extlen_t idx); + +void xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn); +void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, + int flags); + +void xfs_trans_item_committed(struct xfs_log_item *lip, + xfs_lsn_t commit_lsn, int aborted); +void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); /* * AIL traversal cursor. diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index b09904555d0..320775295e3 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -75,6 +75,8 @@ typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */ +typedef __uint32_t xlog_tid_t; /* transaction ID type */ + /* * These types are 64 bits on disk but are either 32 or 64 bits in memory. * Disk based types: |