diff options
Diffstat (limited to 'fs')
125 files changed, 6166 insertions, 4817 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index ef966188611..2b78014a124 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -132,21 +132,19 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) options = tmp_options; while ((p = strsep(&options, ",")) != NULL) { - int token; + int token, r; if (!*p) continue; token = match_token(p, tokens, args); - if (token < Opt_uname) { - int r = match_int(&args[0], &option); + switch (token) { + case Opt_debug: + r = match_int(&args[0], &option); if (r < 0) { P9_DPRINTK(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); + "integer field, but no integer?\n"); ret = r; continue; } - } - switch (token) { - case Opt_debug: v9ses->debug = option; #ifdef CONFIG_NET_9P_DEBUG p9_debug_level = option; @@ -154,12 +152,33 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) break; case Opt_dfltuid: + r = match_int(&args[0], &option); + if (r < 0) { + P9_DPRINTK(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } v9ses->dfltuid = option; break; case Opt_dfltgid: + r = match_int(&args[0], &option); + if (r < 0) { + P9_DPRINTK(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } v9ses->dfltgid = option; break; case Opt_afid: + r = match_int(&args[0], &option); + if (r < 0) { + P9_DPRINTK(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } v9ses->afid = option; break; case Opt_uname: diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 9c2bdda5cd9..598fff1a54e 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -165,9 +165,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) } while (rdir->head < rdir->tail) { p9stat_init(&st); - err = p9stat_read(rdir->buf + rdir->head, - rdir->tail - rdir->head, &st, - fid->clnt->proto_version); + err = p9stat_read(fid->clnt, rdir->buf + rdir->head, + rdir->tail - rdir->head, &st); if (err) { P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); err = -EIO; @@ -231,7 +230,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, while (err == 0) { if (rdir->tail == rdir->head) { err = p9_client_readdir(fid, rdir->buf, buflen, - filp->f_pos); + filp->f_pos); if (err <= 0) goto unlock_and_exit; @@ -241,10 +240,9 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, while (rdir->head < rdir->tail) { - err = p9dirent_read(rdir->buf + rdir->head, - rdir->tail - rdir->head, - &curdirent, - fid->clnt->proto_version); + err = p9dirent_read(fid->clnt, rdir->buf + rdir->head, + rdir->tail - rdir->head, + &curdirent); if (err < 0) { P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); err = -EIO; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index e3c03db3c78..b5a1076aaa6 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -278,10 +278,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, case S_IFSOCK: if (v9fs_proto_dotl(v9ses)) { inode->i_op = &v9fs_file_inode_operations_dotl; - inode->i_fop = &v9fs_file_operations_dotl; } else if (v9fs_proto_dotu(v9ses)) { inode->i_op = &v9fs_file_inode_operations; - inode->i_fop = &v9fs_file_operations; } else { P9_DPRINTK(P9_DEBUG_ERROR, "special files without extended mode\n"); diff --git a/fs/Makefile b/fs/Makefile index afc109691a9..d2c3353d547 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -120,6 +120,6 @@ obj-$(CONFIG_DEBUG_FS) += debugfs/ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ -obj-$(CONFIG_EXOFS_FS) += exofs/ +obj-y += exofs/ # Multiple modules obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ diff --git a/fs/attr.c b/fs/attr.c index 538e27959d3..7ee7ba48831 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -13,6 +13,7 @@ #include <linux/fsnotify.h> #include <linux/fcntl.h> #include <linux/security.h> +#include <linux/evm.h> /** * inode_change_ok - check if attribute changes to an inode are allowed @@ -237,8 +238,10 @@ int notify_change(struct dentry * dentry, struct iattr * attr) else error = simple_setattr(dentry, attr); - if (!error) + if (!error) { fsnotify_change(dentry, ia_valid); + evm_inode_post_setattr(dentry, ia_valid); + } return error; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 538f65a79ec..dae5dfe41ba 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1047,7 +1047,16 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (!max_to_defrag) max_to_defrag = last_index - 1; - while (i <= last_index && defrag_count < max_to_defrag) { + /* + * make writeback starts from i, so the defrag range can be + * written sequentially. + */ + if (i < inode->i_mapping->writeback_index) + inode->i_mapping->writeback_index = i; + + while (i <= last_index && defrag_count < max_to_defrag && + (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT)) { /* * make sure we stop running if someone unmounts * the FS diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 69565e5fc6a..426aa464f1a 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -383,36 +383,36 @@ int btrfs_removexattr(struct dentry *dentry, const char *name) XATTR_REPLACE); } -int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir, - const struct qstr *qstr) +int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *suffix; + const struct xattr *xattr; + struct btrfs_trans_handle *trans = fs_info; char *name; + int err = 0; - err = security_inode_init_security(inode, dir, qstr, &suffix, &value, - &len); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; - } - - name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1, - GFP_NOFS); - if (!name) { - err = -ENOMEM; - } else { + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + name = kmalloc(XATTR_SECURITY_PREFIX_LEN + + strlen(xattr->name) + 1, GFP_NOFS); + if (!name) { + err = -ENOMEM; + break; + } strcpy(name, XATTR_SECURITY_PREFIX); - strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix); - err = __btrfs_setxattr(trans, inode, name, value, len, 0); + strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); + err = __btrfs_setxattr(trans, inode, name, + xattr->value, xattr->value_len, 0); kfree(name); + if (err < 0) + break; } - - kfree(suffix); - kfree(value); return err; } + +int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &btrfs_initxattrs, trans); +} diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index f70d87d6ba6..d545a95c30e 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3018,9 +3018,9 @@ cleanup_volume_info_contents(struct smb_vol *volume_info) { kfree(volume_info->username); kzfree(volume_info->password); - kfree(volume_info->UNC); if (volume_info->UNCip != volume_info->UNC + 2) kfree(volume_info->UNCip); + kfree(volume_info->UNC); kfree(volume_info->domainname); kfree(volume_info->iocharset); kfree(volume_info->prepath); diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index fde15b1a1eb..45f07c46f3e 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -22,6 +22,7 @@ #include <linux/fs.h> #include <linux/posix_acl_xattr.h> #include <linux/slab.h> +#include <linux/xattr.h> #include "cifsfs.h" #include "cifspdu.h" #include "cifsglob.h" @@ -31,16 +32,8 @@ #define MAX_EA_VALUE_SIZE 65535 #define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib" #define CIFS_XATTR_CIFS_ACL "system.cifs_acl" -#define CIFS_XATTR_USER_PREFIX "user." -#define CIFS_XATTR_SYSTEM_PREFIX "system." -#define CIFS_XATTR_OS2_PREFIX "os2." -#define CIFS_XATTR_SECURITY_PREFIX "security." -#define CIFS_XATTR_TRUSTED_PREFIX "trusted." -#define XATTR_TRUSTED_PREFIX_LEN 8 -#define XATTR_SECURITY_PREFIX_LEN 9 -/* BB need to add server (Samba e.g) support for security and trusted prefix */ - +/* BB need to add server (Samba e.g) support for security and trusted prefix */ int cifs_removexattr(struct dentry *direntry, const char *ea_name) { @@ -76,8 +69,8 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name) } if (ea_name == NULL) { cFYI(1, "Null xattr names not supported"); - } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) - && (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4))) { + } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) + && (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))) { cFYI(1, "illegal xattr request %s (only user namespace supported)", ea_name); @@ -88,7 +81,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto remove_ea_exit; - ea_name += 5; /* skip past user. prefix */ + ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, NULL, (__u16)0, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); @@ -149,21 +142,23 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, if (ea_name == NULL) { cFYI(1, "Null xattr names not supported"); - } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) { + } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) + == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto set_ea_exit; if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) cFYI(1, "attempt to set cifs inode metadata"); - ea_name += 5; /* skip past user. prefix */ + ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, (__u16)value_size, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) { + } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) + == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto set_ea_exit; - ea_name += 4; /* skip past os2. prefix */ + ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */ rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, (__u16)value_size, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); @@ -269,7 +264,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, /* return alt name if available as pseudo attr */ if (ea_name == NULL) { cFYI(1, "Null xattr names not supported"); - } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) { + } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) + == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto get_ea_exit; @@ -277,15 +273,15 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, cFYI(1, "attempt to query cifs inode metadata"); /* revalidate/getattr then populate from inode */ } /* BB add else when above is implemented */ - ea_name += 5; /* skip past user. prefix */ + ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value, buf_size, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) { + } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto get_ea_exit; - ea_name += 4; /* skip past os2. prefix */ + ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */ rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value, buf_size, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); @@ -339,10 +335,10 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, cFYI(1, "Query CIFS ACL not supported yet"); #endif /* CONFIG_CIFS_ACL */ } else if (strncmp(ea_name, - CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) { + XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) { cFYI(1, "Trusted xattr namespace not supported yet"); } else if (strncmp(ea_name, - CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) { + XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) { cFYI(1, "Security xattr namespace not supported yet"); } else cFYI(1, diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index 44e17e9c21a..cc0ea9fe5ec 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -59,12 +59,11 @@ void coda_sysctl_clean(void); #define CODA_ALLOC(ptr, cast, size) do { \ if (size < PAGE_SIZE) \ - ptr = kmalloc((unsigned long) size, GFP_KERNEL); \ + ptr = kzalloc((unsigned long) size, GFP_KERNEL); \ else \ - ptr = (cast)vmalloc((unsigned long) size); \ + ptr = (cast)vzalloc((unsigned long) size); \ if (!ptr) \ printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \ - else memset( ptr, 0, size ); \ } while (0) diff --git a/fs/compat.c b/fs/compat.c index 58b1da45989..05e3f3d2cd7 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -37,7 +37,6 @@ #include <linux/dirent.h> #include <linux/fsnotify.h> #include <linux/highuid.h> -#include <linux/nfsd/syscall.h> #include <linux/personality.h> #include <linux/rwsem.h> #include <linux/tsacct_kern.h> diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index c83f4768eea..ca418aaf635 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -23,7 +23,8 @@ * * configfs Copyright (C) 2005 Oracle. All rights reserved. * - * Please see Documentation/filesystems/configfs.txt for more information. + * Please see Documentation/filesystems/configfs/configfs.txt for more + * information. */ #undef DEBUG diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 76dc4c3e5d5..50cee7f9110 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -23,7 +23,7 @@ * * configfs Copyright (C) 2005 Oracle. All rights reserved. * - * Please see the file Documentation/filesystems/configfs.txt for + * Please see the file Documentation/filesystems/configfs/configfs.txt for * critical information about using the config_item interface. */ diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index e7a7a2f0732..f3a257d7a98 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -1,5 +1,5 @@ /* - * file.c - part of debugfs, a tiny little debug file system + * inode.c - part of debugfs, a tiny little debug file system * * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Inc. diff --git a/fs/eventpoll.c b/fs/eventpoll.c index fe047d966dc..9026fc91fe3 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -700,7 +700,7 @@ static const struct file_operations eventpoll_fops = { .llseek = noop_llseek, }; -/* Fast test to see if the file is an evenpoll file */ +/* Fast test to see if the file is an eventpoll file */ static inline int is_file_epoll(struct file *f) { return f->f_op == &eventpoll_fops; diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index c5a5855a6c4..352ba149d23 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild @@ -13,7 +13,8 @@ # # ore module library -obj-$(CONFIG_ORE) += ore.o +libore-y := ore.o ore_raid.o +obj-$(CONFIG_ORE) += libore.o exofs-y := inode.o file.o symlink.o namei.o dir.o super.o obj-$(CONFIG_EXOFS_FS) += exofs.o diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig index 70bae414929..fa9a286c877 100644 --- a/fs/exofs/Kconfig +++ b/fs/exofs/Kconfig @@ -1,10 +1,17 @@ +# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects +# for every ORE user we do it like this. Any user should add itself here +# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are +# selected here, and we default to "ON". So in effect it is like been +# selected by any of the users. config ORE tristate + depends on EXOFS_FS + select ASYNC_XOR + default SCSI_OSD_ULD config EXOFS_FS tristate "exofs: OSD based file system support" depends on SCSI_OSD_ULD - select ORE help EXOFS is a file system that uses an OSD storage device, as its backing storage. diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index f4e442ec744..51f4b4c40f0 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -53,6 +53,10 @@ /* u64 has problems with printk this will cast it to unsigned long long */ #define _LLU(x) (unsigned long long)(x) +struct exofs_dev { + struct ore_dev ored; + unsigned did; +}; /* * our extension to the in-memory superblock */ @@ -66,13 +70,9 @@ struct exofs_sb_info { u32 s_next_generation; /* next gen # to use */ atomic_t s_curr_pending; /* number of pending commands */ - struct pnfs_osd_data_map data_map; /* Default raid to use - * FIXME: Needed ? - */ struct ore_layout layout; /* Default files layout */ struct ore_comp one_comp; /* id & cred of partition id=0*/ - struct ore_components comps; /* comps for the partition */ - struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ + struct ore_components oc; /* comps for the partition */ }; /* @@ -86,7 +86,7 @@ struct exofs_i_info { uint32_t i_dir_start_lookup; /* which page to start lookup */ uint64_t i_commit_size; /* the object's written length */ struct ore_comp one_comp; /* same component for all devices */ - struct ore_components comps; /* inode view of the device table */ + struct ore_components oc; /* inode view of the device table */ }; static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) @@ -207,7 +207,7 @@ extern const struct inode_operations exofs_fast_symlink_inode_operations; * bigger and that the device table repeats twice. * See: exofs_read_lookup_dev_table() */ -static inline void exofs_init_comps(struct ore_components *comps, +static inline void exofs_init_comps(struct ore_components *oc, struct ore_comp *one_comp, struct exofs_sb_info *sbi, osd_id oid) { @@ -217,13 +217,15 @@ static inline void exofs_init_comps(struct ore_components *comps, one_comp->obj.id = oid; exofs_make_credential(one_comp->cred, &one_comp->obj); - comps->numdevs = sbi->comps.numdevs; - comps->single_comp = EC_SINGLE_COMP; - comps->comps = one_comp; + oc->first_dev = 0; + oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 * + sbi->layout.group_count; + oc->single_comp = EC_SINGLE_COMP; + oc->comps = one_comp; /* Round robin device view of the table */ - first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->comps.numdevs; - comps->ods = sbi->comps.ods + first_dev; + first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs; + oc->ods = &sbi->oc.ods[first_dev]; } #endif diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index f39a38fc234..3e5f3a6be90 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -37,11 +37,7 @@ #define EXOFS_DBGMSG2(M...) do {} while (0) -enum { BIO_MAX_PAGES_KMALLOC = - (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), - MAX_PAGES_KMALLOC = - PAGE_SIZE / sizeof(struct page *), -}; +enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), }; unsigned exofs_max_io_pages(struct ore_layout *layout, unsigned expected_pages) @@ -49,8 +45,7 @@ unsigned exofs_max_io_pages(struct ore_layout *layout, unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC); /* TODO: easily support bio chaining */ - pages = min_t(unsigned, pages, - layout->group_width * BIO_MAX_PAGES_KMALLOC); + pages = min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE); return pages; } @@ -68,6 +63,7 @@ struct page_collect { bool read_4_write; /* This means two things: that the read is sync * And the pages should not be unlocked. */ + struct page *that_locked_page; }; static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, @@ -86,6 +82,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, pcol->length = 0; pcol->pg_first = -1; pcol->read_4_write = false; + pcol->that_locked_page = NULL; } static void _pcol_reset(struct page_collect *pcol) @@ -98,6 +95,7 @@ static void _pcol_reset(struct page_collect *pcol) pcol->length = 0; pcol->pg_first = -1; pcol->ios = NULL; + pcol->that_locked_page = NULL; /* this is probably the end of the loop but in writes * it might not end here. don't be left with nothing @@ -149,14 +147,17 @@ static int pcol_add_page(struct page_collect *pcol, struct page *page, return 0; } +enum {PAGE_WAS_NOT_IN_IO = 17}; static int update_read_page(struct page *page, int ret) { - if (ret == 0) { + switch (ret) { + case 0: /* Everything is OK */ SetPageUptodate(page); if (PageError(page)) ClearPageError(page); - } else if (ret == -EFAULT) { + break; + case -EFAULT: /* In this case we were trying to read something that wasn't on * disk yet - return a page full of zeroes. This should be OK, * because the object should be empty (if there was a write @@ -167,16 +168,22 @@ static int update_read_page(struct page *page, int ret) SetPageUptodate(page); if (PageError(page)) ClearPageError(page); - ret = 0; /* recovered error */ EXOFS_DBGMSG("recovered read error\n"); - } else /* Error */ + /* fall through */ + case PAGE_WAS_NOT_IN_IO: + ret = 0; /* recovered error */ + break; + default: SetPageError(page); - + } return ret; } static void update_write_page(struct page *page, int ret) { + if (unlikely(ret == PAGE_WAS_NOT_IN_IO)) + return; /* don't pass start don't collect $200 */ + if (ret) { mapping_set_error(page->mapping, ret); SetPageError(page); @@ -190,15 +197,16 @@ static void update_write_page(struct page *page, int ret) static int __readpages_done(struct page_collect *pcol) { int i; - u64 resid; u64 good_bytes; u64 length = 0; - int ret = ore_check_io(pcol->ios, &resid); + int ret = ore_check_io(pcol->ios, NULL); - if (likely(!ret)) + if (likely(!ret)) { good_bytes = pcol->length; - else - good_bytes = pcol->length - resid; + ret = PAGE_WAS_NOT_IN_IO; + } else { + good_bytes = 0; + } EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" " length=0x%lx nr_pages=%u\n", @@ -259,6 +267,46 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) } } +static int _maybe_not_all_in_one_io(struct ore_io_state *ios, + struct page_collect *pcol_src, struct page_collect *pcol) +{ + /* length was wrong or offset was not page aligned */ + BUG_ON(pcol_src->nr_pages < ios->nr_pages); + + if (pcol_src->nr_pages > ios->nr_pages) { + struct page **src_page; + unsigned pages_less = pcol_src->nr_pages - ios->nr_pages; + unsigned long len_less = pcol_src->length - ios->length; + unsigned i; + int ret; + + /* This IO was trimmed */ + pcol_src->nr_pages = ios->nr_pages; + pcol_src->length = ios->length; + + /* Left over pages are passed to the next io */ + pcol->expected_pages += pages_less; + pcol->nr_pages = pages_less; + pcol->length = len_less; + src_page = pcol_src->pages + pcol_src->nr_pages; + pcol->pg_first = (*src_page)->index; + + ret = pcol_try_alloc(pcol); + if (unlikely(ret)) + return ret; + + for (i = 0; i < pages_less; ++i) + pcol->pages[i] = *src_page++; + + EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x " + "pages_less=0x%x expected_pages=0x%x " + "next_offset=0x%llx next_len=0x%lx\n", + pcol_src->nr_pages, pages_less, pcol->expected_pages, + pcol->pg_first * PAGE_SIZE, pcol->length); + } + return 0; +} + static int read_exec(struct page_collect *pcol) { struct exofs_i_info *oi = exofs_i(pcol->inode); @@ -270,7 +318,7 @@ static int read_exec(struct page_collect *pcol) return 0; if (!pcol->ios) { - int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, true, + int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true, pcol->pg_first << PAGE_CACHE_SHIFT, pcol->length, &pcol->ios); @@ -280,7 +328,6 @@ static int read_exec(struct page_collect *pcol) ios = pcol->ios; ios->pages = pcol->pages; - ios->nr_pages = pcol->nr_pages; if (pcol->read_4_write) { ore_read(pcol->ios); @@ -296,17 +343,23 @@ static int read_exec(struct page_collect *pcol) *pcol_copy = *pcol; ios->done = readpages_done; ios->private = pcol_copy; + + /* pages ownership was passed to pcol_copy */ + _pcol_reset(pcol); + + ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); + if (unlikely(ret)) + goto err; + + EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n", + pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); + ret = ore_read(ios); if (unlikely(ret)) goto err; atomic_inc(&pcol->sbi->s_curr_pending); - EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", - oi->one_comp.obj.id, _LLU(ios->offset), pcol->length); - - /* pages ownership was passed to pcol_copy */ - _pcol_reset(pcol); return 0; err: @@ -341,6 +394,8 @@ static int readpage_strip(void *data, struct page *page) EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, page->index); + pcol->that_locked_page = page; + if (page->index < end_index) len = PAGE_CACHE_SIZE; else if (page->index == end_index) @@ -429,6 +484,10 @@ static int exofs_readpages(struct file *file, struct address_space *mapping, return ret; } + ret = read_exec(&pcol); + if (unlikely(ret)) + return ret; + return read_exec(&pcol); } @@ -462,17 +521,18 @@ static void writepages_done(struct ore_io_state *ios, void *p) { struct page_collect *pcol = p; int i; - u64 resid; u64 good_bytes; u64 length = 0; - int ret = ore_check_io(ios, &resid); + int ret = ore_check_io(ios, NULL); atomic_dec(&pcol->sbi->s_curr_pending); - if (likely(!ret)) + if (likely(!ret)) { good_bytes = pcol->length; - else - good_bytes = pcol->length - resid; + ret = PAGE_WAS_NOT_IN_IO; + } else { + good_bytes = 0; + } EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" " length=0x%lx nr_pages=%u\n", @@ -505,6 +565,56 @@ static void writepages_done(struct ore_io_state *ios, void *p) EXOFS_DBGMSG2("writepages_done END\n"); } +static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) +{ + struct page_collect *pcol = priv; + pgoff_t index = offset / PAGE_SIZE; + + if (!pcol->that_locked_page || + (pcol->that_locked_page->index != index)) { + struct page *page = find_get_page(pcol->inode->i_mapping, index); + + if (!page) { + page = find_or_create_page(pcol->inode->i_mapping, + index, GFP_NOFS); + if (unlikely(!page)) { + EXOFS_DBGMSG("grab_cache_page Failed " + "index=0x%llx\n", _LLU(index)); + return NULL; + } + unlock_page(page); + } + if (PageDirty(page) || PageWriteback(page)) + *uptodate = true; + else + *uptodate = PageUptodate(page); + EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate); + return page; + } else { + EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n", + pcol->that_locked_page->index); + *uptodate = true; + return pcol->that_locked_page; + } +} + +static void __r4w_put_page(void *priv, struct page *page) +{ + struct page_collect *pcol = priv; + + if (pcol->that_locked_page != page) { + EXOFS_DBGMSG("index=0x%lx\n", page->index); + page_cache_release(page); + return; + } + EXOFS_DBGMSG("that_locked_page index=0x%lx\n", page->index); +} + +static const struct _ore_r4w_op _r4w_op = { + .get_page = &__r4w_get_page, + .put_page = &__r4w_put_page, +}; + static int write_exec(struct page_collect *pcol) { struct exofs_i_info *oi = exofs_i(pcol->inode); @@ -516,10 +626,9 @@ static int write_exec(struct page_collect *pcol) return 0; BUG_ON(pcol->ios); - ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, false, + ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false, pcol->pg_first << PAGE_CACHE_SHIFT, pcol->length, &pcol->ios); - if (unlikely(ret)) goto err; @@ -534,10 +643,20 @@ static int write_exec(struct page_collect *pcol) ios = pcol->ios; ios->pages = pcol_copy->pages; - ios->nr_pages = pcol_copy->nr_pages; ios->done = writepages_done; + ios->r4w = &_r4w_op; ios->private = pcol_copy; + /* pages ownership was passed to pcol_copy */ + _pcol_reset(pcol); + + ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); + if (unlikely(ret)) + goto err; + + EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n", + pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); + ret = ore_write(ios); if (unlikely(ret)) { EXOFS_ERR("write_exec: ore_write() Failed\n"); @@ -545,11 +664,6 @@ static int write_exec(struct page_collect *pcol) } atomic_inc(&pcol->sbi->s_curr_pending); - EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", - pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset), - pcol->length); - /* pages ownership was passed to pcol_copy */ - _pcol_reset(pcol); return 0; err: @@ -689,14 +803,33 @@ static int exofs_writepages(struct address_space *mapping, _pcol_init(&pcol, expected_pages, mapping->host); ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); - if (ret) { + if (unlikely(ret)) { EXOFS_ERR("write_cache_pages => %d\n", ret); return ret; } - return write_exec(&pcol); + ret = write_exec(&pcol); + if (unlikely(ret)) + return ret; + + if (wbc->sync_mode == WB_SYNC_ALL) { + return write_exec(&pcol); /* pump the last reminder */ + } else if (pcol.nr_pages) { + /* not SYNC let the reminder join the next writeout */ + unsigned i; + + for (i = 0; i < pcol.nr_pages; i++) { + struct page *page = pcol.pages[i]; + + end_page_writeback(page); + set_page_dirty(page); + unlock_page(page); + } + } + return 0; } +/* static int exofs_writepage(struct page *page, struct writeback_control *wbc) { struct page_collect pcol; @@ -712,7 +845,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc) return write_exec(&pcol); } - +*/ /* i_mutex held using inode->i_size directly */ static void _write_failed(struct inode *inode, loff_t to) { @@ -818,7 +951,7 @@ static void exofs_invalidatepage(struct page *page, unsigned long offset) const struct address_space_operations exofs_aops = { .readpage = exofs_readpage, .readpages = exofs_readpages, - .writepage = exofs_writepage, + .writepage = NULL, .writepages = exofs_writepages, .write_begin = exofs_write_begin_export, .write_end = exofs_write_end, @@ -860,7 +993,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize) inode->i_mtime = inode->i_ctime = CURRENT_TIME; - ret = ore_truncate(&sbi->layout, &oi->comps, (u64)newsize); + ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize); if (likely(!ret)) truncate_setsize(inode, newsize); @@ -927,14 +1060,14 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, struct exofs_on_disk_inode_layout *layout; int ret; - ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); return ret; } - attrs[1].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs); - attrs[2].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs); + attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); + attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); ios->in_attr = attrs; ios->in_attr_len = ARRAY_SIZE(attrs); @@ -1018,7 +1151,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) return inode; oi = exofs_i(inode); __oi_init(oi); - exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info, + exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, exofs_oi_objno(oi)); /* read the inode from the osd */ @@ -1172,13 +1305,13 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) spin_unlock(&sbi->s_next_gen_lock); insert_inode_hash(inode); - exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info, + exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, exofs_oi_objno(oi)); exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ mark_inode_dirty(inode); - ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n"); return ERR_PTR(ret); @@ -1267,7 +1400,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync) } else memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); - ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); goto free_args; @@ -1350,7 +1483,7 @@ void exofs_evict_inode(struct inode *inode) /* ignore the error, attempt a remove anyway */ /* Now Remove the OSD objects */ - ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed\n", __func__); return; diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 25305af8819..fcfa86ae6fa 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -24,76 +24,287 @@ #include <linux/slab.h> #include <asm/div64.h> +#include <linux/lcm.h> -#include <scsi/osd_ore.h> +#include "ore_raid.h" -#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) +MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); +MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); +MODULE_LICENSE("GPL"); + +/* ore_verify_layout does a couple of things: + * 1. Given a minimum number of needed parameters fixes up the rest of the + * members to be operatonals for the ore. The needed parameters are those + * that are defined by the pnfs-objects layout STD. + * 2. Check to see if the current ore code actually supports these parameters + * for example stripe_unit must be a multple of the system PAGE_SIZE, + * and etc... + * 3. Cache some havily used calculations that will be needed by users. + */ + +enum { BIO_MAX_PAGES_KMALLOC = + (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),}; -#ifdef CONFIG_EXOFS_DEBUG -#define ORE_DBGMSG(fmt, a...) \ - printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) -#else -#define ORE_DBGMSG(fmt, a...) \ - do { if (0) printk(fmt, ##a); } while (0) -#endif +int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) +{ + u64 stripe_length; + + switch (layout->raid_algorithm) { + case PNFS_OSD_RAID_0: + layout->parity = 0; + break; + case PNFS_OSD_RAID_5: + layout->parity = 1; + break; + case PNFS_OSD_RAID_PQ: + case PNFS_OSD_RAID_4: + default: + ORE_ERR("Only RAID_0/5 for now\n"); + return -EINVAL; + } + if (0 != (layout->stripe_unit & ~PAGE_MASK)) { + ORE_ERR("Stripe Unit(0x%llx)" + " must be Multples of PAGE_SIZE(0x%lx)\n", + _LLU(layout->stripe_unit), PAGE_SIZE); + return -EINVAL; + } + if (layout->group_width) { + if (!layout->group_depth) { + ORE_ERR("group_depth == 0 && group_width != 0\n"); + return -EINVAL; + } + if (total_comps < (layout->group_width * layout->mirrors_p1)) { + ORE_ERR("Data Map wrong, " + "numdevs=%d < group_width=%d * mirrors=%d\n", + total_comps, layout->group_width, + layout->mirrors_p1); + return -EINVAL; + } + layout->group_count = total_comps / layout->mirrors_p1 / + layout->group_width; + } else { + if (layout->group_depth) { + printk(KERN_NOTICE "Warning: group_depth ignored " + "group_width == 0 && group_depth == %lld\n", + _LLU(layout->group_depth)); + } + layout->group_width = total_comps / layout->mirrors_p1; + layout->group_depth = -1; + layout->group_count = 1; + } -/* u64 has problems with printk this will cast it to unsigned long long */ -#define _LLU(x) (unsigned long long)(x) + stripe_length = (u64)layout->group_width * layout->stripe_unit; + if (stripe_length >= (1ULL << 32)) { + ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n", + _LLU(stripe_length)); + return -EINVAL; + } -#define ORE_DBGMSG2(M...) do {} while (0) -/* #define ORE_DBGMSG2 ORE_DBGMSG */ + layout->max_io_length = + (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * + layout->group_width; + if (layout->parity) { + unsigned stripe_length = + (layout->group_width - layout->parity) * + layout->stripe_unit; -MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); -MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); -MODULE_LICENSE("GPL"); + layout->max_io_length /= stripe_length; + layout->max_io_length *= stripe_length; + } + return 0; +} +EXPORT_SYMBOL(ore_verify_layout); static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) { - return ios->comps->comps[index & ios->comps->single_comp].cred; + return ios->oc->comps[index & ios->oc->single_comp].cred; } static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) { - return &ios->comps->comps[index & ios->comps->single_comp].obj; + return &ios->oc->comps[index & ios->oc->single_comp].obj; } static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) { - return ios->comps->ods[index]; + ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n", + ios->oc->first_dev, ios->oc->numdevs, index, + ios->oc->ods); + + return ore_comp_dev(ios->oc, index); } -int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, +int _ore_get_io_state(struct ore_layout *layout, + struct ore_components *oc, unsigned numdevs, + unsigned sgs_per_dev, unsigned num_par_pages, + struct ore_io_state **pios) +{ + struct ore_io_state *ios; + struct page **pages; + struct osd_sg_entry *sgilist; + struct __alloc_all_io_state { + struct ore_io_state ios; + struct ore_per_dev_state per_dev[numdevs]; + union { + struct osd_sg_entry sglist[sgs_per_dev * numdevs]; + struct page *pages[num_par_pages]; + }; + } *_aios; + + if (likely(sizeof(*_aios) <= PAGE_SIZE)) { + _aios = kzalloc(sizeof(*_aios), GFP_KERNEL); + if (unlikely(!_aios)) { + ORE_DBGMSG("Failed kzalloc bytes=%zd\n", + sizeof(*_aios)); + *pios = NULL; + return -ENOMEM; + } + pages = num_par_pages ? _aios->pages : NULL; + sgilist = sgs_per_dev ? _aios->sglist : NULL; + ios = &_aios->ios; + } else { + struct __alloc_small_io_state { + struct ore_io_state ios; + struct ore_per_dev_state per_dev[numdevs]; + } *_aio_small; + union __extra_part { + struct osd_sg_entry sglist[sgs_per_dev * numdevs]; + struct page *pages[num_par_pages]; + } *extra_part; + + _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL); + if (unlikely(!_aio_small)) { + ORE_DBGMSG("Failed alloc first part bytes=%zd\n", + sizeof(*_aio_small)); + *pios = NULL; + return -ENOMEM; + } + extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL); + if (unlikely(!extra_part)) { + ORE_DBGMSG("Failed alloc second part bytes=%zd\n", + sizeof(*extra_part)); + kfree(_aio_small); + *pios = NULL; + return -ENOMEM; + } + + pages = num_par_pages ? extra_part->pages : NULL; + sgilist = sgs_per_dev ? extra_part->sglist : NULL; + /* In this case the per_dev[0].sgilist holds the pointer to + * be freed + */ + ios = &_aio_small->ios; + ios->extra_part_alloc = true; + } + + if (pages) { + ios->parity_pages = pages; + ios->max_par_pages = num_par_pages; + } + if (sgilist) { + unsigned d; + + for (d = 0; d < numdevs; ++d) { + ios->per_dev[d].sglist = sgilist; + sgilist += sgs_per_dev; + } + ios->sgs_per_dev = sgs_per_dev; + } + + ios->layout = layout; + ios->oc = oc; + *pios = ios; + return 0; +} + +/* Allocate an io_state for only a single group of devices + * + * If a user needs to call ore_read/write() this version must be used becase it + * allocates extra stuff for striping and raid. + * The ore might decide to only IO less then @length bytes do to alignmets + * and constrains as follows: + * - The IO cannot cross group boundary. + * - In raid5/6 The end of the IO must align at end of a stripe eg. + * (@offset + @length) % strip_size == 0. Or the complete range is within a + * single stripe. + * - Memory condition only permitted a shorter IO. (A user can use @length=~0 + * And check the returned ios->length for max_io_size.) + * + * The caller must check returned ios->length (and/or ios->nr_pages) and + * re-issue these pages that fall outside of ios->length + */ +int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, bool is_reading, u64 offset, u64 length, struct ore_io_state **pios) { struct ore_io_state *ios; + unsigned numdevs = layout->group_width * layout->mirrors_p1; + unsigned sgs_per_dev = 0, max_par_pages = 0; + int ret; - /*TODO: Maybe use kmem_cach per sbi of size - * exofs_io_state_size(layout->s_numdevs) - */ - ios = kzalloc(ore_io_state_size(comps->numdevs), GFP_KERNEL); - if (unlikely(!ios)) { - ORE_DBGMSG("Failed kzalloc bytes=%d\n", - ore_io_state_size(comps->numdevs)); - *pios = NULL; - return -ENOMEM; + if (layout->parity && length) { + unsigned data_devs = layout->group_width - layout->parity; + unsigned stripe_size = layout->stripe_unit * data_devs; + unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; + u32 remainder; + u64 num_stripes; + u64 num_raid_units; + + num_stripes = div_u64_rem(length, stripe_size, &remainder); + if (remainder) + ++num_stripes; + + num_raid_units = num_stripes * layout->parity; + + if (is_reading) { + /* For reads add per_dev sglist array */ + /* TODO: Raid 6 we need twice more. Actually: + * num_stripes / LCMdP(W,P); + * if (W%P != 0) num_stripes *= parity; + */ + + /* first/last seg is split */ + num_raid_units += layout->group_width; + sgs_per_dev = div_u64(num_raid_units, data_devs); + } else { + /* For Writes add parity pages array. */ + max_par_pages = num_raid_units * pages_in_unit * + sizeof(struct page *); + } } - ios->layout = layout; - ios->comps = comps; - ios->offset = offset; - ios->length = length; + ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages, + pios); + if (unlikely(ret)) + return ret; + + ios = *pios; ios->reading = is_reading; + ios->offset = offset; + + if (length) { + ore_calc_stripe_info(layout, offset, length, &ios->si); + ios->length = ios->si.length; + ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; + if (layout->parity) + _ore_post_alloc_raid_stuff(ios); + } - *pios = ios; return 0; } EXPORT_SYMBOL(ore_get_rw_state); -int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps, - struct ore_io_state **ios) +/* Allocate an io_state for all the devices in the comps array + * + * This version of io_state allocation is used mostly by create/remove + * and trunc where we currently need all the devices. The only wastful + * bit is the read/write_attributes with no IO. Those sites should + * be converted to use ore_get_rw_state() with length=0 + */ +int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, + struct ore_io_state **pios) { - return ore_get_rw_state(layout, comps, true, 0, 0, ios); + return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios); } EXPORT_SYMBOL(ore_get_io_state); @@ -111,6 +322,7 @@ void ore_put_io_state(struct ore_io_state *ios) bio_put(per_dev->bio); } + _ore_free_raid_stuff(ios); kfree(ios); } } @@ -138,7 +350,7 @@ static void _done_io(struct osd_request *or, void *p) kref_put(&ios->kref, _last_io); } -static int ore_io_execute(struct ore_io_state *ios) +int ore_io_execute(struct ore_io_state *ios) { DECLARE_COMPLETION_ONSTACK(wait); bool sync = (ios->done == NULL); @@ -198,7 +410,7 @@ static void _clear_bio(struct bio *bio) } } -int ore_check_io(struct ore_io_state *ios, u64 *resid) +int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error) { enum osd_err_priority acumulated_osd_err = 0; int acumulated_lin_err = 0; @@ -206,7 +418,8 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid) for (i = 0; i < ios->numdevs; i++) { struct osd_sense_info osi; - struct osd_request *or = ios->per_dev[i].or; + struct ore_per_dev_state *per_dev = &ios->per_dev[i]; + struct osd_request *or = per_dev->or; int ret; if (unlikely(!or)) @@ -218,29 +431,31 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid) if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { /* start read offset passed endof file */ - _clear_bio(ios->per_dev[i].bio); + _clear_bio(per_dev->bio); ORE_DBGMSG("start read offset passed end of file " "offset=0x%llx, length=0x%llx\n", - _LLU(ios->per_dev[i].offset), - _LLU(ios->per_dev[i].length)); + _LLU(per_dev->offset), + _LLU(per_dev->length)); continue; /* we recovered */ } + if (on_dev_error) { + u64 residual = ios->reading ? + or->in.residual : or->out.residual; + u64 offset = (ios->offset + ios->length) - residual; + struct ore_dev *od = ios->oc->ods[ + per_dev->dev - ios->oc->first_dev]; + + on_dev_error(ios, od, per_dev->dev, osi.osd_err_pri, + offset, residual); + } if (osi.osd_err_pri >= acumulated_osd_err) { acumulated_osd_err = osi.osd_err_pri; acumulated_lin_err = ret; } } - /* TODO: raid specific residual calculations */ - if (resid) { - if (likely(!acumulated_lin_err)) - *resid = 0; - else - *resid = ios->length; - } - return acumulated_lin_err; } EXPORT_SYMBOL(ore_check_io); @@ -248,61 +463,65 @@ EXPORT_SYMBOL(ore_check_io); /* * L - logical offset into the file * - * U - The number of bytes in a stripe within a group + * D - number of Data devices + * D = group_width - parity * - * U = stripe_unit * group_width + * U - The number of bytes in a stripe within a group + * U = stripe_unit * D * * T - The number of bytes striped within a group of component objects * (before advancing to the next group) - * - * T = stripe_unit * group_width * group_depth + * T = U * group_depth * * S - The number of bytes striped across all component objects * before the pattern repeats + * S = T * group_count * - * S = stripe_unit * group_width * group_depth * group_count - * - * M - The "major" (i.e., across all components) stripe number - * + * M - The "major" (i.e., across all components) cycle number * M = L / S * - * G - Counts the groups from the beginning of the major stripe - * + * G - Counts the groups from the beginning of the major cycle * G = (L - (M * S)) / T [or (L % S) / T] * * H - The byte offset within the group - * * H = (L - (M * S)) % T [or (L % S) % T] * * N - The "minor" (i.e., across the group) stripe number - * * N = H / U * * C - The component index coresponding to L * - * C = (H - (N * U)) / stripe_unit + G * group_width - * [or (L % U) / stripe_unit + G * group_width] + * C = (H - (N * U)) / stripe_unit + G * D + * [or (L % U) / stripe_unit + G * D] * * O - The component offset coresponding to L - * * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit + * + * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity + * divide by parity + * LCMdP = lcm(group_width, parity) / parity + * + * R - The parity Rotation stripe + * (Note parity cycle always starts at a group's boundary) + * R = N % LCMdP + * + * I = the first parity device index + * I = (group_width + group_width - R*parity - parity) % group_width + * + * Craid - The component index Rotated + * Craid = (group_width + C - R*parity) % group_width + * (We add the group_width to avoid negative numbers modulo math) */ -struct _striping_info { - u64 obj_offset; - u64 group_length; - u64 M; /* for truncate */ - unsigned dev; - unsigned unit_off; -}; - -static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset, - struct _striping_info *si) +void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, + u64 length, struct ore_striping_info *si) { u32 stripe_unit = layout->stripe_unit; u32 group_width = layout->group_width; u64 group_depth = layout->group_depth; + u32 parity = layout->parity; - u32 U = stripe_unit * group_width; + u32 D = group_width - parity; + u32 U = D * stripe_unit; u64 T = U * group_depth; u64 S = T * layout->group_count; u64 M = div64_u64(file_offset, S); @@ -318,39 +537,65 @@ static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset, u32 N = div_u64(H, U); /* "H - (N * U)" is just "H % U" so it's bound to u32 */ - si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; - si->dev *= layout->mirrors_p1; + u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; div_u64_rem(file_offset, stripe_unit, &si->unit_off); si->obj_offset = si->unit_off + (N * stripe_unit) + (M * group_depth * stripe_unit); - si->group_length = T - H; + if (parity) { + u32 LCMdP = lcm(group_width, parity) / parity; + /* R = N % LCMdP; */ + u32 RxP = (N % LCMdP) * parity; + u32 first_dev = C - C % group_width; + + si->par_dev = (group_width + group_width - parity - RxP) % + group_width + first_dev; + si->dev = (group_width + C - RxP) % group_width + first_dev; + si->bytes_in_stripe = U; + si->first_stripe_start = M * S + G * T + N * U; + } else { + /* Make the math correct see _prepare_one_group */ + si->par_dev = group_width; + si->dev = C; + } + + si->dev *= layout->mirrors_p1; + si->par_dev *= layout->mirrors_p1; + si->offset = file_offset; + si->length = T - H; + if (si->length > length) + si->length = length; si->M = M; } +EXPORT_SYMBOL(ore_calc_stripe_info); -static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, - unsigned pgbase, struct ore_per_dev_state *per_dev, - int cur_len) +int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, + unsigned pgbase, struct page **pages, + struct ore_per_dev_state *per_dev, int cur_len) { unsigned pg = *cur_pg; struct request_queue *q = osd_request_queue(_ios_od(ios, per_dev->dev)); - - per_dev->length += cur_len; + unsigned len = cur_len; + int ret; if (per_dev->bio == NULL) { unsigned pages_in_stripe = ios->layout->group_width * (ios->layout->stripe_unit / PAGE_SIZE); - unsigned bio_size = (ios->nr_pages + pages_in_stripe) / - ios->layout->group_width; + unsigned nr_pages = ios->nr_pages * ios->layout->group_width / + (ios->layout->group_width - + ios->layout->parity); + unsigned bio_size = (nr_pages + pages_in_stripe) / + ios->layout->group_width; per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); if (unlikely(!per_dev->bio)) { ORE_DBGMSG("Failed to allocate BIO size=%u\n", bio_size); - return -ENOMEM; + ret = -ENOMEM; + goto out; } } @@ -358,64 +603,90 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); unsigned added_len; - BUG_ON(ios->nr_pages <= pg); cur_len -= pglen; - added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg], + added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], pglen, pgbase); - if (unlikely(pglen != added_len)) - return -ENOMEM; + if (unlikely(pglen != added_len)) { + ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n", + per_dev->bio->bi_vcnt); + ret = -ENOMEM; + goto out; + } + _add_stripe_page(ios->sp2d, &ios->si, pages[pg]); + pgbase = 0; ++pg; } BUG_ON(cur_len); + per_dev->length += len; *cur_pg = pg; - return 0; + ret = 0; +out: /* we fail the complete unit on an error eg don't advance + * per_dev->length and cur_pg. This means that we might have a bigger + * bio than the CDB requested length (per_dev->length). That's fine + * only the oposite is fatal. + */ + return ret; } -static int _prepare_one_group(struct ore_io_state *ios, u64 length, - struct _striping_info *si) +static int _prepare_for_striping(struct ore_io_state *ios) { + struct ore_striping_info *si = &ios->si; unsigned stripe_unit = ios->layout->stripe_unit; unsigned mirrors_p1 = ios->layout->mirrors_p1; - unsigned devs_in_group = ios->layout->group_width * mirrors_p1; + unsigned group_width = ios->layout->group_width; + unsigned devs_in_group = group_width * mirrors_p1; unsigned dev = si->dev; unsigned first_dev = dev - (dev % devs_in_group); - unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; + unsigned dev_order; unsigned cur_pg = ios->pages_consumed; + u64 length = ios->length; int ret = 0; + if (!ios->pages) { + ios->numdevs = ios->layout->mirrors_p1; + return 0; + } + + BUG_ON(length > si->length); + + dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev); + si->cur_comp = dev_order; + si->cur_pg = si->unit_off / PAGE_SIZE; + while (length) { - struct ore_per_dev_state *per_dev = &ios->per_dev[dev]; + unsigned comp = dev - first_dev; + struct ore_per_dev_state *per_dev = &ios->per_dev[comp]; unsigned cur_len, page_off = 0; if (!per_dev->length) { per_dev->dev = dev; - if (dev < si->dev) { - per_dev->offset = si->obj_offset + stripe_unit - - si->unit_off; - cur_len = stripe_unit; - } else if (dev == si->dev) { + if (dev == si->dev) { + WARN_ON(dev == si->par_dev); per_dev->offset = si->obj_offset; cur_len = stripe_unit - si->unit_off; page_off = si->unit_off & ~PAGE_MASK; BUG_ON(page_off && (page_off != ios->pgbase)); - } else { /* dev > si->dev */ - per_dev->offset = si->obj_offset - si->unit_off; + } else { + if (si->cur_comp > dev_order) + per_dev->offset = + si->obj_offset - si->unit_off; + else /* si->cur_comp < dev_order */ + per_dev->offset = + si->obj_offset + stripe_unit - + si->unit_off; cur_len = stripe_unit; } - - if (max_comp < dev) - max_comp = dev; } else { cur_len = stripe_unit; } if (cur_len >= length) cur_len = length; - ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, - cur_len); + ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages, + per_dev, cur_len); if (unlikely(ret)) goto out; @@ -423,60 +694,60 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length, dev = (dev % devs_in_group) + first_dev; length -= cur_len; - } -out: - ios->numdevs = max_comp + mirrors_p1; - ios->pages_consumed = cur_pg; - return ret; -} - -static int _prepare_for_striping(struct ore_io_state *ios) -{ - u64 length = ios->length; - u64 offset = ios->offset; - struct _striping_info si; - int ret = 0; - if (!ios->pages) { - if (ios->kern_buff) { - struct ore_per_dev_state *per_dev = &ios->per_dev[0]; + si->cur_comp = (si->cur_comp + 1) % group_width; + if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) { + if (!length && ios->sp2d) { + /* If we are writing and this is the very last + * stripe. then operate on parity dev. + */ + dev = si->par_dev; + } + if (ios->sp2d) + /* In writes cur_len just means if it's the + * last one. See _ore_add_parity_unit. + */ + cur_len = length; + per_dev = &ios->per_dev[dev - first_dev]; + if (!per_dev->length) { + /* Only/always the parity unit of the first + * stripe will be empty. So this is a chance to + * initialize the per_dev info. + */ + per_dev->dev = dev; + per_dev->offset = si->obj_offset - si->unit_off; + } - _calc_stripe_info(ios->layout, ios->offset, &si); - per_dev->offset = si.obj_offset; - per_dev->dev = si.dev; + ret = _ore_add_parity_unit(ios, si, per_dev, cur_len); + if (unlikely(ret)) + goto out; - /* no cross device without page array */ - BUG_ON((ios->layout->group_width > 1) && - (si.unit_off + ios->length > - ios->layout->stripe_unit)); + /* Rotate next par_dev backwards with wraping */ + si->par_dev = (devs_in_group + si->par_dev - + ios->layout->parity * mirrors_p1) % + devs_in_group + first_dev; + /* Next stripe, start fresh */ + si->cur_comp = 0; + si->cur_pg = 0; } - ios->numdevs = ios->layout->mirrors_p1; - return 0; - } - - while (length) { - _calc_stripe_info(ios->layout, offset, &si); - - if (length < si.group_length) - si.group_length = length; - - ret = _prepare_one_group(ios, si.group_length, &si); - if (unlikely(ret)) - goto out; - - offset += si.group_length; - length -= si.group_length; } - out: - return ret; + ios->numdevs = devs_in_group; + ios->pages_consumed = cur_pg; + if (unlikely(ret)) { + if (length == ios->length) + return ret; + else + ios->length -= length; + } + return 0; } int ore_create(struct ore_io_state *ios) { int i, ret; - for (i = 0; i < ios->comps->numdevs; i++) { + for (i = 0; i < ios->oc->numdevs; i++) { struct osd_request *or; or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); @@ -501,7 +772,7 @@ int ore_remove(struct ore_io_state *ios) { int i, ret; - for (i = 0; i < ios->comps->numdevs; i++) { + for (i = 0; i < ios->oc->numdevs; i++) { struct osd_request *or; or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); @@ -543,7 +814,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) goto out; } per_dev->or = or; - per_dev->offset = master_dev->offset; if (ios->pages) { struct bio *bio; @@ -562,6 +832,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) __bio_clone(bio, master_dev->bio); bio->bi_bdev = NULL; bio->bi_next = NULL; + per_dev->offset = master_dev->offset; per_dev->length = master_dev->length; per_dev->bio = bio; per_dev->dev = dev; @@ -579,7 +850,15 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) _LLU(per_dev->offset), _LLU(per_dev->length), dev); } else if (ios->kern_buff) { - ret = osd_req_write_kern(or, _ios_obj(ios, dev), + per_dev->offset = ios->si.obj_offset; + per_dev->dev = ios->si.dev + dev; + + /* no cross device without page array */ + BUG_ON((ios->layout->group_width > 1) && + (ios->si.unit_off + ios->length > + ios->layout->stripe_unit)); + + ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev), per_dev->offset, ios->kern_buff, ios->length); if (unlikely(ret)) @@ -588,7 +867,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) "length=0x%llx dev=%d\n", _LLU(_ios_obj(ios, dev)->id), _LLU(per_dev->offset), - _LLU(ios->length), dev); + _LLU(ios->length), per_dev->dev); } else { osd_req_set_attributes(or, _ios_obj(ios, dev)); ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", @@ -614,6 +893,14 @@ int ore_write(struct ore_io_state *ios) int i; int ret; + if (unlikely(ios->sp2d && !ios->r4w)) { + /* A library is attempting a RAID-write without providing + * a pages lock interface. + */ + WARN_ON_ONCE(1); + return -ENOTSUPP; + } + ret = _prepare_for_striping(ios); if (unlikely(ret)) return ret; @@ -629,7 +916,7 @@ int ore_write(struct ore_io_state *ios) } EXPORT_SYMBOL(ore_write); -static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) +int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp) { struct osd_request *or; struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; @@ -648,22 +935,27 @@ static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) per_dev->or = or; if (ios->pages) { - osd_req_read(or, obj, per_dev->offset, - per_dev->bio, per_dev->length); + if (per_dev->cur_sg) { + /* finalize the last sg_entry */ + _ore_add_sg_seg(per_dev, 0, false); + if (unlikely(!per_dev->cur_sg)) + return 0; /* Skip parity only device */ + + osd_req_read_sg(or, obj, per_dev->bio, + per_dev->sglist, per_dev->cur_sg); + } else { + /* The no raid case */ + osd_req_read(or, obj, per_dev->offset, + per_dev->bio, per_dev->length); + } + ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" - " dev=%d\n", _LLU(obj->id), + " dev=%d sg_len=%d\n", _LLU(obj->id), _LLU(per_dev->offset), _LLU(per_dev->length), - first_dev); - } else if (ios->kern_buff) { - int ret = osd_req_read_kern(or, obj, per_dev->offset, - ios->kern_buff, ios->length); - ORE_DBGMSG2("read_kern(0x%llx) offset=0x%llx " - "length=0x%llx dev=%d ret=>%d\n", - _LLU(obj->id), _LLU(per_dev->offset), - _LLU(ios->length), first_dev, ret); - if (unlikely(ret)) - return ret; + first_dev, per_dev->cur_sg); } else { + BUG_ON(ios->kern_buff); + osd_req_get_attributes(or, obj); ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", _LLU(obj->id), @@ -688,7 +980,7 @@ int ore_read(struct ore_io_state *ios) return ret; for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - ret = _read_mirror(ios, i); + ret = _ore_read_mirror(ios, i); if (unlikely(ret)) return ret; } @@ -744,31 +1036,29 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp, } struct _trunc_info { - struct _striping_info si; + struct ore_striping_info si; u64 prev_group_obj_off; u64 next_group_obj_off; unsigned first_group_dev; unsigned nex_group_dev; - unsigned max_devs; }; -void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, - struct _trunc_info *ti) +static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, + struct _trunc_info *ti) { unsigned stripe_unit = layout->stripe_unit; - _calc_stripe_info(layout, file_offset, &ti->si); + ore_calc_stripe_info(layout, file_offset, 0, &ti->si); ti->prev_group_obj_off = ti->si.M * stripe_unit; ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width); ti->nex_group_dev = ti->first_group_dev + layout->group_width; - ti->max_devs = layout->group_width * layout->group_count; } -int ore_truncate(struct ore_layout *layout, struct ore_components *comps, +int ore_truncate(struct ore_layout *layout, struct ore_components *oc, u64 size) { struct ore_io_state *ios; @@ -779,22 +1069,22 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps, struct _trunc_info ti; int i, ret; - ret = ore_get_io_state(layout, comps, &ios); + ret = ore_get_io_state(layout, oc, &ios); if (unlikely(ret)) return ret; _calc_trunk_info(ios->layout, size, &ti); - size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs), + size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs), GFP_KERNEL); if (unlikely(!size_attrs)) { ret = -ENOMEM; goto out; } - ios->numdevs = ios->comps->numdevs; + ios->numdevs = ios->oc->numdevs; - for (i = 0; i < ti.max_devs; ++i) { + for (i = 0; i < ios->numdevs; ++i) { struct exofs_trunc_attr *size_attr = &size_attrs[i]; u64 obj_size; @@ -815,7 +1105,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps, size_attr->attr.val_ptr = &size_attr->newsize; ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", - _LLU(comps->comps->obj.id), _LLU(obj_size), i); + _LLU(oc->comps->obj.id), _LLU(obj_size), i); ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, &size_attr->attr); if (unlikely(ret)) diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c new file mode 100644 index 00000000000..29c47e5c4a8 --- /dev/null +++ b/fs/exofs/ore_raid.c @@ -0,0 +1,660 @@ +/* + * Copyright (C) 2011 + * Boaz Harrosh <bharrosh@panasas.com> + * + * This file is part of the objects raid engine (ore). + * + * It is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with "ore". If not, write to the Free Software Foundation, Inc: + * "Free Software Foundation <info@fsf.org>" + */ + +#include <linux/gfp.h> +#include <linux/async_tx.h> + +#include "ore_raid.h" + +#undef ORE_DBGMSG2 +#define ORE_DBGMSG2 ORE_DBGMSG + +struct page *_raid_page_alloc(void) +{ + return alloc_page(GFP_KERNEL); +} + +void _raid_page_free(struct page *p) +{ + __free_page(p); +} + +/* This struct is forward declare in ore_io_state, but is private to here. + * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit. + * + * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn. + * Ascending page index access is sp2d(p-minor, c-major). But storage is + * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor + * API. + */ +struct __stripe_pages_2d { + /* Cache some hot path repeated calculations */ + unsigned parity; + unsigned data_devs; + unsigned pages_in_unit; + + bool needed ; + + /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */ + struct __1_page_stripe { + bool alloc; + unsigned write_count; + struct async_submit_ctl submit; + struct dma_async_tx_descriptor *tx; + + /* The size of this array is data_devs + parity */ + struct page **pages; + struct page **scribble; + /* bool array, size of this array is data_devs */ + char *page_is_read; + } _1p_stripes[]; +}; + +/* This can get bigger then a page. So support multiple page allocations + * _sp2d_free should be called even if _sp2d_alloc fails (by returning + * none-zero). + */ +static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, + unsigned parity, struct __stripe_pages_2d **psp2d) +{ + struct __stripe_pages_2d *sp2d; + unsigned data_devs = group_width - parity; + struct _alloc_all_bytes { + struct __alloc_stripe_pages_2d { + struct __stripe_pages_2d sp2d; + struct __1_page_stripe _1p_stripes[pages_in_unit]; + } __asp2d; + struct __alloc_1p_arrays { + struct page *pages[group_width]; + struct page *scribble[group_width]; + char page_is_read[data_devs]; + } __a1pa[pages_in_unit]; + } *_aab; + struct __alloc_1p_arrays *__a1pa; + struct __alloc_1p_arrays *__a1pa_end; + const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]); + unsigned num_a1pa, alloc_size, i; + + /* FIXME: check these numbers in ore_verify_layout */ + BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE); + BUG_ON(sizeof__a1pa > PAGE_SIZE); + + if (sizeof(*_aab) > PAGE_SIZE) { + num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa; + alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa; + } else { + num_a1pa = pages_in_unit; + alloc_size = sizeof(*_aab); + } + + _aab = kzalloc(alloc_size, GFP_KERNEL); + if (unlikely(!_aab)) { + ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size); + return -ENOMEM; + } + + sp2d = &_aab->__asp2d.sp2d; + *psp2d = sp2d; /* From here Just call _sp2d_free */ + + __a1pa = _aab->__a1pa; + __a1pa_end = __a1pa + num_a1pa; + + for (i = 0; i < pages_in_unit; ++i) { + if (unlikely(__a1pa >= __a1pa_end)) { + num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, + pages_in_unit - i); + + __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL); + if (unlikely(!__a1pa)) { + ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", + num_a1pa); + return -ENOMEM; + } + __a1pa_end = __a1pa + num_a1pa; + /* First *pages is marked for kfree of the buffer */ + sp2d->_1p_stripes[i].alloc = true; + } + + sp2d->_1p_stripes[i].pages = __a1pa->pages; + sp2d->_1p_stripes[i].scribble = __a1pa->scribble ; + sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read; + ++__a1pa; + } + + sp2d->parity = parity; + sp2d->data_devs = data_devs; + sp2d->pages_in_unit = pages_in_unit; + return 0; +} + +static void _sp2d_reset(struct __stripe_pages_2d *sp2d, + const struct _ore_r4w_op *r4w, void *priv) +{ + unsigned data_devs = sp2d->data_devs; + unsigned group_width = data_devs + sp2d->parity; + unsigned p; + + if (!sp2d->needed) + return; + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->write_count < group_width) { + unsigned c; + + for (c = 0; c < data_devs; c++) + if (_1ps->page_is_read[c]) { + struct page *page = _1ps->pages[c]; + + r4w->put_page(priv, page); + _1ps->page_is_read[c] = false; + } + } + + memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages)); + _1ps->write_count = 0; + _1ps->tx = NULL; + } + + sp2d->needed = false; +} + +static void _sp2d_free(struct __stripe_pages_2d *sp2d) +{ + unsigned i; + + if (!sp2d) + return; + + for (i = 0; i < sp2d->pages_in_unit; ++i) { + if (sp2d->_1p_stripes[i].alloc) + kfree(sp2d->_1p_stripes[i].pages); + } + + kfree(sp2d); +} + +static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d) +{ + unsigned p; + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->write_count) + return p; + } + + return ~0; +} + +static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) +{ + unsigned p; + + for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->write_count) + return p; + } + + return ~0; +} + +static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) +{ + unsigned p; + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (!_1ps->write_count) + continue; + + init_async_submit(&_1ps->submit, + ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK, + NULL, + NULL, NULL, + (addr_conv_t *)_1ps->scribble); + + /* TODO: raid6 */ + _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages, + 0, sp2d->data_devs, PAGE_SIZE, + &_1ps->submit); + } + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + /* NOTE: We wait for HW synchronously (I don't have such HW + * to test with.) Is parallelism needed with today's multi + * cores? + */ + async_tx_issue_pending(_1ps->tx); + } +} + +void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page) +{ + struct __1_page_stripe *_1ps; + + sp2d->needed = true; + + _1ps = &sp2d->_1p_stripes[si->cur_pg]; + _1ps->pages[si->cur_comp] = page; + ++_1ps->write_count; + + si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit; + /* si->cur_comp is advanced outside at main loop */ +} + +void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, + bool not_last) +{ + struct osd_sg_entry *sge; + + ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d " + "offset=0x%llx length=0x%x last_sgs_total=0x%x\n", + per_dev->dev, cur_len, not_last, per_dev->cur_sg, + _LLU(per_dev->offset), per_dev->length, + per_dev->last_sgs_total); + + if (!per_dev->cur_sg) { + sge = per_dev->sglist; + + /* First time we prepare two entries */ + if (per_dev->length) { + ++per_dev->cur_sg; + sge->offset = per_dev->offset; + sge->len = per_dev->length; + } else { + /* Here the parity is the first unit of this object. + * This happens every time we reach a parity device on + * the same stripe as the per_dev->offset. We need to + * just skip this unit. + */ + per_dev->offset += cur_len; + return; + } + } else { + /* finalize the last one */ + sge = &per_dev->sglist[per_dev->cur_sg - 1]; + sge->len = per_dev->length - per_dev->last_sgs_total; + } + + if (not_last) { + /* Partly prepare the next one */ + struct osd_sg_entry *next_sge = sge + 1; + + ++per_dev->cur_sg; + next_sge->offset = sge->offset + sge->len + cur_len; + /* Save cur len so we know how mutch was added next time */ + per_dev->last_sgs_total = per_dev->length; + next_sge->len = 0; + } else if (!sge->len) { + /* Optimize for when the last unit is a parity */ + --per_dev->cur_sg; + } +} + +static int _alloc_read_4_write(struct ore_io_state *ios) +{ + struct ore_layout *layout = ios->layout; + int ret; + /* We want to only read those pages not in cache so worst case + * is a stripe populated with every other page + */ + unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2; + + ret = _ore_get_io_state(layout, ios->oc, + layout->group_width * layout->mirrors_p1, + sgs_per_dev, 0, &ios->ios_read_4_write); + return ret; +} + +/* @si contains info of the to-be-inserted page. Update of @si should be + * maintained by caller. Specificaly si->dev, si->obj_offset, ... + */ +static int _add_to_read_4_write(struct ore_io_state *ios, + struct ore_striping_info *si, struct page *page) +{ + struct request_queue *q; + struct ore_per_dev_state *per_dev; + struct ore_io_state *read_ios; + unsigned first_dev = si->dev - (si->dev % + (ios->layout->group_width * ios->layout->mirrors_p1)); + unsigned comp = si->dev - first_dev; + unsigned added_len; + + if (!ios->ios_read_4_write) { + int ret = _alloc_read_4_write(ios); + + if (unlikely(ret)) + return ret; + } + + read_ios = ios->ios_read_4_write; + read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1; + + per_dev = &read_ios->per_dev[comp]; + if (!per_dev->length) { + per_dev->bio = bio_kmalloc(GFP_KERNEL, + ios->sp2d->pages_in_unit); + if (unlikely(!per_dev->bio)) { + ORE_DBGMSG("Failed to allocate BIO size=%u\n", + ios->sp2d->pages_in_unit); + return -ENOMEM; + } + per_dev->offset = si->obj_offset; + per_dev->dev = si->dev; + } else if (si->obj_offset != (per_dev->offset + per_dev->length)) { + u64 gap = si->obj_offset - (per_dev->offset + per_dev->length); + + _ore_add_sg_seg(per_dev, gap, true); + } + q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); + added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0); + if (unlikely(added_len != PAGE_SIZE)) { + ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", + per_dev->bio->bi_vcnt); + return -ENOMEM; + } + + per_dev->length += PAGE_SIZE; + return 0; +} + +static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) +{ + struct bio_vec *bv; + unsigned i, d; + + /* loop on all devices all pages */ + for (d = 0; d < ios->numdevs; d++) { + struct bio *bio = ios->per_dev[d].bio; + + if (!bio) + continue; + + __bio_for_each_segment(bv, bio, i, 0) { + struct page *page = bv->bv_page; + + SetPageUptodate(page); + if (PageError(page)) + ClearPageError(page); + } + } +} + +/* read_4_write is hacked to read the start of the first stripe and/or + * the end of the last stripe. If needed, with an sg-gap at each device/page. + * It is assumed to be called after the to_be_written pages of the first stripe + * are populating ios->sp2d[][] + * + * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations + * These pages are held at sp2d[p].pages[c] but with + * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are + * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is + * @uptodate=true, so we don't need to read it, only unlock, after IO. + * + * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then + * to-be-written count, we should consider the xor-in-place mode. + * need_to_read_pages_count is the actual number of pages not present in cache. + * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough + * approximation? In this mode the read pages are put in the empty places of + * ios->sp2d[p][*], xor is calculated the same way. These pages are + * allocated/freed and don't go through cache + */ +static int _read_4_write(struct ore_io_state *ios) +{ + struct ore_io_state *ios_read; + struct ore_striping_info read_si; + struct __stripe_pages_2d *sp2d = ios->sp2d; + u64 offset = ios->si.first_stripe_start; + u64 last_stripe_end; + unsigned bytes_in_stripe = ios->si.bytes_in_stripe; + unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1; + int ret; + + if (offset == ios->offset) /* Go to start collect $200 */ + goto read_last_stripe; + + min_p = _sp2d_min_pg(sp2d); + max_p = _sp2d_max_pg(sp2d); + + for (c = 0; ; c++) { + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + read_si.obj_offset += min_p * PAGE_SIZE; + offset += min_p * PAGE_SIZE; + for (p = min_p; p <= max_p; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + struct page **pp = &_1ps->pages[c]; + bool uptodate; + + if (*pp) + /* to-be-written pages start here */ + goto read_last_stripe; + + *pp = ios->r4w->get_page(ios->private, offset, + &uptodate); + if (unlikely(!*pp)) + return -ENOMEM; + + if (!uptodate) + _add_to_read_4_write(ios, &read_si, *pp); + + /* Mark read-pages to be cache_released */ + _1ps->page_is_read[c] = true; + read_si.obj_offset += PAGE_SIZE; + offset += PAGE_SIZE; + } + offset += (sp2d->pages_in_unit - p) * PAGE_SIZE; + } + +read_last_stripe: + offset = ios->offset + (ios->length + PAGE_SIZE - 1) / + PAGE_SIZE * PAGE_SIZE; + last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) + * bytes_in_stripe; + if (offset == last_stripe_end) /* Optimize for the aligned case */ + goto read_it; + + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + p = read_si.unit_off / PAGE_SIZE; + c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, + ios->layout->mirrors_p1, read_si.par_dev, read_si.dev); + + BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end); + /* unaligned IO must be within a single stripe */ + + if (min_p == sp2d->pages_in_unit) { + /* Didn't do it yet */ + min_p = _sp2d_min_pg(sp2d); + max_p = _sp2d_max_pg(sp2d); + } + + while (offset < last_stripe_end) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if ((min_p <= p) && (p <= max_p)) { + struct page *page; + bool uptodate; + + BUG_ON(_1ps->pages[c]); + page = ios->r4w->get_page(ios->private, offset, + &uptodate); + if (unlikely(!page)) + return -ENOMEM; + + _1ps->pages[c] = page; + /* Mark read-pages to be cache_released */ + _1ps->page_is_read[c] = true; + if (!uptodate) + _add_to_read_4_write(ios, &read_si, page); + } + + offset += PAGE_SIZE; + if (p == (sp2d->pages_in_unit - 1)) { + ++c; + p = 0; + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + } else { + read_si.obj_offset += PAGE_SIZE; + ++p; + } + } + +read_it: + ios_read = ios->ios_read_4_write; + if (!ios_read) + return 0; + + /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change + * to check for per_dev->bio + */ + ios_read->pages = ios->pages; + + /* Now read these devices */ + for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) { + ret = _ore_read_mirror(ios_read, i); + if (unlikely(ret)) + return ret; + } + + ret = ore_io_execute(ios_read); /* Synchronus execution */ + if (unlikely(ret)) { + ORE_DBGMSG("!! ore_io_execute => %d\n", ret); + return ret; + } + + _mark_read4write_pages_uptodate(ios_read, ret); + return 0; +} + +/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ +int _ore_add_parity_unit(struct ore_io_state *ios, + struct ore_striping_info *si, + struct ore_per_dev_state *per_dev, + unsigned cur_len) +{ + if (ios->reading) { + BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev); + _ore_add_sg_seg(per_dev, cur_len, true); + } else { + struct __stripe_pages_2d *sp2d = ios->sp2d; + struct page **pages = ios->parity_pages + ios->cur_par_page; + unsigned num_pages; + unsigned array_start = 0; + unsigned i; + int ret; + + si->cur_pg = _sp2d_min_pg(sp2d); + num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; + + if (!cur_len) /* If last stripe operate on parity comp */ + si->cur_comp = sp2d->data_devs; + + if (!per_dev->length) { + per_dev->offset += si->cur_pg * PAGE_SIZE; + /* If first stripe, Read in all read4write pages + * (if needed) before we calculate the first parity. + */ + _read_4_write(ios); + } + + for (i = 0; i < num_pages; i++) { + pages[i] = _raid_page_alloc(); + if (unlikely(!pages[i])) + return -ENOMEM; + + ++(ios->cur_par_page); + } + + BUG_ON(si->cur_comp != sp2d->data_devs); + BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); + + ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, + per_dev, num_pages * PAGE_SIZE); + if (unlikely(ret)) + return ret; + + /* TODO: raid6 if (last_parity_dev) */ + _gen_xor_unit(sp2d); + _sp2d_reset(sp2d, ios->r4w, ios->private); + } + return 0; +} + +int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) +{ + struct ore_layout *layout = ios->layout; + + if (ios->parity_pages) { + unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; + unsigned stripe_size = ios->si.bytes_in_stripe; + u64 last_stripe, first_stripe; + + if (_sp2d_alloc(pages_in_unit, layout->group_width, + layout->parity, &ios->sp2d)) { + return -ENOMEM; + } + + BUG_ON(ios->offset % PAGE_SIZE); + + /* Round io down to last full strip */ + first_stripe = div_u64(ios->offset, stripe_size); + last_stripe = div_u64(ios->offset + ios->length, stripe_size); + + /* If an IO spans more then a single stripe it must end at + * a stripe boundary. The reminder at the end is pushed into the + * next IO. + */ + if (last_stripe != first_stripe) { + ios->length = last_stripe * stripe_size - ios->offset; + + BUG_ON(!ios->length); + ios->nr_pages = (ios->length + PAGE_SIZE - 1) / + PAGE_SIZE; + ios->si.length = ios->length; /*make it consistent */ + } + } + return 0; +} + +void _ore_free_raid_stuff(struct ore_io_state *ios) +{ + if (ios->sp2d) { /* writing and raid */ + unsigned i; + + for (i = 0; i < ios->cur_par_page; i++) { + struct page *page = ios->parity_pages[i]; + + if (page) + _raid_page_free(page); + } + if (ios->extra_part_alloc) + kfree(ios->parity_pages); + /* If IO returned an error pages might need unlocking */ + _sp2d_reset(ios->sp2d, ios->r4w, ios->private); + _sp2d_free(ios->sp2d); + } else { + /* Will only be set if raid reading && sglist is big */ + if (ios->extra_part_alloc) + kfree(ios->per_dev[0].sglist); + } + if (ios->ios_read_4_write) + ore_put_io_state(ios->ios_read_4_write); +} diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h new file mode 100644 index 00000000000..2ffd2c3c6e4 --- /dev/null +++ b/fs/exofs/ore_raid.h @@ -0,0 +1,79 @@ +/* + * Copyright (C) from 2011 + * Boaz Harrosh <bharrosh@panasas.com> + * + * This file is part of the objects raid engine (ore). + * + * It is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with "ore". If not, write to the Free Software Foundation, Inc: + * "Free Software Foundation <info@fsf.org>" + */ + +#include <scsi/osd_ore.h> + +#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) + +#ifdef CONFIG_EXOFS_DEBUG +#define ORE_DBGMSG(fmt, a...) \ + printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) +#else +#define ORE_DBGMSG(fmt, a...) \ + do { if (0) printk(fmt, ##a); } while (0) +#endif + +/* u64 has problems with printk this will cast it to unsigned long long */ +#define _LLU(x) (unsigned long long)(x) + +#define ORE_DBGMSG2(M...) do {} while (0) +/* #define ORE_DBGMSG2 ORE_DBGMSG */ + +/* Calculate the component order in a stripe. eg the logical data unit + * address within the stripe of @dev given the @par_dev of this stripe. + */ +static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1, + unsigned par_dev, unsigned dev) +{ + unsigned first_dev = dev - dev % devs_in_group; + + dev -= first_dev; + par_dev -= first_dev; + + if (devs_in_group == par_dev) /* The raid 0 case */ + return dev / mirrors_p1; + /* raid4/5/6 case */ + return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) / + mirrors_p1; +} + +/* ios_raid.c stuff needed by ios.c */ +int _ore_post_alloc_raid_stuff(struct ore_io_state *ios); +void _ore_free_raid_stuff(struct ore_io_state *ios); + +void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, + bool not_last); +int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, + struct ore_per_dev_state *per_dev, unsigned cur_len); +void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page); +static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page) +{ + if (!sp2d) /* Inline the fast path */ + return; /* Hay no raid stuff */ + _ore_add_stripe_page(sp2d, si, page); +} + +/* ios.c stuff needed by ios_raid.c */ +int _ore_get_io_state(struct ore_layout *layout, + struct ore_components *oc, unsigned numdevs, + unsigned sgs_per_dev, unsigned num_par_pages, + struct ore_io_state **pios); +int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, + unsigned pgbase, struct page **pages, + struct ore_per_dev_state *per_dev, int cur_len); +int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp); +int ore_io_execute(struct ore_io_state *ios); diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 274894053b0..057b237b8b6 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -266,7 +266,7 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi) struct ore_io_state *ios; int ret; - ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); return ret; @@ -321,7 +321,7 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi) struct ore_io_state *ios; int ret; - ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); return ret; @@ -355,12 +355,12 @@ static const struct export_operations exofs_export_ops; /* * Write the superblock to the OSD */ -int exofs_sync_fs(struct super_block *sb, int wait) +static int exofs_sync_fs(struct super_block *sb, int wait) { struct exofs_sb_info *sbi; struct exofs_fscb *fscb; struct ore_comp one_comp; - struct ore_components comps; + struct ore_components oc; struct ore_io_state *ios; int ret = -ENOMEM; @@ -378,9 +378,9 @@ int exofs_sync_fs(struct super_block *sb, int wait) * the writeable info is set in exofs_sbi_write_stats() above. */ - exofs_init_comps(&comps, &one_comp, sbi, EXOFS_SUPER_ID); + exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID); - ret = ore_get_io_state(&sbi->layout, &comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oc, &ios); if (unlikely(ret)) goto out; @@ -429,19 +429,20 @@ static void _exofs_print_device(const char *msg, const char *dev_path, msg, dev_path ?: "", odi->osdname, _LLU(pid)); } -void exofs_free_sbi(struct exofs_sb_info *sbi) +static void exofs_free_sbi(struct exofs_sb_info *sbi) { - while (sbi->comps.numdevs) { - int i = --sbi->comps.numdevs; - struct osd_dev *od = sbi->comps.ods[i]; + unsigned numdevs = sbi->oc.numdevs; + + while (numdevs) { + unsigned i = --numdevs; + struct osd_dev *od = ore_comp_dev(&sbi->oc, i); if (od) { - sbi->comps.ods[i] = NULL; + ore_comp_set_dev(&sbi->oc, i, NULL); osduld_put_device(od); } } - if (sbi->comps.ods != sbi->_min_one_dev) - kfree(sbi->comps.ods); + kfree(sbi->oc.ods); kfree(sbi); } @@ -468,7 +469,7 @@ static void exofs_put_super(struct super_block *sb) msecs_to_jiffies(100)); } - _exofs_print_device("Unmounting", NULL, sbi->comps.ods[0], + _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0), sbi->one_comp.obj.partition); bdi_destroy(&sbi->bdi); @@ -479,76 +480,20 @@ static void exofs_put_super(struct super_block *sb) static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, struct exofs_device_table *dt) { - u64 stripe_length; + int ret; - sbi->data_map.odm_num_comps = - le32_to_cpu(dt->dt_data_map.cb_num_comps); - sbi->data_map.odm_stripe_unit = + sbi->layout.stripe_unit = le64_to_cpu(dt->dt_data_map.cb_stripe_unit); - sbi->data_map.odm_group_width = + sbi->layout.group_width = le32_to_cpu(dt->dt_data_map.cb_group_width); - sbi->data_map.odm_group_depth = + sbi->layout.group_depth = le32_to_cpu(dt->dt_data_map.cb_group_depth); - sbi->data_map.odm_mirror_cnt = - le32_to_cpu(dt->dt_data_map.cb_mirror_cnt); - sbi->data_map.odm_raid_algorithm = + sbi->layout.mirrors_p1 = + le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1; + sbi->layout.raid_algorithm = le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); -/* FIXME: Only raid0 for now. if not so, do not mount */ - if (sbi->data_map.odm_num_comps != numdevs) { - EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n", - sbi->data_map.odm_num_comps, numdevs); - return -EINVAL; - } - if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) { - EXOFS_ERR("Only RAID_0 for now\n"); - return -EINVAL; - } - if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) { - EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n", - numdevs, sbi->data_map.odm_mirror_cnt); - return -EINVAL; - } - - if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) { - EXOFS_ERR("Stripe Unit(0x%llx)" - " must be Multples of PAGE_SIZE(0x%lx)\n", - _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE); - return -EINVAL; - } - - sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit; - sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1; - - if (sbi->data_map.odm_group_width) { - sbi->layout.group_width = sbi->data_map.odm_group_width; - sbi->layout.group_depth = sbi->data_map.odm_group_depth; - if (!sbi->layout.group_depth) { - EXOFS_ERR("group_depth == 0 && group_width != 0\n"); - return -EINVAL; - } - sbi->layout.group_count = sbi->data_map.odm_num_comps / - sbi->layout.mirrors_p1 / - sbi->data_map.odm_group_width; - } else { - if (sbi->data_map.odm_group_depth) { - printk(KERN_NOTICE "Warning: group_depth ignored " - "group_width == 0 && group_depth == %d\n", - sbi->data_map.odm_group_depth); - sbi->data_map.odm_group_depth = 0; - } - sbi->layout.group_width = sbi->data_map.odm_num_comps / - sbi->layout.mirrors_p1; - sbi->layout.group_depth = -1; - sbi->layout.group_count = 1; - } - - stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit; - if (stripe_length >= (1ULL << 32)) { - EXOFS_ERR("Total Stripe length(0x%llx)" - " >= 32bit is not supported\n", _LLU(stripe_length)); - return -EINVAL; - } + ret = ore_verify_layout(numdevs, &sbi->layout); EXOFS_DBGMSG("exofs: layout: " "num_comps=%u stripe_unit=0x%x group_width=%u " @@ -558,8 +503,8 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, sbi->layout.group_width, _LLU(sbi->layout.group_depth), sbi->layout.mirrors_p1, - sbi->data_map.odm_raid_algorithm); - return 0; + sbi->layout.raid_algorithm); + return ret; } static unsigned __ra_pages(struct ore_layout *layout) @@ -605,12 +550,40 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, return !(odi->systemid_len || odi->osdname_len); } +int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, + struct exofs_dev **peds) +{ + struct __alloc_ore_devs_and_exofs_devs { + /* Twice bigger table: See exofs_init_comps() and comment at + * exofs_read_lookup_dev_table() + */ + struct ore_dev *oreds[numdevs * 2 - 1]; + struct exofs_dev eds[numdevs]; + } *aoded; + struct exofs_dev *eds; + unsigned i; + + aoded = kzalloc(sizeof(*aoded), GFP_KERNEL); + if (unlikely(!aoded)) { + EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", + numdevs); + return -ENOMEM; + } + + sbi->oc.ods = aoded->oreds; + *peds = eds = aoded->eds; + for (i = 0; i < numdevs; ++i) + aoded->oreds[i] = &eds[i].ored; + return 0; +} + static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, struct osd_dev *fscb_od, unsigned table_count) { struct ore_comp comp; struct exofs_device_table *dt; + struct exofs_dev *eds; unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + sizeof(*dt); unsigned numdevs, i; @@ -623,7 +596,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, return -ENOMEM; } - sbi->comps.numdevs = 0; + sbi->oc.numdevs = 0; comp.obj.partition = sbi->one_comp.obj.partition; comp.obj.id = EXOFS_DEVTABLE_ID; @@ -647,20 +620,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, if (unlikely(ret)) goto out; - if (likely(numdevs > 1)) { - unsigned size = numdevs * sizeof(sbi->comps.ods[0]); - - /* Twice bigger table: See exofs_init_comps() and below - * comment - */ - sbi->comps.ods = kzalloc(size + size - 1, GFP_KERNEL); - if (unlikely(!sbi->comps.ods)) { - EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", - numdevs); - ret = -ENOMEM; - goto out; - } - } + ret = __alloc_dev_table(sbi, numdevs, &eds); + if (unlikely(ret)) + goto out; + /* exofs round-robins the device table view according to inode + * number. We hold a: twice bigger table hence inodes can point + * to any device and have a sequential view of the table + * starting at this device. See exofs_init_comps() + */ + memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0], + (numdevs - 1) * sizeof(sbi->oc.ods[0])); for (i = 0; i < numdevs; i++) { struct exofs_fscb fscb; @@ -676,13 +645,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n", i, odi.osdname); + /* the exofs id is currently the table index */ + eds[i].did = i; + /* On all devices the device table is identical. The user can * specify any one of the participating devices on the command * line. We always keep them in device-table order. */ if (fscb_od && osduld_device_same(fscb_od, &odi)) { - sbi->comps.ods[i] = fscb_od; - ++sbi->comps.numdevs; + eds[i].ored.od = fscb_od; + ++sbi->oc.numdevs; fscb_od = NULL; continue; } @@ -695,8 +667,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, goto out; } - sbi->comps.ods[i] = od; - ++sbi->comps.numdevs; + eds[i].ored.od = od; + ++sbi->oc.numdevs; /* Read the fscb of the other devices to make sure the FS * partition is there. @@ -718,21 +690,10 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, out: kfree(dt); - if (likely(!ret)) { - unsigned numdevs = sbi->comps.numdevs; - - if (unlikely(fscb_od)) { + if (unlikely(fscb_od && !ret)) { EXOFS_ERR("ERROR: Bad device-table container device not present\n"); osduld_put_device(fscb_od); return -EINVAL; - } - /* exofs round-robins the device table view according to inode - * number. We hold a: twice bigger table hence inodes can point - * to any device and have a sequential view of the table - * starting at this device. See exofs_init_comps() - */ - for (i = 0; i < numdevs - 1; ++i) - sbi->comps.ods[i + numdevs] = sbi->comps.ods[i]; } return ret; } @@ -783,10 +744,9 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sbi->one_comp.obj.partition = opts->pid; sbi->one_comp.obj.id = 0; exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj); - sbi->comps.numdevs = 1; - sbi->comps.single_comp = EC_SINGLE_COMP; - sbi->comps.comps = &sbi->one_comp; - sbi->comps.ods = sbi->_min_one_dev; + sbi->oc.numdevs = 1; + sbi->oc.single_comp = EC_SINGLE_COMP; + sbi->oc.comps = &sbi->one_comp; /* fill in some other data by hand */ memset(sb->s_id, 0, sizeof(sb->s_id)); @@ -835,7 +795,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) if (unlikely(ret)) goto free_sbi; } else { - sbi->comps.ods[0] = od; + struct exofs_dev *eds; + + ret = __alloc_dev_table(sbi, 1, &eds); + if (unlikely(ret)) + goto free_sbi; + + ore_comp_set_dev(&sbi->oc, 0, od); } __sbi_read_stats(sbi); @@ -875,7 +841,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - _exofs_print_device("Mounting", opts->dev_name, sbi->comps.ods[0], + _exofs_print_device("Mounting", opts->dev_name, + ore_comp_dev(&sbi->oc, 0), sbi->one_comp.obj.partition); return 0; @@ -924,7 +891,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) uint64_t used = ULLONG_MAX; int ret; - ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); if (ret) { EXOFS_DBGMSG("ore_get_io_state failed.\n"); return ret; @@ -981,7 +948,7 @@ static const struct super_operations exofs_sops = { * EXPORT OPERATIONS *****************************************************************************/ -struct dentry *exofs_get_parent(struct dentry *child) +static struct dentry *exofs_get_parent(struct dentry *child) { unsigned long ino = exofs_parent_ino(child); diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index 5d979b4347b..c922adc8ef4 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -46,28 +46,30 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name, value, size, flags); } -int -ext2_init_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr) +int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *name; + const struct xattr *xattr; + int err = 0; - err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; } - err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, - name, value, len, 0); - kfree(name); - kfree(value); return err; } +int +ext2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &ext2_initxattrs, NULL); +} + const struct xattr_handler ext2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext2_xattr_security_list, diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index b8d9f83aa5c..3c218b8a51d 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -48,28 +48,32 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name, name, value, size, flags); } -int -ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir, - const struct qstr *qstr) +int ext3_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *name; + const struct xattr *xattr; + handle_t *handle = fs_info; + int err = 0; - err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ext3_xattr_set_handle(handle, inode, + EXT3_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; } - err = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_SECURITY, - name, value, len, 0); - kfree(name); - kfree(value); return err; } +int +ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &ext3_initxattrs, handle); +} + const struct xattr_handler ext3_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext3_xattr_security_list, diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 007c3bfbf09..34e4350dd4d 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -48,28 +48,32 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name, name, value, size, flags); } -int -ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, - const struct qstr *qstr) +int ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *name; + const struct xattr *xattr; + handle_t *handle = fs_info; + int err = 0; - err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ext4_xattr_set_handle(handle, inode, + EXT4_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; } - err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY, - name, value, len, 0); - kfree(name); - kfree(value); return err; } +int +ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &ext4_initxattrs, handle); +} + const struct xattr_handler ext4_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext4_xattr_security_list, diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 900cf986aad..6525b804d5e 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -624,31 +624,29 @@ fail: return error; } -static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip, - const struct qstr *qstr) +int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *name; - - err = security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr, - &name, &value, &len); - - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; + const struct xattr *xattr; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = __gfs2_xattr_set(inode, xattr->name, xattr->value, + xattr->value_len, 0, + GFS2_EATYPE_SECURITY); + if (err < 0) + break; } - - err = __gfs2_xattr_set(&ip->i_inode, name, value, len, 0, - GFS2_EATYPE_SECURITY); - kfree(value); - kfree(name); - return err; } +static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip, + const struct qstr *qstr) +{ + return security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr, + &gfs2_initxattrs, NULL); +} + /** * gfs2_create_inode - Create a new inode * @dir: The parent directory diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index cfeb7164b08..0f20208df60 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -22,26 +22,29 @@ #include <linux/security.h> #include "nodelist.h" -/* ---- Initial Security Label Attachment -------------- */ -int jffs2_init_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr) +/* ---- Initial Security Label(s) Attachment callback --- */ +int jffs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int rc; - size_t len; - void *value; - char *name; + const struct xattr *xattr; + int err = 0; - rc = security_inode_init_security(inode, dir, qstr, &name, &value, &len); - if (rc) { - if (rc == -EOPNOTSUPP) - return 0; - return rc; + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; } - rc = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, value, len, 0); + return err; +} - kfree(name); - kfree(value); - return rc; +/* ---- Initial Security Label(s) Attachment ----------- */ +int jffs2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &jffs2_initxattrs, NULL); } /* ---- XATTR Handler for "security.*" ----------------- */ diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index e87fedef23d..26683e15b3a 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -1089,38 +1089,37 @@ int jfs_removexattr(struct dentry *dentry, const char *name) } #ifdef CONFIG_JFS_SECURITY -int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir, - const struct qstr *qstr) +int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int rc; - size_t len; - void *value; - char *suffix; + const struct xattr *xattr; + tid_t *tid = fs_info; char *name; - - rc = security_inode_init_security(inode, dir, qstr, &suffix, &value, - &len); - if (rc) { - if (rc == -EOPNOTSUPP) - return 0; - return rc; - } - name = kmalloc(XATTR_SECURITY_PREFIX_LEN + 1 + strlen(suffix), - GFP_NOFS); - if (!name) { - rc = -ENOMEM; - goto kmalloc_failed; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + name = kmalloc(XATTR_SECURITY_PREFIX_LEN + + strlen(xattr->name) + 1, GFP_NOFS); + if (!name) { + err = -ENOMEM; + break; + } + strcpy(name, XATTR_SECURITY_PREFIX); + strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); + + err = __jfs_setxattr(*tid, inode, name, + xattr->value, xattr->value_len, 0); + kfree(name); + if (err < 0) + break; } - strcpy(name, XATTR_SECURITY_PREFIX); - strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix); - - rc = __jfs_setxattr(tid, inode, name, value, len, 0); - - kfree(name); -kmalloc_failed: - kfree(suffix); - kfree(value); + return err; +} - return rc; +int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &jfs_initxattrs, &tid); } #endif diff --git a/fs/lockd/host.c b/fs/lockd/host.c index b7c99bfb3da..6f29836ec0c 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -316,14 +316,8 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, struct hlist_node *pos; struct nlm_host *host = NULL; struct nsm_handle *nsm = NULL; - struct sockaddr_in sin = { - .sin_family = AF_INET, - }; - struct sockaddr_in6 sin6 = { - .sin6_family = AF_INET6, - }; - struct sockaddr *src_sap; - size_t src_len = rqstp->rq_addrlen; + struct sockaddr *src_sap = svc_daddr(rqstp); + size_t src_len = rqstp->rq_daddrlen; struct nlm_lookup_host_info ni = { .server = 1, .sap = svc_addr(rqstp), @@ -340,21 +334,6 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, mutex_lock(&nlm_host_mutex); - switch (ni.sap->sa_family) { - case AF_INET: - sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr; - src_sap = (struct sockaddr *)&sin; - break; - case AF_INET6: - ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6); - src_sap = (struct sockaddr *)&sin6; - break; - default: - dprintk("lockd: %s failed; unrecognized address family\n", - __func__); - goto out; - } - if (time_after_eq(jiffies, next_gc)) nlm_gc_hosts(); diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index abfff9d7979..c061b9aa7dd 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -282,7 +282,7 @@ int lockd_up(void) /* * Create the kernel thread and wait for it to start. */ - nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0]); + nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); if (IS_ERR(nlmsvc_rqst)) { error = PTR_ERR(nlmsvc_rqst); nlmsvc_rqst = NULL; diff --git a/fs/locks.c b/fs/locks.c index 703f545097d..3b0d05dcd7c 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -60,7 +60,7 @@ * * Initial implementation of mandatory locks. SunOS turned out to be * a rotten model, so I implemented the "obvious" semantics. - * See 'Documentation/mandatory.txt' for details. + * See 'Documentation/filesystems/mandatory-locking.txt' for details. * Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996. * * Don't allow mandatory locks on mmap()'ed files. Added simple functions to @@ -133,6 +133,20 @@ #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) #define IS_LEASE(fl) (fl->fl_flags & FL_LEASE) +static bool lease_breaking(struct file_lock *fl) +{ + return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING); +} + +static int target_leasetype(struct file_lock *fl) +{ + if (fl->fl_flags & FL_UNLOCK_PENDING) + return F_UNLCK; + if (fl->fl_flags & FL_DOWNGRADE_PENDING) + return F_RDLCK; + return fl->fl_type; +} + int leases_enable = 1; int lease_break_time = 45; @@ -1119,6 +1133,17 @@ int locks_mandatory_area(int read_write, struct inode *inode, EXPORT_SYMBOL(locks_mandatory_area); +static void lease_clear_pending(struct file_lock *fl, int arg) +{ + switch (arg) { + case F_UNLCK: + fl->fl_flags &= ~FL_UNLOCK_PENDING; + /* fall through: */ + case F_RDLCK: + fl->fl_flags &= ~FL_DOWNGRADE_PENDING; + } +} + /* We already had a lease on this file; just change its type */ int lease_modify(struct file_lock **before, int arg) { @@ -1127,6 +1152,7 @@ int lease_modify(struct file_lock **before, int arg) if (error) return error; + lease_clear_pending(fl, arg); locks_wake_up_blocks(fl); if (arg == F_UNLCK) locks_delete_lock(before); @@ -1135,19 +1161,25 @@ int lease_modify(struct file_lock **before, int arg) EXPORT_SYMBOL(lease_modify); +static bool past_time(unsigned long then) +{ + if (!then) + /* 0 is a special value meaning "this never expires": */ + return false; + return time_after(jiffies, then); +} + static void time_out_leases(struct inode *inode) { struct file_lock **before; struct file_lock *fl; before = &inode->i_flock; - while ((fl = *before) && IS_LEASE(fl) && (fl->fl_type & F_INPROGRESS)) { - if ((fl->fl_break_time == 0) - || time_before(jiffies, fl->fl_break_time)) { - before = &fl->fl_next; - continue; - } - lease_modify(before, fl->fl_type & ~F_INPROGRESS); + while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) { + if (past_time(fl->fl_downgrade_time)) + lease_modify(before, F_RDLCK); + if (past_time(fl->fl_break_time)) + lease_modify(before, F_UNLCK); if (fl == *before) /* lease_modify may have freed fl */ before = &fl->fl_next; } @@ -1165,7 +1197,7 @@ static void time_out_leases(struct inode *inode) */ int __break_lease(struct inode *inode, unsigned int mode) { - int error = 0, future; + int error = 0; struct file_lock *new_fl, *flock; struct file_lock *fl; unsigned long break_time; @@ -1182,24 +1214,13 @@ int __break_lease(struct inode *inode, unsigned int mode) if ((flock == NULL) || !IS_LEASE(flock)) goto out; + if (!locks_conflict(flock, new_fl)) + goto out; + for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) if (fl->fl_owner == current->files) i_have_this_lease = 1; - if (want_write) { - /* If we want write access, we have to revoke any lease. */ - future = F_UNLCK | F_INPROGRESS; - } else if (flock->fl_type & F_INPROGRESS) { - /* If the lease is already being broken, we just leave it */ - future = flock->fl_type; - } else if (flock->fl_type & F_WRLCK) { - /* Downgrade the exclusive lease to a read-only lease. */ - future = F_RDLCK | F_INPROGRESS; - } else { - /* the existing lease was read-only, so we can read too. */ - goto out; - } - if (IS_ERR(new_fl) && !i_have_this_lease && ((mode & O_NONBLOCK) == 0)) { error = PTR_ERR(new_fl); @@ -1214,12 +1235,18 @@ int __break_lease(struct inode *inode, unsigned int mode) } for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) { - if (fl->fl_type != future) { - fl->fl_type = future; + if (want_write) { + if (fl->fl_flags & FL_UNLOCK_PENDING) + continue; + fl->fl_flags |= FL_UNLOCK_PENDING; fl->fl_break_time = break_time; - /* lease must have lmops break callback */ - fl->fl_lmops->lm_break(fl); + } else { + if (lease_breaking(flock)) + continue; + fl->fl_flags |= FL_DOWNGRADE_PENDING; + fl->fl_downgrade_time = break_time; } + fl->fl_lmops->lm_break(fl); } if (i_have_this_lease || (mode & O_NONBLOCK)) { @@ -1243,10 +1270,13 @@ restart: if (error >= 0) { if (error == 0) time_out_leases(inode); - /* Wait for the next lease that has not been broken yet */ + /* + * Wait for the next conflicting lease that has not been + * broken yet + */ for (flock = inode->i_flock; flock && IS_LEASE(flock); flock = flock->fl_next) { - if (flock->fl_type & F_INPROGRESS) + if (locks_conflict(new_fl, flock)) goto restart; } error = 0; @@ -1314,7 +1344,7 @@ int fcntl_getlease(struct file *filp) for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl); fl = fl->fl_next) { if (fl->fl_file == filp) { - type = fl->fl_type & ~F_INPROGRESS; + type = target_leasetype(fl); break; } } @@ -1322,50 +1352,23 @@ int fcntl_getlease(struct file *filp) return type; } -/** - * generic_setlease - sets a lease on an open file - * @filp: file pointer - * @arg: type of lease to obtain - * @flp: input - file_lock to use, output - file_lock inserted - * - * The (input) flp->fl_lmops->lm_break function is required - * by break_lease(). - * - * Called with file_lock_lock held. - */ -int generic_setlease(struct file *filp, long arg, struct file_lock **flp) +int generic_add_lease(struct file *filp, long arg, struct file_lock **flp) { struct file_lock *fl, **before, **my_before = NULL, *lease; struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; - int error, rdlease_count = 0, wrlease_count = 0; + int error; lease = *flp; - error = -EACCES; - if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE)) - goto out; - error = -EINVAL; - if (!S_ISREG(inode->i_mode)) + error = -EAGAIN; + if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) goto out; - error = security_file_lock(filp, arg); - if (error) + if ((arg == F_WRLCK) + && ((dentry->d_count > 1) + || (atomic_read(&inode->i_count) > 1))) goto out; - time_out_leases(inode); - - BUG_ON(!(*flp)->fl_lmops->lm_break); - - if (arg != F_UNLCK) { - error = -EAGAIN; - if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) - goto out; - if ((arg == F_WRLCK) - && ((dentry->d_count > 1) - || (atomic_read(&inode->i_count) > 1))) - goto out; - } - /* * At this point, we know that if there is an exclusive * lease on this file, then we hold it on this filp @@ -1374,27 +1377,28 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) * then the file is not open by anyone (including us) * except for this filp. */ + error = -EAGAIN; for (before = &inode->i_flock; ((fl = *before) != NULL) && IS_LEASE(fl); before = &fl->fl_next) { - if (fl->fl_file == filp) + if (fl->fl_file == filp) { my_before = before; - else if (fl->fl_type == (F_INPROGRESS | F_UNLCK)) - /* - * Someone is in the process of opening this - * file for writing so we may not take an - * exclusive lease on it. - */ - wrlease_count++; - else - rdlease_count++; + continue; + } + /* + * No exclusive leases if someone else has a lease on + * this file: + */ + if (arg == F_WRLCK) + goto out; + /* + * Modifying our existing lease is OK, but no getting a + * new lease if someone else is opening for write: + */ + if (fl->fl_flags & FL_UNLOCK_PENDING) + goto out; } - error = -EAGAIN; - if ((arg == F_RDLCK && (wrlease_count > 0)) || - (arg == F_WRLCK && ((rdlease_count + wrlease_count) > 0))) - goto out; - if (my_before != NULL) { error = lease->fl_lmops->lm_change(my_before, arg); if (!error) @@ -1402,9 +1406,6 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) goto out; } - if (arg == F_UNLCK) - goto out; - error = -EINVAL; if (!leases_enable) goto out; @@ -1415,6 +1416,62 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) out: return error; } + +int generic_delete_lease(struct file *filp, struct file_lock **flp) +{ + struct file_lock *fl, **before; + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + + for (before = &inode->i_flock; + ((fl = *before) != NULL) && IS_LEASE(fl); + before = &fl->fl_next) { + if (fl->fl_file != filp) + continue; + return (*flp)->fl_lmops->lm_change(before, F_UNLCK); + } + return -EAGAIN; +} + +/** + * generic_setlease - sets a lease on an open file + * @filp: file pointer + * @arg: type of lease to obtain + * @flp: input - file_lock to use, output - file_lock inserted + * + * The (input) flp->fl_lmops->lm_break function is required + * by break_lease(). + * + * Called with file_lock_lock held. + */ +int generic_setlease(struct file *filp, long arg, struct file_lock **flp) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + int error; + + if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE)) + return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + error = security_file_lock(filp, arg); + if (error) + return error; + + time_out_leases(inode); + + BUG_ON(!(*flp)->fl_lmops->lm_break); + + switch (arg) { + case F_UNLCK: + return generic_delete_lease(filp, flp); + case F_RDLCK: + case F_WRLCK: + return generic_add_lease(filp, arg, flp); + default: + BUG(); + } +} EXPORT_SYMBOL(generic_setlease); static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease) @@ -2126,7 +2183,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, } } else if (IS_LEASE(fl)) { seq_printf(f, "LEASE "); - if (fl->fl_type & F_INPROGRESS) + if (lease_breaking(fl)) seq_printf(f, "BREAKING "); else if (fl->fl_file) seq_printf(f, "ACTIVE "); @@ -2142,7 +2199,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE "); } else { seq_printf(f, "%s ", - (fl->fl_type & F_INPROGRESS) + (lease_breaking(fl)) ? (fl->fl_type & F_UNLCK) ? "UNLCK" : "READ " : (fl->fl_type & F_WRLCK) ? "WRITE" : "READ "); } diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 9561c8fc8bd..281ae95932c 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -176,17 +176,6 @@ retry: return bio; } -static void bl_set_lo_fail(struct pnfs_layout_segment *lseg) -{ - if (lseg->pls_range.iomode == IOMODE_RW) { - dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__); - set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); - } else { - dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__); - set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); - } -} - /* This is basically copied from mpage_end_io_read */ static void bl_end_io_read(struct bio *bio, int err) { @@ -206,7 +195,7 @@ static void bl_end_io_read(struct bio *bio, int err) if (!uptodate) { if (!rdata->pnfs_error) rdata->pnfs_error = -EIO; - bl_set_lo_fail(rdata->lseg); + pnfs_set_lo_fail(rdata->lseg); } bio_put(bio); put_parallel(par); @@ -303,6 +292,7 @@ bl_read_pagelist(struct nfs_read_data *rdata) bl_end_io_read, par); if (IS_ERR(bio)) { rdata->pnfs_error = PTR_ERR(bio); + bio = NULL; goto out; } } @@ -370,7 +360,7 @@ static void bl_end_io_write_zero(struct bio *bio, int err) if (!uptodate) { if (!wdata->pnfs_error) wdata->pnfs_error = -EIO; - bl_set_lo_fail(wdata->lseg); + pnfs_set_lo_fail(wdata->lseg); } bio_put(bio); put_parallel(par); @@ -386,7 +376,7 @@ static void bl_end_io_write(struct bio *bio, int err) if (!uptodate) { if (!wdata->pnfs_error) wdata->pnfs_error = -EIO; - bl_set_lo_fail(wdata->lseg); + pnfs_set_lo_fail(wdata->lseg); } bio_put(bio); put_parallel(par); @@ -543,6 +533,11 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync) fill_invalid_ext: dprintk("%s need to zero %d pages\n", __func__, npg_zero); for (;npg_zero > 0; npg_zero--) { + if (bl_is_sector_init(be->be_inval, isect)) { + dprintk("isect %llu already init\n", + (unsigned long long)isect); + goto next_page; + } /* page ref released in bl_end_io_write_zero */ index = isect >> PAGE_CACHE_SECTOR_SHIFT; dprintk("%s zero %dth page: index %lu isect %llu\n", @@ -562,8 +557,7 @@ fill_invalid_ext: * PageUptodate: It was read before * sector_initialized: already written out */ - if (PageDirty(page) || PageWriteback(page) || - bl_is_sector_init(be->be_inval, isect)) { + if (PageDirty(page) || PageWriteback(page)) { print_page(page); unlock_page(page); page_cache_release(page); @@ -592,6 +586,7 @@ fill_invalid_ext: bl_end_io_write_zero, par); if (IS_ERR(bio)) { wdata->pnfs_error = PTR_ERR(bio); + bio = NULL; goto out; } /* FIXME: This should be done in bi_end_io */ @@ -640,6 +635,7 @@ next_page: bl_end_io_write, par); if (IS_ERR(bio)) { wdata->pnfs_error = PTR_ERR(bio); + bio = NULL; goto out; } isect += PAGE_CACHE_SECTORS; @@ -805,7 +801,7 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, struct nfs4_deviceid *d_id) { struct pnfs_device *dev; - struct pnfs_block_dev *rv = NULL; + struct pnfs_block_dev *rv; u32 max_resp_sz; int max_pages; struct page **pages = NULL; @@ -823,18 +819,20 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, dev = kmalloc(sizeof(*dev), GFP_NOFS); if (!dev) { dprintk("%s kmalloc failed\n", __func__); - return NULL; + return ERR_PTR(-ENOMEM); } pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS); if (pages == NULL) { kfree(dev); - return NULL; + return ERR_PTR(-ENOMEM); } for (i = 0; i < max_pages; i++) { pages[i] = alloc_page(GFP_NOFS); - if (!pages[i]) + if (!pages[i]) { + rv = ERR_PTR(-ENOMEM); goto out_free; + } } memcpy(&dev->dev_id, d_id, sizeof(*d_id)); @@ -847,8 +845,10 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); rc = nfs4_proc_getdeviceinfo(server, dev); dprintk("%s getdevice info returns %d\n", __func__, rc); - if (rc) + if (rc) { + rv = ERR_PTR(rc); goto out_free; + } rv = nfs4_blk_decode_device(server, dev); out_free: @@ -866,7 +866,7 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) struct pnfs_devicelist *dlist = NULL; struct pnfs_block_dev *bdev; LIST_HEAD(block_disklist); - int status = 0, i; + int status, i; dprintk("%s enter\n", __func__); @@ -898,8 +898,8 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) for (i = 0; i < dlist->num_devs; i++) { bdev = nfs4_blk_get_deviceinfo(server, fh, &dlist->dev_id[i]); - if (!bdev) { - status = -ENODEV; + if (IS_ERR(bdev)) { + status = PTR_ERR(bdev); goto out_error; } spin_lock(&b_mt_id->bm_lock); @@ -960,7 +960,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = { }; static const struct rpc_pipe_ops bl_upcall_ops = { - .upcall = bl_pipe_upcall, + .upcall = rpc_pipe_generic_upcall, .downcall = bl_pipe_downcall, .destroy_msg = bl_pipe_destroy_msg, }; @@ -989,17 +989,20 @@ static int __init nfs4blocklayout_init(void) mnt, NFS_PIPE_DIRNAME, 0, &path); if (ret) - goto out_remove; + goto out_putrpc; bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL, &bl_upcall_ops, 0); + path_put(&path); if (IS_ERR(bl_device_pipe)) { ret = PTR_ERR(bl_device_pipe); - goto out_remove; + goto out_putrpc; } out: return ret; +out_putrpc: + rpc_put_mount(); out_remove: pnfs_unregister_layoutdriver(&blocklayout_type); return ret; @@ -1012,6 +1015,7 @@ static void __exit nfs4blocklayout_exit(void) pnfs_unregister_layoutdriver(&blocklayout_type); rpc_unlink(bl_device_pipe); + rpc_put_mount(); } MODULE_ALIAS("nfs-layouttype4-3"); diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index f27d827960a..42acf7ef599 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -150,7 +150,7 @@ BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) } struct bl_dev_msg { - int status; + int32_t status; uint32_t major, minor; }; @@ -169,8 +169,6 @@ extern wait_queue_head_t bl_wq; #define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ /* blocklayoutdev.c */ -ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *, - char __user *, size_t); ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); void bl_pipe_destroy_msg(struct rpc_pipe_msg *); struct block_device *nfs4_blkdev_get(dev_t dev); diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c index a83b393fb01..d08ba9107fd 100644 --- a/fs/nfs/blocklayout/blocklayoutdev.c +++ b/fs/nfs/blocklayout/blocklayoutdev.c @@ -79,28 +79,6 @@ int nfs4_blkdev_put(struct block_device *bdev) return blkdev_put(bdev, FMODE_READ); } -/* - * Shouldn't there be a rpc_generic_upcall() to do this for us? - */ -ssize_t bl_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, - char __user *dst, size_t buflen) -{ - char *data = (char *)msg->data + msg->copied; - size_t mlen = min(msg->len - msg->copied, buflen); - unsigned long left; - - left = copy_to_user(dst, data, mlen); - if (left == mlen) { - msg->errno = -EFAULT; - return -EFAULT; - } - - mlen -= left; - msg->copied += mlen; - msg->errno = 0; - return mlen; -} - static struct bl_dev_msg bl_mount_reply; ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, @@ -131,7 +109,7 @@ struct pnfs_block_dev * nfs4_blk_decode_device(struct nfs_server *server, struct pnfs_device *dev) { - struct pnfs_block_dev *rv = NULL; + struct pnfs_block_dev *rv; struct block_device *bd = NULL; struct rpc_pipe_msg msg; struct bl_msg_hdr bl_msg = { @@ -141,7 +119,7 @@ nfs4_blk_decode_device(struct nfs_server *server, uint8_t *dataptr; DECLARE_WAITQUEUE(wq, current); struct bl_dev_msg *reply = &bl_mount_reply; - int offset, len, i; + int offset, len, i, rc; dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, @@ -168,8 +146,10 @@ nfs4_blk_decode_device(struct nfs_server *server, dprintk("%s CALLING USERSPACE DAEMON\n", __func__); add_wait_queue(&bl_wq, &wq); - if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) { + rc = rpc_queue_upcall(bl_device_pipe->d_inode, &msg); + if (rc < 0) { remove_wait_queue(&bl_wq, &wq); + rv = ERR_PTR(rc); goto out; } @@ -187,8 +167,9 @@ nfs4_blk_decode_device(struct nfs_server *server, bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor)); if (IS_ERR(bd)) { - dprintk("%s failed to open device : %ld\n", - __func__, PTR_ERR(bd)); + rc = PTR_ERR(bd); + dprintk("%s failed to open device : %d\n", __func__, rc); + rv = ERR_PTR(rc); goto out; } diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index e3d29426905..516f3375e06 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -125,7 +125,7 @@ nfs4_callback_up(struct svc_serv *serv) else goto out_err; - return svc_prepare_thread(serv, &serv->sv_pools[0]); + return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); out_err: if (ret == 0) @@ -199,7 +199,7 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) INIT_LIST_HEAD(&serv->sv_cb_list); spin_lock_init(&serv->sv_cb_lock); init_waitqueue_head(&serv->sv_cb_waitq); - rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]); + rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); if (IS_ERR(rqstp)) { svc_xprt_put(serv->sv_bc_xprt); serv->sv_bc_xprt = NULL; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 5833fbbf59b..873bf00d51a 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -336,11 +336,12 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; - if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && - sin1->sin6_scope_id != sin2->sin6_scope_id) + if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr)) return 0; + else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL) + return sin1->sin6_scope_id == sin2->sin6_scope_id; - return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr); + return 1; } #else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, @@ -1867,6 +1868,10 @@ static int nfs_server_list_show(struct seq_file *m, void *v) /* display one transport per line on subsequent lines */ clp = list_entry(v, struct nfs_client, cl_share_link); + /* Check if the client is initialized */ + if (clp->cl_cons_state != NFS_CS_READY) + return 0; + seq_printf(m, "v%u %s %s %3d %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 321a66bc384..7f265406980 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -240,7 +240,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct sizeof(delegation->stateid.data)); delegation->type = res->delegation_type; delegation->maxsize = res->maxsize; - delegation->change_attr = nfsi->change_attr; + delegation->change_attr = inode->i_version; delegation->cred = get_rpccred(cred); delegation->inode = inode; delegation->flags = 1<<NFS_DELEGATION_REFERENCED; diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c index 5b1006480bc..7cf2c4699b0 100644 --- a/fs/nfs/fscache-index.c +++ b/fs/nfs/fscache-index.c @@ -212,7 +212,7 @@ static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data, auxdata.ctime = nfsi->vfs_inode.i_ctime; if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) - auxdata.change_attr = nfsi->change_attr; + auxdata.change_attr = nfsi->vfs_inode.i_version; if (bufmax > sizeof(auxdata)) bufmax = sizeof(auxdata); @@ -244,7 +244,7 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, auxdata.ctime = nfsi->vfs_inode.i_ctime; if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) - auxdata.change_attr = nfsi->change_attr; + auxdata.change_attr = nfsi->vfs_inode.i_version; if (memcmp(data, &auxdata, datalen) != 0) return FSCACHE_CHECKAUX_OBSOLETE; diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index f20801ae0a1..47d1c6ff2d8 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -336,8 +336,6 @@ struct idmap { struct idmap_hashtable idmap_group_hash; }; -static ssize_t idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *, - char __user *, size_t); static ssize_t idmap_pipe_downcall(struct file *, const char __user *, size_t); static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); @@ -345,7 +343,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); static unsigned int fnvhash32(const void *, size_t); static const struct rpc_pipe_ops idmap_upcall_ops = { - .upcall = idmap_pipe_upcall, + .upcall = rpc_pipe_generic_upcall, .downcall = idmap_pipe_downcall, .destroy_msg = idmap_pipe_destroy_msg, }; @@ -595,27 +593,6 @@ nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h, return ret; } -/* RPC pipefs upcall/downcall routines */ -static ssize_t -idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, - char __user *dst, size_t buflen) -{ - char *data = (char *)msg->data + msg->copied; - size_t mlen = min(msg->len, buflen); - unsigned long left; - - left = copy_to_user(dst, data, mlen); - if (left == mlen) { - msg->errno = -EFAULT; - return -EFAULT; - } - - mlen -= left; - msg->copied += mlen; - msg->errno = 0; - return mlen; -} - static ssize_t idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index fe1203797b2..4dc6d078f10 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -318,7 +318,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) memset(&inode->i_atime, 0, sizeof(inode->i_atime)); memset(&inode->i_mtime, 0, sizeof(inode->i_mtime)); memset(&inode->i_ctime, 0, sizeof(inode->i_ctime)); - nfsi->change_attr = 0; + inode->i_version = 0; inode->i_size = 0; inode->i_nlink = 0; inode->i_uid = -2; @@ -344,7 +344,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; if (fattr->valid & NFS_ATTR_FATTR_CHANGE) - nfsi->change_attr = fattr->change_attr; + inode->i_version = fattr->change_attr; else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) nfsi->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA; @@ -897,8 +897,8 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) && (fattr->valid & NFS_ATTR_FATTR_CHANGE) - && nfsi->change_attr == fattr->pre_change_attr) { - nfsi->change_attr = fattr->change_attr; + && inode->i_version == fattr->pre_change_attr) { + inode->i_version = fattr->change_attr; if (S_ISDIR(inode->i_mode)) nfsi->cache_validity |= NFS_INO_INVALID_DATA; ret |= NFS_INO_INVALID_ATTR; @@ -952,7 +952,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat return -EIO; if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && - nfsi->change_attr != fattr->change_attr) + inode->i_version != fattr->change_attr) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; /* Verify a few of the more important attributes */ @@ -1163,7 +1163,7 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa } if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) { - fattr->pre_change_attr = NFS_I(inode)->change_attr; + fattr->pre_change_attr = inode->i_version; fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; } if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && @@ -1244,13 +1244,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* More cache consistency checks */ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { - if (nfsi->change_attr != fattr->change_attr) { + if (inode->i_version != fattr->change_attr) { dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); - nfsi->change_attr = fattr->change_attr; + inode->i_version = fattr->change_attr; } } else if (server->caps & NFS_CAP_CHANGE_ATTR) invalid |= save_cache_validity; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index ab12913dd47..c1a1bd8ddf1 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -457,13 +457,3 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len) PAGE_SIZE - 1) >> PAGE_SHIFT; } -/* - * Helper for restarting RPC calls in the possible presence of NFSv4.1 - * sessions. - */ -static inline int nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp) -{ - if (nfs4_has_session(clp)) - return rpc_restart_call_prepare(task); - return rpc_restart_call(task); -} diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 3e93e9a1bee..693ae22f873 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -13,30 +13,6 @@ struct idmap; -/* - * In a seqid-mutating op, this macro controls which error return - * values trigger incrementation of the seqid. - * - * from rfc 3010: - * The client MUST monotonically increment the sequence number for the - * CLOSE, LOCK, LOCKU, OPEN, OPEN_CONFIRM, and OPEN_DOWNGRADE - * operations. This is true even in the event that the previous - * operation that used the sequence number received an error. The only - * exception to this rule is if the previous operation received one of - * the following errors: NFSERR_STALE_CLIENTID, NFSERR_STALE_STATEID, - * NFSERR_BAD_STATEID, NFSERR_BAD_SEQID, NFSERR_BADXDR, - * NFSERR_RESOURCE, NFSERR_NOFILEHANDLE. - * - */ -#define seqid_mutating_err(err) \ -(((err) != NFSERR_STALE_CLIENTID) && \ - ((err) != NFSERR_STALE_STATEID) && \ - ((err) != NFSERR_BAD_STATEID) && \ - ((err) != NFSERR_BAD_SEQID) && \ - ((err) != NFSERR_BAD_XDR) && \ - ((err) != NFSERR_RESOURCE) && \ - ((err) != NFSERR_NOFILEHANDLE)) - enum nfs4_client_state { NFS4CLNT_MANAGER_RUNNING = 0, NFS4CLNT_CHECK_LEASE, diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index e8915d4840a..09119418402 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -77,19 +77,6 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) BUG(); } -/* For data server errors we don't recover from */ -static void -filelayout_set_lo_fail(struct pnfs_layout_segment *lseg) -{ - if (lseg->pls_range.iomode == IOMODE_RW) { - dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__); - set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); - } else { - dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__); - set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); - } -} - static int filelayout_async_handle_error(struct rpc_task *task, struct nfs4_state *state, struct nfs_client *clp, @@ -135,7 +122,6 @@ static int filelayout_async_handle_error(struct rpc_task *task, static int filelayout_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) { - struct nfs_client *clp = data->ds_clp; int reset = 0; dprintk("%s DS read\n", __func__); @@ -145,11 +131,10 @@ static int filelayout_read_done_cb(struct rpc_task *task, dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", __func__, data->ds_clp, data->ds_clp->cl_session); if (reset) { - filelayout_set_lo_fail(data->lseg); + pnfs_set_lo_fail(data->lseg); nfs4_reset_read(task, data); - clp = NFS_SERVER(data->inode)->nfs_client; } - nfs_restart_rpc(task, clp); + rpc_restart_call_prepare(task); return -EAGAIN; } @@ -216,17 +201,13 @@ static int filelayout_write_done_cb(struct rpc_task *task, if (filelayout_async_handle_error(task, data->args.context->state, data->ds_clp, &reset) == -EAGAIN) { - struct nfs_client *clp; - dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", __func__, data->ds_clp, data->ds_clp->cl_session); if (reset) { - filelayout_set_lo_fail(data->lseg); + pnfs_set_lo_fail(data->lseg); nfs4_reset_write(task, data); - clp = NFS_SERVER(data->inode)->nfs_client; - } else - clp = data->ds_clp; - nfs_restart_rpc(task, clp); + } + rpc_restart_call_prepare(task); return -EAGAIN; } @@ -256,9 +237,9 @@ static int filelayout_commit_done_cb(struct rpc_task *task, __func__, data->ds_clp, data->ds_clp->cl_session); if (reset) { prepare_to_resend_writes(data); - filelayout_set_lo_fail(data->lseg); + pnfs_set_lo_fail(data->lseg); } else - nfs_restart_rpc(task, data->ds_clp); + rpc_restart_call_prepare(task); return -EAGAIN; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4700fae1ada..d2ae413c986 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -73,9 +73,6 @@ static int _nfs4_proc_open(struct nfs4_opendata *data); static int _nfs4_recover_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); -static int _nfs4_proc_lookup(struct rpc_clnt *client, struct inode *dir, - const struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr); static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, @@ -753,9 +750,9 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) spin_lock(&dir->i_lock); nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA; - if (!cinfo->atomic || cinfo->before != nfsi->change_attr) + if (!cinfo->atomic || cinfo->before != dir->i_version) nfs_force_lookup_revalidate(dir); - nfsi->change_attr = cinfo->after; + dir->i_version = cinfo->after; spin_unlock(&dir->i_lock); } @@ -1596,8 +1593,14 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) int status; status = nfs4_run_open_task(data, 0); - if (status != 0 || !data->rpc_done) + if (!data->rpc_done) + return status; + if (status != 0) { + if (status == -NFS4ERR_BADNAME && + !(o_arg->open_flags & O_CREAT)) + return -ENOENT; return status; + } if (o_arg->open_flags & O_CREAT) { update_changeattr(dir, &o_res->cinfo); @@ -2408,14 +2411,15 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, return status; } -static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server, - const struct nfs_fh *dirfh, const struct qstr *name, - struct nfs_fh *fhandle, struct nfs_fattr *fattr) +static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, + const struct qstr *name, struct nfs_fh *fhandle, + struct nfs_fattr *fattr) { + struct nfs_server *server = NFS_SERVER(dir); int status; struct nfs4_lookup_arg args = { .bitmask = server->attr_bitmask, - .dir_fh = dirfh, + .dir_fh = NFS_FH(dir), .name = name, }; struct nfs4_lookup_res res = { @@ -2431,40 +2435,8 @@ static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server, nfs_fattr_init(fattr); - dprintk("NFS call lookupfh %s\n", name->name); - status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0); - dprintk("NFS reply lookupfh: %d\n", status); - return status; -} - -static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh, - struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) -{ - struct nfs4_exception exception = { }; - int err; - do { - err = _nfs4_proc_lookupfh(server->client, server, dirfh, name, fhandle, fattr); - /* FIXME: !!!! */ - if (err == -NFS4ERR_MOVED) { - err = -EREMOTE; - break; - } - err = nfs4_handle_exception(server, err, &exception); - } while (exception.retry); - return err; -} - -static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, - const struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) -{ - int status; - dprintk("NFS call lookup %s\n", name->name); - status = _nfs4_proc_lookupfh(clnt, NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr); - if (status == -NFS4ERR_MOVED) - status = nfs4_get_referral(dir, name, fattr, fhandle); + status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0); dprintk("NFS reply lookup: %d\n", status); return status; } @@ -2485,11 +2457,20 @@ static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qst struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(NFS_SERVER(dir), - _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr), - &exception); - if (err == -EPERM) + int status; + + status = _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr); + switch (status) { + case -NFS4ERR_BADNAME: + return -ENOENT; + case -NFS4ERR_MOVED: + err = nfs4_get_referral(dir, name, fattr, fhandle); + break; + case -NFS4ERR_WRONGSEC: nfs_fixup_secinfo_attributes(fattr, fhandle); + } + err = nfs4_handle_exception(NFS_SERVER(dir), + status, &exception); } while (exception.retry); return err; } @@ -3210,7 +3191,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) struct nfs_server *server = NFS_SERVER(data->inode); if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { - nfs_restart_rpc(task, server->nfs_client); + rpc_restart_call_prepare(task); return -EAGAIN; } @@ -3260,7 +3241,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data struct inode *inode = data->inode; if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { - nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + rpc_restart_call_prepare(task); return -EAGAIN; } if (task->tk_status >= 0) { @@ -3317,7 +3298,7 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *dat struct inode *inode = data->inode; if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { - nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + rpc_restart_call_prepare(task); return -EAGAIN; } nfs_refresh_inode(inode, data->res.fattr); @@ -3857,7 +3838,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) default: if (nfs4_async_handle_error(task, data->res.server, NULL) == -EAGAIN) { - nfs_restart_rpc(task, data->res.server->nfs_client); + rpc_restart_call_prepare(task); return; } } @@ -4111,8 +4092,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) break; default: if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) - nfs_restart_rpc(task, - calldata->server->nfs_client); + rpc_restart_call_prepare(task); } } @@ -4945,7 +4925,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata) task->tk_status = 0; /* fall through */ case -NFS4ERR_RETRY_UNCACHED_REP: - nfs_restart_rpc(task, data->clp); + rpc_restart_call_prepare(task); return; } dprintk("<-- %s\n", __func__); @@ -5786,7 +5766,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) server = NFS_SERVER(lrp->args.inode); if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { - nfs_restart_rpc(task, lrp->clp); + rpc_restart_call_prepare(task); return; } spin_lock(&lo->plh_inode->i_lock); @@ -5957,7 +5937,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) } if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { - nfs_restart_rpc(task, server->nfs_client); + rpc_restart_call_prepare(task); return; } @@ -6270,7 +6250,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .getroot = nfs4_proc_get_root, .getattr = nfs4_proc_getattr, .setattr = nfs4_proc_setattr, - .lookupfh = nfs4_proc_lookupfh, .lookup = nfs4_proc_lookup, .access = nfs4_proc_access, .readlink = nfs4_proc_readlink, diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index e550e8836c3..ee73d9a4f70 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1168,23 +1168,17 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); /* * Called by non rpc-based layout drivers */ -int -pnfs_ld_write_done(struct nfs_write_data *data) +void pnfs_ld_write_done(struct nfs_write_data *data) { - int status; - - if (!data->pnfs_error) { + if (likely(!data->pnfs_error)) { pnfs_set_layoutcommit(data); data->mds_ops->rpc_call_done(&data->task, data); - data->mds_ops->rpc_release(data); - return 0; + } else { + put_lseg(data->lseg); + data->lseg = NULL; + dprintk("pnfs write error = %d\n", data->pnfs_error); } - - dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__, - data->pnfs_error); - status = nfs_initiate_write(data, NFS_CLIENT(data->inode), - data->mds_ops, NFS_FILE_SYNC); - return status ? : -EAGAIN; + data->mds_ops->rpc_release(data); } EXPORT_SYMBOL_GPL(pnfs_ld_write_done); @@ -1268,23 +1262,17 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); /* * Called by non rpc-based layout drivers */ -int -pnfs_ld_read_done(struct nfs_read_data *data) +void pnfs_ld_read_done(struct nfs_read_data *data) { - int status; - - if (!data->pnfs_error) { + if (likely(!data->pnfs_error)) { __nfs4_read_done_cb(data); data->mds_ops->rpc_call_done(&data->task, data); - data->mds_ops->rpc_release(data); - return 0; + } else { + put_lseg(data->lseg); + data->lseg = NULL; + dprintk("pnfs write error = %d\n", data->pnfs_error); } - - dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__, - data->pnfs_error); - status = nfs_initiate_read(data, NFS_CLIENT(data->inode), - data->mds_ops); - return status ? : -EAGAIN; + data->mds_ops->rpc_release(data); } EXPORT_SYMBOL_GPL(pnfs_ld_read_done); @@ -1381,6 +1369,18 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) } } +void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) +{ + if (lseg->pls_range.iomode == IOMODE_RW) { + dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__); + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); + } else { + dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__); + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + } +} +EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); + void pnfs_set_layoutcommit(struct nfs_write_data *wdata) { diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 01cbfd54f3c..1509530cb11 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -178,6 +178,7 @@ int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *); int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); +void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg); int pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); @@ -200,8 +201,8 @@ void pnfs_set_layoutcommit(struct nfs_write_data *wdata); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); int _pnfs_return_layout(struct inode *); -int pnfs_ld_write_done(struct nfs_write_data *); -int pnfs_ld_read_done(struct nfs_read_data *); +void pnfs_ld_write_done(struct nfs_write_data *); +void pnfs_ld_read_done(struct nfs_read_data *); struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, loff_t pos, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 2171c043ab0..8b48ec63f72 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -35,16 +35,13 @@ static const struct rpc_call_ops nfs_read_partial_ops; static const struct rpc_call_ops nfs_read_full_ops; static struct kmem_cache *nfs_rdata_cachep; -static mempool_t *nfs_rdata_mempool; - -#define MIN_POOL_READ (32) struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) { - struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL); + struct nfs_read_data *p; + p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); if (p) { - memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); p->npages = pagecount; if (pagecount <= ARRAY_SIZE(p->page_array)) @@ -52,7 +49,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) else { p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); if (!p->pagevec) { - mempool_free(p, nfs_rdata_mempool); + kmem_cache_free(nfs_rdata_cachep, p); p = NULL; } } @@ -64,7 +61,7 @@ void nfs_readdata_free(struct nfs_read_data *p) { if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); - mempool_free(p, nfs_rdata_mempool); + kmem_cache_free(nfs_rdata_cachep, p); } void nfs_readdata_release(struct nfs_read_data *rdata) @@ -276,7 +273,6 @@ nfs_async_read_error(struct list_head *head) while (!list_empty(head)) { req = nfs_list_entry(head->next); nfs_list_remove_request(req); - SetPageError(req->wb_page); nfs_readpage_release(req); } } @@ -322,7 +318,6 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head offset += len; } while(nbytes != 0); atomic_set(&req->wb_complete, requests); - ClearPageError(page); desc->pg_rpc_callops = &nfs_read_partial_ops; return ret; out_bad: @@ -331,7 +326,6 @@ out_bad: list_del(&data->list); nfs_readdata_free(data); } - SetPageError(page); nfs_readpage_release(req); return -ENOMEM; } @@ -357,7 +351,6 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, struct list_head * req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_list_add_request(req, &data->pages); - ClearPageError(req->wb_page); *pages++ = req->wb_page; } req = nfs_list_entry(data->pages.next); @@ -435,7 +428,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data argp->offset += resp->count; argp->pgbase += resp->count; argp->count -= resp->count; - nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); + rpc_restart_call_prepare(task); } /* @@ -462,10 +455,10 @@ static void nfs_readpage_release_partial(void *calldata) int status = data->task.tk_status; if (status < 0) - SetPageError(page); + set_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags); if (atomic_dec_and_test(&req->wb_complete)) { - if (!PageError(page)) + if (!test_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags)) SetPageUptodate(page); nfs_readpage_release(req); } @@ -541,13 +534,23 @@ static void nfs_readpage_result_full(struct rpc_task *task, void *calldata) static void nfs_readpage_release_full(void *calldata) { struct nfs_read_data *data = calldata; + struct nfs_pageio_descriptor pgio; + if (data->pnfs_error) { + nfs_pageio_init_read_mds(&pgio, data->inode); + pgio.pg_recoalesce = 1; + } while (!list_empty(&data->pages)) { struct nfs_page *req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - nfs_readpage_release(req); + if (!data->pnfs_error) + nfs_readpage_release(req); + else + nfs_pageio_add_request(&pgio, req); } + if (data->pnfs_error) + nfs_pageio_complete(&pgio); nfs_readdata_release(calldata); } @@ -648,7 +651,6 @@ readpage_async_filler(void *data, struct page *page) return 0; out_error: error = PTR_ERR(new); - SetPageError(page); out_unlock: unlock_page(page); return error; @@ -711,16 +713,10 @@ int __init nfs_init_readpagecache(void) if (nfs_rdata_cachep == NULL) return -ENOMEM; - nfs_rdata_mempool = mempool_create_slab_pool(MIN_POOL_READ, - nfs_rdata_cachep); - if (nfs_rdata_mempool == NULL) - return -ENOMEM; - return 0; } void nfs_destroy_readpagecache(void) { - mempool_destroy(nfs_rdata_mempool); kmem_cache_destroy(nfs_rdata_cachep); } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 5b19b6aabe1..480b3b6bf71 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -733,18 +733,22 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) return 0; } + +#ifdef CONFIG_NFS_V4 #ifdef CONFIG_NFS_V4_1 -void show_sessions(struct seq_file *m, struct nfs_server *server) +static void show_sessions(struct seq_file *m, struct nfs_server *server) { if (nfs4_has_session(server->nfs_client)) seq_printf(m, ",sessions"); } #else -void show_sessions(struct seq_file *m, struct nfs_server *server) {} +static void show_sessions(struct seq_file *m, struct nfs_server *server) {} +#endif #endif +#ifdef CONFIG_NFS_V4 #ifdef CONFIG_NFS_V4_1 -void show_pnfs(struct seq_file *m, struct nfs_server *server) +static void show_pnfs(struct seq_file *m, struct nfs_server *server) { seq_printf(m, ",pnfs="); if (server->pnfs_curr_ld) @@ -752,9 +756,10 @@ void show_pnfs(struct seq_file *m, struct nfs_server *server) else seq_printf(m, "not configured"); } -#else /* CONFIG_NFS_V4_1 */ -void show_pnfs(struct seq_file *m, struct nfs_server *server) {} -#endif /* CONFIG_NFS_V4_1 */ +#else +static void show_pnfs(struct seq_file *m, struct nfs_server *server) {} +#endif +#endif static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt) { diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index b2fbbde58e4..4f9319a2e56 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -87,7 +87,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata) struct inode *dir = data->dir; if (!NFS_PROTO(dir)->unlink_done(task, dir)) - nfs_restart_rpc(task, NFS_SERVER(dir)->nfs_client); + rpc_restart_call_prepare(task); } /** @@ -369,7 +369,7 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata) struct dentry *new_dentry = data->new_dentry; if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { - nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client); + rpc_restart_call_prepare(task); return; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c9bd2a6b7d4..2219c88d96b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -390,7 +390,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); BUG_ON(error); if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE)) - nfsi->change_attr++; + inode->i_version++; set_bit(PG_MAPPED, &req->wb_flags); SetPagePrivate(req->wb_page); set_page_private(req->wb_page, (unsigned long)req); @@ -428,7 +428,6 @@ static void nfs_mark_request_dirty(struct nfs_page *req) { __set_page_dirty_nobuffers(req->wb_page); - __mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC); } #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) @@ -762,6 +761,8 @@ int nfs_updatepage(struct file *file, struct page *page, status = nfs_writepage_setup(ctx, page, offset, count); if (status < 0) nfs_set_pageerror(page); + else + __set_page_dirty_nobuffers(page); dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", status, (long long)i_size_read(inode)); @@ -1010,7 +1011,6 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc, struct list_head *r req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_list_add_request(req, &data->pages); - ClearPageError(req->wb_page); *pages++ = req->wb_page; } req = nfs_list_entry(data->pages.next); @@ -1165,7 +1165,13 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata) static void nfs_writeback_release_full(void *calldata) { struct nfs_write_data *data = calldata; - int status = data->task.tk_status; + int ret, status = data->task.tk_status; + struct nfs_pageio_descriptor pgio; + + if (data->pnfs_error) { + nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE); + pgio.pg_recoalesce = 1; + } /* Update attributes as result of writeback. */ while (!list_empty(&data->pages)) { @@ -1181,6 +1187,11 @@ static void nfs_writeback_release_full(void *calldata) req->wb_bytes, (long long)req_offset(req)); + if (data->pnfs_error) { + dprintk(", pnfs error = %d\n", data->pnfs_error); + goto next; + } + if (status < 0) { nfs_set_pageerror(page); nfs_context_set_write_error(req->wb_context, status); @@ -1200,7 +1211,19 @@ remove_request: next: nfs_clear_page_tag_locked(req); nfs_end_page_writeback(page); + if (data->pnfs_error) { + lock_page(page); + nfs_pageio_cond_complete(&pgio, page->index); + ret = nfs_page_async_flush(&pgio, page, 0); + if (ret) { + nfs_set_pageerror(page); + dprintk("rewrite to MDS error = %d\n", ret); + } + unlock_page(page); + } } + if (data->pnfs_error) + nfs_pageio_complete(&pgio); nfs_writedata_release(calldata); } @@ -1281,7 +1304,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) */ argp->stable = NFS_FILE_SYNC; } - nfs_restart_rpc(task, server->nfs_client); + rpc_restart_call_prepare(task); return; } if (time_before(complain, jiffies)) { @@ -1553,6 +1576,10 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr int flags = FLUSH_SYNC; int ret = 0; + /* no commits means nothing needs to be done */ + if (!nfsi->ncommit) + return ret; + if (wbc->sync_mode == WB_SYNC_NONE) { /* Don't commit yet if this is a non-blocking flush and there * are a lot of outstanding writes for this mapping. @@ -1686,34 +1713,20 @@ out_error: int nfs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page) { - struct nfs_page *req; - int ret; + /* + * If PagePrivate is set, then the page is currently associated with + * an in-progress read or write request. Don't try to migrate it. + * + * FIXME: we could do this in principle, but we'll need a way to ensure + * that we can safely release the inode reference while holding + * the page lock. + */ + if (PagePrivate(page)) + return -EBUSY; nfs_fscache_release_page(page, GFP_KERNEL); - req = nfs_find_and_lock_request(page, false); - ret = PTR_ERR(req); - if (IS_ERR(req)) - goto out; - - ret = migrate_page(mapping, newpage, page); - if (!req) - goto out; - if (ret) - goto out_unlock; - page_cache_get(newpage); - spin_lock(&mapping->host->i_lock); - req->wb_page = newpage; - SetPagePrivate(newpage); - set_page_private(newpage, (unsigned long)req); - ClearPagePrivate(page); - set_page_private(page, 0); - spin_unlock(&mapping->host->i_lock); - page_cache_release(page); -out_unlock: - nfs_clear_page_tag_locked(req); -out: - return ret; + return migrate_page(mapping, newpage, page); } #endif diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index f4cc1e2bfc5..62f3b9074e8 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -16,7 +16,6 @@ #include <linux/module.h> #include <linux/exportfs.h> -#include <linux/nfsd/syscall.h> #include <net/ipv6.h> #include "nfsd.h" @@ -318,7 +317,6 @@ static void svc_export_put(struct kref *ref) struct svc_export *exp = container_of(ref, struct svc_export, h.ref); path_put(&exp->ex_path); auth_domain_put(exp->ex_client); - kfree(exp->ex_pathname); nfsd4_fslocs_free(&exp->ex_fslocs); kfree(exp); } @@ -528,11 +526,6 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) exp.ex_client = dom; - err = -ENOMEM; - exp.ex_pathname = kstrdup(buf, GFP_KERNEL); - if (!exp.ex_pathname) - goto out2; - /* expiry */ err = -EINVAL; exp.h.expiry_time = get_expiry(&mesg); @@ -613,8 +606,6 @@ out4: nfsd4_fslocs_free(&exp.ex_fslocs); kfree(exp.ex_uuid); out3: - kfree(exp.ex_pathname); -out2: path_put(&exp.ex_path); out1: auth_domain_put(dom); @@ -678,7 +669,6 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) new->ex_client = item->ex_client; new->ex_path.dentry = dget(item->ex_path.dentry); new->ex_path.mnt = mntget(item->ex_path.mnt); - new->ex_pathname = NULL; new->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = 0; @@ -696,8 +686,6 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem) new->ex_fsid = item->ex_fsid; new->ex_uuid = item->ex_uuid; item->ex_uuid = NULL; - new->ex_pathname = item->ex_pathname; - item->ex_pathname = NULL; new->ex_fslocs.locations = item->ex_fslocs.locations; item->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = item->ex_fslocs.locations_count; @@ -1010,7 +998,7 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct path *path) return exp; } -static struct svc_export *find_fsidzero_export(struct svc_rqst *rqstp) +struct svc_export *rqst_find_fsidzero_export(struct svc_rqst *rqstp) { u32 fsidv[2]; @@ -1030,7 +1018,7 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp) struct svc_export *exp; __be32 rv; - exp = find_fsidzero_export(rqstp); + exp = rqst_find_fsidzero_export(rqstp); if (IS_ERR(exp)) return nfserrno(PTR_ERR(exp)); rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL); diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 02eb4edf0ec..7748d6a18d9 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -39,6 +39,8 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC +static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason); + #define NFSPROC4_CB_NULL 0 #define NFSPROC4_CB_COMPOUND 1 @@ -351,7 +353,7 @@ static void encode_cb_recall4args(struct xdr_stream *xdr, __be32 *p; encode_nfs_cb_opnum4(xdr, OP_CB_RECALL); - encode_stateid4(xdr, &dp->dl_stateid); + encode_stateid4(xdr, &dp->dl_stid.sc_stateid); p = xdr_reserve_space(xdr, 4); *p++ = xdr_zero; /* truncate */ @@ -460,6 +462,8 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr, */ status = 0; out: + if (status) + nfsd4_mark_cb_fault(cb->cb_clp, status); return status; out_overflow: print_overflow_msg(__func__, xdr); @@ -686,6 +690,12 @@ static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason) warn_no_callback_path(clp, reason); } +static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason) +{ + clp->cl_cb_state = NFSD4_CB_FAULT; + warn_no_callback_path(clp, reason); +} + static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) { struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); @@ -787,7 +797,7 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); - struct nfs4_client *clp = dp->dl_client; + struct nfs4_client *clp = dp->dl_stid.sc_client; u32 minorversion = clp->cl_minorversion; cb->cb_minorversion = minorversion; @@ -809,7 +819,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); - struct nfs4_client *clp = dp->dl_client; + struct nfs4_client *clp = dp->dl_stid.sc_client; dprintk("%s: minorversion=%d\n", __func__, clp->cl_minorversion); @@ -832,7 +842,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); - struct nfs4_client *clp = dp->dl_client; + struct nfs4_client *clp = dp->dl_stid.sc_client; struct rpc_clnt *current_rpc_client = clp->cl_cb_client; nfsd4_cb_done(task, calldata); @@ -1006,7 +1016,7 @@ void nfsd4_do_callback_rpc(struct work_struct *w) void nfsd4_cb_recall(struct nfs4_delegation *dp) { struct nfsd4_callback *cb = &dp->dl_recall; - struct nfs4_client *clp = dp->dl_client; + struct nfs4_client *clp = dp->dl_stid.sc_client; dp->dl_retries = 1; cb->cb_op = dp; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index e8077766661..fa383361bc6 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -35,6 +35,7 @@ #include <linux/file.h> #include <linux/slab.h> +#include "idmap.h" #include "cache.h" #include "xdr4.h" #include "vfs.h" @@ -156,6 +157,8 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs !(open->op_share_access & NFS4_SHARE_ACCESS_WRITE)) return nfserr_inval; + accmode |= NFSD_MAY_READ_IF_EXEC; + if (open->op_share_access & NFS4_SHARE_ACCESS_READ) accmode |= NFSD_MAY_READ; if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) @@ -168,12 +171,29 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs return status; } +static __be32 nfsd_check_obj_isreg(struct svc_fh *fh) +{ + umode_t mode = fh->fh_dentry->d_inode->i_mode; + + if (S_ISREG(mode)) + return nfs_ok; + if (S_ISDIR(mode)) + return nfserr_isdir; + /* + * Using err_symlink as our catch-all case may look odd; but + * there's no other obvious error for this case in 4.0, and we + * happen to know that it will cause the linux v4 client to do + * the right thing on attempts to open something other than a + * regular file. + */ + return nfserr_symlink; +} + static __be32 do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) { struct svc_fh resfh; __be32 status; - int created = 0; fh_init(&resfh, NFS4_FHSIZE); open->op_truncate = 0; @@ -202,7 +222,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o open->op_fname.len, &open->op_iattr, &resfh, open->op_createmode, (u32 *)open->op_verf.data, - &open->op_truncate, &created); + &open->op_truncate, &open->op_created); /* * Following rfc 3530 14.2.16, use the returned bitmask @@ -216,6 +236,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o status = nfsd_lookup(rqstp, current_fh, open->op_fname.data, open->op_fname.len, &resfh); fh_unlock(current_fh); + if (status) + goto out; + status = nfsd_check_obj_isreg(&resfh); } if (status) goto out; @@ -227,9 +250,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o fh_dup2(current_fh, &resfh); /* set reply cache */ - fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh, + fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh, &resfh.fh_handle); - if (!created) + if (!open->op_created) status = do_open_permission(rqstp, current_fh, open, NFSD_MAY_NOP); @@ -254,7 +277,7 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_ memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info)); /* set replay cache */ - fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh, + fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh, ¤t_fh->fh_handle); open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) && @@ -283,14 +306,18 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status; struct nfsd4_compoundres *resp; - dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n", + dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n", (int)open->op_fname.len, open->op_fname.data, - open->op_stateowner); + open->op_openowner); /* This check required by spec. */ if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) return nfserr_inval; + /* We don't yet support WANT bits: */ + open->op_share_access &= NFS4_SHARE_ACCESS_MASK; + + open->op_created = 0; /* * RFC5661 18.51.3 * Before RECLAIM_COMPLETE done, server should deny new lock @@ -309,7 +336,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, resp = rqstp->rq_resp; status = nfsd4_process_open1(&resp->cstate, open); if (status == nfserr_replay_me) { - struct nfs4_replay *rp = &open->op_stateowner->so_replay; + struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay; fh_put(&cstate->current_fh); fh_copy_shallow(&cstate->current_fh.fh_handle, &rp->rp_openfh); @@ -339,32 +366,23 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_DELEGATE_CUR: case NFS4_OPEN_CLAIM_NULL: - /* - * (1) set CURRENT_FH to the file being opened, - * creating it if necessary, (2) set open->op_cinfo, - * (3) set open->op_truncate if the file is to be - * truncated after opening, (4) do permission checking. - */ status = do_open_lookup(rqstp, &cstate->current_fh, open); if (status) goto out; break; case NFS4_OPEN_CLAIM_PREVIOUS: - open->op_stateowner->so_confirmed = 1; - /* - * The CURRENT_FH is already set to the file being - * opened. (1) set open->op_cinfo, (2) set - * open->op_truncate if the file is to be truncated - * after opening, (3) do permission checking. - */ + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; + case NFS4_OPEN_CLAIM_FH: + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: status = do_open_fhandle(rqstp, &cstate->current_fh, open); if (status) goto out; break; + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: case NFS4_OPEN_CLAIM_DELEGATE_PREV: - open->op_stateowner->so_confirmed = 1; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; dprintk("NFSD: unsupported OPEN claim type %d\n", open->op_claim_type); status = nfserr_notsupp; @@ -381,12 +399,13 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * set, (2) sets open->op_stateid, (3) sets open->op_delegation. */ status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); + WARN_ON(status && open->op_created); out: - if (open->op_stateowner) { - nfs4_get_stateowner(open->op_stateowner); - cstate->replay_owner = open->op_stateowner; - } - nfs4_unlock_state(); + nfsd4_cleanup_open_state(open, status); + if (open->op_openowner) + cstate->replay_owner = &open->op_openowner->oo_owner; + else + nfs4_unlock_state(); return status; } @@ -467,17 +486,12 @@ static __be32 nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_commit *commit) { - __be32 status; - u32 *p = (u32 *)commit->co_verf.data; *p++ = nfssvc_boot.tv_sec; *p++ = nfssvc_boot.tv_usec; - status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, + return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, commit->co_count); - if (status == nfserr_symlink) - status = nfserr_inval; - return status; } static __be32 @@ -492,8 +506,6 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_CREATE); - if (status == nfserr_symlink) - status = nfserr_notdir; if (status) return status; @@ -691,7 +703,7 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion); readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion); - if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) || + if ((cookie == 1) || (cookie == 2) || (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE))) return nfserr_bad_cookie; @@ -719,8 +731,6 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_grace; status = nfsd_unlink(rqstp, &cstate->current_fh, 0, remove->rm_name, remove->rm_namelen); - if (status == nfserr_symlink) - return nfserr_notdir; if (!status) { fh_unlock(&cstate->current_fh); set_change_info(&remove->rm_cinfo, &cstate->current_fh); @@ -751,8 +761,6 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, (S_ISDIR(cstate->save_fh.fh_dentry->d_inode->i_mode) && S_ISDIR(cstate->current_fh.fh_dentry->d_inode->i_mode))) status = nfserr_exist; - else if (status == nfserr_symlink) - status = nfserr_notdir; if (!status) { set_change_info(&rename->rn_sinfo, &cstate->current_fh); @@ -892,8 +900,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, write->wr_bytes_written = cnt; - if (status == nfserr_symlink) - status = nfserr_inval; return status; } @@ -930,7 +936,7 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, count = 4 + (verify->ve_attrlen >> 2); buf = kmalloc(count << 2, GFP_KERNEL); if (!buf) - return nfserr_resource; + return nfserr_jukebox; status = nfsd4_encode_fattr(&cstate->current_fh, cstate->current_fh.fh_export, @@ -994,6 +1000,8 @@ static inline void nfsd4_increment_op_stats(u32 opnum) typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *, void *); +typedef u32(*nfsd4op_rsize)(struct svc_rqst *, struct nfsd4_op *op); + enum nfsd4_op_flags { ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */ ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */ @@ -1001,13 +1009,15 @@ enum nfsd4_op_flags { /* For rfc 5661 section 2.6.3.1.1: */ OP_HANDLES_WRONGSEC = 1 << 3, OP_IS_PUTFH_LIKE = 1 << 4, -}; - -struct nfsd4_operation { - nfsd4op_func op_func; - u32 op_flags; - char *op_name; /* + * These are the ops whose result size we estimate before + * encoding, to avoid performing an op then not being able to + * respond or cache a response. This includes writes and setattrs + * as well as the operations usually called "nonidempotent": + */ + OP_MODIFIES_SOMETHING = 1 << 5, + /* + * Cache compounds containing these ops in the xid-based drc: * We use the DRC for compounds containing non-idempotent * operations, *except* those that are 4.1-specific (since * sessions provide their own EOS), and except for stateful @@ -1015,7 +1025,15 @@ struct nfsd4_operation { * (since sequence numbers provide EOS for open, lock, etc in * the v4.0 case). */ - bool op_cacheresult; + OP_CACHEME = 1 << 6, +}; + +struct nfsd4_operation { + nfsd4op_func op_func; + u32 op_flags; + char *op_name; + /* Try to get response size before operation */ + nfsd4op_rsize op_rsize_bop; }; static struct nfsd4_operation nfsd4_ops[]; @@ -1062,7 +1080,7 @@ static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op) bool nfsd4_cache_this_op(struct nfsd4_op *op) { - return OPDESC(op)->op_cacheresult; + return OPDESC(op)->op_flags & OP_CACHEME; } static bool need_wrongsec_check(struct svc_rqst *rqstp) @@ -1110,6 +1128,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, struct nfsd4_operation *opdesc; struct nfsd4_compound_state *cstate = &resp->cstate; int slack_bytes; + u32 plen = 0; __be32 status; resp->xbuf = &rqstp->rq_res; @@ -1188,6 +1207,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, goto encode_op; } + /* If op is non-idempotent */ + if (opdesc->op_flags & OP_MODIFIES_SOMETHING) { + plen = opdesc->op_rsize_bop(rqstp, op); + op->status = nfsd4_check_resp_size(resp, plen); + } + + if (op->status) + goto encode_op; + if (opdesc->op_func) op->status = opdesc->op_func(rqstp, cstate, &op->u); else @@ -1217,7 +1245,7 @@ encode_op: be32_to_cpu(status)); if (cstate->replay_owner) { - nfs4_put_stateowner(cstate->replay_owner); + nfs4_unlock_state(); cstate->replay_owner = NULL; } /* XXX Ugh, we need to get rid of this kind of special case: */ @@ -1238,6 +1266,144 @@ out: return status; } +#define op_encode_hdr_size (2) +#define op_encode_stateid_maxsz (XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define op_encode_verifier_maxsz (XDR_QUADLEN(NFS4_VERIFIER_SIZE)) +#define op_encode_change_info_maxsz (5) +#define nfs4_fattr_bitmap_maxsz (4) + +#define op_encode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) +#define op_encode_lock_denied_maxsz (8 + op_encode_lockowner_maxsz) + +#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) + +#define op_encode_ace_maxsz (3 + nfs4_owner_maxsz) +#define op_encode_delegation_maxsz (1 + op_encode_stateid_maxsz + 1 + \ + op_encode_ace_maxsz) + +#define op_encode_channel_attrs_maxsz (6 + 1 + 1) + +static inline u32 nfsd4_only_status_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size) * sizeof(__be32); +} + +static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32); +} + +static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz + + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz) + * sizeof(__be32); +} + +static inline u32 nfsd4_lock_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_lock_denied_maxsz) + * sizeof(__be32); +} + +static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_stateid_maxsz + + op_encode_change_info_maxsz + 1 + + nfs4_fattr_bitmap_maxsz + + op_encode_delegation_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + u32 maxcount = 0, rlen = 0; + + maxcount = svc_max_payload(rqstp); + rlen = op->u.read.rd_length; + + if (rlen > maxcount) + rlen = maxcount; + + return (op_encode_hdr_size + 2) * sizeof(__be32) + rlen; +} + +static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + u32 rlen = op->u.readdir.rd_maxcount; + + if (rlen > PAGE_SIZE) + rlen = PAGE_SIZE; + + return (op_encode_hdr_size + op_encode_verifier_maxsz) + * sizeof(__be32) + rlen; +} + +static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz) + * sizeof(__be32); +} + +static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz + + op_encode_change_info_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 2 + 1024) * sizeof(__be32); +} + +static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\ + 1 + 1 + 0 + /* eir_flags, spr_how, SP4_NONE (for now) */\ + 2 + /*eir_server_owner.so_minor_id */\ + /* eir_server_owner.so_major_id<> */\ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\ + /* eir_server_scope<> */\ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\ + 1 + /* eir_server_impl_id array length */\ + 0 /* ignored eir_server_impl_id contents */) * sizeof(__be32); +} + +static inline u32 nfsd4_bind_conn_to_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* bctsr_sessid */\ + 2 /* bctsr_dir, use_conn_in_rdma_mode */) * sizeof(__be32); +} + +static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* sessionid */\ + 2 + /* csr_sequence, csr_flags */\ + op_encode_channel_attrs_maxsz + \ + op_encode_channel_attrs_maxsz) * sizeof(__be32); +} + static struct nfsd4_operation nfsd4_ops[] = { [OP_ACCESS] = { .op_func = (nfsd4op_func)nfsd4_access, @@ -1245,20 +1411,27 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_CLOSE] = { .op_func = (nfsd4op_func)nfsd4_close, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_CLOSE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize, }, [OP_COMMIT] = { .op_func = (nfsd4op_func)nfsd4_commit, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_COMMIT", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_commit_rsize, }, [OP_CREATE] = { .op_func = (nfsd4op_func)nfsd4_create, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_CREATE", - .op_cacheresult = true, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_rsize, }, [OP_DELEGRETURN] = { .op_func = (nfsd4op_func)nfsd4_delegreturn, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_DELEGRETURN", + .op_rsize_bop = nfsd4_only_status_rsize, }, [OP_GETATTR] = { .op_func = (nfsd4op_func)nfsd4_getattr, @@ -1271,12 +1444,16 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_LINK] = { .op_func = (nfsd4op_func)nfsd4_link, + .op_flags = ALLOWED_ON_ABSENT_FS | OP_MODIFIES_SOMETHING + | OP_CACHEME, .op_name = "OP_LINK", - .op_cacheresult = true, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_link_rsize, }, [OP_LOCK] = { .op_func = (nfsd4op_func)nfsd4_lock, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_LOCK", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_lock_rsize, }, [OP_LOCKT] = { .op_func = (nfsd4op_func)nfsd4_lockt, @@ -1284,7 +1461,9 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_LOCKU] = { .op_func = (nfsd4op_func)nfsd4_locku, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_LOCKU", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize, }, [OP_LOOKUP] = { .op_func = (nfsd4op_func)nfsd4_lookup, @@ -1302,42 +1481,54 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_OPEN] = { .op_func = (nfsd4op_func)nfsd4_open, - .op_flags = OP_HANDLES_WRONGSEC, + .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING, .op_name = "OP_OPEN", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_open_rsize, }, [OP_OPEN_CONFIRM] = { .op_func = (nfsd4op_func)nfsd4_open_confirm, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_OPEN_CONFIRM", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize, }, [OP_OPEN_DOWNGRADE] = { .op_func = (nfsd4op_func)nfsd4_open_downgrade, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_OPEN_DOWNGRADE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize, }, [OP_PUTFH] = { .op_func = (nfsd4op_func)nfsd4_putfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE, + | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING, .op_name = "OP_PUTFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_PUTPUBFH] = { .op_func = (nfsd4op_func)nfsd4_putrootfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE, + | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING, .op_name = "OP_PUTPUBFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_PUTROOTFH] = { .op_func = (nfsd4op_func)nfsd4_putrootfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE, + | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING, .op_name = "OP_PUTROOTFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_READ] = { .op_func = (nfsd4op_func)nfsd4_read, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_READ", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_read_rsize, }, [OP_READDIR] = { .op_func = (nfsd4op_func)nfsd4_readdir, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_READDIR", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_readdir_rsize, }, [OP_READLINK] = { .op_func = (nfsd4op_func)nfsd4_readlink, @@ -1345,29 +1536,36 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_REMOVE] = { .op_func = (nfsd4op_func)nfsd4_remove, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_REMOVE", - .op_cacheresult = true, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_remove_rsize, }, [OP_RENAME] = { - .op_name = "OP_RENAME", .op_func = (nfsd4op_func)nfsd4_rename, - .op_cacheresult = true, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_RENAME", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_rename_rsize, }, [OP_RENEW] = { .op_func = (nfsd4op_func)nfsd4_renew, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING, .op_name = "OP_RENEW", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, [OP_RESTOREFH] = { .op_func = (nfsd4op_func)nfsd4_restorefh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE, + | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING, .op_name = "OP_RESTOREFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_SAVEFH] = { .op_func = (nfsd4op_func)nfsd4_savefh, - .op_flags = OP_HANDLES_WRONGSEC, + .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING, .op_name = "OP_SAVEFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_SECINFO] = { .op_func = (nfsd4op_func)nfsd4_secinfo, @@ -1377,19 +1575,22 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_SETATTR] = { .op_func = (nfsd4op_func)nfsd4_setattr, .op_name = "OP_SETATTR", - .op_cacheresult = true, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_setattr_rsize, }, [OP_SETCLIENTID] = { .op_func = (nfsd4op_func)nfsd4_setclientid, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_SETCLIENTID", - .op_cacheresult = true, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_setclientid_rsize, }, [OP_SETCLIENTID_CONFIRM] = { .op_func = (nfsd4op_func)nfsd4_setclientid_confirm, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_SETCLIENTID_CONFIRM", - .op_cacheresult = true, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_VERIFY] = { .op_func = (nfsd4op_func)nfsd4_verify, @@ -1397,35 +1598,46 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_WRITE] = { .op_func = (nfsd4op_func)nfsd4_write, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_WRITE", - .op_cacheresult = true, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize, }, [OP_RELEASE_LOCKOWNER] = { .op_func = (nfsd4op_func)nfsd4_release_lockowner, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING, .op_name = "OP_RELEASE_LOCKOWNER", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, /* NFSv4.1 operations */ [OP_EXCHANGE_ID] = { .op_func = (nfsd4op_func)nfsd4_exchange_id, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, .op_name = "OP_EXCHANGE_ID", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_exchange_id_rsize, }, [OP_BIND_CONN_TO_SESSION] = { .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, .op_name = "OP_BIND_CONN_TO_SESSION", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_bind_conn_to_session_rsize, }, [OP_CREATE_SESSION] = { .op_func = (nfsd4op_func)nfsd4_create_session, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, .op_name = "OP_CREATE_SESSION", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_session_rsize, }, [OP_DESTROY_SESSION] = { .op_func = (nfsd4op_func)nfsd4_destroy_session, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, .op_name = "OP_DESTROY_SESSION", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_SEQUENCE] = { .op_func = (nfsd4op_func)nfsd4_sequence, @@ -1433,14 +1645,17 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_name = "OP_SEQUENCE", }, [OP_DESTROY_CLIENTID] = { - .op_func = NULL, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_func = (nfsd4op_func)nfsd4_destroy_clientid, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, .op_name = "OP_DESTROY_CLIENTID", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_RECLAIM_COMPLETE] = { .op_func = (nfsd4op_func)nfsd4_reclaim_complete, - .op_flags = ALLOWED_WITHOUT_FH, + .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING, .op_name = "OP_RECLAIM_COMPLETE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_SECINFO_NO_NAME] = { .op_func = (nfsd4op_func)nfsd4_secinfo_no_name, @@ -1454,8 +1669,9 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_FREE_STATEID] = { .op_func = (nfsd4op_func)nfsd4_free_stateid, - .op_flags = ALLOWED_WITHOUT_FH, + .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING, .op_name = "OP_FREE_STATEID", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, }; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 29d77f60585..ed083b9a731 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -45,6 +45,7 @@ /* Globals */ static struct file *rec_file; +static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; static int nfs4_save_creds(const struct cred **original_creds) @@ -88,7 +89,7 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname) struct xdr_netobj cksum; struct hash_desc desc; struct scatterlist sg; - __be32 status = nfserr_resource; + __be32 status = nfserr_jukebox; dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n", clname->len, clname->data); @@ -129,6 +130,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) if (!rec_file || clp->cl_firststate) return 0; + clp->cl_firststate = 1; status = nfs4_save_creds(&original_cred); if (status < 0) return status; @@ -143,10 +145,8 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) goto out_unlock; } status = -EEXIST; - if (dentry->d_inode) { - dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n"); + if (dentry->d_inode) goto out_put; - } status = mnt_want_write(rec_file->f_path.mnt); if (status) goto out_put; @@ -156,12 +156,14 @@ out_put: dput(dentry); out_unlock: mutex_unlock(&dir->d_inode->i_mutex); - if (status == 0) { - clp->cl_firststate = 1; + if (status == 0) vfs_fsync(rec_file, 0); - } + else + printk(KERN_ERR "NFSD: failed to write recovery record" + " (err %d); please check that %s exists" + " and is writeable", status, + user_recovery_dirname); nfs4_reset_creds(original_cred); - dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status); return status; } @@ -354,13 +356,13 @@ nfsd4_recdir_load(void) { */ void -nfsd4_init_recdir(char *rec_dirname) +nfsd4_init_recdir() { const struct cred *original_cred; int status; printk("NFSD: Using %s as the NFSv4 state recovery directory\n", - rec_dirname); + user_recovery_dirname); BUG_ON(rec_file); @@ -372,10 +374,10 @@ nfsd4_init_recdir(char *rec_dirname) return; } - rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0); + rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0); if (IS_ERR(rec_file)) { printk("NFSD: unable to find recovery directory %s\n", - rec_dirname); + user_recovery_dirname); rec_file = NULL; } @@ -390,3 +392,30 @@ nfsd4_shutdown_recdir(void) fput(rec_file); rec_file = NULL; } + +/* + * Change the NFSv4 recovery directory to recdir. + */ +int +nfs4_reset_recoverydir(char *recdir) +{ + int status; + struct path path; + + status = kern_path(recdir, LOOKUP_FOLLOW, &path); + if (status) + return status; + status = -ENOTDIR; + if (S_ISDIR(path.dentry->d_inode->i_mode)) { + strcpy(user_recovery_dirname, recdir); + status = 0; + } + path_put(&path); + return status; +} + +char * +nfs4_recoverydir(void) +{ + return user_recovery_dirname; +} diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3787ec11740..47e94e33a97 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -49,9 +49,6 @@ time_t nfsd4_lease = 90; /* default lease time */ time_t nfsd4_grace = 90; static time_t boot_time; -static u32 current_ownerid = 1; -static u32 current_fileid = 1; -static u32 current_delegid = 1; static stateid_t zerostateid; /* bits all 0 */ static stateid_t onestateid; /* bits all 1 */ static u64 current_sessionid = 1; @@ -60,13 +57,7 @@ static u64 current_sessionid = 1; #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) /* forward declarations */ -static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); -static struct nfs4_stateid * search_for_stateid(stateid_t *stid); -static struct nfs4_delegation * search_for_delegation(stateid_t *stid); -static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); -static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; -static void nfs4_set_recdir(char *recdir); -static int check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner); +static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner); /* Locking: */ @@ -80,7 +71,8 @@ static DEFINE_MUTEX(client_mutex); */ static DEFINE_SPINLOCK(recall_lock); -static struct kmem_cache *stateowner_slab = NULL; +static struct kmem_cache *openowner_slab = NULL; +static struct kmem_cache *lockowner_slab = NULL; static struct kmem_cache *file_slab = NULL; static struct kmem_cache *stateid_slab = NULL; static struct kmem_cache *deleg_slab = NULL; @@ -112,6 +104,11 @@ opaque_hashval(const void *ptr, int nbytes) static struct list_head del_recall_lru; +static void nfsd4_free_file(struct nfs4_file *f) +{ + kmem_cache_free(file_slab, f); +} + static inline void put_nfs4_file(struct nfs4_file *fi) { @@ -119,7 +116,7 @@ put_nfs4_file(struct nfs4_file *fi) list_del(&fi->fi_hash); spin_unlock(&recall_lock); iput(fi->fi_inode); - kmem_cache_free(file_slab, fi); + nfsd4_free_file(fi); } } @@ -136,35 +133,33 @@ unsigned int max_delegations; * Open owner state (share locks) */ -/* hash tables for nfs4_stateowner */ -#define OWNER_HASH_BITS 8 -#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) -#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) +/* hash tables for open owners */ +#define OPEN_OWNER_HASH_BITS 8 +#define OPEN_OWNER_HASH_SIZE (1 << OPEN_OWNER_HASH_BITS) +#define OPEN_OWNER_HASH_MASK (OPEN_OWNER_HASH_SIZE - 1) -#define ownerid_hashval(id) \ - ((id) & OWNER_HASH_MASK) -#define ownerstr_hashval(clientid, ownername) \ - (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK) +static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername) +{ + unsigned int ret; -static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE]; -static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; + ret = opaque_hashval(ownername->data, ownername->len); + ret += clientid; + return ret & OPEN_OWNER_HASH_MASK; +} + +static struct list_head open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE]; /* hash table for nfs4_file */ #define FILE_HASH_BITS 8 #define FILE_HASH_SIZE (1 << FILE_HASH_BITS) -/* hash table for (open)nfs4_stateid */ -#define STATEID_HASH_BITS 10 -#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) -#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) - -#define file_hashval(x) \ - hash_ptr(x, FILE_HASH_BITS) -#define stateid_hashval(owner_id, file_id) \ - (((owner_id) + (file_id)) & STATEID_HASH_MASK) +static unsigned int file_hashval(struct inode *ino) +{ + /* XXX: why are we hashing on inode pointer, anyway? */ + return hash_ptr(ino, FILE_HASH_BITS); +} static struct list_head file_hashtbl[FILE_HASH_SIZE]; -static struct list_head stateid_hashtbl[STATEID_HASH_SIZE]; static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag) { @@ -192,8 +187,15 @@ static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag) static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) { if (atomic_dec_and_test(&fp->fi_access[oflag])) { - nfs4_file_put_fd(fp, O_RDWR); nfs4_file_put_fd(fp, oflag); + /* + * It's also safe to get rid of the RDWR open *if* + * we no longer have need of the other kind of access + * or if we already have the other kind of open: + */ + if (fp->fi_fds[1-oflag] + || atomic_read(&fp->fi_access[1 - oflag]) == 0) + nfs4_file_put_fd(fp, O_RDWR); } } @@ -206,8 +208,73 @@ static void nfs4_file_put_access(struct nfs4_file *fp, int oflag) __nfs4_file_put_access(fp, oflag); } +static inline int get_new_stid(struct nfs4_stid *stid) +{ + static int min_stateid = 0; + struct idr *stateids = &stid->sc_client->cl_stateids; + int new_stid; + int error; + + error = idr_get_new_above(stateids, stid, min_stateid, &new_stid); + /* + * Note: the necessary preallocation was done in + * nfs4_alloc_stateid(). The idr code caps the number of + * preallocations that can exist at a time, but the state lock + * prevents anyone from using ours before we get here: + */ + BUG_ON(error); + /* + * It shouldn't be a problem to reuse an opaque stateid value. + * I don't think it is for 4.1. But with 4.0 I worry that, for + * example, a stray write retransmission could be accepted by + * the server when it should have been rejected. Therefore, + * adopt a trick from the sctp code to attempt to maximize the + * amount of time until an id is reused, by ensuring they always + * "increase" (mod INT_MAX): + */ + + min_stateid = new_stid+1; + if (min_stateid == INT_MAX) + min_stateid = 0; + return new_stid; +} + +static void init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type) +{ + stateid_t *s = &stid->sc_stateid; + int new_id; + + stid->sc_type = type; + stid->sc_client = cl; + s->si_opaque.so_clid = cl->cl_clientid; + new_id = get_new_stid(stid); + s->si_opaque.so_id = (u32)new_id; + /* Will be incremented before return to client: */ + s->si_generation = 0; +} + +static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab) +{ + struct idr *stateids = &cl->cl_stateids; + + if (!idr_pre_get(stateids, GFP_KERNEL)) + return NULL; + /* + * Note: if we fail here (or any time between now and the time + * we actually get the new idr), we won't need to undo the idr + * preallocation, since the idr code caps the number of + * preallocated entries. + */ + return kmem_cache_alloc(slab, GFP_KERNEL); +} + +static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp) +{ + return openlockstateid(nfs4_alloc_stid(clp, stateid_slab)); +} + static struct nfs4_delegation * -alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type) +alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type) { struct nfs4_delegation *dp; struct nfs4_file *fp = stp->st_file; @@ -224,21 +291,23 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f return NULL; if (num_delegations > max_delegations) return NULL; - dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL); + dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab)); if (dp == NULL) return dp; + init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID); + /* + * delegation seqid's are never incremented. The 4.1 special + * meaning of seqid 0 isn't meaningful, really, but let's avoid + * 0 anyway just for consistency and use 1: + */ + dp->dl_stid.sc_stateid.si_generation = 1; num_delegations++; INIT_LIST_HEAD(&dp->dl_perfile); INIT_LIST_HEAD(&dp->dl_perclnt); INIT_LIST_HEAD(&dp->dl_recall_lru); - dp->dl_client = clp; get_nfs4_file(fp); dp->dl_file = fp; dp->dl_type = type; - dp->dl_stateid.si_boot = boot_time; - dp->dl_stateid.si_stateownerid = current_delegid++; - dp->dl_stateid.si_fileid = 0; - dp->dl_stateid.si_generation = 0; fh_copy_shallow(&dp->dl_fh, ¤t_fh->fh_handle); dp->dl_time = 0; atomic_set(&dp->dl_count, 1); @@ -267,10 +336,18 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp) } } +static void unhash_stid(struct nfs4_stid *s) +{ + struct idr *stateids = &s->sc_client->cl_stateids; + + idr_remove(stateids, s->sc_stateid.si_opaque.so_id); +} + /* Called under the state lock. */ static void unhash_delegation(struct nfs4_delegation *dp) { + unhash_stid(&dp->dl_stid); list_del_init(&dp->dl_perclnt); spin_lock(&recall_lock); list_del_init(&dp->dl_perfile); @@ -292,10 +369,16 @@ static DEFINE_SPINLOCK(client_lock); #define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) #define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) -#define clientid_hashval(id) \ - ((id) & CLIENT_HASH_MASK) -#define clientstr_hashval(name) \ - (opaque_hashval((name), 8) & CLIENT_HASH_MASK) +static unsigned int clientid_hashval(u32 id) +{ + return id & CLIENT_HASH_MASK; +} + +static unsigned int clientstr_hashval(const char *name) +{ + return opaque_hashval(name, 8) & CLIENT_HASH_MASK; +} + /* * reclaim_str_hashtbl[] holds known client info from previous reset/reboot * used in reboot/reset lease grace period processing @@ -362,7 +445,7 @@ set_deny(unsigned int *deny, unsigned long bmap) { } static int -test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) { +test_share(struct nfs4_ol_stateid *stp, struct nfsd4_open *open) { unsigned int access, deny; set_access(&access, stp->st_access_bmap); @@ -385,14 +468,13 @@ static int nfs4_access_to_omode(u32 access) BUG(); } -static void unhash_generic_stateid(struct nfs4_stateid *stp) +static void unhash_generic_stateid(struct nfs4_ol_stateid *stp) { - list_del(&stp->st_hash); list_del(&stp->st_perfile); list_del(&stp->st_perstateowner); } -static void free_generic_stateid(struct nfs4_stateid *stp) +static void close_generic_stateid(struct nfs4_ol_stateid *stp) { int i; @@ -401,84 +483,106 @@ static void free_generic_stateid(struct nfs4_stateid *stp) if (test_bit(i, &stp->st_access_bmap)) nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(i)); + __clear_bit(i, &stp->st_access_bmap); } } put_nfs4_file(stp->st_file); + stp->st_file = NULL; +} + +static void free_generic_stateid(struct nfs4_ol_stateid *stp) +{ kmem_cache_free(stateid_slab, stp); } -static void release_lock_stateid(struct nfs4_stateid *stp) +static void release_lock_stateid(struct nfs4_ol_stateid *stp) { struct file *file; unhash_generic_stateid(stp); + unhash_stid(&stp->st_stid); file = find_any_file(stp->st_file); if (file) - locks_remove_posix(file, (fl_owner_t)stp->st_stateowner); + locks_remove_posix(file, (fl_owner_t)lockowner(stp->st_stateowner)); + close_generic_stateid(stp); free_generic_stateid(stp); } -static void unhash_lockowner(struct nfs4_stateowner *sop) +static void unhash_lockowner(struct nfs4_lockowner *lo) { - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; - list_del(&sop->so_idhash); - list_del(&sop->so_strhash); - list_del(&sop->so_perstateid); - while (!list_empty(&sop->so_stateids)) { - stp = list_first_entry(&sop->so_stateids, - struct nfs4_stateid, st_perstateowner); + list_del(&lo->lo_owner.so_strhash); + list_del(&lo->lo_perstateid); + while (!list_empty(&lo->lo_owner.so_stateids)) { + stp = list_first_entry(&lo->lo_owner.so_stateids, + struct nfs4_ol_stateid, st_perstateowner); release_lock_stateid(stp); } } -static void release_lockowner(struct nfs4_stateowner *sop) +static void release_lockowner(struct nfs4_lockowner *lo) { - unhash_lockowner(sop); - nfs4_put_stateowner(sop); + unhash_lockowner(lo); + nfs4_free_lockowner(lo); } static void -release_stateid_lockowners(struct nfs4_stateid *open_stp) +release_stateid_lockowners(struct nfs4_ol_stateid *open_stp) { - struct nfs4_stateowner *lock_sop; + struct nfs4_lockowner *lo; while (!list_empty(&open_stp->st_lockowners)) { - lock_sop = list_entry(open_stp->st_lockowners.next, - struct nfs4_stateowner, so_perstateid); - /* list_del(&open_stp->st_lockowners); */ - BUG_ON(lock_sop->so_is_open_owner); - release_lockowner(lock_sop); + lo = list_entry(open_stp->st_lockowners.next, + struct nfs4_lockowner, lo_perstateid); + release_lockowner(lo); } } -static void release_open_stateid(struct nfs4_stateid *stp) +static void unhash_open_stateid(struct nfs4_ol_stateid *stp) { unhash_generic_stateid(stp); release_stateid_lockowners(stp); + close_generic_stateid(stp); +} + +static void release_open_stateid(struct nfs4_ol_stateid *stp) +{ + unhash_open_stateid(stp); + unhash_stid(&stp->st_stid); free_generic_stateid(stp); } -static void unhash_openowner(struct nfs4_stateowner *sop) +static void unhash_openowner(struct nfs4_openowner *oo) { - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; - list_del(&sop->so_idhash); - list_del(&sop->so_strhash); - list_del(&sop->so_perclient); - list_del(&sop->so_perstateid); /* XXX: necessary? */ - while (!list_empty(&sop->so_stateids)) { - stp = list_first_entry(&sop->so_stateids, - struct nfs4_stateid, st_perstateowner); + list_del(&oo->oo_owner.so_strhash); + list_del(&oo->oo_perclient); + while (!list_empty(&oo->oo_owner.so_stateids)) { + stp = list_first_entry(&oo->oo_owner.so_stateids, + struct nfs4_ol_stateid, st_perstateowner); release_open_stateid(stp); } } -static void release_openowner(struct nfs4_stateowner *sop) +static void release_last_closed_stateid(struct nfs4_openowner *oo) { - unhash_openowner(sop); - list_del(&sop->so_close_lru); - nfs4_put_stateowner(sop); + struct nfs4_ol_stateid *s = oo->oo_last_closed_stid; + + if (s) { + unhash_stid(&s->st_stid); + free_generic_stateid(s); + oo->oo_last_closed_stid = NULL; + } +} + +static void release_openowner(struct nfs4_openowner *oo) +{ + unhash_openowner(oo); + list_del(&oo->oo_close_lru); + release_last_closed_stateid(oo); + nfs4_free_openowner(oo); } #define SESSION_HASH_SIZE 512 @@ -843,9 +947,6 @@ renew_client_locked(struct nfs4_client *clp) return; } - /* - * Move client to the end to the LRU list. - */ dprintk("renewing client (clientid %08x/%08x)\n", clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); @@ -943,7 +1044,7 @@ unhash_client_locked(struct nfs4_client *clp) static void expire_client(struct nfs4_client *clp) { - struct nfs4_stateowner *sop; + struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct list_head reaplist; @@ -961,8 +1062,8 @@ expire_client(struct nfs4_client *clp) unhash_delegation(dp); } while (!list_empty(&clp->cl_openowners)) { - sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); - release_openowner(sop); + oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient); + release_openowner(oo); } nfsd4_shutdown_callback(clp); if (clp->cl_cb_conn.cb_xprt) @@ -1038,6 +1139,23 @@ static void gen_confirm(struct nfs4_client *clp) *p++ = i++; } +static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t) +{ + return idr_find(&cl->cl_stateids, t->si_opaque.so_id); +} + +static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask) +{ + struct nfs4_stid *s; + + s = find_stateid(cl, t); + if (!s) + return NULL; + if (typemask & s->sc_type) + return s; + return NULL; +} + static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, struct svc_rqst *rqstp, nfs4_verifier *verf) { @@ -1060,6 +1178,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, } } + idr_init(&clp->cl_stateids); memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); atomic_set(&clp->cl_refcount, 0); clp->cl_cb_state = NFSD4_CB_UNKNOWN; @@ -1083,17 +1202,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, return clp; } -static int check_name(struct xdr_netobj name) -{ - if (name.len == 0) - return 0; - if (name.len > NFS4_OPAQUE_LIMIT) { - dprintk("NFSD: check_name: name too long(%d)!\n", name.len); - return 0; - } - return 1; -} - static void add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval) { @@ -1125,8 +1233,10 @@ find_confirmed_client(clientid_t *clid) unsigned int idhashval = clientid_hashval(clid->cl_id); list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) { - if (same_clid(&clp->cl_clientid, clid)) + if (same_clid(&clp->cl_clientid, clid)) { + renew_client(clp); return clp; + } } return NULL; } @@ -1173,20 +1283,6 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval) return NULL; } -static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr) -{ - switch (family) { - case AF_INET: - ((struct sockaddr_in *)sa)->sin_family = AF_INET; - ((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr; - return; - case AF_INET6: - ((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6; - ((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6; - return; - } -} - static void gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp) { @@ -1218,7 +1314,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r conn->cb_prog = se->se_callback_prog; conn->cb_ident = se->se_callback_ident; - rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr); + memcpy(&conn->cb_saddr, &rqstp->rq_daddr, rqstp->rq_daddrlen); return; out_err: conn->cb_addr.ss_family = AF_UNSPEC; @@ -1350,7 +1446,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, __func__, rqstp, exid, exid->clname.len, exid->clname.data, addr_str, exid->flags, exid->spa_how); - if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A)) + if (exid->flags & ~EXCHGID4_FLAG_MASK_A) return nfserr_inval; /* Currently only support SP4_NONE */ @@ -1849,8 +1945,16 @@ out: nfsd4_get_session(cstate->session); atomic_inc(&clp->cl_refcount); - if (clp->cl_cb_state == NFSD4_CB_DOWN) - seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN; + switch (clp->cl_cb_state) { + case NFSD4_CB_DOWN: + seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; + break; + case NFSD4_CB_FAULT: + seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT; + break; + default: + seq->status_flags = 0; + } } kfree(conn); spin_unlock(&client_lock); @@ -1858,6 +1962,50 @@ out: return status; } +static inline bool has_resources(struct nfs4_client *clp) +{ + return !list_empty(&clp->cl_openowners) + || !list_empty(&clp->cl_delegations) + || !list_empty(&clp->cl_sessions); +} + +__be32 +nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc) +{ + struct nfs4_client *conf, *unconf, *clp; + int status = 0; + + nfs4_lock_state(); + unconf = find_unconfirmed_client(&dc->clientid); + conf = find_confirmed_client(&dc->clientid); + + if (conf) { + clp = conf; + + if (!is_client_expired(conf) && has_resources(conf)) { + status = nfserr_clientid_busy; + goto out; + } + + /* rfc5661 18.50.3 */ + if (cstate->session && conf == cstate->session->se_client) { + status = nfserr_clientid_busy; + goto out; + } + } else if (unconf) + clp = unconf; + else { + status = nfserr_stale_clientid; + goto out; + } + + expire_client(clp); +out: + nfs4_unlock_state(); + dprintk("%s return %d\n", __func__, ntohl(status)); + return status; +} + __be32 nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc) { @@ -1900,19 +2048,13 @@ __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setclientid *setclid) { - struct xdr_netobj clname = { - .len = setclid->se_namelen, - .data = setclid->se_name, - }; + struct xdr_netobj clname = setclid->se_name; nfs4_verifier clverifier = setclid->se_verf; unsigned int strhashval; struct nfs4_client *conf, *unconf, *new; __be32 status; char dname[HEXDIR_LEN]; - if (!check_name(clname)) - return nfserr_inval; - status = nfs4_make_rec_clidname(dname, &clname); if (status) return status; @@ -1946,7 +2088,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * of 5 bullet points, labeled as CASE0 - CASE4 below. */ unconf = find_unconfirmed_client_by_str(dname, strhashval); - status = nfserr_resource; + status = nfserr_jukebox; if (!conf) { /* * RFC 3530 14.2.33 CASE 4: @@ -2116,31 +2258,28 @@ out: return status; } +static struct nfs4_file *nfsd4_alloc_file(void) +{ + return kmem_cache_alloc(file_slab, GFP_KERNEL); +} + /* OPEN Share state helper functions */ -static inline struct nfs4_file * -alloc_init_file(struct inode *ino) +static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino) { - struct nfs4_file *fp; unsigned int hashval = file_hashval(ino); - fp = kmem_cache_alloc(file_slab, GFP_KERNEL); - if (fp) { - atomic_set(&fp->fi_ref, 1); - INIT_LIST_HEAD(&fp->fi_hash); - INIT_LIST_HEAD(&fp->fi_stateids); - INIT_LIST_HEAD(&fp->fi_delegations); - fp->fi_inode = igrab(ino); - fp->fi_id = current_fileid++; - fp->fi_had_conflict = false; - fp->fi_lease = NULL; - memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); - memset(fp->fi_access, 0, sizeof(fp->fi_access)); - spin_lock(&recall_lock); - list_add(&fp->fi_hash, &file_hashtbl[hashval]); - spin_unlock(&recall_lock); - return fp; - } - return NULL; + atomic_set(&fp->fi_ref, 1); + INIT_LIST_HEAD(&fp->fi_hash); + INIT_LIST_HEAD(&fp->fi_stateids); + INIT_LIST_HEAD(&fp->fi_delegations); + fp->fi_inode = igrab(ino); + fp->fi_had_conflict = false; + fp->fi_lease = NULL; + memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); + memset(fp->fi_access, 0, sizeof(fp->fi_access)); + spin_lock(&recall_lock); + list_add(&fp->fi_hash, &file_hashtbl[hashval]); + spin_unlock(&recall_lock); } static void @@ -2155,7 +2294,8 @@ nfsd4_free_slab(struct kmem_cache **slab) void nfsd4_free_slabs(void) { - nfsd4_free_slab(&stateowner_slab); + nfsd4_free_slab(&openowner_slab); + nfsd4_free_slab(&lockowner_slab); nfsd4_free_slab(&file_slab); nfsd4_free_slab(&stateid_slab); nfsd4_free_slab(&deleg_slab); @@ -2164,16 +2304,20 @@ nfsd4_free_slabs(void) static int nfsd4_init_slabs(void) { - stateowner_slab = kmem_cache_create("nfsd4_stateowners", - sizeof(struct nfs4_stateowner), 0, 0, NULL); - if (stateowner_slab == NULL) + openowner_slab = kmem_cache_create("nfsd4_openowners", + sizeof(struct nfs4_openowner), 0, 0, NULL); + if (openowner_slab == NULL) + goto out_nomem; + lockowner_slab = kmem_cache_create("nfsd4_lockowners", + sizeof(struct nfs4_openowner), 0, 0, NULL); + if (lockowner_slab == NULL) goto out_nomem; file_slab = kmem_cache_create("nfsd4_files", sizeof(struct nfs4_file), 0, 0, NULL); if (file_slab == NULL) goto out_nomem; stateid_slab = kmem_cache_create("nfsd4_stateids", - sizeof(struct nfs4_stateid), 0, 0, NULL); + sizeof(struct nfs4_ol_stateid), 0, 0, NULL); if (stateid_slab == NULL) goto out_nomem; deleg_slab = kmem_cache_create("nfsd4_delegations", @@ -2187,97 +2331,94 @@ out_nomem: return -ENOMEM; } -void -nfs4_free_stateowner(struct kref *kref) +void nfs4_free_openowner(struct nfs4_openowner *oo) { - struct nfs4_stateowner *sop = - container_of(kref, struct nfs4_stateowner, so_ref); - kfree(sop->so_owner.data); - kmem_cache_free(stateowner_slab, sop); + kfree(oo->oo_owner.so_owner.data); + kmem_cache_free(openowner_slab, oo); } -static inline struct nfs4_stateowner * -alloc_stateowner(struct xdr_netobj *owner) +void nfs4_free_lockowner(struct nfs4_lockowner *lo) { - struct nfs4_stateowner *sop; + kfree(lo->lo_owner.so_owner.data); + kmem_cache_free(lockowner_slab, lo); +} - if ((sop = kmem_cache_alloc(stateowner_slab, GFP_KERNEL))) { - if ((sop->so_owner.data = kmalloc(owner->len, GFP_KERNEL))) { - memcpy(sop->so_owner.data, owner->data, owner->len); - sop->so_owner.len = owner->len; - kref_init(&sop->so_ref); - return sop; - } - kmem_cache_free(stateowner_slab, sop); - } - return NULL; +static void init_nfs4_replay(struct nfs4_replay *rp) +{ + rp->rp_status = nfserr_serverfault; + rp->rp_buflen = 0; + rp->rp_buf = rp->rp_ibuf; } -static struct nfs4_stateowner * -alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) { +static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj *owner, struct nfs4_client *clp) +{ struct nfs4_stateowner *sop; - struct nfs4_replay *rp; - unsigned int idhashval; - if (!(sop = alloc_stateowner(&open->op_owner))) + sop = kmem_cache_alloc(slab, GFP_KERNEL); + if (!sop) + return NULL; + + sop->so_owner.data = kmemdup(owner->data, owner->len, GFP_KERNEL); + if (!sop->so_owner.data) { + kmem_cache_free(slab, sop); return NULL; - idhashval = ownerid_hashval(current_ownerid); - INIT_LIST_HEAD(&sop->so_idhash); - INIT_LIST_HEAD(&sop->so_strhash); - INIT_LIST_HEAD(&sop->so_perclient); + } + sop->so_owner.len = owner->len; + INIT_LIST_HEAD(&sop->so_stateids); - INIT_LIST_HEAD(&sop->so_perstateid); /* not used */ - INIT_LIST_HEAD(&sop->so_close_lru); - sop->so_time = 0; - list_add(&sop->so_idhash, &ownerid_hashtbl[idhashval]); - list_add(&sop->so_strhash, &ownerstr_hashtbl[strhashval]); - list_add(&sop->so_perclient, &clp->cl_openowners); - sop->so_is_open_owner = 1; - sop->so_id = current_ownerid++; sop->so_client = clp; - sop->so_seqid = open->op_seqid; - sop->so_confirmed = 0; - rp = &sop->so_replay; - rp->rp_status = nfserr_serverfault; - rp->rp_buflen = 0; - rp->rp_buf = rp->rp_ibuf; + init_nfs4_replay(&sop->so_replay); return sop; } -static inline void -init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { - struct nfs4_stateowner *sop = open->op_stateowner; - unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id); +static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval) +{ + list_add(&oo->oo_owner.so_strhash, &open_ownerstr_hashtbl[strhashval]); + list_add(&oo->oo_perclient, &clp->cl_openowners); +} - INIT_LIST_HEAD(&stp->st_hash); - INIT_LIST_HEAD(&stp->st_perstateowner); +static struct nfs4_openowner * +alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) { + struct nfs4_openowner *oo; + + oo = alloc_stateowner(openowner_slab, &open->op_owner, clp); + if (!oo) + return NULL; + oo->oo_owner.so_is_open_owner = 1; + oo->oo_owner.so_seqid = open->op_seqid; + oo->oo_flags = NFS4_OO_NEW; + oo->oo_time = 0; + oo->oo_last_closed_stid = NULL; + INIT_LIST_HEAD(&oo->oo_close_lru); + hash_openowner(oo, clp, strhashval); + return oo; +} + +static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { + struct nfs4_openowner *oo = open->op_openowner; + struct nfs4_client *clp = oo->oo_owner.so_client; + + init_stid(&stp->st_stid, clp, NFS4_OPEN_STID); INIT_LIST_HEAD(&stp->st_lockowners); - INIT_LIST_HEAD(&stp->st_perfile); - list_add(&stp->st_hash, &stateid_hashtbl[hashval]); - list_add(&stp->st_perstateowner, &sop->so_stateids); + list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); - stp->st_stateowner = sop; + stp->st_stateowner = &oo->oo_owner; get_nfs4_file(fp); stp->st_file = fp; - stp->st_stateid.si_boot = boot_time; - stp->st_stateid.si_stateownerid = sop->so_id; - stp->st_stateid.si_fileid = fp->fi_id; - stp->st_stateid.si_generation = 0; stp->st_access_bmap = 0; stp->st_deny_bmap = 0; - __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK, - &stp->st_access_bmap); + __set_bit(open->op_share_access, &stp->st_access_bmap); __set_bit(open->op_share_deny, &stp->st_deny_bmap); stp->st_openstp = NULL; } static void -move_to_close_lru(struct nfs4_stateowner *sop) +move_to_close_lru(struct nfs4_openowner *oo) { - dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop); + dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo); - list_move_tail(&sop->so_close_lru, &close_lru); - sop->so_time = get_seconds(); + list_move_tail(&oo->oo_close_lru, &close_lru); + oo->oo_time = get_seconds(); } static int @@ -2289,14 +2430,18 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner, (sop->so_client->cl_clientid.cl_id == clid->cl_id); } -static struct nfs4_stateowner * +static struct nfs4_openowner * find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open) { - struct nfs4_stateowner *so = NULL; + struct nfs4_stateowner *so; + struct nfs4_openowner *oo; - list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { - if (same_owner_str(so, &open->op_owner, &open->op_clientid)) - return so; + list_for_each_entry(so, &open_ownerstr_hashtbl[hashval], so_strhash) { + if (same_owner_str(so, &open->op_owner, &open->op_clientid)) { + oo = openowner(so); + renew_client(oo->oo_owner.so_client); + return oo; + } } return NULL; } @@ -2320,31 +2465,6 @@ find_file(struct inode *ino) return NULL; } -static inline int access_valid(u32 x, u32 minorversion) -{ - if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ) - return 0; - if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH) - return 0; - x &= ~NFS4_SHARE_ACCESS_MASK; - if (minorversion && x) { - if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL) - return 0; - if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED) - return 0; - x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK); - } - if (x) - return 0; - return 1; -} - -static inline int deny_valid(u32 x) -{ - /* Note: unlike access bits, deny bits may be zero. */ - return x <= NFS4_SHARE_DENY_BOTH; -} - /* * Called to check deny when READ with all zero stateid or * WRITE with all zero or all one stateid @@ -2354,7 +2474,7 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) { struct inode *ino = current_fh->fh_dentry->d_inode; struct nfs4_file *fp; - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; __be32 ret; dprintk("NFSD: nfs4_share_conflict\n"); @@ -2429,6 +2549,16 @@ static const struct lock_manager_operations nfsd_lease_mng_ops = { .lm_change = nfsd_change_deleg_cb, }; +static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4_stateowner *so, u32 seqid) +{ + if (nfsd4_has_session(cstate)) + return nfs_ok; + if (seqid == so->so_seqid - 1) + return nfserr_replay_me; + if (seqid == so->so_seqid) + return nfs_ok; + return nfserr_bad_seqid; +} __be32 nfsd4_process_open1(struct nfsd4_compound_state *cstate, @@ -2437,57 +2567,49 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, clientid_t *clientid = &open->op_clientid; struct nfs4_client *clp = NULL; unsigned int strhashval; - struct nfs4_stateowner *sop = NULL; - - if (!check_name(open->op_owner)) - return nfserr_inval; + struct nfs4_openowner *oo = NULL; + __be32 status; if (STALE_CLIENTID(&open->op_clientid)) return nfserr_stale_clientid; + /* + * In case we need it later, after we've already created the + * file and don't want to risk a further failure: + */ + open->op_file = nfsd4_alloc_file(); + if (open->op_file == NULL) + return nfserr_jukebox; - strhashval = ownerstr_hashval(clientid->cl_id, open->op_owner); - sop = find_openstateowner_str(strhashval, open); - open->op_stateowner = sop; - if (!sop) { - /* Make sure the client's lease hasn't expired. */ + strhashval = open_ownerstr_hashval(clientid->cl_id, &open->op_owner); + oo = find_openstateowner_str(strhashval, open); + open->op_openowner = oo; + if (!oo) { clp = find_confirmed_client(clientid); if (clp == NULL) return nfserr_expired; - goto renew; + goto new_owner; } - /* When sessions are used, skip open sequenceid processing */ - if (nfsd4_has_session(cstate)) - goto renew; - if (!sop->so_confirmed) { + if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { /* Replace unconfirmed owners without checking for replay. */ - clp = sop->so_client; - release_openowner(sop); - open->op_stateowner = NULL; - goto renew; - } - if (open->op_seqid == sop->so_seqid - 1) { - if (sop->so_replay.rp_buflen) - return nfserr_replay_me; - /* The original OPEN failed so spectacularly - * that we don't even have replay data saved! - * Therefore, we have no choice but to continue - * processing this OPEN; presumably, we'll - * fail again for the same reason. - */ - dprintk("nfsd4_process_open1: replay with no replay cache\n"); - goto renew; - } - if (open->op_seqid != sop->so_seqid) - return nfserr_bad_seqid; -renew: - if (open->op_stateowner == NULL) { - sop = alloc_init_open_stateowner(strhashval, clp, open); - if (sop == NULL) - return nfserr_resource; - open->op_stateowner = sop; + clp = oo->oo_owner.so_client; + release_openowner(oo); + open->op_openowner = NULL; + goto new_owner; } - list_del_init(&sop->so_close_lru); - renew_client(sop->so_client); + status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid); + if (status) + return status; + clp = oo->oo_owner.so_client; + goto alloc_stateid; +new_owner: + oo = alloc_init_open_stateowner(strhashval, clp, open); + if (oo == NULL) + return nfserr_jukebox; + open->op_openowner = oo; +alloc_stateid: + open->op_stp = nfs4_alloc_stateid(clp); + if (!open->op_stp) + return nfserr_jukebox; return nfs_ok; } @@ -2500,36 +2622,37 @@ nfs4_check_delegmode(struct nfs4_delegation *dp, int flags) return nfs_ok; } -static struct nfs4_delegation * -find_delegation_file(struct nfs4_file *fp, stateid_t *stid) +static int share_access_to_flags(u32 share_access) { - struct nfs4_delegation *dp; + share_access &= ~NFS4_SHARE_WANT_MASK; - spin_lock(&recall_lock); - list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) - if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) { - spin_unlock(&recall_lock); - return dp; - } - spin_unlock(&recall_lock); - return NULL; + return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE; } -static int share_access_to_flags(u32 share_access) +static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s) { - share_access &= ~NFS4_SHARE_WANT_MASK; + struct nfs4_stid *ret; - return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE; + ret = find_stateid_by_type(cl, s, NFS4_DELEG_STID); + if (!ret) + return NULL; + return delegstateid(ret); +} + +static bool nfsd4_is_deleg_cur(struct nfsd4_open *open) +{ + return open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR || + open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH; } static __be32 -nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open, +nfs4_check_deleg(struct nfs4_client *cl, struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_delegation **dp) { int flags; __be32 status = nfserr_bad_stateid; - *dp = find_delegation_file(fp, &open->op_delegate_stateid); + *dp = find_deleg_stateid(cl, &open->op_delegate_stateid); if (*dp == NULL) goto out; flags = share_access_to_flags(open->op_share_access); @@ -2537,41 +2660,37 @@ nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open, if (status) *dp = NULL; out: - if (open->op_claim_type != NFS4_OPEN_CLAIM_DELEGATE_CUR) + if (!nfsd4_is_deleg_cur(open)) return nfs_ok; if (status) return status; - open->op_stateowner->so_confirmed = 1; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; return nfs_ok; } static __be32 -nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_stateid **stpp) +nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_ol_stateid **stpp) { - struct nfs4_stateid *local; - __be32 status = nfserr_share_denied; - struct nfs4_stateowner *sop = open->op_stateowner; + struct nfs4_ol_stateid *local; + struct nfs4_openowner *oo = open->op_openowner; list_for_each_entry(local, &fp->fi_stateids, st_perfile) { /* ignore lock owners */ if (local->st_stateowner->so_is_open_owner == 0) continue; /* remember if we have seen this open owner */ - if (local->st_stateowner == sop) + if (local->st_stateowner == &oo->oo_owner) *stpp = local; /* check for conflicting share reservations */ if (!test_share(local, open)) - goto out; + return nfserr_share_denied; } - status = 0; -out: - return status; + return nfs_ok; } -static inline struct nfs4_stateid * -nfs4_alloc_stateid(void) +static void nfs4_free_stateid(struct nfs4_ol_stateid *s) { - return kmem_cache_alloc(stateid_slab, GFP_KERNEL); + kmem_cache_free(stateid_slab, s); } static inline int nfs4_access_to_access(u32 nfs4_access) @@ -2592,12 +2711,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, int oflag = nfs4_access_to_omode(open->op_share_access); int access = nfs4_access_to_access(open->op_share_access); - /* CLAIM_DELEGATE_CUR is used in response to a broken lease; - * allowing it to break the lease and return EAGAIN leaves the - * client unable to make progress in returning the delegation */ - if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) - access |= NFSD_MAY_NOT_BREAK_LEASE; - if (!fp->fi_fds[oflag]) { status = nfsd_open(rqstp, cur_fh, S_IFREG, access, &fp->fi_fds[oflag]); @@ -2609,27 +2722,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, return nfs_ok; } -static __be32 -nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp, - struct nfs4_file *fp, struct svc_fh *cur_fh, - struct nfsd4_open *open) -{ - struct nfs4_stateid *stp; - __be32 status; - - stp = nfs4_alloc_stateid(); - if (stp == NULL) - return nfserr_resource; - - status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open); - if (status) { - kmem_cache_free(stateid_slab, stp); - return status; - } - *stpp = stp; - return 0; -} - static inline __be32 nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, struct nfsd4_open *open) @@ -2646,9 +2738,9 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, } static __be32 -nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open) +nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) { - u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK; + u32 op_share_access = open->op_share_access; bool new_access; __be32 status; @@ -2677,8 +2769,8 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c static void nfs4_set_claim_prev(struct nfsd4_open *open) { - open->op_stateowner->so_confirmed = 1; - open->op_stateowner->so_client->cl_firststate = 1; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; + open->op_openowner->oo_owner.so_client->cl_firststate = 1; } /* Should we give out recallable state?: */ @@ -2721,7 +2813,7 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag) if (!fl) return -ENOMEM; fl->fl_file = find_readable_file(fp); - list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations); + list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); status = vfs_setlease(fl->fl_file, fl->fl_type, &fl); if (status) { list_del_init(&dp->dl_perclnt); @@ -2750,7 +2842,7 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag) atomic_inc(&fp->fi_delegees); list_add(&dp->dl_perfile, &fp->fi_delegations); spin_unlock(&recall_lock); - list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations); + list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); return 0; } @@ -2758,14 +2850,14 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag) * Attempt to hand out a delegation. */ static void -nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_stateid *stp) +nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_ol_stateid *stp) { struct nfs4_delegation *dp; - struct nfs4_stateowner *sop = stp->st_stateowner; + struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner); int cb_up; int status, flag = 0; - cb_up = nfsd4_cb_channel_good(sop->so_client); + cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client); flag = NFS4_OPEN_DELEGATE_NONE; open->op_recall = 0; switch (open->op_claim_type) { @@ -2781,7 +2873,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta * had the chance to reclaim theirs.... */ if (locks_in_grace()) goto out; - if (!cb_up || !sop->so_confirmed) + if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) goto out; if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) flag = NFS4_OPEN_DELEGATE_WRITE; @@ -2792,17 +2884,17 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta goto out; } - dp = alloc_init_deleg(sop->so_client, stp, fh, flag); + dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh, flag); if (dp == NULL) goto out_no_deleg; status = nfs4_set_delegation(dp, flag); if (status) goto out_free; - memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid)); + memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid)); dprintk("NFSD: delegation stateid=" STATEID_FMT "\n", - STATEID_VAL(&dp->dl_stateid)); + STATEID_VAL(&dp->dl_stid.sc_stateid)); out: if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && flag == NFS4_OPEN_DELEGATE_NONE @@ -2824,16 +2916,13 @@ __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) { struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfs4_client *cl = open->op_openowner->oo_owner.so_client; struct nfs4_file *fp = NULL; struct inode *ino = current_fh->fh_dentry->d_inode; - struct nfs4_stateid *stp = NULL; + struct nfs4_ol_stateid *stp = NULL; struct nfs4_delegation *dp = NULL; __be32 status; - status = nfserr_inval; - if (!access_valid(open->op_share_access, resp->cstate.minorversion) - || !deny_valid(open->op_share_deny)) - goto out; /* * Lookup file; if found, lookup stateid and check open request, * and check for delegations in the process of being recalled. @@ -2843,17 +2932,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf if (fp) { if ((status = nfs4_check_open(fp, open, &stp))) goto out; - status = nfs4_check_deleg(fp, open, &dp); + status = nfs4_check_deleg(cl, fp, open, &dp); if (status) goto out; } else { status = nfserr_bad_stateid; - if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) - goto out; - status = nfserr_resource; - fp = alloc_init_file(ino); - if (fp == NULL) + if (nfsd4_is_deleg_cur(open)) goto out; + status = nfserr_jukebox; + fp = open->op_file; + open->op_file = NULL; + nfsd4_init_file(fp, ino); } /* @@ -2865,24 +2954,24 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); if (status) goto out; - update_stateid(&stp->st_stateid); } else { - status = nfs4_new_open(rqstp, &stp, fp, current_fh, open); + status = nfs4_get_vfs_file(rqstp, fp, current_fh, open); if (status) goto out; - init_stateid(stp, fp, open); + stp = open->op_stp; + open->op_stp = NULL; + init_open_stateid(stp, fp, open); status = nfsd4_truncate(rqstp, current_fh, open); if (status) { release_open_stateid(stp); goto out; } - if (nfsd4_has_session(&resp->cstate)) - update_stateid(&stp->st_stateid); } - memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); if (nfsd4_has_session(&resp->cstate)) - open->op_stateowner->so_confirmed = 1; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; /* * Attempt to hand out a delegation. No error return, because the @@ -2893,7 +2982,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs_ok; dprintk("%s: stateid=" STATEID_FMT "\n", __func__, - STATEID_VAL(&stp->st_stateid)); + STATEID_VAL(&stp->st_stid.sc_stateid)); out: if (fp) put_nfs4_file(fp); @@ -2903,13 +2992,34 @@ out: * To finish the open response, we just need to set the rflags. */ open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX; - if (!open->op_stateowner->so_confirmed && + if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) && !nfsd4_has_session(&resp->cstate)) open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; return status; } +void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status) +{ + if (open->op_openowner) { + struct nfs4_openowner *oo = open->op_openowner; + + if (!list_empty(&oo->oo_owner.so_stateids)) + list_del_init(&oo->oo_close_lru); + if (oo->oo_flags & NFS4_OO_NEW) { + if (status) { + release_openowner(oo); + open->op_openowner = NULL; + } else + oo->oo_flags &= ~NFS4_OO_NEW; + } + } + if (open->op_file) + nfsd4_free_file(open->op_file); + if (open->op_stp) + nfs4_free_stateid(open->op_stp); +} + __be32 nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, clientid_t *clid) @@ -2930,7 +3040,6 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("nfsd4_renew: clientid not found!\n"); goto out; } - renew_client(clp); status = nfserr_cb_path_down; if (!list_empty(&clp->cl_delegations) && clp->cl_cb_state != NFSD4_CB_UP) @@ -2962,7 +3071,7 @@ static time_t nfs4_laundromat(void) { struct nfs4_client *clp; - struct nfs4_stateowner *sop; + struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct list_head *pos, *next, reaplist; time_t cutoff = get_seconds() - nfsd4_lease; @@ -3019,16 +3128,14 @@ nfs4_laundromat(void) } test_val = nfsd4_lease; list_for_each_safe(pos, next, &close_lru) { - sop = list_entry(pos, struct nfs4_stateowner, so_close_lru); - if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) { - u = sop->so_time - cutoff; + oo = container_of(pos, struct nfs4_openowner, oo_close_lru); + if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) { + u = oo->oo_time - cutoff; if (test_val > u) test_val = u; break; } - dprintk("NFSD: purging unused open stateowner (so_id %d)\n", - sop->so_id); - release_openowner(sop); + release_openowner(oo); } if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT; @@ -3050,30 +3157,17 @@ laundromat_main(struct work_struct *not_used) queue_delayed_work(laundry_wq, &laundromat_work, t*HZ); } -static struct nfs4_stateowner * -search_close_lru(u32 st_id, int flags) +static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) { - struct nfs4_stateowner *local = NULL; - - if (flags & CLOSE_STATE) { - list_for_each_entry(local, &close_lru, so_close_lru) { - if (local->so_id == st_id) - return local; - } - } - return NULL; -} - -static inline int -nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp) -{ - return fhp->fh_dentry->d_inode != stp->st_file->fi_inode; + if (fhp->fh_dentry->d_inode != stp->st_file->fi_inode) + return nfserr_bad_stateid; + return nfs_ok; } static int STALE_STATEID(stateid_t *stateid) { - if (stateid->si_boot == boot_time) + if (stateid->si_opaque.so_clid.cl_boot == boot_time) return 0; dprintk("NFSD: stale stateid " STATEID_FMT "!\n", STATEID_VAL(stateid)); @@ -3096,7 +3190,7 @@ access_permit_write(unsigned long access_bmap) } static -__be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags) +__be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags) { __be32 status = nfserr_openmode; @@ -3139,68 +3233,80 @@ grace_disallows_io(struct inode *inode) return locks_in_grace() && mandatory_lock(inode); } -static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags) +/* Returns true iff a is later than b: */ +static bool stateid_generation_after(stateid_t *a, stateid_t *b) +{ + return (s32)a->si_generation - (s32)b->si_generation > 0; +} + +static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session) { /* * When sessions are used the stateid generation number is ignored * when it is zero. */ - if ((flags & HAS_SESSION) && in->si_generation == 0) - goto out; + if (has_session && in->si_generation == 0) + return nfs_ok; + + if (in->si_generation == ref->si_generation) + return nfs_ok; /* If the client sends us a stateid from the future, it's buggy: */ - if (in->si_generation > ref->si_generation) + if (stateid_generation_after(in, ref)) return nfserr_bad_stateid; /* - * The following, however, can happen. For example, if the - * client sends an open and some IO at the same time, the open - * may bump si_generation while the IO is still in flight. - * Thanks to hard links and renames, the client never knows what - * file an open will affect. So it could avoid that situation - * only by serializing all opens and IO from the same open - * owner. To recover from the old_stateid error, the client - * will just have to retry the IO: + * However, we could see a stateid from the past, even from a + * non-buggy client. For example, if the client sends a lock + * while some IO is outstanding, the lock may bump si_generation + * while the IO is still in flight. The client could avoid that + * situation by waiting for responses on all the IO requests, + * but better performance may result in retrying IO that + * receives an old_stateid error if requests are rarely + * reordered in flight: */ - if (in->si_generation < ref->si_generation) - return nfserr_old_stateid; -out: - return nfs_ok; + return nfserr_old_stateid; } -static int is_delegation_stateid(stateid_t *stateid) +__be32 nfs4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) { - return stateid->si_fileid == 0; -} + struct nfs4_stid *s; + struct nfs4_ol_stateid *ols; + __be32 status; -static int is_open_stateid(struct nfs4_stateid *stateid) -{ - return stateid->st_openstp == NULL; + if (STALE_STATEID(stateid)) + return nfserr_stale_stateid; + + s = find_stateid(cl, stateid); + if (!s) + return nfserr_stale_stateid; + status = check_stateid_generation(stateid, &s->sc_stateid, 1); + if (status) + return status; + if (!(s->sc_type & (NFS4_OPEN_STID | NFS4_LOCK_STID))) + return nfs_ok; + ols = openlockstateid(s); + if (ols->st_stateowner->so_is_open_owner + && !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) + return nfserr_bad_stateid; + return nfs_ok; } -__be32 nfs4_validate_stateid(stateid_t *stateid, int flags) +static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s) { - struct nfs4_stateid *stp = NULL; - __be32 status = nfserr_stale_stateid; + struct nfs4_client *cl; + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + return nfserr_bad_stateid; if (STALE_STATEID(stateid)) - goto out; - - status = nfserr_expired; - stp = search_for_stateid(stateid); - if (!stp) - goto out; - status = nfserr_bad_stateid; - - if (!stp->st_stateowner->so_confirmed) - goto out; - - status = check_stateid_generation(stateid, &stp->st_stateid, flags); - if (status) - goto out; + return nfserr_stale_stateid; + cl = find_confirmed_client(&stateid->si_opaque.so_clid); + if (!cl) + return nfserr_expired; + *s = find_stateid_by_type(cl, stateid, typemask); + if (!*s) + return nfserr_bad_stateid; + return nfs_ok; - status = nfs_ok; -out: - return status; } /* @@ -3210,7 +3316,8 @@ __be32 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, stateid_t *stateid, int flags, struct file **filpp) { - struct nfs4_stateid *stp = NULL; + struct nfs4_stid *s; + struct nfs4_ol_stateid *stp = NULL; struct nfs4_delegation *dp = NULL; struct svc_fh *current_fh = &cstate->current_fh; struct inode *ino = current_fh->fh_dentry->d_inode; @@ -3222,60 +3329,47 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, if (grace_disallows_io(ino)) return nfserr_grace; - if (nfsd4_has_session(cstate)) - flags |= HAS_SESSION; - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) return check_special_stateids(current_fh, stateid, flags); - status = nfserr_stale_stateid; - if (STALE_STATEID(stateid)) + status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s); + if (status) + return status; + status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate)); + if (status) goto out; - - /* - * We assume that any stateid that has the current boot time, - * but that we can't find, is expired: - */ - status = nfserr_expired; - if (is_delegation_stateid(stateid)) { - dp = find_delegation_stateid(ino, stateid); - if (!dp) - goto out; - status = check_stateid_generation(stateid, &dp->dl_stateid, - flags); - if (status) - goto out; + switch (s->sc_type) { + case NFS4_DELEG_STID: + dp = delegstateid(s); status = nfs4_check_delegmode(dp, flags); if (status) goto out; - renew_client(dp->dl_client); if (filpp) { *filpp = dp->dl_file->fi_deleg_file; BUG_ON(!*filpp); } - } else { /* open or lock stateid */ - stp = find_stateid(stateid, flags); - if (!stp) - goto out; - status = nfserr_bad_stateid; - if (nfs4_check_fh(current_fh, stp)) - goto out; - if (!stp->st_stateowner->so_confirmed) - goto out; - status = check_stateid_generation(stateid, &stp->st_stateid, - flags); + break; + case NFS4_OPEN_STID: + case NFS4_LOCK_STID: + stp = openlockstateid(s); + status = nfs4_check_fh(current_fh, stp); if (status) goto out; + if (stp->st_stateowner->so_is_open_owner + && !(openowner(stp->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) + goto out; status = nfs4_check_openmode(stp, flags); if (status) goto out; - renew_client(stp->st_stateowner->so_client); if (filpp) { if (flags & RD_STATE) *filpp = find_readable_file(stp->st_file); else *filpp = find_writeable_file(stp->st_file); } + break; + default: + return nfserr_bad_stateid; } status = nfs_ok; out: @@ -3283,18 +3377,9 @@ out: } static __be32 -nfsd4_free_delegation_stateid(stateid_t *stateid) +nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp) { - struct nfs4_delegation *dp = search_for_delegation(stateid); - if (dp) - return nfserr_locks_held; - return nfserr_bad_stateid; -} - -static __be32 -nfsd4_free_lock_stateid(struct nfs4_stateid *stp) -{ - if (check_for_locks(stp->st_file, stp->st_stateowner)) + if (check_for_locks(stp->st_file, lockowner(stp->st_stateowner))) return nfserr_locks_held; release_lock_stateid(stp); return nfs_ok; @@ -3307,51 +3392,40 @@ __be32 nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_test_stateid *test_stateid) { - test_stateid->ts_has_session = nfsd4_has_session(cstate); + /* real work is done during encoding */ return nfs_ok; } -/* - * Free a state id - */ __be32 nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_free_stateid *free_stateid) { stateid_t *stateid = &free_stateid->fr_stateid; - struct nfs4_stateid *stp; - __be32 ret; + struct nfs4_stid *s; + struct nfs4_client *cl = cstate->session->se_client; + __be32 ret = nfserr_bad_stateid; nfs4_lock_state(); - if (is_delegation_stateid(stateid)) { - ret = nfsd4_free_delegation_stateid(stateid); - goto out; - } - - stp = search_for_stateid(stateid); - if (!stp) { - ret = nfserr_bad_stateid; + s = find_stateid(cl, stateid); + if (!s) goto out; - } - if (stateid->si_generation != 0) { - if (stateid->si_generation < stp->st_stateid.si_generation) { - ret = nfserr_old_stateid; - goto out; - } - if (stateid->si_generation > stp->st_stateid.si_generation) { - ret = nfserr_bad_stateid; - goto out; - } - } - - if (is_open_stateid(stp)) { + switch (s->sc_type) { + case NFS4_DELEG_STID: ret = nfserr_locks_held; goto out; - } else { - ret = nfsd4_free_lock_stateid(stp); - goto out; + case NFS4_OPEN_STID: + case NFS4_LOCK_STID: + ret = check_stateid_generation(stateid, &s->sc_stateid, 1); + if (ret) + goto out; + if (s->sc_type == NFS4_LOCK_STID) + ret = nfsd4_free_lock_stateid(openlockstateid(s)); + else + ret = nfserr_locks_held; + break; + default: + ret = nfserr_bad_stateid; } - out: nfs4_unlock_state(); return ret; @@ -3364,124 +3438,64 @@ setlkflg (int type) RD_STATE : WR_STATE; } +static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_t *stateid, u32 seqid, struct nfs4_ol_stateid *stp) +{ + struct svc_fh *current_fh = &cstate->current_fh; + struct nfs4_stateowner *sop = stp->st_stateowner; + __be32 status; + + status = nfsd4_check_seqid(cstate, sop, seqid); + if (status) + return status; + if (stp->st_stid.sc_type == NFS4_CLOSED_STID) + /* + * "Closed" stateid's exist *only* to return + * nfserr_replay_me from the previous step. + */ + return nfserr_bad_stateid; + status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); + if (status) + return status; + return nfs4_check_fh(current_fh, stp); +} + /* * Checks for sequence id mutating operations. */ static __be32 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, - stateid_t *stateid, int flags, - struct nfs4_stateowner **sopp, - struct nfs4_stateid **stpp, struct nfsd4_lock *lock) + stateid_t *stateid, char typemask, + struct nfs4_ol_stateid **stpp) { - struct nfs4_stateid *stp; - struct nfs4_stateowner *sop; - struct svc_fh *current_fh = &cstate->current_fh; __be32 status; + struct nfs4_stid *s; dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__, seqid, STATEID_VAL(stateid)); *stpp = NULL; - *sopp = NULL; - - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { - dprintk("NFSD: preprocess_seqid_op: magic stateid!\n"); - return nfserr_bad_stateid; - } - - if (STALE_STATEID(stateid)) - return nfserr_stale_stateid; - - if (nfsd4_has_session(cstate)) - flags |= HAS_SESSION; - - /* - * We return BAD_STATEID if filehandle doesn't match stateid, - * the confirmed flag is incorrecly set, or the generation - * number is incorrect. - */ - stp = find_stateid(stateid, flags); - if (stp == NULL) { - /* - * Also, we should make sure this isn't just the result of - * a replayed close: - */ - sop = search_close_lru(stateid->si_stateownerid, flags); - /* It's not stale; let's assume it's expired: */ - if (sop == NULL) - return nfserr_expired; - *sopp = sop; - goto check_replay; - } - - *stpp = stp; - *sopp = sop = stp->st_stateowner; - - if (lock) { - clientid_t *lockclid = &lock->v.new.clientid; - struct nfs4_client *clp = sop->so_client; - int lkflg = 0; - __be32 status; - - lkflg = setlkflg(lock->lk_type); - - if (lock->lk_is_new) { - if (!sop->so_is_open_owner) - return nfserr_bad_stateid; - if (!(flags & HAS_SESSION) && - !same_clid(&clp->cl_clientid, lockclid)) - return nfserr_bad_stateid; - /* stp is the open stateid */ - status = nfs4_check_openmode(stp, lkflg); - if (status) - return status; - } else { - /* stp is the lock stateid */ - status = nfs4_check_openmode(stp->st_openstp, lkflg); - if (status) - return status; - } - } + status = nfsd4_lookup_stateid(stateid, typemask, &s); + if (status) + return status; + *stpp = openlockstateid(s); + cstate->replay_owner = (*stpp)->st_stateowner; - if (nfs4_check_fh(current_fh, stp)) { - dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n"); - return nfserr_bad_stateid; - } + return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp); +} - /* - * We now validate the seqid and stateid generation numbers. - * For the moment, we ignore the possibility of - * generation number wraparound. - */ - if (!(flags & HAS_SESSION) && seqid != sop->so_seqid) - goto check_replay; +static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_ol_stateid **stpp) +{ + __be32 status; + struct nfs4_openowner *oo; - if (sop->so_confirmed && flags & CONFIRM) { - dprintk("NFSD: preprocess_seqid_op: expected" - " unconfirmed stateowner!\n"); - return nfserr_bad_stateid; - } - if (!sop->so_confirmed && !(flags & CONFIRM)) { - dprintk("NFSD: preprocess_seqid_op: stateowner not" - " confirmed yet!\n"); - return nfserr_bad_stateid; - } - status = check_stateid_generation(stateid, &stp->st_stateid, flags); + status = nfs4_preprocess_seqid_op(cstate, seqid, stateid, + NFS4_OPEN_STID, stpp); if (status) return status; - renew_client(sop->so_client); + oo = openowner((*stpp)->st_stateowner); + if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) + return nfserr_bad_stateid; return nfs_ok; - -check_replay: - if (seqid == sop->so_seqid - 1) { - dprintk("NFSD: preprocess_seqid_op: retransmission?\n"); - /* indicate replay to calling function */ - return nfserr_replay_me; - } - dprintk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n", - sop->so_seqid, seqid); - *sopp = NULL; - return nfserr_bad_seqid; } __be32 @@ -3489,8 +3503,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open_confirm *oc) { __be32 status; - struct nfs4_stateowner *sop; - struct nfs4_stateid *stp; + struct nfs4_openowner *oo; + struct nfs4_ol_stateid *stp; dprintk("NFSD: nfsd4_open_confirm on file %.*s\n", (int)cstate->current_fh.fh_dentry->d_name.len, @@ -3502,38 +3516,52 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); - if ((status = nfs4_preprocess_seqid_op(cstate, + status = nfs4_preprocess_seqid_op(cstate, oc->oc_seqid, &oc->oc_req_stateid, - CONFIRM | OPEN_STATE, - &oc->oc_stateowner, &stp, NULL))) - goto out; - - sop = oc->oc_stateowner; - sop->so_confirmed = 1; - update_stateid(&stp->st_stateid); - memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t)); + NFS4_OPEN_STID, &stp); + if (status) + goto out; + oo = openowner(stp->st_stateowner); + status = nfserr_bad_stateid; + if (oo->oo_flags & NFS4_OO_CONFIRMED) + goto out; + oo->oo_flags |= NFS4_OO_CONFIRMED; + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n", - __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid)); + __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid)); - nfsd4_create_clid_dir(sop->so_client); + nfsd4_create_clid_dir(oo->oo_owner.so_client); + status = nfs_ok; out: - if (oc->oc_stateowner) { - nfs4_get_stateowner(oc->oc_stateowner); - cstate->replay_owner = oc->oc_stateowner; - } - nfs4_unlock_state(); + if (!cstate->replay_owner) + nfs4_unlock_state(); return status; } -static inline void nfs4_file_downgrade(struct nfs4_stateid *stp, unsigned int to_access) +static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 access) { - int i; + if (!test_bit(access, &stp->st_access_bmap)) + return; + nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(access)); + __clear_bit(access, &stp->st_access_bmap); +} - for (i = 1; i < 4; i++) { - if (test_bit(i, &stp->st_access_bmap) && !(i & to_access)) { - nfs4_file_put_access(stp->st_file, i); - __clear_bit(i, &stp->st_access_bmap); - } +static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_access) +{ + switch (to_access) { + case NFS4_SHARE_ACCESS_READ: + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_WRITE); + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH); + break; + case NFS4_SHARE_ACCESS_WRITE: + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_READ); + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH); + break; + case NFS4_SHARE_ACCESS_BOTH: + break; + default: + BUG(); } } @@ -3553,24 +3581,20 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, struct nfsd4_open_downgrade *od) { __be32 status; - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n", (int)cstate->current_fh.fh_dentry->d_name.len, cstate->current_fh.fh_dentry->d_name.name); - if (!access_valid(od->od_share_access, cstate->minorversion) - || !deny_valid(od->od_share_deny)) - return nfserr_inval; + /* We don't yet support WANT bits: */ + od->od_share_access &= NFS4_SHARE_ACCESS_MASK; nfs4_lock_state(); - if ((status = nfs4_preprocess_seqid_op(cstate, - od->od_seqid, - &od->od_stateid, - OPEN_STATE, - &od->od_stateowner, &stp, NULL))) + status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid, + &od->od_stateid, &stp); + if (status) goto out; - status = nfserr_inval; if (!test_bit(od->od_share_access, &stp->st_access_bmap)) { dprintk("NFSD:access not a subset current bitmap: 0x%lx, input access=%08x\n", @@ -3582,22 +3606,45 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, stp->st_deny_bmap, od->od_share_deny); goto out; } - nfs4_file_downgrade(stp, od->od_share_access); + nfs4_stateid_downgrade(stp, od->od_share_access); reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap); - update_stateid(&stp->st_stateid); - memcpy(&od->od_stateid, &stp->st_stateid, sizeof(stateid_t)); + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); status = nfs_ok; out: - if (od->od_stateowner) { - nfs4_get_stateowner(od->od_stateowner); - cstate->replay_owner = od->od_stateowner; - } - nfs4_unlock_state(); + if (!cstate->replay_owner) + nfs4_unlock_state(); return status; } +void nfsd4_purge_closed_stateid(struct nfs4_stateowner *so) +{ + struct nfs4_openowner *oo; + struct nfs4_ol_stateid *s; + + if (!so->so_is_open_owner) + return; + oo = openowner(so); + s = oo->oo_last_closed_stid; + if (!s) + return; + if (!(oo->oo_flags & NFS4_OO_PURGE_CLOSE)) { + /* Release the last_closed_stid on the next seqid bump: */ + oo->oo_flags |= NFS4_OO_PURGE_CLOSE; + return; + } + oo->oo_flags &= ~NFS4_OO_PURGE_CLOSE; + release_last_closed_stateid(oo); +} + +static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) +{ + unhash_open_stateid(s); + s->st_stid.sc_type = NFS4_CLOSED_STID; +} + /* * nfs4_unlock_state() called after encode */ @@ -3606,39 +3653,37 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_close *close) { __be32 status; - struct nfs4_stateid *stp; + struct nfs4_openowner *oo; + struct nfs4_ol_stateid *stp; dprintk("NFSD: nfsd4_close on file %.*s\n", (int)cstate->current_fh.fh_dentry->d_name.len, cstate->current_fh.fh_dentry->d_name.name); nfs4_lock_state(); - /* check close_lru for replay */ - if ((status = nfs4_preprocess_seqid_op(cstate, - close->cl_seqid, - &close->cl_stateid, - OPEN_STATE | CLOSE_STATE, - &close->cl_stateowner, &stp, NULL))) + status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid, + &close->cl_stateid, + NFS4_OPEN_STID|NFS4_CLOSED_STID, + &stp); + if (status) goto out; + oo = openowner(stp->st_stateowner); status = nfs_ok; - update_stateid(&stp->st_stateid); - memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t)); + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); - /* release_stateid() calls nfsd_close() if needed */ - release_open_stateid(stp); + nfsd4_close_open_stateid(stp); + oo->oo_last_closed_stid = stp; /* place unused nfs4_stateowners on so_close_lru list to be * released by the laundromat service after the lease period * to enable us to handle CLOSE replay */ - if (list_empty(&close->cl_stateowner->so_stateids)) - move_to_close_lru(close->cl_stateowner); + if (list_empty(&oo->oo_owner.so_stateids)) + move_to_close_lru(oo); out: - if (close->cl_stateowner) { - nfs4_get_stateowner(close->cl_stateowner); - cstate->replay_owner = close->cl_stateowner; - } - nfs4_unlock_state(); + if (!cstate->replay_owner) + nfs4_unlock_state(); return status; } @@ -3648,34 +3693,22 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfs4_delegation *dp; stateid_t *stateid = &dr->dr_stateid; + struct nfs4_stid *s; struct inode *inode; __be32 status; - int flags = 0; if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) return status; inode = cstate->current_fh.fh_dentry->d_inode; - if (nfsd4_has_session(cstate)) - flags |= HAS_SESSION; nfs4_lock_state(); - status = nfserr_bad_stateid; - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) - goto out; - status = nfserr_stale_stateid; - if (STALE_STATEID(stateid)) - goto out; - status = nfserr_bad_stateid; - if (!is_delegation_stateid(stateid)) - goto out; - status = nfserr_expired; - dp = find_delegation_stateid(inode, stateid); - if (!dp) + status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s); + if (status) goto out; - status = check_stateid_generation(stateid, &dp->dl_stateid, flags); + dp = delegstateid(s); + status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate)); if (status) goto out; - renew_client(dp->dl_client); unhash_delegation(dp); out: @@ -3713,9 +3746,6 @@ last_byte_offset(u64 start, u64 len) return end > start ? end - 1: NFS4_MAX_UINT64; } -#define lockownerid_hashval(id) \ - ((id) & LOCK_HASH_MASK) - static inline unsigned int lock_ownerstr_hashval(struct inode *inode, u32 cl_id, struct xdr_netobj *ownername) @@ -3725,101 +3755,7 @@ lock_ownerstr_hashval(struct inode *inode, u32 cl_id, & LOCK_HASH_MASK; } -static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE]; static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; -static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE]; - -static int -same_stateid(stateid_t *id_one, stateid_t *id_two) -{ - if (id_one->si_stateownerid != id_two->si_stateownerid) - return 0; - return id_one->si_fileid == id_two->si_fileid; -} - -static struct nfs4_stateid * -find_stateid(stateid_t *stid, int flags) -{ - struct nfs4_stateid *local; - u32 st_id = stid->si_stateownerid; - u32 f_id = stid->si_fileid; - unsigned int hashval; - - dprintk("NFSD: find_stateid flags 0x%x\n",flags); - if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) { - hashval = stateid_hashval(st_id, f_id); - list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) { - if ((local->st_stateid.si_stateownerid == st_id) && - (local->st_stateid.si_fileid == f_id)) - return local; - } - } - - if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) { - hashval = stateid_hashval(st_id, f_id); - list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) { - if ((local->st_stateid.si_stateownerid == st_id) && - (local->st_stateid.si_fileid == f_id)) - return local; - } - } - return NULL; -} - -static struct nfs4_stateid * -search_for_stateid(stateid_t *stid) -{ - struct nfs4_stateid *local; - unsigned int hashval = stateid_hashval(stid->si_stateownerid, stid->si_fileid); - - list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) { - if (same_stateid(&local->st_stateid, stid)) - return local; - } - - list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) { - if (same_stateid(&local->st_stateid, stid)) - return local; - } - return NULL; -} - -static struct nfs4_delegation * -search_for_delegation(stateid_t *stid) -{ - struct nfs4_file *fp; - struct nfs4_delegation *dp; - struct list_head *pos; - int i; - - for (i = 0; i < FILE_HASH_SIZE; i++) { - list_for_each_entry(fp, &file_hashtbl[i], fi_hash) { - list_for_each(pos, &fp->fi_delegations) { - dp = list_entry(pos, struct nfs4_delegation, dl_perfile); - if (same_stateid(&dp->dl_stateid, stid)) - return dp; - } - } - } - return NULL; -} - -static struct nfs4_delegation * -find_delegation_stateid(struct inode *ino, stateid_t *stid) -{ - struct nfs4_file *fp; - struct nfs4_delegation *dl; - - dprintk("NFSD: %s: stateid=" STATEID_FMT "\n", __func__, - STATEID_VAL(stid)); - - fp = find_file(ino); - if (!fp) - return NULL; - dl = find_delegation_file(fp, stid); - put_nfs4_file(fp); - return dl; -} /* * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that @@ -3846,15 +3782,21 @@ static const struct lock_manager_operations nfsd_posix_mng_ops = { static inline void nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) { - struct nfs4_stateowner *sop; + struct nfs4_lockowner *lo; if (fl->fl_lmops == &nfsd_posix_mng_ops) { - sop = (struct nfs4_stateowner *) fl->fl_owner; - kref_get(&sop->so_ref); - deny->ld_sop = sop; - deny->ld_clientid = sop->so_client->cl_clientid; + lo = (struct nfs4_lockowner *) fl->fl_owner; + deny->ld_owner.data = kmemdup(lo->lo_owner.so_owner.data, + lo->lo_owner.so_owner.len, GFP_KERNEL); + if (!deny->ld_owner.data) + /* We just don't care that much */ + goto nevermind; + deny->ld_owner.len = lo->lo_owner.so_owner.len; + deny->ld_clientid = lo->lo_owner.so_client->cl_clientid; } else { - deny->ld_sop = NULL; +nevermind: + deny->ld_owner.len = 0; + deny->ld_owner.data = NULL; deny->ld_clientid.cl_boot = 0; deny->ld_clientid.cl_id = 0; } @@ -3867,8 +3809,8 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) deny->ld_type = NFS4_WRITE_LT; } -static struct nfs4_stateowner * -find_lockstateowner_str(struct inode *inode, clientid_t *clid, +static struct nfs4_lockowner * +find_lockowner_str(struct inode *inode, clientid_t *clid, struct xdr_netobj *owner) { unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner); @@ -3876,11 +3818,17 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid, list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) { if (same_owner_str(op, owner, clid)) - return op; + return lockowner(op); } return NULL; } +static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp) +{ + list_add(&lo->lo_owner.so_strhash, &lock_ownerstr_hashtbl[strhashval]); + list_add(&lo->lo_perstateid, &open_stp->st_lockowners); +} + /* * Alloc a lock owner structure. * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has @@ -3889,67 +3837,40 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid, * strhashval = lock_ownerstr_hashval */ -static struct nfs4_stateowner * -alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp, struct nfsd4_lock *lock) { - struct nfs4_stateowner *sop; - struct nfs4_replay *rp; - unsigned int idhashval; +static struct nfs4_lockowner * +alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp, struct nfsd4_lock *lock) { + struct nfs4_lockowner *lo; - if (!(sop = alloc_stateowner(&lock->lk_new_owner))) + lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp); + if (!lo) return NULL; - idhashval = lockownerid_hashval(current_ownerid); - INIT_LIST_HEAD(&sop->so_idhash); - INIT_LIST_HEAD(&sop->so_strhash); - INIT_LIST_HEAD(&sop->so_perclient); - INIT_LIST_HEAD(&sop->so_stateids); - INIT_LIST_HEAD(&sop->so_perstateid); - INIT_LIST_HEAD(&sop->so_close_lru); /* not used */ - sop->so_time = 0; - list_add(&sop->so_idhash, &lock_ownerid_hashtbl[idhashval]); - list_add(&sop->so_strhash, &lock_ownerstr_hashtbl[strhashval]); - list_add(&sop->so_perstateid, &open_stp->st_lockowners); - sop->so_is_open_owner = 0; - sop->so_id = current_ownerid++; - sop->so_client = clp; + INIT_LIST_HEAD(&lo->lo_owner.so_stateids); + lo->lo_owner.so_is_open_owner = 0; /* It is the openowner seqid that will be incremented in encode in the * case of new lockowners; so increment the lock seqid manually: */ - sop->so_seqid = lock->lk_new_lock_seqid + 1; - sop->so_confirmed = 1; - rp = &sop->so_replay; - rp->rp_status = nfserr_serverfault; - rp->rp_buflen = 0; - rp->rp_buf = rp->rp_ibuf; - return sop; + lo->lo_owner.so_seqid = lock->lk_new_lock_seqid + 1; + hash_lockowner(lo, strhashval, clp, open_stp); + return lo; } -static struct nfs4_stateid * -alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struct nfs4_stateid *open_stp) +static struct nfs4_ol_stateid * +alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_ol_stateid *open_stp) { - struct nfs4_stateid *stp; - unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id); + struct nfs4_ol_stateid *stp; + struct nfs4_client *clp = lo->lo_owner.so_client; - stp = nfs4_alloc_stateid(); + stp = nfs4_alloc_stateid(clp); if (stp == NULL) - goto out; - INIT_LIST_HEAD(&stp->st_hash); - INIT_LIST_HEAD(&stp->st_perfile); - INIT_LIST_HEAD(&stp->st_perstateowner); - INIT_LIST_HEAD(&stp->st_lockowners); /* not used */ - list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]); + return NULL; + init_stid(&stp->st_stid, clp, NFS4_LOCK_STID); list_add(&stp->st_perfile, &fp->fi_stateids); - list_add(&stp->st_perstateowner, &sop->so_stateids); - stp->st_stateowner = sop; + list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); + stp->st_stateowner = &lo->lo_owner; get_nfs4_file(fp); stp->st_file = fp; - stp->st_stateid.si_boot = boot_time; - stp->st_stateid.si_stateownerid = sop->so_id; - stp->st_stateid.si_fileid = fp->fi_id; - stp->st_stateid.si_generation = 0; stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; - -out: return stp; } @@ -3960,7 +3881,7 @@ check_lock_length(u64 offset, u64 length) LOFF_OVERFLOW(offset, length))); } -static void get_lock_access(struct nfs4_stateid *lock_stp, u32 access) +static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access) { struct nfs4_file *fp = lock_stp->st_file; int oflag = nfs4_access_to_omode(access); @@ -3978,15 +3899,16 @@ __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_lock *lock) { - struct nfs4_stateowner *open_sop = NULL; - struct nfs4_stateowner *lock_sop = NULL; - struct nfs4_stateid *lock_stp; + struct nfs4_openowner *open_sop = NULL; + struct nfs4_lockowner *lock_sop = NULL; + struct nfs4_ol_stateid *lock_stp; struct nfs4_file *fp; struct file *filp = NULL; struct file_lock file_lock; struct file_lock conflock; __be32 status = 0; unsigned int strhashval; + int lkflg; int err; dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n", @@ -4010,7 +3932,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * Use open owner and open stateid to create lock owner and * lock stateid. */ - struct nfs4_stateid *open_stp = NULL; + struct nfs4_ol_stateid *open_stp = NULL; status = nfserr_stale_clientid; if (!nfsd4_has_session(cstate) && @@ -4018,26 +3940,29 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; /* validate and update open stateid and open seqid */ - status = nfs4_preprocess_seqid_op(cstate, + status = nfs4_preprocess_confirmed_seqid_op(cstate, lock->lk_new_open_seqid, &lock->lk_new_open_stateid, - OPEN_STATE, - &lock->lk_replay_owner, &open_stp, - lock); + &open_stp); if (status) goto out; - open_sop = lock->lk_replay_owner; + open_sop = openowner(open_stp->st_stateowner); + status = nfserr_bad_stateid; + if (!nfsd4_has_session(cstate) && + !same_clid(&open_sop->oo_owner.so_client->cl_clientid, + &lock->v.new.clientid)) + goto out; /* create lockowner and lock stateid */ fp = open_stp->st_file; - strhashval = lock_ownerstr_hashval(fp->fi_inode, - open_sop->so_client->cl_clientid.cl_id, + strhashval = lock_ownerstr_hashval(fp->fi_inode, + open_sop->oo_owner.so_client->cl_clientid.cl_id, &lock->v.new.owner); /* XXX: Do we need to check for duplicate stateowners on * the same file, or should they just be allowed (and * create new stateids)? */ - status = nfserr_resource; + status = nfserr_jukebox; lock_sop = alloc_init_lock_stateowner(strhashval, - open_sop->so_client, open_stp, lock); + open_sop->oo_owner.so_client, open_stp, lock); if (lock_sop == NULL) goto out; lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp); @@ -4046,16 +3971,20 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } else { /* lock (lock owner + lock stateid) already exists */ status = nfs4_preprocess_seqid_op(cstate, - lock->lk_old_lock_seqid, - &lock->lk_old_lock_stateid, - LOCK_STATE, - &lock->lk_replay_owner, &lock_stp, lock); + lock->lk_old_lock_seqid, + &lock->lk_old_lock_stateid, + NFS4_LOCK_STID, &lock_stp); if (status) goto out; - lock_sop = lock->lk_replay_owner; + lock_sop = lockowner(lock_stp->st_stateowner); fp = lock_stp->st_file; } - /* lock->lk_replay_owner and lock_stp have been created or found */ + /* lock_sop and lock_stp have been created or found */ + + lkflg = setlkflg(lock->lk_type); + status = nfs4_check_openmode(lock_stp, lkflg); + if (status) + goto out; status = nfserr_grace; if (locks_in_grace() && !lock->lk_reclaim) @@ -4106,8 +4035,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, err = vfs_lock_file(filp, F_SETLK, &file_lock, &conflock); switch (-err) { case 0: /* success! */ - update_stateid(&lock_stp->st_stateid); - memcpy(&lock->lk_resp_stateid, &lock_stp->st_stateid, + update_stateid(&lock_stp->st_stid.sc_stateid); + memcpy(&lock->lk_resp_stateid, &lock_stp->st_stid.sc_stateid, sizeof(stateid_t)); status = 0; break; @@ -4119,19 +4048,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, case (EDEADLK): status = nfserr_deadlock; break; - default: + default: dprintk("NFSD: nfsd4_lock: vfs_lock_file() failed! status %d\n",err); - status = nfserr_resource; + status = nfserrno(err); break; } out: if (status && lock->lk_is_new && lock_sop) release_lockowner(lock_sop); - if (lock->lk_replay_owner) { - nfs4_get_stateowner(lock->lk_replay_owner); - cstate->replay_owner = lock->lk_replay_owner; - } - nfs4_unlock_state(); + if (!cstate->replay_owner) + nfs4_unlock_state(); return status; } @@ -4163,6 +4089,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct inode *inode; struct file_lock file_lock; + struct nfs4_lockowner *lo; int error; __be32 status; @@ -4172,19 +4099,14 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (check_lock_length(lockt->lt_offset, lockt->lt_length)) return nfserr_inval; - lockt->lt_stateowner = NULL; nfs4_lock_state(); status = nfserr_stale_clientid; if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid)) goto out; - if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) { - dprintk("NFSD: nfsd4_lockt: fh_verify() failed!\n"); - if (status == nfserr_symlink) - status = nfserr_inval; + if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) goto out; - } inode = cstate->current_fh.fh_dentry->d_inode; locks_init_lock(&file_lock); @@ -4203,10 +4125,9 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - lockt->lt_stateowner = find_lockstateowner_str(inode, - &lockt->lt_clientid, &lockt->lt_owner); - if (lockt->lt_stateowner) - file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner; + lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner); + if (lo) + file_lock.fl_owner = (fl_owner_t)lo; file_lock.fl_pid = current->tgid; file_lock.fl_flags = FL_POSIX; @@ -4234,7 +4155,7 @@ __be32 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_locku *locku) { - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; struct file *filp = NULL; struct file_lock file_lock; __be32 status; @@ -4249,13 +4170,10 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); - if ((status = nfs4_preprocess_seqid_op(cstate, - locku->lu_seqid, - &locku->lu_stateid, - LOCK_STATE, - &locku->lu_stateowner, &stp, NULL))) + status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid, + &locku->lu_stateid, NFS4_LOCK_STID, &stp); + if (status) goto out; - filp = find_any_file(stp->st_file); if (!filp) { status = nfserr_lock_range; @@ -4264,7 +4182,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, BUG_ON(!filp); locks_init_lock(&file_lock); file_lock.fl_type = F_UNLCK; - file_lock.fl_owner = (fl_owner_t) locku->lu_stateowner; + file_lock.fl_owner = (fl_owner_t)lockowner(stp->st_stateowner); file_lock.fl_pid = current->tgid; file_lock.fl_file = filp; file_lock.fl_flags = FL_POSIX; @@ -4285,15 +4203,12 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* * OK, unlock succeeded; the only thing left to do is update the stateid. */ - update_stateid(&stp->st_stateid); - memcpy(&locku->lu_stateid, &stp->st_stateid, sizeof(stateid_t)); + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); out: - if (locku->lu_stateowner) { - nfs4_get_stateowner(locku->lu_stateowner); - cstate->replay_owner = locku->lu_stateowner; - } - nfs4_unlock_state(); + if (!cstate->replay_owner) + nfs4_unlock_state(); return status; out_nfserr: @@ -4307,7 +4222,7 @@ out_nfserr: * 0: no locks held by lockowner */ static int -check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner) +check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner) { struct file_lock **flpp; struct inode *inode = filp->fi_inode; @@ -4332,7 +4247,8 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, { clientid_t *clid = &rlockowner->rl_clientid; struct nfs4_stateowner *sop; - struct nfs4_stateid *stp; + struct nfs4_lockowner *lo; + struct nfs4_ol_stateid *stp; struct xdr_netobj *owner = &rlockowner->rl_owner; struct list_head matches; int i; @@ -4356,16 +4272,15 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, * data structures. */ INIT_LIST_HEAD(&matches); for (i = 0; i < LOCK_HASH_SIZE; i++) { - list_for_each_entry(sop, &lock_ownerid_hashtbl[i], so_idhash) { + list_for_each_entry(sop, &lock_ownerstr_hashtbl[i], so_strhash) { if (!same_owner_str(sop, owner, clid)) continue; list_for_each_entry(stp, &sop->so_stateids, st_perstateowner) { - if (check_for_locks(stp->st_file, sop)) + lo = lockowner(sop); + if (check_for_locks(stp->st_file, lo)) goto out; - /* Note: so_perclient unused for lockowners, - * so it's OK to fool with here. */ - list_add(&sop->so_perclient, &matches); + list_add(&lo->lo_list, &matches); } } } @@ -4374,12 +4289,12 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, * have been checked. */ status = nfs_ok; while (!list_empty(&matches)) { - sop = list_entry(matches.next, struct nfs4_stateowner, - so_perclient); + lo = list_entry(matches.next, struct nfs4_lockowner, + lo_list); /* unhash_stateowner deletes so_perclient only * for openowners. */ - list_del(&sop->so_perclient); - release_lockowner(sop); + list_del(&lo->lo_list); + release_lockowner(lo); } out: nfs4_unlock_state(); @@ -4501,16 +4416,10 @@ nfs4_state_init(void) for (i = 0; i < FILE_HASH_SIZE; i++) { INIT_LIST_HEAD(&file_hashtbl[i]); } - for (i = 0; i < OWNER_HASH_SIZE; i++) { - INIT_LIST_HEAD(&ownerstr_hashtbl[i]); - INIT_LIST_HEAD(&ownerid_hashtbl[i]); - } - for (i = 0; i < STATEID_HASH_SIZE; i++) { - INIT_LIST_HEAD(&stateid_hashtbl[i]); - INIT_LIST_HEAD(&lockstateid_hashtbl[i]); + for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) { + INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]); } for (i = 0; i < LOCK_HASH_SIZE; i++) { - INIT_LIST_HEAD(&lock_ownerid_hashtbl[i]); INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]); } memset(&onestateid, ~0, sizeof(stateid_t)); @@ -4527,7 +4436,7 @@ nfsd4_load_reboot_recovery_data(void) int status; nfs4_lock_state(); - nfsd4_init_recdir(user_recovery_dirname); + nfsd4_init_recdir(); status = nfsd4_recdir_load(); nfs4_unlock_state(); if (status) @@ -4636,40 +4545,3 @@ nfs4_state_shutdown(void) nfs4_unlock_state(); nfsd4_destroy_callback_queue(); } - -/* - * user_recovery_dirname is protected by the nfsd_mutex since it's only - * accessed when nfsd is starting. - */ -static void -nfs4_set_recdir(char *recdir) -{ - strcpy(user_recovery_dirname, recdir); -} - -/* - * Change the NFSv4 recovery directory to recdir. - */ -int -nfs4_reset_recoverydir(char *recdir) -{ - int status; - struct path path; - - status = kern_path(recdir, LOOKUP_FOLLOW, &path); - if (status) - return status; - status = -ENOTDIR; - if (S_ISDIR(path.dentry->d_inode->i_mode)) { - nfs4_set_recdir(recdir); - status = 0; - } - path_put(&path); - return status; -} - -char * -nfs4_recoverydir(void) -{ - return user_recovery_dirname; -} diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index c8bf405d19d..66d095d7955 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -456,7 +456,6 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) { DECODE_HEAD; - close->cl_stateowner = NULL; READ_BUF(4); READ32(close->cl_seqid); return nfsd4_decode_stateid(argp, &close->cl_stateid); @@ -551,7 +550,6 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) { DECODE_HEAD; - lock->lk_replay_owner = NULL; /* * type, reclaim(boolean), offset, length, new_lock_owner(boolean) */ @@ -611,7 +609,6 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) { DECODE_HEAD; - locku->lu_stateowner = NULL; READ_BUF(8); READ32(locku->lu_type); if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT)) @@ -642,6 +639,83 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup DECODE_TAIL; } +static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *x) +{ + __be32 *p; + u32 w; + + READ_BUF(4); + READ32(w); + *x = w; + switch (w & NFS4_SHARE_ACCESS_MASK) { + case NFS4_SHARE_ACCESS_READ: + case NFS4_SHARE_ACCESS_WRITE: + case NFS4_SHARE_ACCESS_BOTH: + break; + default: + return nfserr_bad_xdr; + } + w &= !NFS4_SHARE_ACCESS_MASK; + if (!w) + return nfs_ok; + if (!argp->minorversion) + return nfserr_bad_xdr; + switch (w & NFS4_SHARE_WANT_MASK) { + case NFS4_SHARE_WANT_NO_PREFERENCE: + case NFS4_SHARE_WANT_READ_DELEG: + case NFS4_SHARE_WANT_WRITE_DELEG: + case NFS4_SHARE_WANT_ANY_DELEG: + case NFS4_SHARE_WANT_NO_DELEG: + case NFS4_SHARE_WANT_CANCEL: + break; + default: + return nfserr_bad_xdr; + } + w &= ~NFS4_SHARE_WANT_MASK; + if (!w) + return nfs_ok; + switch (w) { + case NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL: + case NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED: + case (NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL | + NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED): + return nfs_ok; + } +xdr_error: + return nfserr_bad_xdr; +} + +static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x) +{ + __be32 *p; + + READ_BUF(4); + READ32(*x); + /* Note: unlinke access bits, deny bits may be zero. */ + if (*x & ~NFS4_SHARE_DENY_BOTH) + return nfserr_bad_xdr; + return nfs_ok; +xdr_error: + return nfserr_bad_xdr; +} + +static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o) +{ + __be32 *p; + + READ_BUF(4); + READ32(o->len); + + if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT) + return nfserr_bad_xdr; + + READ_BUF(o->len); + SAVEMEM(o->data, o->len); + return nfs_ok; +xdr_error: + return nfserr_bad_xdr; +} + static __be32 nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) { @@ -649,19 +723,23 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) memset(open->op_bmval, 0, sizeof(open->op_bmval)); open->op_iattr.ia_valid = 0; - open->op_stateowner = NULL; + open->op_openowner = NULL; /* seqid, share_access, share_deny, clientid, ownerlen */ - READ_BUF(16 + sizeof(clientid_t)); + READ_BUF(4); READ32(open->op_seqid); - READ32(open->op_share_access); - READ32(open->op_share_deny); + status = nfsd4_decode_share_access(argp, &open->op_share_access); + if (status) + goto xdr_error; + status = nfsd4_decode_share_deny(argp, &open->op_share_deny); + if (status) + goto xdr_error; + READ_BUF(sizeof(clientid_t)); COPYMEM(&open->op_clientid, sizeof(clientid_t)); - READ32(open->op_owner.len); - - /* owner, open_flag */ - READ_BUF(open->op_owner.len + 4); - SAVEMEM(open->op_owner.data, open->op_owner.len); + status = nfsd4_decode_opaque(argp, &open->op_owner); + if (status) + goto xdr_error; + READ_BUF(4); READ32(open->op_create); switch (open->op_create) { case NFS4_OPEN_NOCREATE: @@ -727,6 +805,19 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval))) return status; break; + case NFS4_OPEN_CLAIM_FH: + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + if (argp->minorversion < 1) + goto xdr_error; + /* void */ + break; + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + if (argp->minorversion < 1) + goto xdr_error; + status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid); + if (status) + return status; + break; default: goto xdr_error; } @@ -739,7 +830,6 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con { DECODE_HEAD; - open_conf->oc_stateowner = NULL; status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid); if (status) return status; @@ -754,15 +844,17 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d { DECODE_HEAD; - open_down->od_stateowner = NULL; status = nfsd4_decode_stateid(argp, &open_down->od_stateid); if (status) return status; - READ_BUF(12); + READ_BUF(4); READ32(open_down->od_seqid); - READ32(open_down->od_share_access); - READ32(open_down->od_share_deny); - + status = nfsd4_decode_share_access(argp, &open_down->od_share_access); + if (status) + return status; + status = nfsd4_decode_share_deny(argp, &open_down->od_share_deny); + if (status) + return status; DECODE_TAIL; } @@ -903,12 +995,13 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient { DECODE_HEAD; - READ_BUF(12); + READ_BUF(8); COPYMEM(setclientid->se_verf.data, 8); - READ32(setclientid->se_namelen); - READ_BUF(setclientid->se_namelen + 8); - SAVEMEM(setclientid->se_name, setclientid->se_namelen); + status = nfsd4_decode_opaque(argp, &setclientid->se_name); + if (status) + return nfserr_bad_xdr; + READ_BUF(8); READ32(setclientid->se_callback_prog); READ32(setclientid->se_callback_netid_len); @@ -1051,11 +1144,9 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, READ_BUF(NFS4_VERIFIER_SIZE); COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE); - READ_BUF(4); - READ32(exid->clname.len); - - READ_BUF(exid->clname.len); - SAVEMEM(exid->clname.data, exid->clname.len); + status = nfsd4_decode_opaque(argp, &exid->clname); + if (status) + return nfserr_bad_xdr; READ_BUF(4); READ32(exid->flags); @@ -1326,6 +1417,16 @@ xdr_error: goto out; } +static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, struct nfsd4_destroy_clientid *dc) +{ + DECODE_HEAD; + + READ_BUF(8); + COPYMEM(&dc->clientid, 8); + + DECODE_TAIL; +} + static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc) { DECODE_HEAD; @@ -1447,7 +1548,7 @@ static nfsd4_dec nfsd41_dec_ops[] = { [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_test_stateid, [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid, [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, }; @@ -1630,15 +1731,20 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c) * we know whether the error to be returned is a sequence id mutating error. */ -#define ENCODE_SEQID_OP_TAIL(stateowner) do { \ - if (seqid_mutating_err(nfserr) && stateowner) { \ - stateowner->so_seqid++; \ - stateowner->so_replay.rp_status = nfserr; \ - stateowner->so_replay.rp_buflen = \ - (((char *)(resp)->p - (char *)save)); \ - memcpy(stateowner->so_replay.rp_buf, save, \ - stateowner->so_replay.rp_buflen); \ - } } while (0); +static void encode_seqid_op_tail(struct nfsd4_compoundres *resp, __be32 *save, __be32 nfserr) +{ + struct nfs4_stateowner *stateowner = resp->cstate.replay_owner; + + if (seqid_mutating_err(ntohl(nfserr)) && stateowner) { + stateowner->so_seqid++; + stateowner->so_replay.rp_status = nfserr; + stateowner->so_replay.rp_buflen = + (char *)resp->p - (char *)save; + memcpy(stateowner->so_replay.rp_buf, save, + stateowner->so_replay.rp_buflen); + nfsd4_purge_closed_stateid(stateowner); + } +} /* Encode as an array of strings the string given with components * separated @sep. @@ -1697,36 +1803,89 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location, } /* - * Return the path to an export point in the pseudo filesystem namespace - * Returned string is safe to use as long as the caller holds a reference - * to @exp. + * Encode a path in RFC3530 'pathname4' format */ -static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat) +static __be32 nfsd4_encode_path(const struct path *root, + const struct path *path, __be32 **pp, int *buflen) { - struct svc_fh tmp_fh; - char *path = NULL, *rootpath; - size_t rootlen; + struct path cur = { + .mnt = path->mnt, + .dentry = path->dentry, + }; + __be32 *p = *pp; + struct dentry **components = NULL; + unsigned int ncomponents = 0; + __be32 err = nfserr_jukebox; - fh_init(&tmp_fh, NFS4_FHSIZE); - *stat = exp_pseudoroot(rqstp, &tmp_fh); - if (*stat) - return NULL; - rootpath = tmp_fh.fh_export->ex_pathname; + dprintk("nfsd4_encode_components("); - path = exp->ex_pathname; + path_get(&cur); + /* First walk the path up to the nfsd root, and store the + * dentries/path components in an array. + */ + for (;;) { + if (cur.dentry == root->dentry && cur.mnt == root->mnt) + break; + if (cur.dentry == cur.mnt->mnt_root) { + if (follow_up(&cur)) + continue; + goto out_free; + } + if ((ncomponents & 15) == 0) { + struct dentry **new; + new = krealloc(components, + sizeof(*new) * (ncomponents + 16), + GFP_KERNEL); + if (!new) + goto out_free; + components = new; + } + components[ncomponents++] = cur.dentry; + cur.dentry = dget_parent(cur.dentry); + } - rootlen = strlen(rootpath); - if (strncmp(path, rootpath, rootlen)) { - dprintk("nfsd: fs_locations failed;" - "%s is not contained in %s\n", path, rootpath); - *stat = nfserr_notsupp; - path = NULL; - goto out; + *buflen -= 4; + if (*buflen < 0) + goto out_free; + WRITE32(ncomponents); + + while (ncomponents) { + struct dentry *dentry = components[ncomponents - 1]; + unsigned int len = dentry->d_name.len; + + *buflen -= 4 + (XDR_QUADLEN(len) << 2); + if (*buflen < 0) + goto out_free; + WRITE32(len); + WRITEMEM(dentry->d_name.name, len); + dprintk("/%s", dentry->d_name.name); + dput(dentry); + ncomponents--; } - path += rootlen; -out: - fh_put(&tmp_fh); - return path; + + *pp = p; + err = 0; +out_free: + dprintk(")\n"); + while (ncomponents) + dput(components[--ncomponents]); + kfree(components); + path_put(&cur); + return err; +} + +static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp, + const struct path *path, __be32 **pp, int *buflen) +{ + struct svc_export *exp_ps; + __be32 res; + + exp_ps = rqst_find_fsidzero_export(rqstp); + if (IS_ERR(exp_ps)) + return nfserrno(PTR_ERR(exp_ps)); + res = nfsd4_encode_path(&exp_ps->ex_path, path, pp, buflen); + exp_put(exp_ps); + return res; } /* @@ -1740,11 +1899,8 @@ static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp, int i; __be32 *p = *pp; struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs; - char *root = nfsd4_path(rqstp, exp, &status); - if (status) - return status; - status = nfsd4_encode_components('/', root, &p, buflen); + status = nfsd4_encode_fsloc_fsroot(rqstp, &exp->ex_path, &p, buflen); if (status) return status; if ((*buflen -= 4) < 0) @@ -1760,12 +1916,19 @@ static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp, return 0; } -static u32 nfs4_ftypes[16] = { - NF4BAD, NF4FIFO, NF4CHR, NF4BAD, - NF4DIR, NF4BAD, NF4BLK, NF4BAD, - NF4REG, NF4BAD, NF4LNK, NF4BAD, - NF4SOCK, NF4BAD, NF4LNK, NF4BAD, -}; +static u32 nfs4_file_type(umode_t mode) +{ + switch (mode & S_IFMT) { + case S_IFIFO: return NF4FIFO; + case S_IFCHR: return NF4CHR; + case S_IFDIR: return NF4DIR; + case S_IFBLK: return NF4BLK; + case S_IFLNK: return NF4LNK; + case S_IFREG: return NF4REG; + case S_IFSOCK: return NF4SOCK; + default: return NF4BAD; + }; +} static __be32 nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group, @@ -1954,7 +2117,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, if (bmval0 & FATTR4_WORD0_TYPE) { if ((buflen -= 4) < 0) goto out_resource; - dummy = nfs4_ftypes[(stat.mode & S_IFMT) >> 12]; + dummy = nfs4_file_type(stat.mode); if (dummy == NF4BAD) goto out_serverfault; WRITE32(dummy); @@ -2488,7 +2651,7 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c if (!nfserr) nfsd4_encode_stateid(resp, &close->cl_stateid); - ENCODE_SEQID_OP_TAIL(close->cl_stateowner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2564,17 +2727,18 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh static void nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld) { + struct xdr_netobj *conf = &ld->ld_owner; __be32 *p; - RESERVE_SPACE(32 + XDR_LEN(ld->ld_sop ? ld->ld_sop->so_owner.len : 0)); + RESERVE_SPACE(32 + XDR_LEN(conf->len)); WRITE64(ld->ld_start); WRITE64(ld->ld_length); WRITE32(ld->ld_type); - if (ld->ld_sop) { + if (conf->len) { WRITEMEM(&ld->ld_clientid, 8); - WRITE32(ld->ld_sop->so_owner.len); - WRITEMEM(ld->ld_sop->so_owner.data, ld->ld_sop->so_owner.len); - kref_put(&ld->ld_sop->so_ref, nfs4_free_stateowner); + WRITE32(conf->len); + WRITEMEM(conf->data, conf->len); + kfree(conf->data); } else { /* non - nfsv4 lock in conflict, no clientid nor owner */ WRITE64((u64)0); /* clientid */ WRITE32(0); /* length of owner name */ @@ -2592,7 +2756,7 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo else if (nfserr == nfserr_denied) nfsd4_encode_lock_denied(resp, &lock->lk_denied); - ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2612,7 +2776,7 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l if (!nfserr) nfsd4_encode_stateid(resp, &locku->lu_stateid); - ENCODE_SEQID_OP_TAIL(locku->lu_stateowner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2693,7 +2857,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op } /* XXX save filehandle here */ out: - ENCODE_SEQID_OP_TAIL(open->op_stateowner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2705,7 +2869,7 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct if (!nfserr) nfsd4_encode_stateid(resp, &oc->oc_resp_stateid); - ENCODE_SEQID_OP_TAIL(oc->oc_stateowner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2717,7 +2881,7 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struc if (!nfserr) nfsd4_encode_stateid(resp, &od->od_stateid); - ENCODE_SEQID_OP_TAIL(od->od_stateowner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2759,8 +2923,6 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, &maxcount); - if (nfserr == nfserr_symlink) - nfserr = nfserr_inval; if (nfserr) return nfserr; eof = (read->rd_offset + maxcount >= @@ -2886,8 +3048,6 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 readdir->common.err == nfserr_toosmall && readdir->buffer == page) nfserr = nfserr_toosmall; - if (nfserr == nfserr_symlink) - nfserr = nfserr_notdir; if (nfserr) goto err_no_verf; @@ -3218,9 +3378,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN); WRITE32(seq->seqid); WRITE32(seq->slotid); - WRITE32(seq->maxslots); - /* For now: target_maxslots = maxslots */ - WRITE32(seq->maxslots); + /* Note slotid's are numbered from zero: */ + WRITE32(seq->maxslots - 1); /* sr_highest_slotid */ + WRITE32(seq->maxslots - 1); /* sr_target_highest_slotid */ WRITE32(seq->status_flags); ADJUST_ARGS(); @@ -3233,6 +3393,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_test_stateid *test_stateid) { struct nfsd4_compoundargs *argp; + struct nfs4_client *cl = resp->cstate.session->se_client; stateid_t si; __be32 *p; int i; @@ -3248,7 +3409,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr, nfs4_lock_state(); for (i = 0; i < test_stateid->ts_num_ids; i++) { nfsd4_decode_stateid(argp, &si); - valid = nfs4_validate_stateid(&si, test_stateid->ts_has_session); + valid = nfs4_validate_stateid(cl, &si); RESERVE_SPACE(4); *p++ = htonl(valid); resp->p = p; @@ -3334,34 +3495,29 @@ static nfsd4_enc nfsd4_enc_ops[] = { /* * Calculate the total amount of memory that the compound response has taken - * after encoding the current operation. + * after encoding the current operation with pad. * - * pad: add on 8 bytes for the next operation's op_code and status so that - * there is room to cache a failure on the next operation. + * pad: if operation is non-idempotent, pad was calculate by op_rsize_bop() + * which was specified at nfsd4_operation, else pad is zero. * - * Compare this length to the session se_fmaxresp_cached. + * Compare this length to the session se_fmaxresp_sz and se_fmaxresp_cached. * * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so * will be at least a page and will therefore hold the xdr_buf head. */ -static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp) +int nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad) { - int status = 0; struct xdr_buf *xb = &resp->rqstp->rq_res; - struct nfsd4_compoundargs *args = resp->rqstp->rq_argp; struct nfsd4_session *session = NULL; struct nfsd4_slot *slot = resp->cstate.slot; - u32 length, tlen = 0, pad = 8; + u32 length, tlen = 0; if (!nfsd4_has_session(&resp->cstate)) - return status; + return 0; session = resp->cstate.session; - if (session == NULL || slot->sl_cachethis == 0) - return status; - - if (resp->opcnt >= args->opcnt) - pad = 0; /* this is the last operation */ + if (session == NULL) + return 0; if (xb->page_len == 0) { length = (char *)resp->p - (char *)xb->head[0].iov_base + pad; @@ -3374,10 +3530,14 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp) dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__, length, xb->page_len, tlen, pad); - if (length <= session->se_fchannel.maxresp_cached) - return status; - else + if (length > session->se_fchannel.maxresp_sz) + return nfserr_rep_too_big; + + if (slot->sl_cachethis == 1 && + length > session->se_fchannel.maxresp_cached) return nfserr_rep_too_big_to_cache; + + return 0; } void @@ -3397,8 +3557,8 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) !nfsd4_enc_ops[op->opnum]); op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u); /* nfsd4_check_drc_limit guarantees enough room for error status */ - if (!op->status && nfsd4_check_drc_limit(resp)) - op->status = nfserr_rep_too_big_to_cache; + if (!op->status) + op->status = nfsd4_check_resp_size(resp, 0); status: /* * Note: We write the status directly, instead of using WRITE32(), diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index c7716143cbd..db34a585e11 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -9,7 +9,6 @@ #include <linux/ctype.h> #include <linux/sunrpc/svcsock.h> -#include <linux/nfsd/syscall.h> #include <linux/lockd/lockd.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/gss_api.h> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 7ecfa242030..58134a23fdf 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -11,13 +11,39 @@ #include <linux/types.h> #include <linux/mount.h> +#include <linux/nfs.h> +#include <linux/nfs2.h> +#include <linux/nfs3.h> +#include <linux/nfs4.h> +#include <linux/sunrpc/msg_prot.h> + #include <linux/nfsd/debug.h> #include <linux/nfsd/export.h> #include <linux/nfsd/stats.h> + /* * nfsd version */ #define NFSD_SUPPORTED_MINOR_VERSION 1 +/* + * Maximum blocksizes supported by daemon under various circumstances. + */ +#define NFSSVC_MAXBLKSIZE RPCSVC_MAXPAYLOAD +/* NFSv2 is limited by the protocol specification, see RFC 1094 */ +#define NFSSVC_MAXBLKSIZE_V2 (8*1024) + + +/* + * Largest number of bytes we need to allocate for an NFS + * call or reply. Used to control buffer sizes. We use + * the length of v3 WRITE, READDIR and READDIR replies + * which are an RPC header, up to 26 XDR units of reply + * data, and some page data. + * + * Note that accuracy here doesn't matter too much as the + * size is rounded up to a page size when allocating space. + */ +#define NFSD_BUFSIZE ((RPC_MAX_HEADER_WITH_AUTH+26)*XDR_UNIT + NFSSVC_MAXBLKSIZE) struct readdir_cd { __be32 err; /* 0, nfserr, or nfserr_eof */ @@ -335,6 +361,13 @@ static inline u32 nfsd_suppattrs2(u32 minorversion) #define NFSD_SUPPATTR_EXCLCREAT_WORD2 \ NFSD_WRITEABLE_ATTRS_WORD2 +extern int nfsd4_is_junction(struct dentry *dentry); +#else +static inline int nfsd4_is_junction(struct dentry *dentry) +{ + return 0; +} + #endif /* CONFIG_NFSD_V4 */ #endif /* LINUX_NFSD_NFSD_H */ diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 90c6aa6d5e0..c763de5c115 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -59,28 +59,25 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry) * the write call). */ static inline __be32 -nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type) +nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int requested) { - /* Type can be negative when creating hardlinks - not to a dir */ - if (type > 0 && (mode & S_IFMT) != type) { - if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK) - return nfserr_symlink; - else if (type == S_IFDIR) - return nfserr_notdir; - else if ((mode & S_IFMT) == S_IFDIR) - return nfserr_isdir; - else - return nfserr_inval; - } - if (type < 0 && (mode & S_IFMT) == -type) { - if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK) - return nfserr_symlink; - else if (type == -S_IFDIR) - return nfserr_isdir; - else - return nfserr_notdir; - } - return 0; + mode &= S_IFMT; + + if (requested == 0) /* the caller doesn't care */ + return nfs_ok; + if (mode == requested) + return nfs_ok; + /* + * v4 has an error more specific than err_notdir which we should + * return in preference to err_notdir: + */ + if (rqstp->rq_vers == 4 && mode == S_IFLNK) + return nfserr_symlink; + if (requested == S_IFDIR) + return nfserr_notdir; + if (mode == S_IFDIR) + return nfserr_isdir; + return nfserr_inval; } static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 4eefaf1b42e..a3cf38476a1 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -35,6 +35,7 @@ #ifndef _NFSD4_STATE_H #define _NFSD4_STATE_H +#include <linux/idr.h> #include <linux/sunrpc/svc_xprt.h> #include <linux/nfsd/nfsfh.h> #include "nfsfh.h" @@ -45,24 +46,20 @@ typedef struct { } clientid_t; typedef struct { - u32 so_boot; - u32 so_stateownerid; - u32 so_fileid; + clientid_t so_clid; + u32 so_id; } stateid_opaque_t; typedef struct { u32 si_generation; stateid_opaque_t si_opaque; } stateid_t; -#define si_boot si_opaque.so_boot -#define si_stateownerid si_opaque.so_stateownerid -#define si_fileid si_opaque.so_fileid #define STATEID_FMT "(%08x/%08x/%08x/%08x)" #define STATEID_VAL(s) \ - (s)->si_boot, \ - (s)->si_stateownerid, \ - (s)->si_fileid, \ + (s)->si_opaque.so_clid.cl_boot, \ + (s)->si_opaque.so_clid.cl_id, \ + (s)->si_opaque.so_id, \ (s)->si_generation struct nfsd4_callback { @@ -76,17 +73,27 @@ struct nfsd4_callback { bool cb_done; }; +struct nfs4_stid { +#define NFS4_OPEN_STID 1 +#define NFS4_LOCK_STID 2 +#define NFS4_DELEG_STID 4 +/* For an open stateid kept around *only* to process close replays: */ +#define NFS4_CLOSED_STID 8 + unsigned char sc_type; + stateid_t sc_stateid; + struct nfs4_client *sc_client; +}; + struct nfs4_delegation { + struct nfs4_stid dl_stid; /* must be first field */ struct list_head dl_perfile; struct list_head dl_perclnt; struct list_head dl_recall_lru; /* delegation recalled */ atomic_t dl_count; /* ref count */ - struct nfs4_client *dl_client; struct nfs4_file *dl_file; u32 dl_type; time_t dl_time; /* For recall: */ - stateid_t dl_stateid; struct knfsd_fh dl_fh; int dl_retries; struct nfsd4_callback dl_recall; @@ -104,6 +111,11 @@ struct nfs4_cb_conn { struct svc_xprt *cb_xprt; /* minorversion 1 only */ }; +static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s) +{ + return container_of(s, struct nfs4_delegation, dl_stid); +} + /* Maximum number of slots per session. 160 is useful for long haul TCP */ #define NFSD_MAX_SLOTS_PER_SESSION 160 /* Maximum number of operations per session compound */ @@ -220,6 +232,7 @@ struct nfs4_client { struct list_head cl_idhash; /* hash by cl_clientid.id */ struct list_head cl_strhash; /* hash by cl_name */ struct list_head cl_openowners; + struct idr cl_stateids; /* stateid lookup */ struct list_head cl_delegations; struct list_head cl_lru; /* tail queue */ struct xdr_netobj cl_name; /* id generated by client */ @@ -245,6 +258,7 @@ struct nfs4_client { #define NFSD4_CB_UP 0 #define NFSD4_CB_UNKNOWN 1 #define NFSD4_CB_DOWN 2 +#define NFSD4_CB_FAULT 3 int cl_cb_state; struct nfsd4_callback cl_cb_null; struct nfsd4_session *cl_cb_session; @@ -293,6 +307,9 @@ static inline void update_stateid(stateid_t *stateid) { stateid->si_generation++; + /* Wraparound recommendation from 3530bis-13 9.1.3.2: */ + if (stateid->si_generation == 0) + stateid->si_generation = 1; } /* A reasonable value for REPLAY_ISIZE was estimated as follows: @@ -312,49 +329,57 @@ struct nfs4_replay { __be32 rp_status; unsigned int rp_buflen; char *rp_buf; - unsigned intrp_allocated; struct knfsd_fh rp_openfh; char rp_ibuf[NFSD4_REPLAY_ISIZE]; }; -/* -* nfs4_stateowner can either be an open_owner, or a lock_owner -* -* so_idhash: stateid_hashtbl[] for open owner, lockstateid_hashtbl[] -* for lock_owner -* so_strhash: ownerstr_hashtbl[] for open_owner, lock_ownerstr_hashtbl[] -* for lock_owner -* so_perclient: nfs4_client->cl_perclient entry - used when nfs4_client -* struct is reaped. -* so_perfilestate: heads the list of nfs4_stateid (either open or lock) -* and is used to ensure no dangling nfs4_stateid references when we -* release a stateowner. -* so_perlockowner: (open) nfs4_stateid->st_perlockowner entry - used when -* close is called to reap associated byte-range locks -* so_close_lru: (open) stateowner is placed on this list instead of being -* reaped (when so_perfilestate is empty) to hold the last close replay. -* reaped by laundramat thread after lease period. -*/ struct nfs4_stateowner { - struct kref so_ref; - struct list_head so_idhash; /* hash by so_id */ struct list_head so_strhash; /* hash by op_name */ - struct list_head so_perclient; struct list_head so_stateids; - struct list_head so_perstateid; /* for lockowners only */ - struct list_head so_close_lru; /* tail queue */ - time_t so_time; /* time of placement on so_close_lru */ - int so_is_open_owner; /* 1=openowner,0=lockowner */ - u32 so_id; struct nfs4_client * so_client; /* after increment in ENCODE_SEQID_OP_TAIL, represents the next * sequence id expected from the client: */ u32 so_seqid; struct xdr_netobj so_owner; /* open owner name */ - int so_confirmed; /* successful OPEN_CONFIRM? */ struct nfs4_replay so_replay; + bool so_is_open_owner; }; +struct nfs4_openowner { + struct nfs4_stateowner oo_owner; /* must be first field */ + struct list_head oo_perclient; + /* + * We keep around openowners a little while after last close, + * which saves clients from having to confirm, and allows us to + * handle close replays if they come soon enough. The close_lru + * is a list of such openowners, to be reaped by the laundromat + * thread eventually if they remain unused: + */ + struct list_head oo_close_lru; + struct nfs4_ol_stateid *oo_last_closed_stid; + time_t oo_time; /* time of placement on so_close_lru */ +#define NFS4_OO_CONFIRMED 1 +#define NFS4_OO_PURGE_CLOSE 2 +#define NFS4_OO_NEW 4 + unsigned char oo_flags; +}; + +struct nfs4_lockowner { + struct nfs4_stateowner lo_owner; /* must be first element */ + struct list_head lo_perstateid; /* for lockowners only */ + struct list_head lo_list; /* for temporary uses */ +}; + +static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so) +{ + return container_of(so, struct nfs4_openowner, oo_owner); +} + +static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so) +{ + return container_of(so, struct nfs4_lockowner, lo_owner); +} + /* * nfs4_file: a file opened by some number of (open) nfs4_stateowners. * o fi_perfile list is used to search for conflicting @@ -368,17 +393,17 @@ struct nfs4_file { /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ struct file * fi_fds[3]; /* - * Each open or lock stateid contributes 1 to either - * fi_access[O_RDONLY], fi_access[O_WRONLY], or both, depending - * on open or lock mode: + * Each open or lock stateid contributes 0-4 to the counts + * below depending on which bits are set in st_access_bitmap: + * 1 to fi_access[O_RDONLY] if NFS4_SHARE_ACCES_READ is set + * + 1 to fi_access[O_WRONLY] if NFS4_SHARE_ACCESS_WRITE is set + * + 1 to both of the above if NFS4_SHARE_ACCESS_BOTH is set. */ atomic_t fi_access[2]; struct file *fi_deleg_file; struct file_lock *fi_lease; atomic_t fi_delegees; struct inode *fi_inode; - u32 fi_id; /* used with stateowner->so_id - * for stateid_hashtbl hash */ bool fi_had_conflict; }; @@ -408,50 +433,27 @@ static inline struct file *find_any_file(struct nfs4_file *f) return f->fi_fds[O_RDONLY]; } -/* -* nfs4_stateid can either be an open stateid or (eventually) a lock stateid -* -* (open)nfs4_stateid: one per (open)nfs4_stateowner, nfs4_file -* -* st_hash: stateid_hashtbl[] entry or lockstateid_hashtbl entry -* st_perfile: file_hashtbl[] entry. -* st_perfile_state: nfs4_stateowner->so_perfilestate -* st_perlockowner: (open stateid) list of lock nfs4_stateowners -* st_access_bmap: used only for open stateid -* st_deny_bmap: used only for open stateid -* st_openstp: open stateid lock stateid was derived from -* -* XXX: open stateids and lock stateids have diverged sufficiently that -* we should consider defining separate structs for the two cases. -*/ - -struct nfs4_stateid { - struct list_head st_hash; +/* "ol" stands for "Open or Lock". Better suggestions welcome. */ +struct nfs4_ol_stateid { + struct nfs4_stid st_stid; /* must be first field */ struct list_head st_perfile; struct list_head st_perstateowner; struct list_head st_lockowners; struct nfs4_stateowner * st_stateowner; struct nfs4_file * st_file; - stateid_t st_stateid; unsigned long st_access_bmap; unsigned long st_deny_bmap; - struct nfs4_stateid * st_openstp; + struct nfs4_ol_stateid * st_openstp; }; +static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s) +{ + return container_of(s, struct nfs4_ol_stateid, st_stid); +} + /* flags for preprocess_seqid_op() */ -#define HAS_SESSION 0x00000001 -#define CONFIRM 0x00000002 -#define OPEN_STATE 0x00000004 -#define LOCK_STATE 0x00000008 #define RD_STATE 0x00000010 #define WR_STATE 0x00000020 -#define CLOSE_STATE 0x00000040 - -#define seqid_mutating_err(err) \ - (((err) != nfserr_stale_clientid) && \ - ((err) != nfserr_bad_seqid) && \ - ((err) != nfserr_stale_stateid) && \ - ((err) != nfserr_bad_stateid)) struct nfsd4_compound_state; @@ -461,7 +463,8 @@ extern void nfs4_lock_state(void); extern void nfs4_unlock_state(void); extern int nfs4_in_grace(void); extern __be32 nfs4_check_open_reclaim(clientid_t *clid); -extern void nfs4_free_stateowner(struct kref *kref); +extern void nfs4_free_openowner(struct nfs4_openowner *); +extern void nfs4_free_lockowner(struct nfs4_lockowner *); extern int set_callback_cred(void); extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); @@ -473,7 +476,7 @@ extern void nfsd4_destroy_callback_queue(void); extern void nfsd4_shutdown_callback(struct nfs4_client *); extern void nfs4_put_delegation(struct nfs4_delegation *dp); extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); -extern void nfsd4_init_recdir(char *recdir_name); +extern void nfsd4_init_recdir(void); extern int nfsd4_recdir_load(void); extern void nfsd4_shutdown_recdir(void); extern int nfs4_client_to_reclaim(const char *name); @@ -482,18 +485,7 @@ extern void nfsd4_recdir_purge_old(void); extern int nfsd4_create_clid_dir(struct nfs4_client *clp); extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); extern void release_session_client(struct nfsd4_session *); -extern __be32 nfs4_validate_stateid(stateid_t *, int); - -static inline void -nfs4_put_stateowner(struct nfs4_stateowner *so) -{ - kref_put(&so->so_ref, nfs4_free_stateowner); -} - -static inline void -nfs4_get_stateowner(struct nfs4_stateowner *so) -{ - kref_get(&so->so_ref); -} +extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *); +extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *); #endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index fd0acca5370..7a2e442623c 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -168,6 +168,8 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) { if (d_mountpoint(dentry)) return 1; + if (nfsd4_is_junction(dentry)) + return 1; if (!(exp->ex_flags & NFSEXP_V4ROOT)) return 0; return dentry->d_inode != NULL; @@ -502,7 +504,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int flags = 0; /* Get inode */ - error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR); + error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR); if (error) return error; @@ -592,6 +594,22 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac return error; } +#define NFSD_XATTR_JUNCTION_PREFIX XATTR_TRUSTED_PREFIX "junction." +#define NFSD_XATTR_JUNCTION_TYPE NFSD_XATTR_JUNCTION_PREFIX "type" +int nfsd4_is_junction(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + if (inode == NULL) + return 0; + if (inode->i_mode & S_IXUGO) + return 0; + if (!(inode->i_mode & S_ISVTX)) + return 0; + if (vfs_getxattr(dentry, NFSD_XATTR_JUNCTION_TYPE, NULL, 0) <= 0) + return 0; + return 1; +} #endif /* defined(CONFIG_NFSD_V4) */ #ifdef CONFIG_NFSD_V3 @@ -1352,7 +1370,7 @@ __be32 do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, char *fname, int flen, struct iattr *iap, struct svc_fh *resfhp, int createmode, u32 *verifier, - int *truncp, int *created) + bool *truncp, bool *created) { struct dentry *dentry, *dchild = NULL; struct inode *dirp; @@ -1632,10 +1650,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; - err = fh_verify(rqstp, tfhp, -S_IFDIR, NFSD_MAY_NOP); + err = fh_verify(rqstp, tfhp, 0, NFSD_MAY_NOP); if (err) goto out; - + err = nfserr_isdir; + if (S_ISDIR(tfhp->fh_dentry->d_inode->i_mode)) + goto out; err = nfserr_perm; if (!len) goto out; @@ -2114,7 +2134,8 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, /* Allow read access to binaries even when mode 111 */ if (err == -EACCES && S_ISREG(inode->i_mode) && - acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE)) + (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) || + acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC))) err = inode_permission(inode, MAY_EXEC); return err? nfserrno(err) : 0; diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index e0bbac04d1d..3f54ad03bb2 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -10,21 +10,22 @@ /* * Flags for nfsd_permission */ -#define NFSD_MAY_NOP 0 -#define NFSD_MAY_EXEC 1 /* == MAY_EXEC */ -#define NFSD_MAY_WRITE 2 /* == MAY_WRITE */ -#define NFSD_MAY_READ 4 /* == MAY_READ */ -#define NFSD_MAY_SATTR 8 -#define NFSD_MAY_TRUNC 16 -#define NFSD_MAY_LOCK 32 -#define NFSD_MAY_MASK 63 +#define NFSD_MAY_NOP 0 +#define NFSD_MAY_EXEC 0x001 /* == MAY_EXEC */ +#define NFSD_MAY_WRITE 0x002 /* == MAY_WRITE */ +#define NFSD_MAY_READ 0x004 /* == MAY_READ */ +#define NFSD_MAY_SATTR 0x008 +#define NFSD_MAY_TRUNC 0x010 +#define NFSD_MAY_LOCK 0x020 +#define NFSD_MAY_MASK 0x03f /* extra hints to permission and open routines: */ -#define NFSD_MAY_OWNER_OVERRIDE 64 -#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ -#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256 -#define NFSD_MAY_NOT_BREAK_LEASE 512 -#define NFSD_MAY_BYPASS_GSS 1024 +#define NFSD_MAY_OWNER_OVERRIDE 0x040 +#define NFSD_MAY_LOCAL_ACCESS 0x080 /* for device special files */ +#define NFSD_MAY_BYPASS_GSS_ON_ROOT 0x100 +#define NFSD_MAY_NOT_BREAK_LEASE 0x200 +#define NFSD_MAY_BYPASS_GSS 0x400 +#define NFSD_MAY_READ_IF_EXEC 0x800 #define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) #define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) @@ -61,7 +62,7 @@ __be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); __be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, struct svc_fh *res, int createmode, - u32 *verifier, int *truncp, int *created); + u32 *verifier, bool *truncp, bool *created); __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, loff_t, unsigned long); #endif /* CONFIG_NFSD_V3 */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index d2a8d04428c..2364747ee97 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -81,7 +81,6 @@ struct nfsd4_access { struct nfsd4_close { u32 cl_seqid; /* request */ stateid_t cl_stateid; /* request+response */ - struct nfs4_stateowner * cl_stateowner; /* response */ }; struct nfsd4_commit { @@ -131,7 +130,7 @@ struct nfsd4_link { struct nfsd4_lock_denied { clientid_t ld_clientid; - struct nfs4_stateowner *ld_sop; + struct xdr_netobj ld_owner; u64 ld_start; u64 ld_length; u32 ld_type; @@ -165,9 +164,6 @@ struct nfsd4_lock { } ok; struct nfsd4_lock_denied denied; } u; - /* The lk_replay_owner is the open owner in the open_to_lock_owner - * case and the lock owner otherwise: */ - struct nfs4_stateowner *lk_replay_owner; }; #define lk_new_open_seqid v.new.open_seqid #define lk_new_open_stateid v.new.open_stateid @@ -188,7 +184,6 @@ struct nfsd4_lockt { struct xdr_netobj lt_owner; u64 lt_offset; u64 lt_length; - struct nfs4_stateowner * lt_stateowner; struct nfsd4_lock_denied lt_denied; }; @@ -199,7 +194,6 @@ struct nfsd4_locku { stateid_t lu_stateid; u64 lu_offset; u64 lu_length; - struct nfs4_stateowner *lu_stateowner; }; @@ -232,8 +226,11 @@ struct nfsd4_open { u32 op_recall; /* recall */ struct nfsd4_change_info op_cinfo; /* response */ u32 op_rflags; /* response */ - int op_truncate; /* used during processing */ - struct nfs4_stateowner *op_stateowner; /* used during processing */ + bool op_truncate; /* used during processing */ + bool op_created; /* used during processing */ + struct nfs4_openowner *op_openowner; /* used during processing */ + struct nfs4_file *op_file; /* used during processing */ + struct nfs4_ol_stateid *op_stp; /* used during processing */ struct nfs4_acl *op_acl; }; #define op_iattr iattr @@ -243,7 +240,6 @@ struct nfsd4_open_confirm { stateid_t oc_req_stateid /* request */; u32 oc_seqid /* request */; stateid_t oc_resp_stateid /* response */; - struct nfs4_stateowner * oc_stateowner; /* response */ }; struct nfsd4_open_downgrade { @@ -251,7 +247,6 @@ struct nfsd4_open_downgrade { u32 od_seqid; u32 od_share_access; u32 od_share_deny; - struct nfs4_stateowner *od_stateowner; }; @@ -325,8 +320,7 @@ struct nfsd4_setattr { struct nfsd4_setclientid { nfs4_verifier se_verf; /* request */ - u32 se_namelen; /* request */ - char * se_name; /* request */ + struct xdr_netobj se_name; u32 se_callback_prog; /* request */ u32 se_callback_netid_len; /* request */ char * se_callback_netid_val; /* request */ @@ -351,7 +345,6 @@ struct nfsd4_saved_compoundargs { struct nfsd4_test_stateid { __be32 ts_num_ids; - __be32 ts_has_session; struct nfsd4_compoundargs *ts_saved_args; struct nfsd4_saved_compoundargs ts_savedp; }; @@ -405,6 +398,10 @@ struct nfsd4_destroy_session { struct nfs4_sessionid sessionid; }; +struct nfsd4_destroy_clientid { + clientid_t clientid; +}; + struct nfsd4_reclaim_complete { u32 rca_one_fs; }; @@ -532,6 +529,7 @@ int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *, struct nfsd4_compoundargs *); int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *, struct nfsd4_compoundres *); +int nfsd4_check_resp_size(struct nfsd4_compoundres *, u32); void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *); void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op); __be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, @@ -558,11 +556,13 @@ extern __be32 nfsd4_sequence(struct svc_rqst *, extern __be32 nfsd4_destroy_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_session *); +extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_clientid *); __be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *); extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, struct nfsd4_open *open); extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open); +extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status); extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc); extern __be32 nfsd4_close(struct svc_rqst *rqstp, diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 81ecf9c0bf0..194fb22ef79 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7185,20 +7185,9 @@ int ocfs2_init_security_and_acl(struct inode *dir, { int ret = 0; struct buffer_head *dir_bh = NULL; - struct ocfs2_security_xattr_info si = { - .enable = 1, - }; - ret = ocfs2_init_security_get(inode, dir, qstr, &si); + ret = ocfs2_init_security_get(inode, dir, qstr, NULL); if (!ret) { - ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, - si.name, si.value, si.value_len, - XATTR_CREATE); - if (ret) { - mlog_errno(ret); - goto leave; - } - } else if (ret != -EOPNOTSUPP) { mlog_errno(ret); goto leave; } @@ -7255,6 +7244,22 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name, name, value, size, flags); } +int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, XATTR_CREATE); + if (err) + break; + } + return err; +} + int ocfs2_init_security_get(struct inode *inode, struct inode *dir, const struct qstr *qstr, @@ -7263,8 +7268,13 @@ int ocfs2_init_security_get(struct inode *inode, /* check whether ocfs2 support feature xattr */ if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb))) return -EOPNOTSUPP; - return security_inode_init_security(inode, dir, qstr, &si->name, - &si->value, &si->value_len); + if (si) + return security_old_inode_init_security(inode, dir, qstr, + &si->name, &si->value, + &si->value_len); + + return security_inode_init_security(inode, dir, qstr, + &ocfs2_initxattrs, NULL); } int ocfs2_init_security_set(handle_t *handle, diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 9758b654a1b..42b274da92c 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -10,6 +10,7 @@ #include <linux/time.h> #include <linux/irqnr.h> #include <asm/cputime.h> +#include <linux/tick.h> #ifndef arch_irq_stat_cpu #define arch_irq_stat_cpu(cpu) 0 @@ -21,6 +22,35 @@ #define arch_idle_time(cpu) 0 #endif +static cputime64_t get_idle_time(int cpu) +{ + u64 idle_time = get_cpu_idle_time_us(cpu, NULL); + cputime64_t idle; + + if (idle_time == -1ULL) { + /* !NO_HZ so we can rely on cpustat.idle */ + idle = kstat_cpu(cpu).cpustat.idle; + idle = cputime64_add(idle, arch_idle_time(cpu)); + } else + idle = usecs_to_cputime(idle_time); + + return idle; +} + +static cputime64_t get_iowait_time(int cpu) +{ + u64 iowait_time = get_cpu_iowait_time_us(cpu, NULL); + cputime64_t iowait; + + if (iowait_time == -1ULL) + /* !NO_HZ so we can rely on cpustat.iowait */ + iowait = kstat_cpu(cpu).cpustat.iowait; + else + iowait = usecs_to_cputime(iowait_time); + + return iowait; +} + static int show_stat(struct seq_file *p, void *v) { int i, j; @@ -42,9 +72,8 @@ static int show_stat(struct seq_file *p, void *v) user = cputime64_add(user, kstat_cpu(i).cpustat.user); nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); system = cputime64_add(system, kstat_cpu(i).cpustat.system); - idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle); - idle = cputime64_add(idle, arch_idle_time(i)); - iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait); + idle = cputime64_add(idle, get_idle_time(i)); + iowait = cputime64_add(iowait, get_iowait_time(i)); irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); @@ -76,14 +105,12 @@ static int show_stat(struct seq_file *p, void *v) (unsigned long long)cputime64_to_clock_t(guest), (unsigned long long)cputime64_to_clock_t(guest_nice)); for_each_online_cpu(i) { - /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ user = kstat_cpu(i).cpustat.user; nice = kstat_cpu(i).cpustat.nice; system = kstat_cpu(i).cpustat.system; - idle = kstat_cpu(i).cpustat.idle; - idle = cputime64_add(idle, arch_idle_time(i)); - iowait = kstat_cpu(i).cpustat.iowait; + idle = get_idle_time(i); + iowait = get_iowait_time(i); irq = kstat_cpu(i).cpustat.irq; softirq = kstat_cpu(i).cpustat.softirq; steal = kstat_cpu(i).cpustat.steal; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index a159ba5a35e..eb711060a6f 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -291,14 +291,13 @@ int reiserfs_allocate_list_bitmaps(struct super_block *sb, for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { jb = jb_array + i; jb->journal_list = NULL; - jb->bitmaps = vmalloc(mem); + jb->bitmaps = vzalloc(mem); if (!jb->bitmaps) { reiserfs_warning(sb, "clm-2000", "unable to " "allocate bitmaps for journal lists"); failed = 1; break; } - memset(jb->bitmaps, 0, mem); } if (failed) { free_list_bitmaps(sb, jb_array); @@ -353,11 +352,10 @@ static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes) if (num_cnodes <= 0) { return NULL; } - head = vmalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode)); + head = vzalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode)); if (!head) { return NULL; } - memset(head, 0, num_cnodes * sizeof(struct reiserfs_journal_cnode)); head[0].prev = NULL; head[0].next = head + 1; for (i = 1; i < num_cnodes; i++) { @@ -2685,14 +2683,13 @@ int journal_init(struct super_block *sb, const char *j_dev_name, * dependency inversion warnings. */ reiserfs_write_unlock(sb); - journal = SB_JOURNAL(sb) = vmalloc(sizeof(struct reiserfs_journal)); + journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal)); if (!journal) { reiserfs_warning(sb, "journal-1256", "unable to get memory for journal structure"); reiserfs_write_lock(sb); return 1; } - memset(journal, 0, sizeof(struct reiserfs_journal)); INIT_LIST_HEAD(&journal->j_bitmap_nodes); INIT_LIST_HEAD(&journal->j_prealloc_list); INIT_LIST_HEAD(&journal->j_working_list); diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c index b6b9b1fe33b..7483279b482 100644 --- a/fs/reiserfs/resize.c +++ b/fs/reiserfs/resize.c @@ -111,15 +111,13 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) /* allocate additional bitmap blocks, reallocate array of bitmap * block pointers */ bitmap = - vmalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new); + vzalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new); if (!bitmap) { /* Journal bitmaps are still supersized, but the memory isn't * leaked, so I guess it's ok */ printk("reiserfs_resize: unable to allocate memory.\n"); return -ENOMEM; } - memset(bitmap, 0, - sizeof(struct reiserfs_bitmap_info) * bmap_nr_new); for (i = 0; i < bmap_nr; i++) bitmap[i] = old_bitmap[i]; diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index ef66c18a933..534668fa41b 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -66,8 +66,8 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode, if (IS_PRIVATE(dir)) return 0; - error = security_inode_init_security(inode, dir, qstr, &sec->name, - &sec->value, &sec->length); + error = security_old_inode_init_security(inode, dir, qstr, &sec->name, + &sec->value, &sec->length); if (error) { if (error == -EOPNOTSUPP) error = 0; diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index 1360d4f88f4..048b59d5b2f 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -19,9 +19,9 @@ config SQUASHFS If you want to compile this as a module ( = code which can be inserted in and removed from the running kernel whenever you want), - say M here and read <file:Documentation/modules.txt>. The module - will be called squashfs. Note that the root file system (the one - containing the directory /) cannot be compiled as a module. + say M here. The module will be called squashfs. Note that the root + file system (the one containing the directory /) cannot be compiled + as a module. If unsure, say N. diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index ea9120a830d..48ffbdf0d01 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -43,20 +43,48 @@ static DEFINE_IDA(sysfs_ino_ida); static void sysfs_link_sibling(struct sysfs_dirent *sd) { struct sysfs_dirent *parent_sd = sd->s_parent; - struct sysfs_dirent **pos; - BUG_ON(sd->s_sibling); - - /* Store directory entries in order by ino. This allows - * readdir to properly restart without having to add a - * cursor into the s_dir.children list. - */ - for (pos = &parent_sd->s_dir.children; *pos; pos = &(*pos)->s_sibling) { - if (sd->s_ino < (*pos)->s_ino) - break; + struct rb_node **p; + struct rb_node *parent; + + if (sysfs_type(sd) == SYSFS_DIR) + parent_sd->s_dir.subdirs++; + + p = &parent_sd->s_dir.inode_tree.rb_node; + parent = NULL; + while (*p) { + parent = *p; +#define node rb_entry(parent, struct sysfs_dirent, inode_node) + if (sd->s_ino < node->s_ino) { + p = &node->inode_node.rb_left; + } else if (sd->s_ino > node->s_ino) { + p = &node->inode_node.rb_right; + } else { + printk(KERN_CRIT "sysfs: inserting duplicate inode '%lx'\n", + (unsigned long) sd->s_ino); + BUG(); + } +#undef node } - sd->s_sibling = *pos; - *pos = sd; + rb_link_node(&sd->inode_node, parent, p); + rb_insert_color(&sd->inode_node, &parent_sd->s_dir.inode_tree); + + p = &parent_sd->s_dir.name_tree.rb_node; + parent = NULL; + while (*p) { + int c; + parent = *p; +#define node rb_entry(parent, struct sysfs_dirent, name_node) + c = strcmp(sd->s_name, node->s_name); + if (c < 0) { + p = &node->name_node.rb_left; + } else { + p = &node->name_node.rb_right; + } +#undef node + } + rb_link_node(&sd->name_node, parent, p); + rb_insert_color(&sd->name_node, &parent_sd->s_dir.name_tree); } /** @@ -71,16 +99,11 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd) */ static void sysfs_unlink_sibling(struct sysfs_dirent *sd) { - struct sysfs_dirent **pos; + if (sysfs_type(sd) == SYSFS_DIR) + sd->s_parent->s_dir.subdirs--; - for (pos = &sd->s_parent->s_dir.children; *pos; - pos = &(*pos)->s_sibling) { - if (*pos == sd) { - *pos = sd->s_sibling; - sd->s_sibling = NULL; - break; - } - } + rb_erase(&sd->inode_node, &sd->s_parent->s_dir.inode_tree); + rb_erase(&sd->name_node, &sd->s_parent->s_dir.name_tree); } /** @@ -126,7 +149,6 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) */ void sysfs_put_active(struct sysfs_dirent *sd) { - struct completion *cmpl; int v; if (unlikely(!sd)) @@ -138,10 +160,9 @@ void sysfs_put_active(struct sysfs_dirent *sd) return; /* atomic_dec_return() is a mb(), we'll always see the updated - * sd->s_sibling. + * sd->u.completion. */ - cmpl = (void *)sd->s_sibling; - complete(cmpl); + complete(sd->u.completion); } /** @@ -155,16 +176,16 @@ static void sysfs_deactivate(struct sysfs_dirent *sd) DECLARE_COMPLETION_ONSTACK(wait); int v; - BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED)); + BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED)); if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF)) return; - sd->s_sibling = (void *)&wait; + sd->u.completion = (void *)&wait; rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_); /* atomic_add_return() is a mb(), put_active() will always see - * the updated sd->s_sibling. + * the updated sd->u.completion. */ v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active); @@ -173,8 +194,6 @@ static void sysfs_deactivate(struct sysfs_dirent *sd) wait_for_completion(&wait); } - sd->s_sibling = NULL; - lock_acquired(&sd->dep_map, _RET_IP_); rwsem_release(&sd->dep_map, 1, _RET_IP_); } @@ -384,6 +403,13 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) { struct sysfs_inode_attrs *ps_iattr; + if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) { + WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", + sysfs_ns_type(acxt->parent_sd)? "required": "invalid", + acxt->parent_sd->s_name, sd->s_name); + return -EINVAL; + } + if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name)) return -EEXIST; @@ -490,7 +516,7 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) } sd->s_flags |= SYSFS_FLAG_REMOVED; - sd->s_sibling = acxt->removed; + sd->u.removed_list = acxt->removed; acxt->removed = sd; } @@ -514,8 +540,7 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt) while (acxt->removed) { struct sysfs_dirent *sd = acxt->removed; - acxt->removed = sd->s_sibling; - sd->s_sibling = NULL; + acxt->removed = sd->u.removed_list; sysfs_deactivate(sd); unmap_bin_file(sd); @@ -540,15 +565,43 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, const void *ns, const unsigned char *name) { - struct sysfs_dirent *sd; + struct rb_node *p = parent_sd->s_dir.name_tree.rb_node; + struct sysfs_dirent *found = NULL; - for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) { - if (ns && sd->s_ns && (sd->s_ns != ns)) - continue; - if (!strcmp(sd->s_name, name)) - return sd; + if (!!sysfs_ns_type(parent_sd) != !!ns) { + WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", + sysfs_ns_type(parent_sd)? "required": "invalid", + parent_sd->s_name, name); + return NULL; } - return NULL; + + while (p) { + int c; +#define node rb_entry(p, struct sysfs_dirent, name_node) + c = strcmp(name, node->s_name); + if (c < 0) { + p = node->name_node.rb_left; + } else if (c > 0) { + p = node->name_node.rb_right; + } else { + found = node; + p = node->name_node.rb_left; + } +#undef node + } + + if (found) { + while (found->s_ns != ns) { + p = rb_next(&found->name_node); + if (!p) + return NULL; + found = rb_entry(p, struct sysfs_dirent, name_node); + if (strcmp(name, found->s_name)) + return NULL; + } + } + + return found; } /** @@ -744,21 +797,19 @@ void sysfs_remove_subdir(struct sysfs_dirent *sd) static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd) { struct sysfs_addrm_cxt acxt; - struct sysfs_dirent **pos; + struct rb_node *pos; if (!dir_sd) return; pr_debug("sysfs %s: removing dir\n", dir_sd->s_name); sysfs_addrm_start(&acxt, dir_sd); - pos = &dir_sd->s_dir.children; - while (*pos) { - struct sysfs_dirent *sd = *pos; - + pos = rb_first(&dir_sd->s_dir.inode_tree); + while (pos) { + struct sysfs_dirent *sd = rb_entry(pos, struct sysfs_dirent, inode_node); + pos = rb_next(pos); if (sysfs_type(sd) != SYSFS_DIR) sysfs_remove_one(&acxt, sd); - else - pos = &(*pos)->s_sibling; } sysfs_addrm_finish(&acxt); @@ -881,12 +932,28 @@ static struct sysfs_dirent *sysfs_dir_pos(const void *ns, pos = NULL; } if (!pos && (ino > 1) && (ino < INT_MAX)) { - pos = parent_sd->s_dir.children; - while (pos && (ino > pos->s_ino)) - pos = pos->s_sibling; + struct rb_node *p = parent_sd->s_dir.inode_tree.rb_node; + while (p) { +#define node rb_entry(p, struct sysfs_dirent, inode_node) + if (ino < node->s_ino) { + pos = node; + p = node->inode_node.rb_left; + } else if (ino > node->s_ino) { + p = node->inode_node.rb_right; + } else { + pos = node; + break; + } +#undef node + } + } + while (pos && pos->s_ns != ns) { + struct rb_node *p = rb_next(&pos->inode_node); + if (!p) + pos = NULL; + else + pos = rb_entry(p, struct sysfs_dirent, inode_node); } - while (pos && pos->s_ns && pos->s_ns != ns) - pos = pos->s_sibling; return pos; } @@ -894,10 +961,13 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns, struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos) { pos = sysfs_dir_pos(ns, parent_sd, ino, pos); - if (pos) - pos = pos->s_sibling; - while (pos && pos->s_ns && pos->s_ns != ns) - pos = pos->s_sibling; + if (pos) do { + struct rb_node *p = rb_next(&pos->inode_node); + if (!p) + pos = NULL; + else + pos = rb_entry(p, struct sysfs_dirent, inode_node); + } while (pos && pos->s_ns != ns); return pos; } diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1ad8c93c1b8..d4e6080b4b2 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -466,9 +466,6 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr) mutex_lock(&sysfs_mutex); if (sd && dir) - /* Only directories are tagged, so no need to pass - * a tag explicitly. - */ sd = sysfs_find_dirent(sd, NULL, dir); if (sd && attr) sd = sysfs_find_dirent(sd, NULL, attr); @@ -488,17 +485,56 @@ const struct file_operations sysfs_file_operations = { .poll = sysfs_poll, }; +int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr, + const void **pns) +{ + struct sysfs_dirent *dir_sd = kobj->sd; + const struct sysfs_ops *ops; + const void *ns = NULL; + int err; + + err = 0; + if (!sysfs_ns_type(dir_sd)) + goto out; + + err = -EINVAL; + if (!kobj->ktype) + goto out; + ops = kobj->ktype->sysfs_ops; + if (!ops) + goto out; + if (!ops->namespace) + goto out; + + err = 0; + ns = ops->namespace(kobj, attr); +out: + if (err) { + WARN(1, KERN_ERR "missing sysfs namespace attribute operation for " + "kobject: %s\n", kobject_name(kobj)); + } + *pns = ns; + return err; +} + int sysfs_add_file_mode(struct sysfs_dirent *dir_sd, const struct attribute *attr, int type, mode_t amode) { umode_t mode = (amode & S_IALLUGO) | S_IFREG; struct sysfs_addrm_cxt acxt; struct sysfs_dirent *sd; + const void *ns; int rc; + rc = sysfs_attr_ns(dir_sd->s_dir.kobj, attr, &ns); + if (rc) + return rc; + sd = sysfs_new_dirent(attr->name, mode, type); if (!sd) return -ENOMEM; + + sd->s_ns = ns; sd->s_attr.attr = (void *)attr; sysfs_dirent_init_lockdep(sd); @@ -586,12 +622,17 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, { struct sysfs_dirent *sd; struct iattr newattrs; + const void *ns; int rc; + rc = sysfs_attr_ns(kobj, attr, &ns); + if (rc) + return rc; + mutex_lock(&sysfs_mutex); rc = -ENOENT; - sd = sysfs_find_dirent(kobj->sd, NULL, attr->name); + sd = sysfs_find_dirent(kobj->sd, ns, attr->name); if (!sd) goto out; @@ -616,7 +657,12 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file); void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) { - sysfs_hash_and_remove(kobj->sd, NULL, attr->name); + const void *ns; + + if (sysfs_attr_ns(kobj, attr, &ns)) + return; + + sysfs_hash_and_remove(kobj->sd, ns, attr->name); } void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr) diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index e3f091a81c7..e23f28894a3 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -202,18 +202,6 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) inode->i_ctime = iattr->ia_ctime; } -static int sysfs_count_nlink(struct sysfs_dirent *sd) -{ - struct sysfs_dirent *child; - int nr = 0; - - for (child = sd->s_dir.children; child; child = child->s_sibling) - if (sysfs_type(child) == SYSFS_DIR) - nr++; - - return nr + 2; -} - static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode) { struct sysfs_inode_attrs *iattrs = sd->s_iattr; @@ -230,7 +218,7 @@ static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode) } if (sysfs_type(sd) == SYSFS_DIR) - inode->i_nlink = sysfs_count_nlink(sd); + inode->i_nlink = sd->s_dir.subdirs + 2; } int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) @@ -336,8 +324,6 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha sysfs_addrm_start(&acxt, dir_sd); sd = sysfs_find_dirent(dir_sd, ns, name); - if (sd && (sd->s_ns != ns)) - sd = NULL; if (sd) sysfs_remove_one(&acxt, sd); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 845ab3ad229..ce29e28b766 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -11,14 +11,18 @@ #include <linux/lockdep.h> #include <linux/kobject_ns.h> #include <linux/fs.h> +#include <linux/rbtree.h> struct sysfs_open_dirent; /* type-specific structures for sysfs_dirent->s_* union members */ struct sysfs_elem_dir { struct kobject *kobj; - /* children list starts here and goes through sd->s_sibling */ - struct sysfs_dirent *children; + + unsigned long subdirs; + + struct rb_root inode_tree; + struct rb_root name_tree; }; struct sysfs_elem_symlink { @@ -56,9 +60,16 @@ struct sysfs_dirent { struct lockdep_map dep_map; #endif struct sysfs_dirent *s_parent; - struct sysfs_dirent *s_sibling; const char *s_name; + struct rb_node inode_node; + struct rb_node name_node; + + union { + struct completion *completion; + struct sysfs_dirent *removed_list; + } u; + const void *s_ns; /* namespace tag */ union { struct sysfs_elem_dir s_dir; diff --git a/fs/xattr.c b/fs/xattr.c index f060663ab70..67583de8218 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -14,6 +14,7 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/security.h> +#include <linux/evm.h> #include <linux/syscalls.h> #include <linux/module.h> #include <linux/fsnotify.h> @@ -166,6 +167,64 @@ out_noalloc: } EXPORT_SYMBOL_GPL(xattr_getsecurity); +/* + * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr + * + * Allocate memory, if not already allocated, or re-allocate correct size, + * before retrieving the extended attribute. + * + * Returns the result of alloc, if failed, or the getxattr operation. + */ +ssize_t +vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value, + size_t xattr_size, gfp_t flags) +{ + struct inode *inode = dentry->d_inode; + char *value = *xattr_value; + int error; + + error = xattr_permission(inode, name, MAY_READ); + if (error) + return error; + + if (!inode->i_op->getxattr) + return -EOPNOTSUPP; + + error = inode->i_op->getxattr(dentry, name, NULL, 0); + if (error < 0) + return error; + + if (!value || (error > xattr_size)) { + value = krealloc(*xattr_value, error + 1, flags); + if (!value) + return -ENOMEM; + memset(value, 0, error + 1); + } + + error = inode->i_op->getxattr(dentry, name, value, error); + *xattr_value = value; + return error; +} + +/* Compare an extended attribute value with the given value */ +int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name, + const char *value, size_t size, gfp_t flags) +{ + char *xattr_value = NULL; + int rc; + + rc = vfs_getxattr_alloc(dentry, xattr_name, &xattr_value, 0, flags); + if (rc < 0) + return rc; + + if ((rc != size) || (memcmp(xattr_value, value, rc) != 0)) + rc = -EINVAL; + else + rc = 0; + kfree(xattr_value); + return rc; +} + ssize_t vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) { @@ -243,8 +302,10 @@ vfs_removexattr(struct dentry *dentry, const char *name) error = inode->i_op->removexattr(dentry, name); mutex_unlock(&inode->i_mutex); - if (!error) + if (!error) { fsnotify_xattr(dentry); + evm_inode_post_removexattr(dentry, name); + } return error; } EXPORT_SYMBOL_GPL(vfs_removexattr); diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index f7c8f7a9ea6..292eff19803 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -61,12 +61,7 @@ extern void kmem_free(const void *); static inline void *kmem_zalloc_large(size_t size) { - void *ptr; - - ptr = vmalloc(size); - if (ptr) - memset(ptr, 0, size); - return ptr; + return vzalloc(size); } static inline void kmem_free_large(void *ptr) { diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index bdd9cb54d63..ce84ffd0264 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -452,7 +452,7 @@ xfs_alloc_read_agfl( if (error) return error; ASSERT(!xfs_buf_geterror(bp)); - XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF); + xfs_buf_set_ref(bp, XFS_AGFL_REF); *bpp = bp; return 0; } @@ -2139,7 +2139,7 @@ xfs_read_agf( xfs_trans_brelse(tp, *bpp); return XFS_ERROR(EFSCORRUPTED); } - XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF); + xfs_buf_set_ref(*bpp, XFS_AGF_REF); return 0; } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 8c37dde4c52..11b2aad982d 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -38,40 +38,6 @@ #include <linux/pagevec.h> #include <linux/writeback.h> - -/* - * Prime number of hash buckets since address is used as the key. - */ -#define NVSYNC 37 -#define to_ioend_wq(v) (&xfs_ioend_wq[((unsigned long)v) % NVSYNC]) -static wait_queue_head_t xfs_ioend_wq[NVSYNC]; - -void __init -xfs_ioend_init(void) -{ - int i; - - for (i = 0; i < NVSYNC; i++) - init_waitqueue_head(&xfs_ioend_wq[i]); -} - -void -xfs_ioend_wait( - xfs_inode_t *ip) -{ - wait_queue_head_t *wq = to_ioend_wq(ip); - - wait_event(*wq, (atomic_read(&ip->i_iocount) == 0)); -} - -STATIC void -xfs_ioend_wake( - xfs_inode_t *ip) -{ - if (atomic_dec_and_test(&ip->i_iocount)) - wake_up(to_ioend_wq(ip)); -} - void xfs_count_page_state( struct page *page, @@ -115,25 +81,20 @@ xfs_destroy_ioend( xfs_ioend_t *ioend) { struct buffer_head *bh, *next; - struct xfs_inode *ip = XFS_I(ioend->io_inode); for (bh = ioend->io_buffer_head; bh; bh = next) { next = bh->b_private; bh->b_end_io(bh, !ioend->io_error); } - /* - * Volume managers supporting multiple paths can send back ENODEV - * when the final path disappears. In this case continuing to fill - * the page cache with dirty data which cannot be written out is - * evil, so prevent that. - */ - if (unlikely(ioend->io_error == -ENODEV)) { - xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ, - __FILE__, __LINE__); + if (ioend->io_iocb) { + if (ioend->io_isasync) { + aio_complete(ioend->io_iocb, ioend->io_error ? + ioend->io_error : ioend->io_result, 0); + } + inode_dio_done(ioend->io_inode); } - xfs_ioend_wake(ip); mempool_free(ioend, xfs_ioend_pool); } @@ -156,6 +117,15 @@ xfs_ioend_new_eof( } /* + * Fast and loose check if this write could update the on-disk inode size. + */ +static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) +{ + return ioend->io_offset + ioend->io_size > + XFS_I(ioend->io_inode)->i_d.di_size; +} + +/* * Update on-disk file size now that data has been written to disk. The * current in-memory file size is i_size. If a write is beyond eof i_new_size * will be the intended file size until i_size is updated. If this write does @@ -173,9 +143,6 @@ xfs_setfilesize( xfs_inode_t *ip = XFS_I(ioend->io_inode); xfs_fsize_t isize; - if (unlikely(ioend->io_error)) - return 0; - if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) return EAGAIN; @@ -192,6 +159,9 @@ xfs_setfilesize( /* * Schedule IO completion handling on the final put of an ioend. + * + * If there is no work to do we might as well call it a day and free the + * ioend right now. */ STATIC void xfs_finish_ioend( @@ -200,8 +170,10 @@ xfs_finish_ioend( if (atomic_dec_and_test(&ioend->io_remaining)) { if (ioend->io_type == IO_UNWRITTEN) queue_work(xfsconvertd_workqueue, &ioend->io_work); - else + else if (xfs_ioend_is_append(ioend)) queue_work(xfsdatad_workqueue, &ioend->io_work); + else + xfs_destroy_ioend(ioend); } } @@ -216,17 +188,24 @@ xfs_end_io( struct xfs_inode *ip = XFS_I(ioend->io_inode); int error = 0; + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + error = -EIO; + goto done; + } + if (ioend->io_error) + goto done; + /* * For unwritten extents we need to issue transactions to convert a * range to normal written extens after the data I/O has finished. */ - if (ioend->io_type == IO_UNWRITTEN && - likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { - + if (ioend->io_type == IO_UNWRITTEN) { error = xfs_iomap_write_unwritten(ip, ioend->io_offset, ioend->io_size); - if (error) - ioend->io_error = error; + if (error) { + ioend->io_error = -error; + goto done; + } } /* @@ -236,6 +215,7 @@ xfs_end_io( error = xfs_setfilesize(ioend); ASSERT(!error || error == EAGAIN); +done: /* * If we didn't complete processing of the ioend, requeue it to the * tail of the workqueue for another attempt later. Otherwise destroy @@ -247,8 +227,6 @@ xfs_end_io( /* ensure we don't spin on blocked ioends */ delay(1); } else { - if (ioend->io_iocb) - aio_complete(ioend->io_iocb, ioend->io_result, 0); xfs_destroy_ioend(ioend); } } @@ -285,13 +263,13 @@ xfs_alloc_ioend( * all the I/O from calling the completion routine too early. */ atomic_set(&ioend->io_remaining, 1); + ioend->io_isasync = 0; ioend->io_error = 0; ioend->io_list = NULL; ioend->io_type = type; ioend->io_inode = inode; ioend->io_buffer_head = NULL; ioend->io_buffer_tail = NULL; - atomic_inc(&XFS_I(ioend->io_inode)->i_iocount); ioend->io_offset = 0; ioend->io_size = 0; ioend->io_iocb = NULL; @@ -337,8 +315,8 @@ xfs_map_blocks( count = mp->m_maxioffset - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); offset_fsb = XFS_B_TO_FSBT(mp, offset); - error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb, - bmapi_flags, NULL, 0, imap, &nimaps, NULL); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + imap, &nimaps, bmapi_flags); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (error) @@ -551,7 +529,6 @@ xfs_cancel_ioend( unlock_buffer(bh); } while ((bh = next_bh) != NULL); - xfs_ioend_wake(XFS_I(ioend->io_inode)); mempool_free(ioend, xfs_ioend_pool); } while ((ioend = next) != NULL); } @@ -1161,8 +1138,8 @@ __xfs_get_blocks( end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); - error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb, - XFS_BMAPI_ENTIRE, NULL, 0, &imap, &nimaps, NULL); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + &imap, &nimaps, XFS_BMAPI_ENTIRE); if (error) goto out_unlock; @@ -1300,7 +1277,6 @@ xfs_end_io_direct_write( bool is_async) { struct xfs_ioend *ioend = iocb->private; - struct inode *inode = ioend->io_inode; /* * blockdev_direct_IO can return an error even after the I/O @@ -1311,28 +1287,17 @@ xfs_end_io_direct_write( ioend->io_offset = offset; ioend->io_size = size; + ioend->io_iocb = iocb; + ioend->io_result = ret; if (private && size > 0) ioend->io_type = IO_UNWRITTEN; if (is_async) { - /* - * If we are converting an unwritten extent we need to delay - * the AIO completion until after the unwrittent extent - * conversion has completed, otherwise do it ASAP. - */ - if (ioend->io_type == IO_UNWRITTEN) { - ioend->io_iocb = iocb; - ioend->io_result = ret; - } else { - aio_complete(iocb, ret, 0); - } + ioend->io_isasync = 1; xfs_finish_ioend(ioend); } else { xfs_finish_ioend_sync(ioend); } - - /* XXX: probably should move into the real I/O completion handler */ - inode_dio_done(inode); } STATIC ssize_t diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 71f721e1a71..116dd5c3703 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -47,6 +47,7 @@ typedef struct xfs_ioend { unsigned int io_type; /* delalloc / unwritten */ int io_error; /* I/O error code */ atomic_t io_remaining; /* hold count */ + unsigned int io_isasync : 1; /* needs aio_complete */ struct inode *io_inode; /* file being written to */ struct buffer_head *io_buffer_head;/* buffer linked list head */ struct buffer_head *io_buffer_tail;/* buffer linked list tail */ @@ -60,9 +61,6 @@ typedef struct xfs_ioend { extern const struct address_space_operations xfs_address_space_operations; extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); -extern void xfs_ioend_init(void); -extern void xfs_ioend_wait(struct xfs_inode *); - extern void xfs_count_page_state(struct page *, int *, int *); #endif /* __XFS_AOPS_H__ */ diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 160bcdc34a6..1e5d97f86ea 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -319,7 +319,7 @@ xfs_attr_set_int( return (error); } - xfs_trans_ijoin(args.trans, dp); + xfs_trans_ijoin(args.trans, dp, 0); /* * If the attribute list is non-existent or a shortform list, @@ -389,7 +389,7 @@ xfs_attr_set_int( * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args.trans, dp); + xfs_trans_ijoin(args.trans, dp, 0); /* * Commit the leaf transformation. We'll need another (linked) @@ -537,7 +537,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) * No need to make quota reservations here. We expect to release some * blocks not allocate in the common case. */ - xfs_trans_ijoin(args.trans, dp); + xfs_trans_ijoin(args.trans, dp, 0); /* * Decide on what work routines to call based on the inode size. @@ -809,7 +809,7 @@ xfs_attr_inactive(xfs_inode_t *dp) * No need to make quota reservations here. We expect to release some * blocks, not allocate, in the common case. */ - xfs_trans_ijoin(trans, dp); + xfs_trans_ijoin(trans, dp, 0); /* * Decide on what work routines to call based on the inode size. @@ -823,18 +823,6 @@ xfs_attr_inactive(xfs_inode_t *dp) if (error) goto out; - /* - * Signal synchronous inactive transactions unless this is a - * synchronous mount filesystem in which case we know that we're here - * because we've been called out of xfs_inactive which means that the - * last reference is gone and the unlink transaction has already hit - * the disk so async inactive transactions are safe. - */ - if (!(mp->m_flags & XFS_MOUNT_WSYNC)) { - if (dp->i_d.di_anextents > 0) - xfs_trans_set_sync(trans); - } - error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); if (error) goto out; @@ -973,7 +961,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); /* * Commit the current trans (including the inode) and start @@ -1075,7 +1063,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); } else xfs_da_buf_done(bp); @@ -1149,7 +1137,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); } else xfs_da_buf_done(bp); return(0); @@ -1303,7 +1291,7 @@ restart: * in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); /* * Commit the node conversion and start the next @@ -1340,7 +1328,7 @@ restart: * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); } else { /* * Addition succeeded, update Btree hashvals. @@ -1452,7 +1440,7 @@ restart: * in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); } /* @@ -1584,7 +1572,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); /* * Commit the Btree join operation and start a new trans. @@ -1635,7 +1623,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) * in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); } else xfs_da_brelse(args->trans, bp); } @@ -1975,10 +1963,9 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) lblkno = args->rmtblkno; while (valuelen > 0) { nmap = ATTR_RMTVALUE_MAPSIZE; - error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - NULL, 0, map, &nmap, NULL); + error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, map, &nmap, + XFS_BMAPI_ATTRFORK); if (error) return(error); ASSERT(nmap >= 1); @@ -2052,10 +2039,9 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) */ xfs_bmap_init(args->flist, args->firstblock); nmap = 1; - error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno, + error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, blkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA | - XFS_BMAPI_WRITE, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, args->firstblock, args->total, &map, &nmap, args->flist); if (!error) { @@ -2074,7 +2060,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && @@ -2104,14 +2090,11 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) */ xfs_bmap_init(args->flist, args->firstblock); nmap = 1; - error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - args->firstblock, 0, &map, &nmap, - NULL); - if (error) { + error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, &map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) return(error); - } ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); @@ -2121,16 +2104,17 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, XBF_LOCK | XBF_DONT_BLOCK); - ASSERT(!xfs_buf_geterror(bp)); - + if (!bp) + return ENOMEM; tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : XFS_BUF_SIZE(bp); xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE); if (tmp < XFS_BUF_SIZE(bp)) xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); - if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */ - return (error); - } + error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ + xfs_buf_relse(bp); + if (error) + return error; src += tmp; valuelen -= tmp; @@ -2166,16 +2150,12 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) /* * Try to remember where we decided to put the value. */ - xfs_bmap_init(args->flist, args->firstblock); nmap = 1; - error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - args->firstblock, 0, &map, &nmap, - args->flist); - if (error) { + error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, &map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) return(error); - } ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); @@ -2188,8 +2168,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) */ bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK); if (bp) { - XFS_BUF_STALE(bp); - XFS_BUF_UNDELAYWRITE(bp); + xfs_buf_stale(bp); xfs_buf_relse(bp); bp = NULL; } @@ -2227,7 +2206,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, args->dp); + xfs_trans_ijoin(args->trans, args->dp, 0); /* * Close out trans and start the next one in the chain. diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 8fad9602542..d4906e7c978 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -2926,9 +2926,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, * Try to remember where we decided to put the value. */ nmap = 1; - error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - NULL, 0, &map, &nmap, NULL); + error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt, + &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) { return(error); } @@ -2948,6 +2947,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, bp = xfs_trans_get_buf(*trans, dp->i_mount->m_ddev_targp, dblkno, dblkcnt, XBF_LOCK); + if (!bp) + return ENOMEM; xfs_trans_binval(*trans, bp); /* * Roll to next transaction. diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 452a291383a..c68baeb0974 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -50,17 +50,22 @@ #include "xfs_trace.h" -#ifdef DEBUG -STATIC void -xfs_bmap_check_leaf_extents(xfs_btree_cur_t *cur, xfs_inode_t *ip, int whichfork); -#endif - kmem_zone_t *xfs_bmap_free_item_zone; /* * Prototypes for internal bmap routines. */ +#ifdef DEBUG +STATIC void +xfs_bmap_check_leaf_extents( + struct xfs_btree_cur *cur, + struct xfs_inode *ip, + int whichfork); +#else +#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0) +#endif + /* * Called from xfs_bmap_add_attrfork to handle extents format files. @@ -85,58 +90,6 @@ xfs_bmap_add_attrfork_local( int *flags); /* inode logging flags */ /* - * Called by xfs_bmap_add_extent to handle cases converting a delayed - * allocation to a real allocation. - */ -STATIC int /* error */ -xfs_bmap_add_extent_delay_real( - struct xfs_trans *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ - xfs_fsblock_t *first, /* pointer to firstblock variable */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - int *logflagsp); /* inode logging flags */ - -/* - * Called by xfs_bmap_add_extent to handle cases converting a hole - * to a delayed allocation. - */ -STATIC int /* error */ -xfs_bmap_add_extent_hole_delay( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp); /* inode logging flags */ - -/* - * Called by xfs_bmap_add_extent to handle cases converting a hole - * to a real allocation. - */ -STATIC int /* error */ -xfs_bmap_add_extent_hole_real( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t *cur, /* if null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp, /* inode logging flags */ - int whichfork); /* data or attr fork */ - -/* - * Called by xfs_bmap_add_extent to handle cases converting an unwritten - * allocation to a real allocation or vice versa. - */ -STATIC int /* error */ -xfs_bmap_add_extent_unwritten_real( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp); /* inode logging flags */ - -/* * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. * It figures out where to ask the underlying allocator to put the new extent. */ @@ -215,19 +168,6 @@ xfs_bmap_search_extents( xfs_bmbt_irec_t *prevp); /* out: previous extent entry found */ /* - * Check the last inode extent to determine whether this allocation will result - * in blocks being allocated at the end of the file. When we allocate new data - * blocks at the end of the file which do not start at the previous data block, - * we will try to align the new blocks at stripe unit boundaries. - */ -STATIC int /* error */ -xfs_bmap_isaeof( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t off, /* file offset in fsblocks */ - int whichfork, /* data or attribute fork */ - char *aeof); /* return value */ - -/* * Compute the worst-case number of indirect blocks that will be used * for ip's delayed extent of length "len". */ @@ -431,188 +371,13 @@ xfs_bmap_add_attrfork_local( } /* - * Called by xfs_bmapi to update file extent records and the btree - * after allocating space (or doing a delayed allocation). - */ -STATIC int /* error */ -xfs_bmap_add_extent( - struct xfs_trans *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - xfs_fsblock_t *first, /* pointer to firstblock variable */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ -{ - xfs_btree_cur_t *cur; /* btree cursor or null */ - xfs_filblks_t da_new; /* new count del alloc blocks used */ - xfs_filblks_t da_old; /* old count del alloc blocks used */ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork ptr */ - int logflags; /* returned value */ - xfs_extnum_t nextents; /* number of extents in file now */ - - XFS_STATS_INC(xs_add_exlist); - - cur = *curp; - ifp = XFS_IFORK_PTR(ip, whichfork); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - da_old = da_new = 0; - error = 0; - - ASSERT(*idx >= 0); - ASSERT(*idx <= nextents); - - /* - * This is the first extent added to a new/empty file. - * Special case this one, so other routines get to assume there are - * already extents in the list. - */ - if (nextents == 0) { - xfs_iext_insert(ip, *idx, 1, new, - whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); - - ASSERT(cur == NULL); - - if (!isnullstartblock(new->br_startblock)) { - XFS_IFORK_NEXT_SET(ip, whichfork, 1); - logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); - } else - logflags = 0; - } - /* - * Any kind of new delayed allocation goes here. - */ - else if (isnullstartblock(new->br_startblock)) { - if (cur) - ASSERT((cur->bc_private.b.flags & - XFS_BTCUR_BPRV_WASDEL) == 0); - error = xfs_bmap_add_extent_hole_delay(ip, idx, new, - &logflags); - } - /* - * Real allocation off the end of the file. - */ - else if (*idx == nextents) { - if (cur) - ASSERT((cur->bc_private.b.flags & - XFS_BTCUR_BPRV_WASDEL) == 0); - error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new, - &logflags, whichfork); - } else { - xfs_bmbt_irec_t prev; /* old extent at offset idx */ - - /* - * Get the record referred to by idx. - */ - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev); - /* - * If it's a real allocation record, and the new allocation ends - * after the start of the referred to record, then we're filling - * in a delayed or unwritten allocation with a real one, or - * converting real back to unwritten. - */ - if (!isnullstartblock(new->br_startblock) && - new->br_startoff + new->br_blockcount > prev.br_startoff) { - if (prev.br_state != XFS_EXT_UNWRITTEN && - isnullstartblock(prev.br_startblock)) { - da_old = startblockval(prev.br_startblock); - if (cur) - ASSERT(cur->bc_private.b.flags & - XFS_BTCUR_BPRV_WASDEL); - error = xfs_bmap_add_extent_delay_real(tp, ip, - idx, &cur, new, &da_new, - first, flist, &logflags); - } else { - ASSERT(new->br_state == XFS_EXT_NORM || - new->br_state == XFS_EXT_UNWRITTEN); - - error = xfs_bmap_add_extent_unwritten_real(ip, - idx, &cur, new, &logflags); - if (error) - goto done; - } - } - /* - * Otherwise we're filling in a hole with an allocation. - */ - else { - if (cur) - ASSERT((cur->bc_private.b.flags & - XFS_BTCUR_BPRV_WASDEL) == 0); - error = xfs_bmap_add_extent_hole_real(ip, idx, cur, - new, &logflags, whichfork); - } - } - - if (error) - goto done; - ASSERT(*curp == cur || *curp == NULL); - - /* - * Convert to a btree if necessary. - */ - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) { - int tmp_logflags; /* partial log flag return val */ - - ASSERT(cur == NULL); - error = xfs_bmap_extents_to_btree(tp, ip, first, - flist, &cur, da_old > 0, &tmp_logflags, whichfork); - logflags |= tmp_logflags; - if (error) - goto done; - } - /* - * Adjust for changes in reserved delayed indirect blocks. - * Nothing to do for disk quotas here. - */ - if (da_old || da_new) { - xfs_filblks_t nblks; - - nblks = da_new; - if (cur) - nblks += cur->bc_private.b.allocated; - ASSERT(nblks <= da_old); - if (nblks < da_old) - xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, - (int64_t)(da_old - nblks), 0); - } - /* - * Clear out the allocated field, done with it now in any case. - */ - if (cur) { - cur->bc_private.b.allocated = 0; - *curp = cur; - } -done: -#ifdef DEBUG - if (!error) - xfs_bmap_check_leaf_extents(*curp, ip, whichfork); -#endif - *logflagsp = logflags; - return error; -} - -/* - * Called by xfs_bmap_add_extent to handle cases converting a delayed - * allocation to a real allocation. + * Convert a delayed allocation to a real allocation. */ STATIC int /* error */ xfs_bmap_add_extent_delay_real( - struct xfs_trans *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ - xfs_fsblock_t *first, /* pointer to firstblock variable */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - int *logflagsp) /* inode logging flags */ + struct xfs_bmalloca *bma) { - xfs_btree_cur_t *cur; /* btree cursor */ + struct xfs_bmbt_irec *new = &bma->got; int diff; /* temp value */ xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ int error; /* error return value */ @@ -623,10 +388,22 @@ xfs_bmap_add_extent_delay_real( /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ int state = 0;/* state bits, accessed thru macros */ - xfs_filblks_t temp=0; /* value for dnew calculations */ - xfs_filblks_t temp2=0;/* value for dnew calculations */ + xfs_filblks_t da_new; /* new count del alloc blocks used */ + xfs_filblks_t da_old; /* old count del alloc blocks used */ + xfs_filblks_t temp=0; /* value for da_new calculations */ + xfs_filblks_t temp2=0;/* value for da_new calculations */ int tmp_rval; /* partial logging flags */ + ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK); + + ASSERT(bma->idx >= 0); + ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(!isnullstartblock(new->br_startblock)); + ASSERT(!bma->cur || + (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + + XFS_STATS_INC(xs_add_exlist); + #define LEFT r[0] #define RIGHT r[1] #define PREV r[2] @@ -634,14 +411,15 @@ xfs_bmap_add_extent_delay_real( /* * Set up a bunch of variables to make the tests simpler. */ - cur = *curp; - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - ep = xfs_iext_get_ext(ifp, *idx); + ep = xfs_iext_get_ext(ifp, bma->idx); xfs_bmbt_get_all(ep, &PREV); new_endoff = new->br_startoff + new->br_blockcount; ASSERT(PREV.br_startoff <= new->br_startoff); ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); + da_old = startblockval(PREV.br_startblock); + da_new = 0; + /* * Set flags determining what part of the previous delayed allocation * extent is being replaced by a real allocation. @@ -655,9 +433,9 @@ xfs_bmap_add_extent_delay_real( * Check and set flags if this segment has a left neighbor. * Don't set contiguous if the combined extent would be too large. */ - if (*idx > 0) { + if (bma->idx > 0) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT); if (isnullstartblock(LEFT.br_startblock)) state |= BMAP_LEFT_DELAY; @@ -675,9 +453,9 @@ xfs_bmap_add_extent_delay_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT); if (isnullstartblock(RIGHT.br_startblock)) state |= BMAP_RIGHT_DELAY; @@ -708,38 +486,41 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The left and right neighbors are both contiguous with new. */ - --*idx; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + bma->idx--; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), LEFT.br_blockcount + PREV.br_blockcount + RIGHT.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 2, state); - ip->i_d.di_nextents--; - if (cur == NULL) + xfs_iext_remove(bma->ip, bma->idx + 1, 2, state); + bma->ip->i_d.di_nextents--; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, RIGHT.br_startblock, - RIGHT.br_blockcount, &i))) + RIGHT.br_blockcount, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_btree_delete(cur, &i))) + error = xfs_btree_delete(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_btree_decrement(cur, 0, &i))) + error = xfs_btree_decrement(bma->cur, 0, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + PREV.br_blockcount + - RIGHT.br_blockcount, LEFT.br_state))) + RIGHT.br_blockcount, LEFT.br_state); + if (error) goto done; } - *dnew = 0; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: @@ -747,30 +528,31 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The left neighbor is contiguous, the right is not. */ - --*idx; + bma->idx--; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), LEFT.br_blockcount + PREV.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 1, state); - if (cur == NULL) + xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + - PREV.br_blockcount, LEFT.br_state))) + PREV.br_blockcount, LEFT.br_state); + if (error) goto done; } - *dnew = 0; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -778,30 +560,30 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The right neighbor is contiguous, the left is not. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount + RIGHT.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 1, state); - if (cur == NULL) + xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, RIGHT.br_startblock, - RIGHT.br_blockcount, &i))) + RIGHT.br_blockcount, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, PREV.br_startoff, + error = xfs_bmbt_update(bma->cur, PREV.br_startoff, new->br_startblock, PREV.br_blockcount + - RIGHT.br_blockcount, PREV.br_state))) + RIGHT.br_blockcount, PREV.br_state); + if (error) goto done; } - - *dnew = 0; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: @@ -810,27 +592,27 @@ xfs_bmap_add_extent_delay_real( * Neither the left nor right neighbors are contiguous with * the new one. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - ip->i_d.di_nextents++; - if (cur == NULL) + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); - cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_btree_insert(cur, &i))) + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - - *dnew = 0; break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: @@ -838,39 +620,40 @@ xfs_bmap_add_extent_delay_real( * Filling in the first part of a previous delayed allocation. * The left neighbor is contiguous. */ - trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1), + trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1), LEFT.br_blockcount + new->br_blockcount); xfs_bmbt_set_startoff(ep, PREV.br_startoff + new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_); temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); - if (cur == NULL) + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + new->br_blockcount, - LEFT.br_state))) + LEFT.br_state); + if (error) goto done; } - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock)); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - --*idx; - *dnew = temp; + bma->idx--; break; case BMAP_LEFT_FILLING: @@ -878,43 +661,43 @@ xfs_bmap_add_extent_delay_real( * Filling in the first part of a previous delayed allocation. * The left neighbor is not contiguous. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_startoff(ep, new_endoff); temp = PREV.br_blockcount - new->br_blockcount; xfs_bmbt_set_blockcount(ep, temp); - xfs_iext_insert(ip, *idx, 1, new, state); - ip->i_d.di_nextents++; - if (cur == NULL) + xfs_iext_insert(bma->ip, bma->idx, 1, new, state); + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); - cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_btree_insert(cur, &i))) + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_nextents > ip->i_df.if_ext_max) { - error = xfs_bmap_extents_to_btree(tp, ip, - first, flist, &cur, 1, &tmp_rval, - XFS_DATA_FORK); + if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, + &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); rval |= tmp_rval; if (error) goto done; } - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (cur ? cur->bc_private.b.allocated : 0)); - ep = xfs_iext_get_ext(ifp, *idx + 1); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_); - - *dnew = temp; + (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + ep = xfs_iext_get_ext(ifp, bma->idx + 1); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_); break; case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -923,38 +706,39 @@ xfs_bmap_add_extent_delay_real( * The right neighbor is contiguous with the new allocation. */ temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1), + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1), new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, RIGHT.br_state); - trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_); - if (cur == NULL) + trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_); + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, RIGHT.br_startblock, - RIGHT.br_blockcount, &i))) + RIGHT.br_blockcount, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, new->br_startoff, + error = xfs_bmbt_update(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, - RIGHT.br_state))) + RIGHT.br_state); + if (error) goto done; } - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock)); - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - ++*idx; - *dnew = temp; + bma->idx++; break; case BMAP_RIGHT_FILLING: @@ -963,42 +747,43 @@ xfs_bmap_add_extent_delay_real( * The right neighbor is not contiguous. */ temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); - xfs_iext_insert(ip, *idx + 1, 1, new, state); - ip->i_d.di_nextents++; - if (cur == NULL) + xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state); + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); - cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_btree_insert(cur, &i))) + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_nextents > ip->i_df.if_ext_max) { - error = xfs_bmap_extents_to_btree(tp, ip, - first, flist, &cur, 1, &tmp_rval, - XFS_DATA_FORK); + if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, 1, + &tmp_rval, XFS_DATA_FORK); rval |= tmp_rval; if (error) goto done; } - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (cur ? cur->bc_private.b.allocated : 0)); - ep = xfs_iext_get_ext(ifp, *idx); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + ep = xfs_iext_get_ext(ifp, bma->idx); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - ++*idx; - *dnew = temp; + bma->idx++; break; case 0: @@ -1024,82 +809,65 @@ xfs_bmap_add_extent_delay_real( */ temp = new->br_startoff - PREV.br_startoff; temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; - trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */ LEFT = *new; RIGHT.br_state = PREV.br_state; RIGHT.br_startblock = nullstartblock( - (int)xfs_bmap_worst_indlen(ip, temp2)); + (int)xfs_bmap_worst_indlen(bma->ip, temp2)); RIGHT.br_startoff = new_endoff; RIGHT.br_blockcount = temp2; /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */ - xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state); - ip->i_d.di_nextents++; - if (cur == NULL) + xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state); + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); - cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_btree_insert(cur, &i))) + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_nextents > ip->i_df.if_ext_max) { - error = xfs_bmap_extents_to_btree(tp, ip, - first, flist, &cur, 1, &tmp_rval, - XFS_DATA_FORK); + if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, + 1, &tmp_rval, XFS_DATA_FORK); rval |= tmp_rval; if (error) goto done; } - temp = xfs_bmap_worst_indlen(ip, temp); - temp2 = xfs_bmap_worst_indlen(ip, temp2); + temp = xfs_bmap_worst_indlen(bma->ip, temp); + temp2 = xfs_bmap_worst_indlen(bma->ip, temp2); diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - - (cur ? cur->bc_private.b.allocated : 0)); - if (diff > 0 && - xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, - -((int64_t)diff), 0)) { - /* - * Ick gross gag me with a spoon. - */ - ASSERT(0); /* want to see if this ever happens! */ - while (diff > 0) { - if (temp) { - temp--; - diff--; - if (!diff || - !xfs_icsb_modify_counters(ip->i_mount, - XFS_SBS_FDBLOCKS, - -((int64_t)diff), 0)) - break; - } - if (temp2) { - temp2--; - diff--; - if (!diff || - !xfs_icsb_modify_counters(ip->i_mount, - XFS_SBS_FDBLOCKS, - -((int64_t)diff), 0)) - break; - } - } + (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + if (diff > 0) { + error = xfs_icsb_modify_counters(bma->ip->i_mount, + XFS_SBS_FDBLOCKS, + -((int64_t)diff), 0); + ASSERT(!error); + if (error) + goto done; } - ep = xfs_iext_get_ext(ifp, *idx); + + ep = xfs_iext_get_ext(ifp, bma->idx); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_); - xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2), + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_); + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2), nullstartblock((int)temp2)); - trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_); - ++*idx; - *dnew = temp + temp2; + bma->idx++; + da_new = temp + temp2; break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: @@ -1114,9 +882,40 @@ xfs_bmap_add_extent_delay_real( */ ASSERT(0); } - *curp = cur; + + /* convert to a btree if necessary */ + if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(bma->cur == NULL); + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, + da_old > 0, &tmp_logflags, XFS_DATA_FORK); + bma->logflags |= tmp_logflags; + if (error) + goto done; + } + + /* adjust for changes in reserved delayed indirect blocks */ + if (da_old || da_new) { + temp = da_new; + if (bma->cur) + temp += bma->cur->bc_private.b.allocated; + ASSERT(temp <= da_old); + if (temp < da_old) + xfs_icsb_modify_counters(bma->ip->i_mount, + XFS_SBS_FDBLOCKS, + (int64_t)(da_old - temp), 0); + } + + /* clear out the allocated field, done with it now in any case. */ + if (bma->cur) + bma->cur->bc_private.b.allocated = 0; + + xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK); done: - *logflagsp = rval; + bma->logflags |= rval; return error; #undef LEFT #undef RIGHT @@ -1124,15 +923,17 @@ done: } /* - * Called by xfs_bmap_add_extent to handle cases converting an unwritten - * allocation to a real allocation or vice versa. + * Convert an unwritten allocation to a real allocation or vice versa. */ STATIC int /* error */ xfs_bmap_add_extent_unwritten_real( + struct xfs_trans *tp, xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t *idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ + xfs_fsblock_t *first, /* pointer to firstblock variable */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ int *logflagsp) /* inode logging flags */ { xfs_btree_cur_t *cur; /* btree cursor */ @@ -1148,15 +949,25 @@ xfs_bmap_add_extent_unwritten_real( int rval=0; /* return value (logging flags) */ int state = 0;/* state bits, accessed thru macros */ + *logflagsp = 0; + + cur = *curp; + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + + ASSERT(*idx >= 0); + ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(!isnullstartblock(new->br_startblock)); + + XFS_STATS_INC(xs_add_exlist); + #define LEFT r[0] #define RIGHT r[1] #define PREV r[2] + /* * Set up a bunch of variables to make the tests simpler. */ error = 0; - cur = *curp; - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); ep = xfs_iext_get_ext(ifp, *idx); xfs_bmbt_get_all(ep, &PREV); newext = new->br_state; @@ -1406,10 +1217,11 @@ xfs_bmap_add_extent_unwritten_real( goto done; if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - if (xfs_bmbt_update(cur, LEFT.br_startoff, + error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + new->br_blockcount, - LEFT.br_state)) + LEFT.br_state); + if (error) goto done; } break; @@ -1607,9 +1419,29 @@ xfs_bmap_add_extent_unwritten_real( */ ASSERT(0); } - *curp = cur; + + /* convert to a btree if necessary */ + if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur, + 0, &tmp_logflags, XFS_DATA_FORK); + *logflagsp |= tmp_logflags; + if (error) + goto done; + } + + /* clear out the allocated field, done with it now in any case. */ + if (cur) { + cur->bc_private.b.allocated = 0; + *curp = cur; + } + + xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK); done: - *logflagsp = rval; + *logflagsp |= rval; return error; #undef LEFT #undef RIGHT @@ -1617,16 +1449,13 @@ done: } /* - * Called by xfs_bmap_add_extent to handle cases converting a hole - * to a delayed allocation. + * Convert a hole to a delayed allocation. */ -/*ARGSUSED*/ -STATIC int /* error */ +STATIC void xfs_bmap_add_extent_hole_delay( xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp) /* inode logging flags */ + xfs_bmbt_irec_t *new) /* new data to add to file extents */ { xfs_ifork_t *ifp; /* inode fork pointer */ xfs_bmbt_irec_t left; /* left neighbor extent entry */ @@ -1761,23 +1590,17 @@ xfs_bmap_add_extent_hole_delay( * Nothing to do for disk quota accounting here. */ } - *logflagsp = 0; - return 0; } /* - * Called by xfs_bmap_add_extent to handle cases converting a hole - * to a real allocation. + * Convert a hole to a real allocation. */ STATIC int /* error */ xfs_bmap_add_extent_hole_real( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t *cur, /* if null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ + struct xfs_bmalloca *bma, + int whichfork) { + struct xfs_bmbt_irec *new = &bma->got; int error; /* error return value */ int i; /* temp state */ xfs_ifork_t *ifp; /* inode fork pointer */ @@ -1786,19 +1609,26 @@ xfs_bmap_add_extent_hole_real( int rval=0; /* return value (logging flags) */ int state; /* state bits, accessed thru macros */ - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); - state = 0; + ifp = XFS_IFORK_PTR(bma->ip, whichfork); + + ASSERT(bma->idx >= 0); + ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(!isnullstartblock(new->br_startblock)); + ASSERT(!bma->cur || + !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + + XFS_STATS_INC(xs_add_exlist); + state = 0; if (whichfork == XFS_ATTR_FORK) state |= BMAP_ATTRFORK; /* * Check and set flags if this segment has a left neighbor. */ - if (*idx > 0) { + if (bma->idx > 0) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left); if (isnullstartblock(left.br_startblock)) state |= BMAP_LEFT_DELAY; } @@ -1807,9 +1637,9 @@ xfs_bmap_add_extent_hole_real( * Check and set flags if this segment has a current value. * Not true if we're inserting into the "hole" at eof. */ - if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right); if (isnullstartblock(right.br_startblock)) state |= BMAP_RIGHT_DELAY; } @@ -1846,39 +1676,42 @@ xfs_bmap_add_extent_hole_real( * left and on the right. * Merge all three into a single extent record. */ - --*idx; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + --bma->idx; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), left.br_blockcount + new->br_blockcount + right.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 1, state); + xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); - if (cur == NULL) { + XFS_IFORK_NEXT_SET(bma->ip, whichfork, + XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1); + if (bma->cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, - right.br_startoff, - right.br_startblock, - right.br_blockcount, &i))) + error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff, + right.br_startblock, right.br_blockcount, + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_btree_delete(cur, &i))) + error = xfs_btree_delete(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_btree_decrement(cur, 0, &i))) + error = xfs_btree_decrement(bma->cur, 0, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, left.br_startoff, + error = xfs_bmbt_update(bma->cur, left.br_startoff, left.br_startblock, left.br_blockcount + new->br_blockcount + right.br_blockcount, - left.br_state))) + left.br_state); + if (error) goto done; } break; @@ -1889,27 +1722,28 @@ xfs_bmap_add_extent_hole_real( * on the left. * Merge the new allocation with the left neighbor. */ - --*idx; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + --bma->idx; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), left.br_blockcount + new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - if (cur == NULL) { + if (bma->cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, - left.br_startoff, - left.br_startblock, - left.br_blockcount, &i))) + error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff, + left.br_startblock, left.br_blockcount, + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, left.br_startoff, + error = xfs_bmbt_update(bma->cur, left.br_startoff, left.br_startblock, left.br_blockcount + new->br_blockcount, - left.br_state))) + left.br_state); + if (error) goto done; } break; @@ -1920,28 +1754,30 @@ xfs_bmap_add_extent_hole_real( * on the right. * Merge the new allocation with the right neighbor. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx), new->br_startoff, new->br_startblock, new->br_blockcount + right.br_blockcount, right.br_state); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - if (cur == NULL) { + if (bma->cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, + error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff, right.br_startblock, - right.br_blockcount, &i))) + right.br_blockcount, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, new->br_startoff, + error = xfs_bmbt_update(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount + right.br_blockcount, - right.br_state))) + right.br_state); + if (error) goto done; } break; @@ -1952,28 +1788,50 @@ xfs_bmap_add_extent_hole_real( * real allocation. * Insert a new entry. */ - xfs_iext_insert(ip, *idx, 1, new, state); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); - if (cur == NULL) { + xfs_iext_insert(bma->ip, bma->idx, 1, new, state); + XFS_IFORK_NEXT_SET(bma->ip, whichfork, + XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1); + if (bma->cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, - new->br_blockcount, &i))) + new->br_blockcount, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); - cur->bc_rec.b.br_state = new->br_state; - if ((error = xfs_btree_insert(cur, &i))) + bma->cur->bc_rec.b.br_state = new->br_state; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } break; } + + /* convert to a btree if necessary */ + if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(bma->cur == NULL); + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, + 0, &tmp_logflags, whichfork); + bma->logflags |= tmp_logflags; + if (error) + goto done; + } + + /* clear out the allocated field, done with it now in any case. */ + if (bma->cur) + bma->cur->bc_private.b.allocated = 0; + + xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); done: - *logflagsp = rval; + bma->logflags |= rval; return error; } @@ -2160,26 +2018,26 @@ xfs_bmap_adjacent( XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks) mp = ap->ip->i_mount; - nullfb = ap->firstblock == NULLFSBLOCK; + nullfb = *ap->firstblock == NULLFSBLOCK; rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata; - fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); + fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); /* * If allocating at eof, and there's a previous real block, * try to use its last block as our starting point. */ - if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF && - !isnullstartblock(ap->prevp->br_startblock) && - ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount, - ap->prevp->br_startblock)) { - ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount; + if (ap->eof && ap->prev.br_startoff != NULLFILEOFF && + !isnullstartblock(ap->prev.br_startblock) && + ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount, + ap->prev.br_startblock)) { + ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount; /* * Adjust for the gap between prevp and us. */ - adjust = ap->off - - (ap->prevp->br_startoff + ap->prevp->br_blockcount); + adjust = ap->offset - + (ap->prev.br_startoff + ap->prev.br_blockcount); if (adjust && - ISVALID(ap->rval + adjust, ap->prevp->br_startblock)) - ap->rval += adjust; + ISVALID(ap->blkno + adjust, ap->prev.br_startblock)) + ap->blkno += adjust; } /* * If not at eof, then compare the two neighbor blocks. @@ -2196,17 +2054,17 @@ xfs_bmap_adjacent( * If there's a previous (left) block, select a requested * start block based on it. */ - if (ap->prevp->br_startoff != NULLFILEOFF && - !isnullstartblock(ap->prevp->br_startblock) && - (prevbno = ap->prevp->br_startblock + - ap->prevp->br_blockcount) && - ISVALID(prevbno, ap->prevp->br_startblock)) { + if (ap->prev.br_startoff != NULLFILEOFF && + !isnullstartblock(ap->prev.br_startblock) && + (prevbno = ap->prev.br_startblock + + ap->prev.br_blockcount) && + ISVALID(prevbno, ap->prev.br_startblock)) { /* * Calculate gap to end of previous block. */ - adjust = prevdiff = ap->off - - (ap->prevp->br_startoff + - ap->prevp->br_blockcount); + adjust = prevdiff = ap->offset - + (ap->prev.br_startoff + + ap->prev.br_blockcount); /* * Figure the startblock based on the previous block's * end and the gap size. @@ -2215,9 +2073,9 @@ xfs_bmap_adjacent( * allocating, or using it gives us an invalid block * number, then just use the end of the previous block. */ - if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->alen && + if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length && ISVALID(prevbno + prevdiff, - ap->prevp->br_startblock)) + ap->prev.br_startblock)) prevbno += adjust; else prevdiff += adjust; @@ -2238,16 +2096,16 @@ xfs_bmap_adjacent( * If there's a following (right) block, select a requested * start block based on it. */ - if (!isnullstartblock(ap->gotp->br_startblock)) { + if (!isnullstartblock(ap->got.br_startblock)) { /* * Calculate gap to start of next block. */ - adjust = gotdiff = ap->gotp->br_startoff - ap->off; + adjust = gotdiff = ap->got.br_startoff - ap->offset; /* * Figure the startblock based on the next block's * start and the gap size. */ - gotbno = ap->gotp->br_startblock; + gotbno = ap->got.br_startblock; /* * Heuristic! * If the gap is large relative to the piece we're @@ -2255,12 +2113,12 @@ xfs_bmap_adjacent( * number, then just use the start of the next block * offset by our length. */ - if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->alen && + if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length && ISVALID(gotbno - gotdiff, gotbno)) gotbno -= adjust; - else if (ISVALID(gotbno - ap->alen, gotbno)) { - gotbno -= ap->alen; - gotdiff += adjust - ap->alen; + else if (ISVALID(gotbno - ap->length, gotbno)) { + gotbno -= ap->length; + gotdiff += adjust - ap->length; } else gotdiff += adjust; /* @@ -2278,14 +2136,14 @@ xfs_bmap_adjacent( gotbno = NULLFSBLOCK; /* * If both valid, pick the better one, else the only good - * one, else ap->rval is already set (to 0 or the inode block). + * one, else ap->blkno is already set (to 0 or the inode block). */ if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) - ap->rval = prevdiff <= gotdiff ? prevbno : gotbno; + ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno; else if (prevbno != NULLFSBLOCK) - ap->rval = prevbno; + ap->blkno = prevbno; else if (gotbno != NULLFSBLOCK) - ap->rval = gotbno; + ap->blkno = gotbno; } #undef ISVALID } @@ -2305,24 +2163,24 @@ xfs_bmap_rtalloc( mp = ap->ip->i_mount; align = xfs_get_extsz_hint(ap->ip); prod = align / mp->m_sb.sb_rextsize; - error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, + error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 1, ap->eof, 0, - ap->conv, &ap->off, &ap->alen); + ap->conv, &ap->offset, &ap->length); if (error) return error; - ASSERT(ap->alen); - ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0); + ASSERT(ap->length); + ASSERT(ap->length % mp->m_sb.sb_rextsize == 0); /* * If the offset & length are not perfectly aligned * then kill prod, it will just get us in trouble. */ - if (do_mod(ap->off, align) || ap->alen % align) + if (do_mod(ap->offset, align) || ap->length % align) prod = 1; /* * Set ralen to be the actual requested length in rtextents. */ - ralen = ap->alen / mp->m_sb.sb_rextsize; + ralen = ap->length / mp->m_sb.sb_rextsize; /* * If the old value was close enough to MAXEXTLEN that * we rounded up to it, cut it back so it's valid again. @@ -2337,21 +2195,21 @@ xfs_bmap_rtalloc( * Lock out other modifications to the RT bitmap inode. */ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); /* * If it's an allocation to an empty file at offset 0, * pick an extent that will space things out in the rt area. */ - if (ap->eof && ap->off == 0) { + if (ap->eof && ap->offset == 0) { xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */ error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); if (error) return error; - ap->rval = rtx * mp->m_sb.sb_rextsize; + ap->blkno = rtx * mp->m_sb.sb_rextsize; } else { - ap->rval = 0; + ap->blkno = 0; } xfs_bmap_adjacent(ap); @@ -2359,23 +2217,23 @@ xfs_bmap_rtalloc( /* * Realtime allocation, done through xfs_rtallocate_extent. */ - atype = ap->rval == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO; - do_div(ap->rval, mp->m_sb.sb_rextsize); - rtb = ap->rval; - ap->alen = ralen; - if ((error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, ap->alen, + atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO; + do_div(ap->blkno, mp->m_sb.sb_rextsize); + rtb = ap->blkno; + ap->length = ralen; + if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length, &ralen, atype, ap->wasdel, prod, &rtb))) return error; if (rtb == NULLFSBLOCK && prod > 1 && - (error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, - ap->alen, &ralen, atype, + (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, + ap->length, &ralen, atype, ap->wasdel, 1, &rtb))) return error; - ap->rval = rtb; - if (ap->rval != NULLFSBLOCK) { - ap->rval *= mp->m_sb.sb_rextsize; + ap->blkno = rtb; + if (ap->blkno != NULLFSBLOCK) { + ap->blkno *= mp->m_sb.sb_rextsize; ralen *= mp->m_sb.sb_rextsize; - ap->alen = ralen; + ap->length = ralen; ap->ip->i_d.di_nblocks += ralen; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) @@ -2388,7 +2246,7 @@ xfs_bmap_rtalloc( ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_RTBCOUNT, (long) ralen); } else { - ap->alen = 0; + ap->length = 0; } return 0; } @@ -2503,7 +2361,7 @@ xfs_bmap_btalloc_nullfb( * AG as the stream may have moved. */ if (xfs_inode_is_filestream(ap->ip)) - ap->rval = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); + ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); return 0; } @@ -2528,52 +2386,52 @@ xfs_bmap_btalloc( mp = ap->ip->i_mount; align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; if (unlikely(align)) { - error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, + error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, ap->eof, 0, ap->conv, - &ap->off, &ap->alen); + &ap->offset, &ap->length); ASSERT(!error); - ASSERT(ap->alen); + ASSERT(ap->length); } - nullfb = ap->firstblock == NULLFSBLOCK; - fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); + nullfb = *ap->firstblock == NULLFSBLOCK; + fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); if (nullfb) { if (ap->userdata && xfs_inode_is_filestream(ap->ip)) { ag = xfs_filestream_lookup_ag(ap->ip); ag = (ag != NULLAGNUMBER) ? ag : 0; - ap->rval = XFS_AGB_TO_FSB(mp, ag, 0); + ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0); } else { - ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino); + ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino); } } else - ap->rval = ap->firstblock; + ap->blkno = *ap->firstblock; xfs_bmap_adjacent(ap); /* - * If allowed, use ap->rval; otherwise must use firstblock since + * If allowed, use ap->blkno; otherwise must use firstblock since * it's in the right allocation group. */ - if (nullfb || XFS_FSB_TO_AGNO(mp, ap->rval) == fb_agno) + if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno) ; else - ap->rval = ap->firstblock; + ap->blkno = *ap->firstblock; /* * Normal allocation, done through xfs_alloc_vextent. */ tryagain = isaligned = 0; args.tp = ap->tp; args.mp = mp; - args.fsbno = ap->rval; + args.fsbno = ap->blkno; /* Trim the allocation back to the maximum an AG can fit. */ - args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp)); - args.firstblock = ap->firstblock; + args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp)); + args.firstblock = *ap->firstblock; blen = 0; if (nullfb) { error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); if (error) return error; - } else if (ap->low) { + } else if (ap->flist->xbf_low) { if (xfs_inode_is_filestream(ap->ip)) args.type = XFS_ALLOCTYPE_FIRST_AG; else @@ -2587,14 +2445,14 @@ xfs_bmap_btalloc( /* apply extent size hints if obtained earlier */ if (unlikely(align)) { args.prod = align; - if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod))) + if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod))) args.mod = (xfs_extlen_t)(args.prod - args.mod); } else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) { args.prod = 1; args.mod = 0; } else { args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog; - if ((args.mod = (xfs_extlen_t)(do_mod(ap->off, args.prod)))) + if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod)))) args.mod = (xfs_extlen_t)(args.prod - args.mod); } /* @@ -2606,8 +2464,8 @@ xfs_bmap_btalloc( * is >= the stripe unit and the allocation offset is * at the end of file. */ - if (!ap->low && ap->aeof) { - if (!ap->off) { + if (!ap->flist->xbf_low && ap->aeof) { + if (!ap->offset) { args.alignment = mp->m_dalign; atype = args.type; isaligned = 1; @@ -2660,7 +2518,7 @@ xfs_bmap_btalloc( * turned on. */ args.type = atype; - args.fsbno = ap->rval; + args.fsbno = ap->blkno; args.alignment = mp->m_dalign; args.minlen = nextminlen; args.minalignslop = 0; @@ -2674,7 +2532,7 @@ xfs_bmap_btalloc( * try again. */ args.type = atype; - args.fsbno = ap->rval; + args.fsbno = ap->blkno; args.alignment = 0; if ((error = xfs_alloc_vextent(&args))) return error; @@ -2683,7 +2541,7 @@ xfs_bmap_btalloc( args.minlen > ap->minlen) { args.minlen = ap->minlen; args.type = XFS_ALLOCTYPE_START_BNO; - args.fsbno = ap->rval; + args.fsbno = ap->blkno; if ((error = xfs_alloc_vextent(&args))) return error; } @@ -2694,13 +2552,26 @@ xfs_bmap_btalloc( args.minleft = 0; if ((error = xfs_alloc_vextent(&args))) return error; - ap->low = 1; + ap->flist->xbf_low = 1; } if (args.fsbno != NULLFSBLOCK) { - ap->firstblock = ap->rval = args.fsbno; + /* + * check the allocation happened at the same or higher AG than + * the first block that was allocated. + */ + ASSERT(*ap->firstblock == NULLFSBLOCK || + XFS_FSB_TO_AGNO(mp, *ap->firstblock) == + XFS_FSB_TO_AGNO(mp, args.fsbno) || + (ap->flist->xbf_low && + XFS_FSB_TO_AGNO(mp, *ap->firstblock) < + XFS_FSB_TO_AGNO(mp, args.fsbno))); + + ap->blkno = args.fsbno; + if (*ap->firstblock == NULLFSBLOCK) + *ap->firstblock = args.fsbno; ASSERT(nullfb || fb_agno == args.agno || - (ap->low && fb_agno < args.agno)); - ap->alen = args.len; + (ap->flist->xbf_low && fb_agno < args.agno)); + ap->length = args.len; ap->ip->i_d.di_nblocks += args.len; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) @@ -2714,8 +2585,8 @@ xfs_bmap_btalloc( XFS_TRANS_DQ_BCOUNT, (long) args.len); } else { - ap->rval = NULLFSBLOCK; - ap->alen = 0; + ap->blkno = NULLFSBLOCK; + ap->length = 0; } return 0; } @@ -3589,7 +3460,7 @@ xfs_bmap_add_attrfork( } ASSERT(ip->i_d.di_anextents == 0); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); switch (ip->i_d.di_format) { @@ -3782,19 +3653,11 @@ xfs_bmap_compute_maxlevels( * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi * caller. Frees all the extents that need freeing, which must be done * last due to locking considerations. We never free any extents in - * the first transaction. This is to allow the caller to make the first - * transaction a synchronous one so that the pointers to the data being - * broken in this transaction will be permanent before the data is actually - * freed. This is necessary to prevent blocks from being reallocated - * and written to before the free and reallocation are actually permanent. - * We do not just make the first transaction synchronous here, because - * there are more efficient ways to gain the same protection in some cases - * (see the file truncation code). + * the first transaction. * * Return 1 if the given transaction was committed and a new one * started, and 0 otherwise in the committed parameter. */ -/*ARGSUSED*/ int /* error */ xfs_bmap_finish( xfs_trans_t **tp, /* transaction pointer addr */ @@ -3994,42 +3857,122 @@ xfs_bmap_last_before( return 0; } +STATIC int +xfs_bmap_last_extent( + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *rec, + int *is_empty) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int error; + int nextents; + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *is_empty = 1; + return 0; + } + + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec); + *is_empty = 0; + return 0; +} + +/* + * Check the last inode extent to determine whether this allocation will result + * in blocks being allocated at the end of the file. When we allocate new data + * blocks at the end of the file which do not start at the previous data block, + * we will try to align the new blocks at stripe unit boundaries. + * + * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be + * at, or past the EOF. + */ +STATIC int +xfs_bmap_isaeof( + struct xfs_bmalloca *bma, + int whichfork) +{ + struct xfs_bmbt_irec rec; + int is_empty; + int error; + + bma->aeof = 0; + error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, + &is_empty); + if (error || is_empty) + return error; + + /* + * Check if we are allocation or past the last extent, or at least into + * the last delayed allocated extent. + */ + bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount || + (bma->offset >= rec.br_startoff && + isnullstartblock(rec.br_startblock)); + return 0; +} + +/* + * Check if the endoff is outside the last extent. If so the caller will grow + * the allocation to a stripe unit boundary. All offsets are considered outside + * the end of file for an empty fork, so 1 is returned in *eof in that case. + */ +int +xfs_bmap_eof( + struct xfs_inode *ip, + xfs_fileoff_t endoff, + int whichfork, + int *eof) +{ + struct xfs_bmbt_irec rec; + int error; + + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof); + if (error || *eof) + return error; + + *eof = endoff >= rec.br_startoff + rec.br_blockcount; + return 0; +} + /* * Returns the file-relative block number of the first block past eof in * the file. This is not based on i_size, it is based on the extent records. * Returns 0 for local files, as they do not have extent records. */ -int /* error */ +int xfs_bmap_last_offset( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_fileoff_t *last_block, /* last block */ - int whichfork) /* data or attr fork */ + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_fileoff_t *last_block, + int whichfork) { - xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t nextents; /* number of extent entries */ + struct xfs_bmbt_irec rec; + int is_empty; + int error; + + *last_block = 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) + return 0; if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) return XFS_ERROR(EIO); - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - *last_block = 0; - return 0; - } - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) + + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); + if (error || is_empty) return error; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (!nextents) { - *last_block = 0; - return 0; - } - ep = xfs_iext_get_ext(ifp, nextents - 1); - *last_block = xfs_bmbt_get_startoff(ep) + xfs_bmbt_get_blockcount(ep); + + *last_block = rec.br_startoff + rec.br_blockcount; return 0; } @@ -4159,7 +4102,6 @@ xfs_bmap_read_extents( xfs_extnum_t num_recs; xfs_extnum_t start; - num_recs = xfs_btree_get_numrecs(block); if (unlikely(i + num_recs > room)) { ASSERT(i + num_recs <= room); @@ -4282,9 +4224,8 @@ xfs_bmap_validate_ret( ASSERT(i == 0 || mval[i - 1].br_startoff + mval[i - 1].br_blockcount == mval[i].br_startoff); - if ((flags & XFS_BMAPI_WRITE) && !(flags & XFS_BMAPI_DELAY)) - ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && - mval[i].br_startblock != HOLESTARTBLOCK); + ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && + mval[i].br_startblock != HOLESTARTBLOCK); ASSERT(mval[i].br_state == XFS_EXT_NORM || mval[i].br_state == XFS_EXT_UNWRITTEN); } @@ -4293,66 +4234,609 @@ xfs_bmap_validate_ret( /* - * Map file blocks to filesystem blocks. - * File range is given by the bno/len pair. - * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set) - * into a hole or past eof. - * Only allocates blocks from a single allocation group, - * to avoid locking problems. + * Trim the returned map to the required bounds + */ +STATIC void +xfs_bmapi_trim_map( + struct xfs_bmbt_irec *mval, + struct xfs_bmbt_irec *got, + xfs_fileoff_t *bno, + xfs_filblks_t len, + xfs_fileoff_t obno, + xfs_fileoff_t end, + int n, + int flags) +{ + if ((flags & XFS_BMAPI_ENTIRE) || + got->br_startoff + got->br_blockcount <= obno) { + *mval = *got; + if (isnullstartblock(got->br_startblock)) + mval->br_startblock = DELAYSTARTBLOCK; + return; + } + + if (obno > *bno) + *bno = obno; + ASSERT((*bno >= obno) || (n == 0)); + ASSERT(*bno < end); + mval->br_startoff = *bno; + if (isnullstartblock(got->br_startblock)) + mval->br_startblock = DELAYSTARTBLOCK; + else + mval->br_startblock = got->br_startblock + + (*bno - got->br_startoff); + /* + * Return the minimum of what we got and what we asked for for + * the length. We can use the len variable here because it is + * modified below and we could have been there before coming + * here if the first part of the allocation didn't overlap what + * was asked for. + */ + mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno, + got->br_blockcount - (*bno - got->br_startoff)); + mval->br_state = got->br_state; + ASSERT(mval->br_blockcount <= len); + return; +} + +/* + * Update and validate the extent map to return + */ +STATIC void +xfs_bmapi_update_map( + struct xfs_bmbt_irec **map, + xfs_fileoff_t *bno, + xfs_filblks_t *len, + xfs_fileoff_t obno, + xfs_fileoff_t end, + int *n, + int flags) +{ + xfs_bmbt_irec_t *mval = *map; + + ASSERT((flags & XFS_BMAPI_ENTIRE) || + ((mval->br_startoff + mval->br_blockcount) <= end)); + ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) || + (mval->br_startoff < obno)); + + *bno = mval->br_startoff + mval->br_blockcount; + *len = end - *bno; + if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) { + /* update previous map with new information */ + ASSERT(mval->br_startblock == mval[-1].br_startblock); + ASSERT(mval->br_blockcount > mval[-1].br_blockcount); + ASSERT(mval->br_state == mval[-1].br_state); + mval[-1].br_blockcount = mval->br_blockcount; + mval[-1].br_state = mval->br_state; + } else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK && + mval[-1].br_startblock != DELAYSTARTBLOCK && + mval[-1].br_startblock != HOLESTARTBLOCK && + mval->br_startblock == mval[-1].br_startblock + + mval[-1].br_blockcount && + ((flags & XFS_BMAPI_IGSTATE) || + mval[-1].br_state == mval->br_state)) { + ASSERT(mval->br_startoff == + mval[-1].br_startoff + mval[-1].br_blockcount); + mval[-1].br_blockcount += mval->br_blockcount; + } else if (*n > 0 && + mval->br_startblock == DELAYSTARTBLOCK && + mval[-1].br_startblock == DELAYSTARTBLOCK && + mval->br_startoff == + mval[-1].br_startoff + mval[-1].br_blockcount) { + mval[-1].br_blockcount += mval->br_blockcount; + mval[-1].br_state = mval->br_state; + } else if (!((*n == 0) && + ((mval->br_startoff + mval->br_blockcount) <= + obno))) { + mval++; + (*n)++; + } + *map = mval; +} + +/* + * Map file blocks to filesystem blocks without allocation. + */ +int +xfs_bmapi_read( + struct xfs_inode *ip, + xfs_fileoff_t bno, + xfs_filblks_t len, + struct xfs_bmbt_irec *mval, + int *nmap, + int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec prev; + xfs_fileoff_t obno; + xfs_fileoff_t end; + xfs_extnum_t lastx; + int error; + int eof; + int n = 0; + int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + + ASSERT(*nmap >= 1); + ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE| + XFS_BMAPI_IGSTATE))); + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + XFS_STATS_INC(xs_blk_mapr); + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, whichfork); + if (error) + return error; + } + + xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev); + end = bno + len; + obno = bno; + + while (bno < end && n < *nmap) { + /* Reading past eof, act as though there's a hole up to end. */ + if (eof) + got.br_startoff = end; + if (got.br_startoff > bno) { + /* Reading in a hole. */ + mval->br_startoff = bno; + mval->br_startblock = HOLESTARTBLOCK; + mval->br_blockcount = + XFS_FILBLKS_MIN(len, got.br_startoff - bno); + mval->br_state = XFS_EXT_NORM; + bno += mval->br_blockcount; + len -= mval->br_blockcount; + mval++; + n++; + continue; + } + + /* set up the extent map to return. */ + xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); + xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); + + /* If we're done, stop now. */ + if (bno >= end || n >= *nmap) + break; + + /* Else go on to the next record. */ + if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); + else + eof = 1; + } + *nmap = n; + return 0; +} + +STATIC int +xfs_bmapi_reserve_delalloc( + struct xfs_inode *ip, + xfs_fileoff_t aoff, + xfs_filblks_t len, + struct xfs_bmbt_irec *got, + struct xfs_bmbt_irec *prev, + xfs_extnum_t *lastx, + int eof) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + xfs_extlen_t alen; + xfs_extlen_t indlen; + char rt = XFS_IS_REALTIME_INODE(ip); + xfs_extlen_t extsz; + int error; + + alen = XFS_FILBLKS_MIN(len, MAXEXTLEN); + if (!eof) + alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); + + /* Figure out the extent size, adjust alen */ + extsz = xfs_get_extsz_hint(ip); + if (extsz) { + /* + * Make sure we don't exceed a single extent length when we + * align the extent by reducing length we are going to + * allocate by the maximum amount extent size aligment may + * require. + */ + alen = XFS_FILBLKS_MIN(len, MAXEXTLEN - (2 * extsz - 1)); + error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof, + 1, 0, &aoff, &alen); + ASSERT(!error); + } + + if (rt) + extsz = alen / mp->m_sb.sb_rextsize; + + /* + * Make a transaction-less quota reservation for delayed allocation + * blocks. This number gets adjusted later. We return if we haven't + * allocated blocks already inside this loop. + */ + error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0, + rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + if (error) + return error; + + /* + * Split changing sb for alen and indlen since they could be coming + * from different places. + */ + indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); + ASSERT(indlen > 0); + + if (rt) { + error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, + -((int64_t)extsz), 0); + } else { + error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + -((int64_t)alen), 0); + } + + if (error) + goto out_unreserve_quota; + + error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + -((int64_t)indlen), 0); + if (error) + goto out_unreserve_blocks; + + + ip->i_delayed_blks += alen; + + got->br_startoff = aoff; + got->br_startblock = nullstartblock(indlen); + got->br_blockcount = alen; + got->br_state = XFS_EXT_NORM; + xfs_bmap_add_extent_hole_delay(ip, lastx, got); + + /* + * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay + * might have merged it into one of the neighbouring ones. + */ + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got); + + ASSERT(got->br_startoff <= aoff); + ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen); + ASSERT(isnullstartblock(got->br_startblock)); + ASSERT(got->br_state == XFS_EXT_NORM); + return 0; + +out_unreserve_blocks: + if (rt) + xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0); + else + xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0); +out_unreserve_quota: + if (XFS_IS_QUOTA_ON(mp)) + xfs_trans_unreserve_quota_nblks(NULL, ip, alen, 0, rt ? + XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + return error; +} + +/* + * Map file blocks to filesystem blocks, adding delayed allocations as needed. + */ +int +xfs_bmapi_delay( + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting file offs. mapped */ + xfs_filblks_t len, /* length to map in file */ + struct xfs_bmbt_irec *mval, /* output: map values */ + int *nmap, /* i/o: mval size/count */ + int flags) /* XFS_BMAPI_... */ +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_bmbt_irec got; /* current file extent record */ + struct xfs_bmbt_irec prev; /* previous file extent record */ + xfs_fileoff_t obno; /* old block number (offset) */ + xfs_fileoff_t end; /* end of mapped file region */ + xfs_extnum_t lastx; /* last useful extent number */ + int eof; /* we've hit the end of extents */ + int n = 0; /* current extent index */ + int error = 0; + + ASSERT(*nmap >= 1); + ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); + ASSERT(!(flags & ~XFS_BMAPI_ENTIRE)); + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + XFS_STATS_INC(xs_blk_mapw); + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + return error; + } + + xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev); + end = bno + len; + obno = bno; + + while (bno < end && n < *nmap) { + if (eof || got.br_startoff > bno) { + error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got, + &prev, &lastx, eof); + if (error) { + if (n == 0) { + *nmap = 0; + return error; + } + break; + } + } + + /* set up the extent map to return. */ + xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); + xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); + + /* If we're done, stop now. */ + if (bno >= end || n >= *nmap) + break; + + /* Else go on to the next record. */ + prev = got; + if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); + else + eof = 1; + } + + *nmap = n; + return 0; +} + + +STATIC int +xfs_bmapi_allocate( + struct xfs_bmalloca *bma, + int flags) +{ + struct xfs_mount *mp = bma->ip->i_mount; + int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + int tmp_logflags = 0; + int error; + int rt; + + rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip); + + /* + * For the wasdelay case, we could also just allocate the stuff asked + * for in this bmap call but that wouldn't be as good. + */ + if (bma->wasdel) { + bma->length = (xfs_extlen_t)bma->got.br_blockcount; + bma->offset = bma->got.br_startoff; + if (bma->idx != NULLEXTNUM && bma->idx) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), + &bma->prev); + } + } else { + bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN); + if (!bma->eof) + bma->length = XFS_FILBLKS_MIN(bma->length, + bma->got.br_startoff - bma->offset); + } + + /* + * Indicate if this is the first user data in the file, or just any + * user data. + */ + if (!(flags & XFS_BMAPI_METADATA)) { + bma->userdata = (bma->offset == 0) ? + XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; + } + + bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1; + + /* + * Only want to do the alignment at the eof if it is userdata and + * allocation length is larger than a stripe unit. + */ + if (mp->m_dalign && bma->length >= mp->m_dalign && + !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { + error = xfs_bmap_isaeof(bma, whichfork); + if (error) + return error; + } + + error = xfs_bmap_alloc(bma); + if (error) + return error; + + if (bma->flist->xbf_low) + bma->minleft = 0; + if (bma->cur) + bma->cur->bc_private.b.firstblock = *bma->firstblock; + if (bma->blkno == NULLFSBLOCK) + return 0; + if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) { + bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork); + bma->cur->bc_private.b.firstblock = *bma->firstblock; + bma->cur->bc_private.b.flist = bma->flist; + } + /* + * Bump the number of extents we've allocated + * in this call. + */ + bma->nallocs++; + + if (bma->cur) + bma->cur->bc_private.b.flags = + bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + + bma->got.br_startoff = bma->offset; + bma->got.br_startblock = bma->blkno; + bma->got.br_blockcount = bma->length; + bma->got.br_state = XFS_EXT_NORM; + + /* + * A wasdelay extent has been initialized, so shouldn't be flagged + * as unwritten. + */ + if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) && + xfs_sb_version_hasextflgbit(&mp->m_sb)) + bma->got.br_state = XFS_EXT_UNWRITTEN; + + if (bma->wasdel) + error = xfs_bmap_add_extent_delay_real(bma); + else + error = xfs_bmap_add_extent_hole_real(bma, whichfork); + + bma->logflags |= tmp_logflags; + if (error) + return error; + + /* + * Update our extent pointer, given that xfs_bmap_add_extent_delay_real + * or xfs_bmap_add_extent_hole_real might have merged it into one of + * the neighbouring ones. + */ + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got); + + ASSERT(bma->got.br_startoff <= bma->offset); + ASSERT(bma->got.br_startoff + bma->got.br_blockcount >= + bma->offset + bma->length); + ASSERT(bma->got.br_state == XFS_EXT_NORM || + bma->got.br_state == XFS_EXT_UNWRITTEN); + return 0; +} + +STATIC int +xfs_bmapi_convert_unwritten( + struct xfs_bmalloca *bma, + struct xfs_bmbt_irec *mval, + xfs_filblks_t len, + int flags) +{ + int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + int tmp_logflags = 0; + int error; + + /* check if we need to do unwritten->real conversion */ + if (mval->br_state == XFS_EXT_UNWRITTEN && + (flags & XFS_BMAPI_PREALLOC)) + return 0; + + /* check if we need to do real->unwritten conversion */ + if (mval->br_state == XFS_EXT_NORM && + (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) != + (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) + return 0; + + /* + * Modify (by adding) the state flag, if writing. + */ + ASSERT(mval->br_blockcount <= len); + if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) { + bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp, + bma->ip, whichfork); + bma->cur->bc_private.b.firstblock = *bma->firstblock; + bma->cur->bc_private.b.flist = bma->flist; + } + mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) + ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN; + + error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, + &bma->cur, mval, bma->firstblock, bma->flist, + &tmp_logflags); + bma->logflags |= tmp_logflags; + if (error) + return error; + + /* + * Update our extent pointer, given that + * xfs_bmap_add_extent_unwritten_real might have merged it into one + * of the neighbouring ones. + */ + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got); + + /* + * We may have combined previously unwritten space with written space, + * so generate another request. + */ + if (mval->br_blockcount < len) + return EAGAIN; + return 0; +} + +/* + * Map file blocks to filesystem blocks, and allocate blocks or convert the + * extent state if necessary. Details behaviour is controlled by the flags + * parameter. Only allocates blocks from a single allocation group, to avoid + * locking problems. + * * The returned value in "firstblock" from the first call in a transaction * must be remembered and presented to subsequent calls in "firstblock". * An upper bound for the number of blocks to be allocated is supplied to * the first call in "total"; if no allocation group has that many free * blocks then the call will fail (return NULLFSBLOCK in "firstblock"). */ -int /* error */ -xfs_bmapi( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_fileoff_t bno, /* starting file offs. mapped */ - xfs_filblks_t len, /* length to map in file */ - int flags, /* XFS_BMAPI_... */ - xfs_fsblock_t *firstblock, /* first allocated block - controls a.g. for allocs */ - xfs_extlen_t total, /* total blocks needed */ - xfs_bmbt_irec_t *mval, /* output: map values */ - int *nmap, /* i/o: mval size/count */ - xfs_bmap_free_t *flist) /* i/o: list extents to free */ +int +xfs_bmapi_write( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting file offs. mapped */ + xfs_filblks_t len, /* length to map in file */ + int flags, /* XFS_BMAPI_... */ + xfs_fsblock_t *firstblock, /* first allocated block + controls a.g. for allocs */ + xfs_extlen_t total, /* total blocks needed */ + struct xfs_bmbt_irec *mval, /* output: map values */ + int *nmap, /* i/o: mval size/count */ + struct xfs_bmap_free *flist) /* i/o: list extents to free */ { - xfs_fsblock_t abno; /* allocated block number */ - xfs_extlen_t alen; /* allocated extent length */ - xfs_fileoff_t aoff; /* allocated file offset */ - xfs_bmalloca_t bma = { 0 }; /* args for xfs_bmap_alloc */ - xfs_btree_cur_t *cur; /* bmap btree cursor */ - xfs_fileoff_t end; /* end of mapped file region */ - int eof; /* we've hit the end of extents */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - int error; /* error return */ - xfs_bmbt_irec_t got; /* current file extent record */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extlen_t indlen; /* indirect blocks length */ - xfs_extnum_t lastx; /* last useful extent number */ - int logflags; /* flags for transaction logging */ - xfs_extlen_t minleft; /* min blocks left after allocation */ - xfs_extlen_t minlen; /* min allocation size */ - xfs_mount_t *mp; /* xfs mount structure */ - int n; /* current extent index */ - int nallocs; /* number of extents alloc'd */ - xfs_extnum_t nextents; /* number of extents in file */ - xfs_fileoff_t obno; /* old block number (offset) */ - xfs_bmbt_irec_t prev; /* previous file extent record */ - int tmp_logflags; /* temp flags holder */ - int whichfork; /* data or attr fork */ - char inhole; /* current location is hole in file */ - char wasdelay; /* old extent was delayed */ - char wr; /* this is a write request */ - char rt; /* this is a realtime file */ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + struct xfs_bmalloca bma = { 0 }; /* args for xfs_bmap_alloc */ + xfs_fileoff_t end; /* end of mapped file region */ + int eof; /* after the end of extents */ + int error; /* error return */ + int n; /* current extent index */ + xfs_fileoff_t obno; /* old block number (offset) */ + int whichfork; /* data or attr fork */ + char inhole; /* current location is hole in file */ + char wasdelay; /* old extent was delayed */ + #ifdef DEBUG - xfs_fileoff_t orig_bno; /* original block number value */ - int orig_flags; /* original flags arg value */ - xfs_filblks_t orig_len; /* original value of len arg */ - xfs_bmbt_irec_t *orig_mval; /* original value of mval */ - int orig_nmap; /* original value of *nmap */ + xfs_fileoff_t orig_bno; /* original block number value */ + int orig_flags; /* original flags arg value */ + xfs_filblks_t orig_len; /* original value of len arg */ + struct xfs_bmbt_irec *orig_mval; /* original value of mval */ + int orig_nmap; /* original value of *nmap */ orig_bno = bno; orig_len = len; @@ -4360,488 +4844,133 @@ xfs_bmapi( orig_mval = mval; orig_nmap = *nmap; #endif + ASSERT(*nmap >= 1); - ASSERT(*nmap <= XFS_BMAP_MAX_NMAP || !(flags & XFS_BMAPI_WRITE)); + ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); + ASSERT(!(flags & XFS_BMAPI_IGSTATE)); + ASSERT(tp != NULL); + whichfork = (flags & XFS_BMAPI_ATTRFORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; - mp = ip->i_mount; + if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL), mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { - XFS_ERROR_REPORT("xfs_bmapi", XFS_ERRLEVEL_LOW, mp); + XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp); return XFS_ERROR(EFSCORRUPTED); } + if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); + ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(ifp->if_ext_max == XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); - if ((wr = (flags & XFS_BMAPI_WRITE)) != 0) - XFS_STATS_INC(xs_blk_mapw); - else - XFS_STATS_INC(xs_blk_mapr); - /* - * IGSTATE flag is used to combine extents which - * differ only due to the state of the extents. - * This technique is used from xfs_getbmap() - * when the caller does not wish to see the - * separation (which is the default). - * - * This technique is also used when writing a - * buffer which has been partially written, - * (usually by being flushed during a chunkread), - * to ensure one write takes place. This also - * prevents a change in the xfs inode extents at - * this time, intentionally. This change occurs - * on completion of the write operation, in - * xfs_strat_comp(), where the xfs_bmapi() call - * is transactioned, and the extents combined. - */ - if ((flags & XFS_BMAPI_IGSTATE) && wr) /* if writing unwritten space */ - wr = 0; /* no allocations are allowed */ - ASSERT(wr || !(flags & XFS_BMAPI_DELAY)); - logflags = 0; - nallocs = 0; - cur = NULL; + + XFS_STATS_INC(xs_blk_mapw); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - ASSERT(wr && tp); - if ((error = xfs_bmap_local_to_extents(tp, ip, - firstblock, total, &logflags, whichfork))) + error = xfs_bmap_local_to_extents(tp, ip, firstblock, total, + &bma.logflags, whichfork); + if (error) goto error0; } - if (wr && *firstblock == NULLFSBLOCK) { + + if (*firstblock == NULLFSBLOCK) { if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) - minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; + bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; else - minleft = 1; - } else - minleft = 0; - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - goto error0; - ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, - &prev); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + bma.minleft = 1; + } else { + bma.minleft = 0; + } + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + goto error0; + } + + xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got, + &bma.prev); n = 0; end = bno + len; obno = bno; - bma.ip = NULL; + + bma.tp = tp; + bma.ip = ip; + bma.total = total; + bma.userdata = 0; + bma.flist = flist; + bma.firstblock = firstblock; while (bno < end && n < *nmap) { - /* - * Reading past eof, act as though there's a hole - * up to end. - */ - if (eof && !wr) - got.br_startoff = end; - inhole = eof || got.br_startoff > bno; - wasdelay = wr && !inhole && !(flags & XFS_BMAPI_DELAY) && - isnullstartblock(got.br_startblock); + inhole = eof || bma.got.br_startoff > bno; + wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); + /* * First, deal with the hole before the allocated space * that we found, if any. */ - if (wr && (inhole || wasdelay)) { - /* - * For the wasdelay case, we could also just - * allocate the stuff asked for in this bmap call - * but that wouldn't be as good. - */ - if (wasdelay) { - alen = (xfs_extlen_t)got.br_blockcount; - aoff = got.br_startoff; - if (lastx != NULLEXTNUM && lastx) { - ep = xfs_iext_get_ext(ifp, lastx - 1); - xfs_bmbt_get_all(ep, &prev); - } - } else { - alen = (xfs_extlen_t) - XFS_FILBLKS_MIN(len, MAXEXTLEN); - if (!eof) - alen = (xfs_extlen_t) - XFS_FILBLKS_MIN(alen, - got.br_startoff - bno); - aoff = bno; - } - minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1; - if (flags & XFS_BMAPI_DELAY) { - xfs_extlen_t extsz; - - /* Figure out the extent size, adjust alen */ - extsz = xfs_get_extsz_hint(ip); - if (extsz) { - /* - * make sure we don't exceed a single - * extent length when we align the - * extent by reducing length we are - * going to allocate by the maximum - * amount extent size aligment may - * require. - */ - alen = XFS_FILBLKS_MIN(len, - MAXEXTLEN - (2 * extsz - 1)); - error = xfs_bmap_extsize_align(mp, - &got, &prev, extsz, - rt, eof, - flags&XFS_BMAPI_DELAY, - flags&XFS_BMAPI_CONVERT, - &aoff, &alen); - ASSERT(!error); - } - - if (rt) - extsz = alen / mp->m_sb.sb_rextsize; - - /* - * Make a transaction-less quota reservation for - * delayed allocation blocks. This number gets - * adjusted later. We return if we haven't - * allocated blocks already inside this loop. - */ - error = xfs_trans_reserve_quota_nblks( - NULL, ip, (long)alen, 0, - rt ? XFS_QMOPT_RES_RTBLKS : - XFS_QMOPT_RES_REGBLKS); - if (error) { - if (n == 0) { - *nmap = 0; - ASSERT(cur == NULL); - return error; - } - break; - } - - /* - * Split changing sb for alen and indlen since - * they could be coming from different places. - */ - indlen = (xfs_extlen_t) - xfs_bmap_worst_indlen(ip, alen); - ASSERT(indlen > 0); - - if (rt) { - error = xfs_mod_incore_sb(mp, - XFS_SBS_FREXTENTS, - -((int64_t)extsz), 0); - } else { - error = xfs_icsb_modify_counters(mp, - XFS_SBS_FDBLOCKS, - -((int64_t)alen), 0); - } - if (!error) { - error = xfs_icsb_modify_counters(mp, - XFS_SBS_FDBLOCKS, - -((int64_t)indlen), 0); - if (error && rt) - xfs_mod_incore_sb(mp, - XFS_SBS_FREXTENTS, - (int64_t)extsz, 0); - else if (error) - xfs_icsb_modify_counters(mp, - XFS_SBS_FDBLOCKS, - (int64_t)alen, 0); - } - - if (error) { - if (XFS_IS_QUOTA_ON(mp)) - /* unreserve the blocks now */ - (void) - xfs_trans_unreserve_quota_nblks( - NULL, ip, - (long)alen, 0, rt ? - XFS_QMOPT_RES_RTBLKS : - XFS_QMOPT_RES_REGBLKS); - break; - } - - ip->i_delayed_blks += alen; - abno = nullstartblock(indlen); - } else { - /* - * If first time, allocate and fill in - * once-only bma fields. - */ - if (bma.ip == NULL) { - bma.tp = tp; - bma.ip = ip; - bma.prevp = &prev; - bma.gotp = &got; - bma.total = total; - bma.userdata = 0; - } - /* Indicate if this is the first user data - * in the file, or just any user data. - */ - if (!(flags & XFS_BMAPI_METADATA)) { - bma.userdata = (aoff == 0) ? - XFS_ALLOC_INITIAL_USER_DATA : - XFS_ALLOC_USERDATA; - } - /* - * Fill in changeable bma fields. - */ - bma.eof = eof; - bma.firstblock = *firstblock; - bma.alen = alen; - bma.off = aoff; - bma.conv = !!(flags & XFS_BMAPI_CONVERT); - bma.wasdel = wasdelay; - bma.minlen = minlen; - bma.low = flist->xbf_low; - bma.minleft = minleft; - /* - * Only want to do the alignment at the - * eof if it is userdata and allocation length - * is larger than a stripe unit. - */ - if (mp->m_dalign && alen >= mp->m_dalign && - (!(flags & XFS_BMAPI_METADATA)) && - (whichfork == XFS_DATA_FORK)) { - if ((error = xfs_bmap_isaeof(ip, aoff, - whichfork, &bma.aeof))) - goto error0; - } else - bma.aeof = 0; - /* - * Call allocator. - */ - if ((error = xfs_bmap_alloc(&bma))) - goto error0; - /* - * Copy out result fields. - */ - abno = bma.rval; - if ((flist->xbf_low = bma.low)) - minleft = 0; - alen = bma.alen; - aoff = bma.off; - ASSERT(*firstblock == NULLFSBLOCK || - XFS_FSB_TO_AGNO(mp, *firstblock) == - XFS_FSB_TO_AGNO(mp, bma.firstblock) || - (flist->xbf_low && - XFS_FSB_TO_AGNO(mp, *firstblock) < - XFS_FSB_TO_AGNO(mp, bma.firstblock))); - *firstblock = bma.firstblock; - if (cur) - cur->bc_private.b.firstblock = - *firstblock; - if (abno == NULLFSBLOCK) - break; - if ((ifp->if_flags & XFS_IFBROOT) && !cur) { - cur = xfs_bmbt_init_cursor(mp, tp, - ip, whichfork); - cur->bc_private.b.firstblock = - *firstblock; - cur->bc_private.b.flist = flist; - } - /* - * Bump the number of extents we've allocated - * in this call. - */ - nallocs++; - } - if (cur) - cur->bc_private.b.flags = - wasdelay ? XFS_BTCUR_BPRV_WASDEL : 0; - got.br_startoff = aoff; - got.br_startblock = abno; - got.br_blockcount = alen; - got.br_state = XFS_EXT_NORM; /* assume normal */ - /* - * Determine state of extent, and the filesystem. - * A wasdelay extent has been initialized, so - * shouldn't be flagged as unwritten. - */ - if (wr && xfs_sb_version_hasextflgbit(&mp->m_sb)) { - if (!wasdelay && (flags & XFS_BMAPI_PREALLOC)) - got.br_state = XFS_EXT_UNWRITTEN; - } - error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, &got, - firstblock, flist, &tmp_logflags, - whichfork); - logflags |= tmp_logflags; + if (inhole || wasdelay) { + bma.eof = eof; + bma.conv = !!(flags & XFS_BMAPI_CONVERT); + bma.wasdel = wasdelay; + bma.length = len; + bma.offset = bno; + + error = xfs_bmapi_allocate(&bma, flags); if (error) goto error0; - ep = xfs_iext_get_ext(ifp, lastx); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - xfs_bmbt_get_all(ep, &got); - ASSERT(got.br_startoff <= aoff); - ASSERT(got.br_startoff + got.br_blockcount >= - aoff + alen); -#ifdef DEBUG - if (flags & XFS_BMAPI_DELAY) { - ASSERT(isnullstartblock(got.br_startblock)); - ASSERT(startblockval(got.br_startblock) > 0); - } - ASSERT(got.br_state == XFS_EXT_NORM || - got.br_state == XFS_EXT_UNWRITTEN); -#endif - /* - * Fall down into the found allocated space case. - */ - } else if (inhole) { - /* - * Reading in a hole. - */ - mval->br_startoff = bno; - mval->br_startblock = HOLESTARTBLOCK; - mval->br_blockcount = - XFS_FILBLKS_MIN(len, got.br_startoff - bno); - mval->br_state = XFS_EXT_NORM; - bno += mval->br_blockcount; - len -= mval->br_blockcount; - mval++; - n++; - continue; - } - /* - * Then deal with the allocated space we found. - */ - ASSERT(ep != NULL); - if (!(flags & XFS_BMAPI_ENTIRE) && - (got.br_startoff + got.br_blockcount > obno)) { - if (obno > bno) - bno = obno; - ASSERT((bno >= obno) || (n == 0)); - ASSERT(bno < end); - mval->br_startoff = bno; - if (isnullstartblock(got.br_startblock)) { - ASSERT(!wr || (flags & XFS_BMAPI_DELAY)); - mval->br_startblock = DELAYSTARTBLOCK; - } else - mval->br_startblock = - got.br_startblock + - (bno - got.br_startoff); - /* - * Return the minimum of what we got and what we - * asked for for the length. We can use the len - * variable here because it is modified below - * and we could have been there before coming - * here if the first part of the allocation - * didn't overlap what was asked for. - */ - mval->br_blockcount = - XFS_FILBLKS_MIN(end - bno, got.br_blockcount - - (bno - got.br_startoff)); - mval->br_state = got.br_state; - ASSERT(mval->br_blockcount <= len); - } else { - *mval = got; - if (isnullstartblock(mval->br_startblock)) { - ASSERT(!wr || (flags & XFS_BMAPI_DELAY)); - mval->br_startblock = DELAYSTARTBLOCK; - } + if (bma.blkno == NULLFSBLOCK) + break; } - /* - * Check if writing previously allocated but - * unwritten extents. - */ - if (wr && - ((mval->br_state == XFS_EXT_UNWRITTEN && - ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) || - (mval->br_state == XFS_EXT_NORM && - ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) == - (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) { - /* - * Modify (by adding) the state flag, if writing. - */ - ASSERT(mval->br_blockcount <= len); - if ((ifp->if_flags & XFS_IFBROOT) && !cur) { - cur = xfs_bmbt_init_cursor(mp, - tp, ip, whichfork); - cur->bc_private.b.firstblock = - *firstblock; - cur->bc_private.b.flist = flist; - } - mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) - ? XFS_EXT_NORM - : XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, mval, - firstblock, flist, &tmp_logflags, - whichfork); - logflags |= tmp_logflags; - if (error) - goto error0; - ep = xfs_iext_get_ext(ifp, lastx); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - xfs_bmbt_get_all(ep, &got); - /* - * We may have combined previously unwritten - * space with written space, so generate - * another request. - */ - if (mval->br_blockcount < len) - continue; - } + /* Deal with the allocated space we found. */ + xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno, + end, n, flags); + + /* Execute unwritten extent conversion if necessary */ + error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags); + if (error == EAGAIN) + continue; + if (error) + goto error0; + + /* update the extent map to return */ + xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); - ASSERT((flags & XFS_BMAPI_ENTIRE) || - ((mval->br_startoff + mval->br_blockcount) <= end)); - ASSERT((flags & XFS_BMAPI_ENTIRE) || - (mval->br_blockcount <= len) || - (mval->br_startoff < obno)); - bno = mval->br_startoff + mval->br_blockcount; - len = end - bno; - if (n > 0 && mval->br_startoff == mval[-1].br_startoff) { - ASSERT(mval->br_startblock == mval[-1].br_startblock); - ASSERT(mval->br_blockcount > mval[-1].br_blockcount); - ASSERT(mval->br_state == mval[-1].br_state); - mval[-1].br_blockcount = mval->br_blockcount; - mval[-1].br_state = mval->br_state; - } else if (n > 0 && mval->br_startblock != DELAYSTARTBLOCK && - mval[-1].br_startblock != DELAYSTARTBLOCK && - mval[-1].br_startblock != HOLESTARTBLOCK && - mval->br_startblock == - mval[-1].br_startblock + mval[-1].br_blockcount && - ((flags & XFS_BMAPI_IGSTATE) || - mval[-1].br_state == mval->br_state)) { - ASSERT(mval->br_startoff == - mval[-1].br_startoff + mval[-1].br_blockcount); - mval[-1].br_blockcount += mval->br_blockcount; - } else if (n > 0 && - mval->br_startblock == DELAYSTARTBLOCK && - mval[-1].br_startblock == DELAYSTARTBLOCK && - mval->br_startoff == - mval[-1].br_startoff + mval[-1].br_blockcount) { - mval[-1].br_blockcount += mval->br_blockcount; - mval[-1].br_state = mval->br_state; - } else if (!((n == 0) && - ((mval->br_startoff + mval->br_blockcount) <= - obno))) { - mval++; - n++; - } /* * If we're done, stop now. Stop when we've allocated * XFS_BMAP_MAX_NMAP extents no matter what. Otherwise * the transaction may get too big. */ - if (bno >= end || n >= *nmap || nallocs >= *nmap) + if (bno >= end || n >= *nmap || bma.nallocs >= *nmap) break; - /* - * Else go on to the next record. - */ - prev = got; - if (++lastx < nextents) { - ep = xfs_iext_get_ext(ifp, lastx); - xfs_bmbt_get_all(ep, &got); - } else { + + /* Else go on to the next record. */ + bma.prev = bma.got; + if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx), + &bma.got); + } else eof = 1; - } } *nmap = n; + /* * Transform from btree to extents, give it cur. */ - if (tp && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { - ASSERT(wr && cur); - error = xfs_bmap_btree_to_extents(tp, ip, cur, + int tmp_logflags = 0; + + ASSERT(bma.cur); + error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &tmp_logflags, whichfork); - logflags |= tmp_logflags; + bma.logflags |= tmp_logflags; if (error) goto error0; } @@ -4855,34 +4984,33 @@ error0: * Log everything. Do this after conversion, there's no point in * logging the extent records if we've converted to btree format. */ - if ((logflags & xfs_ilog_fext(whichfork)) && + if ((bma.logflags & xfs_ilog_fext(whichfork)) && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - logflags &= ~xfs_ilog_fext(whichfork); - else if ((logflags & xfs_ilog_fbroot(whichfork)) && + bma.logflags &= ~xfs_ilog_fext(whichfork); + else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) - logflags &= ~xfs_ilog_fbroot(whichfork); + bma.logflags &= ~xfs_ilog_fbroot(whichfork); /* * Log whatever the flags say, even if error. Otherwise we might miss * detecting a case where the data is changed, there's an error, * and it's not logged so we don't shutdown when we should. */ - if (logflags) { - ASSERT(tp && wr); - xfs_trans_log_inode(tp, ip, logflags); - } - if (cur) { + if (bma.logflags) + xfs_trans_log_inode(tp, ip, bma.logflags); + + if (bma.cur) { if (!error) { ASSERT(*firstblock == NULLFSBLOCK || XFS_FSB_TO_AGNO(mp, *firstblock) == XFS_FSB_TO_AGNO(mp, - cur->bc_private.b.firstblock) || + bma.cur->bc_private.b.firstblock) || (flist->xbf_low && XFS_FSB_TO_AGNO(mp, *firstblock) < XFS_FSB_TO_AGNO(mp, - cur->bc_private.b.firstblock))); - *firstblock = cur->bc_private.b.firstblock; + bma.cur->bc_private.b.firstblock))); + *firstblock = bma.cur->bc_private.b.firstblock; } - xfs_btree_del_cursor(cur, + xfs_btree_del_cursor(bma.cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); } if (!error) @@ -4892,58 +5020,6 @@ error0: } /* - * Map file blocks to filesystem blocks, simple version. - * One block (extent) only, read-only. - * For flags, only the XFS_BMAPI_ATTRFORK flag is examined. - * For the other flag values, the effect is as if XFS_BMAPI_METADATA - * was set and all the others were clear. - */ -int /* error */ -xfs_bmapi_single( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - int whichfork, /* data or attr fork */ - xfs_fsblock_t *fsb, /* output: mapped block */ - xfs_fileoff_t bno) /* starting file offs. mapped */ -{ - int eof; /* we've hit the end of extents */ - int error; /* error return */ - xfs_bmbt_irec_t got; /* current file extent record */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t lastx; /* last useful extent number */ - xfs_bmbt_irec_t prev; /* previous file extent record */ - - ifp = XFS_IFORK_PTR(ip, whichfork); - if (unlikely( - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)) { - XFS_ERROR_REPORT("xfs_bmapi_single", XFS_ERRLEVEL_LOW, - ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); - } - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return XFS_ERROR(EIO); - XFS_STATS_INC(xs_blk_mapr); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - return error; - (void)xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, - &prev); - /* - * Reading past eof, act as though there's a hole - * up to end. - */ - if (eof || got.br_startoff > bno) { - *fsb = NULLFSBLOCK; - return 0; - } - ASSERT(!isnullstartblock(got.br_startblock)); - ASSERT(bno < got.br_startoff + got.br_blockcount); - *fsb = got.br_startblock + (bno - got.br_startoff); - return 0; -} - -/* * Unmap (remove) blocks from a file. * If nexts is nonzero then the number of extents to remove is limited to * that value. If not all extents in the block range can be removed then @@ -5114,9 +5190,9 @@ xfs_bunmapi( del.br_blockcount = mod; } del.br_state = XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, &del, - firstblock, flist, &logflags, - XFS_DATA_FORK); + error = xfs_bmap_add_extent_unwritten_real(tp, ip, + &lastx, &cur, &del, firstblock, flist, + &logflags); if (error) goto error0; goto nodelete; @@ -5172,18 +5248,18 @@ xfs_bunmapi( } prev.br_state = XFS_EXT_UNWRITTEN; lastx--; - error = xfs_bmap_add_extent(tp, ip, &lastx, - &cur, &prev, firstblock, flist, - &logflags, XFS_DATA_FORK); + error = xfs_bmap_add_extent_unwritten_real(tp, + ip, &lastx, &cur, &prev, + firstblock, flist, &logflags); if (error) goto error0; goto nodelete; } else { ASSERT(del.br_state == XFS_EXT_NORM); del.br_state = XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(tp, ip, &lastx, - &cur, &del, firstblock, flist, - &logflags, XFS_DATA_FORK); + error = xfs_bmap_add_extent_unwritten_real(tp, + ip, &lastx, &cur, &del, + firstblock, flist, &logflags); if (error) goto error0; goto nodelete; @@ -5505,10 +5581,9 @@ xfs_getbmap( do { nmap = (nexleft > subnex) ? subnex : nexleft; - error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), - XFS_BB_TO_FSB(mp, bmv->bmv_length), - bmapi_flags, NULL, 0, map, &nmap, - NULL); + error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), + XFS_BB_TO_FSB(mp, bmv->bmv_length), + map, &nmap, bmapi_flags); if (error) goto out_free_map; ASSERT(nmap <= subnex); @@ -5582,89 +5657,6 @@ xfs_getbmap( return error; } -/* - * Check the last inode extent to determine whether this allocation will result - * in blocks being allocated at the end of the file. When we allocate new data - * blocks at the end of the file which do not start at the previous data block, - * we will try to align the new blocks at stripe unit boundaries. - */ -STATIC int /* error */ -xfs_bmap_isaeof( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t off, /* file offset in fsblocks */ - int whichfork, /* data or attribute fork */ - char *aeof) /* return value */ -{ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_rec_host_t *lastrec; /* extent record pointer */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_bmbt_irec_t s; /* expanded extent record */ - - ASSERT(whichfork == XFS_DATA_FORK); - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(NULL, ip, whichfork))) - return error; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (nextents == 0) { - *aeof = 1; - return 0; - } - /* - * Go to the last extent - */ - lastrec = xfs_iext_get_ext(ifp, nextents - 1); - xfs_bmbt_get_all(lastrec, &s); - /* - * Check we are allocating in the last extent (for delayed allocations) - * or past the last extent for non-delayed allocations. - */ - *aeof = (off >= s.br_startoff && - off < s.br_startoff + s.br_blockcount && - isnullstartblock(s.br_startblock)) || - off >= s.br_startoff + s.br_blockcount; - return 0; -} - -/* - * Check if the endoff is outside the last extent. If so the caller will grow - * the allocation to a stripe unit boundary. - */ -int /* error */ -xfs_bmap_eof( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t endoff, /* file offset in fsblocks */ - int whichfork, /* data or attribute fork */ - int *eof) /* result value */ -{ - xfs_fsblock_t blockcount; /* extent block count */ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_rec_host_t *lastrec; /* extent record pointer */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_fileoff_t startoff; /* extent starting file offset */ - - ASSERT(whichfork == XFS_DATA_FORK); - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(NULL, ip, whichfork))) - return error; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (nextents == 0) { - *eof = 1; - return 0; - } - /* - * Go to the last extent - */ - lastrec = xfs_iext_get_ext(ifp, nextents - 1); - startoff = xfs_bmbt_get_startoff(lastrec); - blockcount = xfs_bmbt_get_blockcount(lastrec); - *eof = endoff >= startoff + blockcount; - return 0; -} - #ifdef DEBUG STATIC struct xfs_buf * xfs_bmap_get_bp( @@ -6099,9 +6091,8 @@ xfs_bmap_punch_delalloc_range( * trying to remove a real extent (which requires a * transaction) or a hole, which is probably a bad idea... */ - error = xfs_bmapi(NULL, ip, start_fsb, 1, - XFS_BMAPI_ENTIRE, NULL, 0, &imap, - &nimaps, NULL); + error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps, + XFS_BMAPI_ENTIRE); if (error) { /* something screwed, just bail */ diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index c62234bde05..89ee672d378 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -62,27 +62,23 @@ typedef struct xfs_bmap_free #define XFS_BMAP_MAX_NMAP 4 /* - * Flags for xfs_bmapi + * Flags for xfs_bmapi_* */ -#define XFS_BMAPI_WRITE 0x001 /* write operation: allocate space */ -#define XFS_BMAPI_DELAY 0x002 /* delayed write operation */ -#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */ -#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */ -#define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */ -#define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */ -#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */ +#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */ +#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */ +#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */ +#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */ +#define XFS_BMAPI_IGSTATE 0x010 /* Ignore state - */ /* combine contig. space */ -#define XFS_BMAPI_CONTIG 0x100 /* must allocate only one extent */ +#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */ /* * unwritten extent conversion - this needs write cache flushing and no additional * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts * from written to unwritten, otherwise convert from unwritten to written. */ -#define XFS_BMAPI_CONVERT 0x200 +#define XFS_BMAPI_CONVERT 0x040 #define XFS_BMAPI_FLAGS \ - { XFS_BMAPI_WRITE, "WRITE" }, \ - { XFS_BMAPI_DELAY, "DELAY" }, \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ @@ -113,21 +109,28 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) * Argument structure for xfs_bmap_alloc. */ typedef struct xfs_bmalloca { - xfs_fsblock_t firstblock; /* i/o first block allocated */ - xfs_fsblock_t rval; /* starting block of new extent */ - xfs_fileoff_t off; /* offset in file filling in */ + xfs_fsblock_t *firstblock; /* i/o first block allocated */ + struct xfs_bmap_free *flist; /* bmap freelist */ struct xfs_trans *tp; /* transaction pointer */ struct xfs_inode *ip; /* incore inode pointer */ - struct xfs_bmbt_irec *prevp; /* extent before the new one */ - struct xfs_bmbt_irec *gotp; /* extent after, or delayed */ - xfs_extlen_t alen; /* i/o length asked/allocated */ + struct xfs_bmbt_irec prev; /* extent before the new one */ + struct xfs_bmbt_irec got; /* extent after, or delayed */ + + xfs_fileoff_t offset; /* offset in file filling in */ + xfs_extlen_t length; /* i/o length asked/allocated */ + xfs_fsblock_t blkno; /* starting block of new extent */ + + struct xfs_btree_cur *cur; /* btree cursor */ + xfs_extnum_t idx; /* current extent index */ + int nallocs;/* number of extents alloc'd */ + int logflags;/* flags for transaction logging */ + xfs_extlen_t total; /* total blocks needed for xaction */ xfs_extlen_t minlen; /* minimum allocation size (blocks) */ xfs_extlen_t minleft; /* amount must be left after alloc */ char eof; /* set if allocating past last extent */ char wasdel; /* replacing a delayed allocation */ char userdata;/* set if is user data */ - char low; /* low on space, using seq'l ags */ char aeof; /* allocated space at eof */ char conv; /* overwriting unwritten extents */ } xfs_bmalloca_t; @@ -152,251 +155,62 @@ typedef struct xfs_bmalloca { { BMAP_RIGHT_FILLING, "RF" }, \ { BMAP_ATTRFORK, "ATTR" } -/* - * Add bmap trace insert entries for all the contents of the extent list. - * - * Quite excessive tracing. Only do this for debug builds. - */ #if defined(__KERNEL) && defined(DEBUG) -void -xfs_bmap_trace_exlist( - struct xfs_inode *ip, /* incore inode pointer */ - xfs_extnum_t cnt, /* count of entries in list */ - int whichfork, - unsigned long caller_ip); /* data or attr fork */ +void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, + int whichfork, unsigned long caller_ip); #define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_) #else #define XFS_BMAP_TRACE_EXLIST(ip,c,w) #endif -/* - * Convert inode from non-attributed to attributed. - * Must not be in a transaction, ip must not be locked. - */ -int /* error code */ -xfs_bmap_add_attrfork( - struct xfs_inode *ip, /* incore inode pointer */ - int size, /* space needed for new attribute */ - int rsvd); /* flag for reserved block allocation */ - -/* - * Add the extent to the list of extents to be free at transaction end. - * The list is maintained sorted (by block number). - */ -void -xfs_bmap_add_free( - xfs_fsblock_t bno, /* fs block number of extent */ - xfs_filblks_t len, /* length of extent */ - xfs_bmap_free_t *flist, /* list of extents */ - struct xfs_mount *mp); /* mount point structure */ - -/* - * Routine to clean up the free list data structure when - * an error occurs during a transaction. - */ -void -xfs_bmap_cancel( - xfs_bmap_free_t *flist); /* free list to clean up */ - -/* - * Compute and fill in the value of the maximum depth of a bmap btree - * in this filesystem. Done once, during mount. - */ -void -xfs_bmap_compute_maxlevels( - struct xfs_mount *mp, /* file system mount structure */ - int whichfork); /* data or attr fork */ - -/* - * Returns the file-relative block number of the first unused block in the file. - * This is the lowest-address hole if the file has holes, else the first block - * past the end of file. - */ -int /* error */ -xfs_bmap_first_unused( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_extlen_t len, /* size of hole to find */ - xfs_fileoff_t *unused, /* unused block num */ - int whichfork); /* data or attr fork */ - -/* - * Returns the file-relative block number of the last block + 1 before - * last_block (input value) in the file. - * This is not based on i_size, it is based on the extent list. - * Returns 0 for local files, as they do not have an extent list. - */ -int /* error */ -xfs_bmap_last_before( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t *last_block, /* last block */ - int whichfork); /* data or attr fork */ - -/* - * Returns the file-relative block number of the first block past eof in - * the file. This is not based on i_size, it is based on the extent list. - * Returns 0 for local files, as they do not have an extent list. - */ -int /* error */ -xfs_bmap_last_offset( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t *unused, /* last block num */ - int whichfork); /* data or attr fork */ - -/* - * Returns whether the selected fork of the inode has exactly one - * block or not. For the data fork we check this matches di_size, - * implying the file's range is 0..bsize-1. - */ -int -xfs_bmap_one_block( - struct xfs_inode *ip, /* incore inode */ - int whichfork); /* data or attr fork */ - -/* - * Read in the extents to iu_extents. - * All inode fields are set up by caller, we just traverse the btree - * and copy the records in. - */ -int /* error */ -xfs_bmap_read_extents( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - int whichfork); /* data or attr fork */ - -/* - * Map file blocks to filesystem blocks. - * File range is given by the bno/len pair. - * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set) - * into a hole or past eof. - * Only allocates blocks from a single allocation group, - * to avoid locking problems. - * The returned value in "firstblock" from the first call in a transaction - * must be remembered and presented to subsequent calls in "firstblock". - * An upper bound for the number of blocks to be allocated is supplied to - * the first call in "total"; if no allocation group has that many free - * blocks then the call will fail (return NULLFSBLOCK in "firstblock"). - */ -int /* error */ -xfs_bmapi( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t bno, /* starting file offs. mapped */ - xfs_filblks_t len, /* length to map in file */ - int flags, /* XFS_BMAPI_... */ - xfs_fsblock_t *firstblock, /* first allocated block - controls a.g. for allocs */ - xfs_extlen_t total, /* total blocks needed */ - struct xfs_bmbt_irec *mval, /* output: map values */ - int *nmap, /* i/o: mval size/count */ - xfs_bmap_free_t *flist); /* i/o: list extents to free */ - -/* - * Map file blocks to filesystem blocks, simple version. - * One block only, read-only. - * For flags, only the XFS_BMAPI_ATTRFORK flag is examined. - * For the other flag values, the effect is as if XFS_BMAPI_METADATA - * was set and all the others were clear. - */ -int /* error */ -xfs_bmapi_single( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - int whichfork, /* data or attr fork */ - xfs_fsblock_t *fsb, /* output: mapped block */ - xfs_fileoff_t bno); /* starting file offs. mapped */ - -/* - * Unmap (remove) blocks from a file. - * If nexts is nonzero then the number of extents to remove is limited to - * that value. If not all extents in the block range can be removed then - * *done is set. - */ -int /* error */ -xfs_bunmapi( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t bno, /* starting offset to unmap */ - xfs_filblks_t len, /* length to unmap in file */ - int flags, /* XFS_BMAPI_... */ - xfs_extnum_t nexts, /* number of extents max */ - xfs_fsblock_t *firstblock, /* first allocated block - controls a.g. for allocs */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - int *done); /* set if not done yet */ - -/* - * Check an extent list, which has just been read, for - * any bit in the extent flag field. - */ -int -xfs_check_nostate_extents( - struct xfs_ifork *ifp, - xfs_extnum_t idx, - xfs_extnum_t num); - -uint -xfs_default_attroffset( - struct xfs_inode *ip); +int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); +void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, + struct xfs_bmap_free *flist, struct xfs_mount *mp); +void xfs_bmap_cancel(struct xfs_bmap_free *flist); +void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); +int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); +int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t *last_block, int whichfork); +int xfs_bmap_last_offset(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t *unused, int whichfork); +int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork); +int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork); +int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, + xfs_filblks_t len, struct xfs_bmbt_irec *mval, + int *nmap, int flags); +int xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno, + xfs_filblks_t len, struct xfs_bmbt_irec *mval, + int *nmap, int flags); +int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_fsblock_t *firstblock, xfs_extlen_t total, + struct xfs_bmbt_irec *mval, int *nmap, + struct xfs_bmap_free *flist); +int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_extnum_t nexts, xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, int *done); +int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, + xfs_extnum_t num); +uint xfs_default_attroffset(struct xfs_inode *ip); #ifdef __KERNEL__ - -/* - * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi - * caller. Frees all the extents that need freeing, which must be done - * last due to locking considerations. - * - * Return 1 if the given transaction was committed and a new one allocated, - * and 0 otherwise. - */ -int /* error */ -xfs_bmap_finish( - struct xfs_trans **tp, /* transaction pointer addr */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - int *committed); /* xact committed or not */ - /* bmap to userspace formatter - copy to user & advance pointer */ typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *); -/* - * Get inode's extents as described in bmv, and format for output. - */ -int /* error code */ -xfs_getbmap( - xfs_inode_t *ip, - struct getbmapx *bmv, /* user bmap structure */ - xfs_bmap_format_t formatter, /* format to user */ - void *arg); /* formatter arg */ - -/* - * Check if the endoff is outside the last extent. If so the caller will grow - * the allocation to a stripe unit boundary - */ -int -xfs_bmap_eof( - struct xfs_inode *ip, - xfs_fileoff_t endoff, - int whichfork, - int *eof); - -/* - * Count fsblocks of the given fork. - */ -int -xfs_bmap_count_blocks( - xfs_trans_t *tp, - struct xfs_inode *ip, - int whichfork, - int *count); - -int -xfs_bmap_punch_delalloc_range( - struct xfs_inode *ip, - xfs_fileoff_t start_fsb, - xfs_fileoff_t length); +int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, + int *committed); +int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv, + xfs_bmap_format_t formatter, void *arg); +int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, + int whichfork, int *eof); +int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, int *count); +int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, + xfs_fileoff_t start_fsb, xfs_fileoff_t length); #endif /* __KERNEL__ */ #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 2b9fd385e27..1f19f03af9d 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -631,7 +631,7 @@ xfs_btree_read_bufl( } ASSERT(!xfs_buf_geterror(bp)); if (bp) - XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); + xfs_buf_set_ref(bp, refval); *bpp = bp; return 0; } @@ -939,13 +939,13 @@ xfs_btree_set_refs( switch (cur->bc_btnum) { case XFS_BTNUM_BNO: case XFS_BTNUM_CNT: - XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF); + xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF); break; case XFS_BTNUM_INO: - XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF); + xfs_buf_set_ref(bp, XFS_INO_BTREE_REF); break; case XFS_BTNUM_BMAP: - XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF); + xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF); break; default: ASSERT(0); @@ -970,7 +970,8 @@ xfs_btree_get_buf_block( *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize, flags); - ASSERT(!xfs_buf_geterror(*bpp)); + if (!*bpp) + return ENOMEM; *block = XFS_BUF_TO_BLOCK(*bpp); return 0; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index c57836dc778..cf0ac056815 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -43,7 +43,6 @@ static kmem_zone_t *xfs_buf_zone; STATIC int xfsbufd(void *); -STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); static struct workqueue_struct *xfslogd_workqueue; struct workqueue_struct *xfsdatad_workqueue; @@ -66,10 +65,6 @@ struct workqueue_struct *xfsconvertd_workqueue; #define xb_to_km(flags) \ (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) -#define xfs_buf_allocate(flags) \ - kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags)) -#define xfs_buf_deallocate(bp) \ - kmem_zone_free(xfs_buf_zone, (bp)); static inline int xfs_buf_is_vmapped( @@ -152,6 +147,7 @@ xfs_buf_stale( struct xfs_buf *bp) { bp->b_flags |= XBF_STALE; + xfs_buf_delwri_dequeue(bp); atomic_set(&(bp)->b_lru_ref, 0); if (!list_empty(&bp->b_lru)) { struct xfs_buftarg *btp = bp->b_target; @@ -167,14 +163,19 @@ xfs_buf_stale( ASSERT(atomic_read(&bp->b_hold) >= 1); } -STATIC void -_xfs_buf_initialize( - xfs_buf_t *bp, - xfs_buftarg_t *target, +struct xfs_buf * +xfs_buf_alloc( + struct xfs_buftarg *target, xfs_off_t range_base, size_t range_length, xfs_buf_flags_t flags) { + struct xfs_buf *bp; + + bp = kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags)); + if (unlikely(!bp)) + return NULL; + /* * We don't want certain flags to appear in b_flags. */ @@ -203,8 +204,9 @@ _xfs_buf_initialize( init_waitqueue_head(&bp->b_waiters); XFS_STATS_INC(xb_create); - trace_xfs_buf_init(bp, _RET_IP_); + + return bp; } /* @@ -277,7 +279,7 @@ xfs_buf_free( } else if (bp->b_flags & _XBF_KMEM) kmem_free(bp->b_addr); _xfs_buf_free_pages(bp); - xfs_buf_deallocate(bp); + kmem_zone_free(xfs_buf_zone, bp); } /* @@ -416,10 +418,7 @@ _xfs_buf_map_pages( /* * Look up, and creates if absent, a lockable buffer for * a given range of an inode. The buffer is returned - * locked. If other overlapping buffers exist, they are - * released before the new buffer is created and locked, - * which may imply that this call will block until those buffers - * are unlocked. No I/O is implied by this call. + * locked. No I/O is implied by this call. */ xfs_buf_t * _xfs_buf_find( @@ -481,8 +480,6 @@ _xfs_buf_find( /* No match found */ if (new_bp) { - _xfs_buf_initialize(new_bp, btp, range_base, - range_length, flags); rb_link_node(&new_bp->b_rbnode, parent, rbp); rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree); /* the buffer keeps the perag reference until it is freed */ @@ -525,35 +522,51 @@ found: } /* - * Assembles a buffer covering the specified range. - * Storage in memory for all portions of the buffer will be allocated, - * although backing storage may not be. + * Assembles a buffer covering the specified range. The code is optimised for + * cache hits, as metadata intensive workloads will see 3 orders of magnitude + * more hits than misses. */ -xfs_buf_t * +struct xfs_buf * xfs_buf_get( xfs_buftarg_t *target,/* target for buffer */ xfs_off_t ioff, /* starting offset of range */ size_t isize, /* length of range */ xfs_buf_flags_t flags) { - xfs_buf_t *bp, *new_bp; + struct xfs_buf *bp; + struct xfs_buf *new_bp; int error = 0; - new_bp = xfs_buf_allocate(flags); + bp = _xfs_buf_find(target, ioff, isize, flags, NULL); + if (likely(bp)) + goto found; + + new_bp = xfs_buf_alloc(target, ioff << BBSHIFT, isize << BBSHIFT, + flags); if (unlikely(!new_bp)) return NULL; bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); + if (!bp) { + kmem_zone_free(xfs_buf_zone, new_bp); + return NULL; + } + if (bp == new_bp) { error = xfs_buf_allocate_memory(bp, flags); if (error) goto no_buffer; - } else { - xfs_buf_deallocate(new_bp); - if (unlikely(bp == NULL)) - return NULL; - } + } else + kmem_zone_free(xfs_buf_zone, new_bp); + /* + * Now we have a workable buffer, fill in the block number so + * that we can do IO on it. + */ + bp->b_bn = ioff; + bp->b_count_desired = bp->b_buffer_length; + +found: if (!(bp->b_flags & XBF_MAPPED)) { error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { @@ -564,18 +577,10 @@ xfs_buf_get( } XFS_STATS_INC(xb_get); - - /* - * Always fill in the block number now, the mapped cases can do - * their own overlay of this later. - */ - bp->b_bn = ioff; - bp->b_count_desired = bp->b_buffer_length; - trace_xfs_buf_get(bp, flags, _RET_IP_); return bp; - no_buffer: +no_buffer: if (flags & (XBF_LOCK | XBF_TRYLOCK)) xfs_buf_unlock(bp); xfs_buf_rele(bp); @@ -689,19 +694,6 @@ xfs_buf_read_uncached( return bp; } -xfs_buf_t * -xfs_buf_get_empty( - size_t len, - xfs_buftarg_t *target) -{ - xfs_buf_t *bp; - - bp = xfs_buf_allocate(0); - if (bp) - _xfs_buf_initialize(bp, target, 0, len, 0); - return bp; -} - /* * Return a buffer allocated as an empty buffer and associated to external * memory via xfs_buf_associate_memory() back to it's empty state. @@ -787,10 +779,9 @@ xfs_buf_get_uncached( int error, i; xfs_buf_t *bp; - bp = xfs_buf_allocate(0); + bp = xfs_buf_alloc(target, 0, len, 0); if (unlikely(bp == NULL)) goto fail; - _xfs_buf_initialize(bp, target, 0, len, 0); error = _xfs_buf_get_pages(bp, page_count, 0); if (error) @@ -818,7 +809,7 @@ xfs_buf_get_uncached( __free_page(bp->b_pages[i]); _xfs_buf_free_pages(bp); fail_free_buf: - xfs_buf_deallocate(bp); + kmem_zone_free(xfs_buf_zone, bp); fail: return NULL; } @@ -937,12 +928,6 @@ void xfs_buf_unlock( struct xfs_buf *bp) { - if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) { - atomic_inc(&bp->b_hold); - bp->b_flags |= XBF_ASYNC; - xfs_buf_delwri_queue(bp, 0); - } - XB_CLEAR_OWNER(bp); up(&bp->b_sema); @@ -1019,9 +1004,19 @@ xfs_buf_ioerror( trace_xfs_buf_ioerror(bp, error, _RET_IP_); } +void +xfs_buf_ioerror_alert( + struct xfs_buf *bp, + const char *func) +{ + xfs_alert(bp->b_target->bt_mount, +"metadata I/O error: block 0x%llx (\"%s\") error %d buf count %zd", + (__uint64_t)XFS_BUF_ADDR(bp), func, + bp->b_error, XFS_BUF_COUNT(bp)); +} + int xfs_bwrite( - struct xfs_mount *mp, struct xfs_buf *bp) { int error; @@ -1033,25 +1028,13 @@ xfs_bwrite( xfs_bdstrat_cb(bp); error = xfs_buf_iowait(bp); - if (error) - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - xfs_buf_relse(bp); + if (error) { + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_META_IO_ERROR); + } return error; } -void -xfs_bdwrite( - void *mp, - struct xfs_buf *bp) -{ - trace_xfs_buf_bdwrite(bp, _RET_IP_); - - bp->b_flags &= ~XBF_READ; - bp->b_flags |= (XBF_DELWRI | XBF_ASYNC); - - xfs_buf_delwri_queue(bp, 1); -} - /* * Called when we want to stop a buffer from getting written or read. * We attach the EIO error, muck with its flags, and call xfs_buf_ioend @@ -1074,9 +1057,8 @@ xfs_bioerror( * We're calling xfs_buf_ioend, so delete XBF_DONE flag. */ XFS_BUF_UNREAD(bp); - XFS_BUF_UNDELAYWRITE(bp); XFS_BUF_UNDONE(bp); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); xfs_buf_ioend(bp, 0); @@ -1103,9 +1085,8 @@ xfs_bioerror_relse( * change that interface. */ XFS_BUF_UNREAD(bp); - XFS_BUF_UNDELAYWRITE(bp); XFS_BUF_DONE(bp); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); bp->b_iodone = NULL; if (!(fl & XBF_ASYNC)) { /* @@ -1115,7 +1096,7 @@ xfs_bioerror_relse( * ASYNC buffers. */ xfs_buf_ioerror(bp, EIO); - XFS_BUF_FINISH_IOWAIT(bp); + complete(&bp->b_iowait); } else { xfs_buf_relse(bp); } @@ -1275,15 +1256,10 @@ xfs_buf_iorequest( { trace_xfs_buf_iorequest(bp, _RET_IP_); - if (bp->b_flags & XBF_DELWRI) { - xfs_buf_delwri_queue(bp, 1); - return 0; - } + ASSERT(!(bp->b_flags & XBF_DELWRI)); - if (bp->b_flags & XBF_WRITE) { + if (bp->b_flags & XBF_WRITE) xfs_buf_wait_unpin(bp); - } - xfs_buf_hold(bp); /* Set the count to 1 initially, this will stop an I/O @@ -1481,9 +1457,13 @@ xfs_setsize_buftarg_flags( btp->bt_smask = sectorsize - 1; if (set_blocksize(btp->bt_bdev, sectorsize)) { + char name[BDEVNAME_SIZE]; + + bdevname(btp->bt_bdev, name); + xfs_warn(btp->bt_mount, "Cannot set_blocksize to %u on device %s\n", - sectorsize, xfs_buf_target_name(btp)); + sectorsize, name); return EINVAL; } @@ -1514,12 +1494,12 @@ xfs_setsize_buftarg( } STATIC int -xfs_alloc_delwrite_queue( +xfs_alloc_delwri_queue( xfs_buftarg_t *btp, const char *fsname) { - INIT_LIST_HEAD(&btp->bt_delwrite_queue); - spin_lock_init(&btp->bt_delwrite_lock); + INIT_LIST_HEAD(&btp->bt_delwri_queue); + spin_lock_init(&btp->bt_delwri_lock); btp->bt_flags = 0; btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); if (IS_ERR(btp->bt_task)) @@ -1549,7 +1529,7 @@ xfs_alloc_buftarg( spin_lock_init(&btp->bt_lru_lock); if (xfs_setsize_buftarg_early(btp, bdev)) goto error; - if (xfs_alloc_delwrite_queue(btp, fsname)) + if (xfs_alloc_delwri_queue(btp, fsname)) goto error; btp->bt_shrinker.shrink = xfs_buftarg_shrink; btp->bt_shrinker.seeks = DEFAULT_SEEKS; @@ -1565,56 +1545,48 @@ error: /* * Delayed write buffer handling */ -STATIC void +void xfs_buf_delwri_queue( - xfs_buf_t *bp, - int unlock) + xfs_buf_t *bp) { - struct list_head *dwq = &bp->b_target->bt_delwrite_queue; - spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; + struct xfs_buftarg *btp = bp->b_target; trace_xfs_buf_delwri_queue(bp, _RET_IP_); - ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC)); + ASSERT(!(bp->b_flags & XBF_READ)); - spin_lock(dwlk); - /* If already in the queue, dequeue and place at tail */ + spin_lock(&btp->bt_delwri_lock); if (!list_empty(&bp->b_list)) { + /* if already in the queue, move it to the tail */ ASSERT(bp->b_flags & _XBF_DELWRI_Q); - if (unlock) - atomic_dec(&bp->b_hold); - list_del(&bp->b_list); - } - - if (list_empty(dwq)) { + list_move_tail(&bp->b_list, &btp->bt_delwri_queue); + } else { /* start xfsbufd as it is about to have something to do */ - wake_up_process(bp->b_target->bt_task); - } + if (list_empty(&btp->bt_delwri_queue)) + wake_up_process(bp->b_target->bt_task); - bp->b_flags |= _XBF_DELWRI_Q; - list_add_tail(&bp->b_list, dwq); + atomic_inc(&bp->b_hold); + bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC; + list_add_tail(&bp->b_list, &btp->bt_delwri_queue); + } bp->b_queuetime = jiffies; - spin_unlock(dwlk); - - if (unlock) - xfs_buf_unlock(bp); + spin_unlock(&btp->bt_delwri_lock); } void xfs_buf_delwri_dequeue( xfs_buf_t *bp) { - spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; int dequeued = 0; - spin_lock(dwlk); + spin_lock(&bp->b_target->bt_delwri_lock); if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) { ASSERT(bp->b_flags & _XBF_DELWRI_Q); list_del_init(&bp->b_list); dequeued = 1; } bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); - spin_unlock(dwlk); + spin_unlock(&bp->b_target->bt_delwri_lock); if (dequeued) xfs_buf_rele(bp); @@ -1646,16 +1618,9 @@ xfs_buf_delwri_promote( if (bp->b_queuetime < jiffies - age) return; bp->b_queuetime = jiffies - age; - spin_lock(&btp->bt_delwrite_lock); - list_move(&bp->b_list, &btp->bt_delwrite_queue); - spin_unlock(&btp->bt_delwrite_lock); -} - -STATIC void -xfs_buf_runall_queues( - struct workqueue_struct *queue) -{ - flush_workqueue(queue); + spin_lock(&btp->bt_delwri_lock); + list_move(&bp->b_list, &btp->bt_delwri_queue); + spin_unlock(&btp->bt_delwri_lock); } /* @@ -1669,15 +1634,13 @@ xfs_buf_delwri_split( unsigned long age) { xfs_buf_t *bp, *n; - struct list_head *dwq = &target->bt_delwrite_queue; - spinlock_t *dwlk = &target->bt_delwrite_lock; int skipped = 0; int force; force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); INIT_LIST_HEAD(list); - spin_lock(dwlk); - list_for_each_entry_safe(bp, n, dwq, b_list) { + spin_lock(&target->bt_delwri_lock); + list_for_each_entry_safe(bp, n, &target->bt_delwri_queue, b_list) { ASSERT(bp->b_flags & XBF_DELWRI); if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) { @@ -1694,10 +1657,9 @@ xfs_buf_delwri_split( } else skipped++; } - spin_unlock(dwlk); + spin_unlock(&target->bt_delwri_lock); return skipped; - } /* @@ -1747,7 +1709,7 @@ xfsbufd( } /* sleep for a long time if there is nothing to do. */ - if (list_empty(&target->bt_delwrite_queue)) + if (list_empty(&target->bt_delwri_queue)) tout = MAX_SCHEDULE_TIMEOUT; schedule_timeout_interruptible(tout); @@ -1783,9 +1745,7 @@ xfs_flush_buftarg( LIST_HEAD(wait_list); struct blk_plug plug; - xfs_buf_runall_queues(xfsconvertd_workqueue); - xfs_buf_runall_queues(xfsdatad_workqueue); - xfs_buf_runall_queues(xfslogd_workqueue); + flush_workqueue(xfslogd_workqueue); set_bit(XBT_FORCE_FLUSH, &target->bt_flags); pincount = xfs_buf_delwri_split(target, &tmp_list, 0); @@ -1866,11 +1826,3 @@ xfs_buf_terminate(void) destroy_workqueue(xfslogd_workqueue); kmem_zone_destroy(xfs_buf_zone); } - -#ifdef CONFIG_KDB_MODULES -struct list_head * -xfs_get_buftarg_list(void) -{ - return &xfs_buftarg_list; -} -#endif diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 620972b8094..5bab046e859 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -105,8 +105,8 @@ typedef struct xfs_buftarg { /* per device delwri queue */ struct task_struct *bt_task; - struct list_head bt_delwrite_queue; - spinlock_t bt_delwrite_lock; + struct list_head bt_delwri_queue; + spinlock_t bt_delwri_lock; unsigned long bt_flags; /* LRU control structures */ @@ -175,7 +175,8 @@ extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t, extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t, xfs_buf_flags_t); -extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); +struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *, xfs_off_t, size_t, + xfs_buf_flags_t); extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len); extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int); extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); @@ -197,14 +198,14 @@ extern void xfs_buf_unlock(xfs_buf_t *); ((bp)->b_sema.count <= 0) /* Buffer Read and Write Routines */ -extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); -extern void xfs_bdwrite(void *mp, xfs_buf_t *bp); +extern int xfs_bwrite(struct xfs_buf *bp); extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); extern int xfs_bdstrat_cb(struct xfs_buf *); extern void xfs_buf_ioend(xfs_buf_t *, int); extern void xfs_buf_ioerror(xfs_buf_t *, int); +extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); extern int xfs_buf_iorequest(xfs_buf_t *); extern int xfs_buf_iowait(xfs_buf_t *); extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, @@ -221,38 +222,22 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp) extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); /* Delayed Write Buffer Routines */ -extern void xfs_buf_delwri_dequeue(xfs_buf_t *); -extern void xfs_buf_delwri_promote(xfs_buf_t *); +extern void xfs_buf_delwri_queue(struct xfs_buf *); +extern void xfs_buf_delwri_dequeue(struct xfs_buf *); +extern void xfs_buf_delwri_promote(struct xfs_buf *); /* Buffer Daemon Setup Routines */ extern int xfs_buf_init(void); extern void xfs_buf_terminate(void); -static inline const char * -xfs_buf_target_name(struct xfs_buftarg *target) -{ - static char __b[BDEVNAME_SIZE]; - - return bdevname(target->bt_bdev, __b); -} - - #define XFS_BUF_ZEROFLAGS(bp) \ ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \ XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) void xfs_buf_stale(struct xfs_buf *bp); -#define XFS_BUF_STALE(bp) xfs_buf_stale(bp); #define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) #define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) -#define XFS_BUF_SUPER_STALE(bp) do { \ - XFS_BUF_STALE(bp); \ - xfs_buf_delwri_dequeue(bp); \ - XFS_BUF_DONE(bp); \ - } while (0) - -#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) -#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp) + #define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) #define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) @@ -280,23 +265,16 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) #define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) -static inline void -xfs_buf_set_ref( - struct xfs_buf *bp, - int lru_ref) +static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) { atomic_set(&bp->b_lru_ref, lru_ref); } -#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref) -#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) static inline int xfs_buf_ispinned(struct xfs_buf *bp) { return atomic_read(&bp->b_pin_count); } -#define XFS_BUF_FINISH_IOWAIT(bp) complete(&bp->b_iowait); - static inline void xfs_buf_relse(xfs_buf_t *bp) { xfs_buf_unlock(bp); @@ -313,14 +291,7 @@ extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); extern int xfs_flush_buftarg(xfs_buftarg_t *, int); -#ifdef CONFIG_KDB_MODULES -extern struct list_head *xfs_get_buftarg_list(void); -#endif - #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) -#define xfs_binval(buftarg) xfs_flush_buftarg(buftarg, 1) -#define XFS_bflush(buftarg) xfs_flush_buftarg(buftarg, 1) - #endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index cac2ecfa674..1a3513881bc 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -629,7 +629,7 @@ xfs_buf_item_push( * the xfsbufd to get this buffer written. We have to unlock the buffer * to allow the xfsbufd to write it, too. */ -STATIC void +STATIC bool xfs_buf_item_pushbuf( struct xfs_log_item *lip) { @@ -643,6 +643,7 @@ xfs_buf_item_pushbuf( xfs_buf_delwri_promote(bp); xfs_buf_relse(bp); + return true; } STATIC void @@ -966,7 +967,8 @@ xfs_buf_iodone_callbacks( * I/O errors, there's no point in giving this a retry. */ if (XFS_FORCED_SHUTDOWN(mp)) { - XFS_BUF_SUPER_STALE(bp); + xfs_buf_stale(bp); + XFS_BUF_DONE(bp); trace_xfs_buf_item_iodone(bp, _RET_IP_); goto do_callbacks; } @@ -974,9 +976,7 @@ xfs_buf_iodone_callbacks( if (bp->b_target != lasttarg || time_after(jiffies, (lasttime + 5*HZ))) { lasttime = jiffies; - xfs_alert(mp, "Device %s: metadata write error block 0x%llx", - xfs_buf_target_name(bp->b_target), - (__uint64_t)XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); } lasttarg = bp->b_target; @@ -992,7 +992,7 @@ xfs_buf_iodone_callbacks( xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ if (!XFS_BUF_ISSTALE(bp)) { - XFS_BUF_DELAYWRITE(bp); + xfs_buf_delwri_queue(bp); XFS_BUF_DONE(bp); } ASSERT(bp->b_iodone != NULL); @@ -1005,9 +1005,8 @@ xfs_buf_iodone_callbacks( * If the write of the buffer was synchronous, we want to make * sure to return the error to the caller of xfs_bwrite(). */ - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); XFS_BUF_DONE(bp); - XFS_BUF_UNDELAYWRITE(bp); trace_xfs_buf_error_relse(bp, _RET_IP_); diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index ee9d5427fcd..77c74257c2a 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -1578,9 +1578,8 @@ xfs_da_grow_inode_int( */ nmap = 1; ASSERT(args->firstblock != NULL); - error = xfs_bmapi(tp, dp, *bno, count, - xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA| - XFS_BMAPI_CONTIG, + error = xfs_bmapi_write(tp, dp, *bno, count, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, args->firstblock, args->total, &map, &nmap, args->flist); if (error) @@ -1602,9 +1601,8 @@ xfs_da_grow_inode_int( for (b = *bno, mapi = 0; b < *bno + count; ) { nmap = MIN(XFS_BMAP_MAX_NMAP, count); c = (int)(*bno + count - b); - error = xfs_bmapi(tp, dp, b, c, - xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE| - XFS_BMAPI_METADATA, + error = xfs_bmapi_write(tp, dp, b, c, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, args->firstblock, args->total, &mapp[mapi], &nmap, args->flist); if (error) @@ -1975,33 +1973,16 @@ xfs_da_do_buf( /* * Optimize the one-block case. */ - if (nfsb == 1) { - xfs_fsblock_t fsb; - - if ((error = - xfs_bmapi_single(trans, dp, whichfork, &fsb, - (xfs_fileoff_t)bno))) { - return error; - } + if (nfsb == 1) mapp = ↦ - if (fsb == NULLFSBLOCK) { - nmap = 0; - } else { - map.br_startblock = fsb; - map.br_startoff = (xfs_fileoff_t)bno; - map.br_blockcount = 1; - nmap = 1; - } - } else { + else mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP); - nmap = nfsb; - if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno, - nfsb, - XFS_BMAPI_METADATA | - xfs_bmapi_aflag(whichfork), - NULL, 0, mapp, &nmap, NULL))) - goto exit0; - } + + nmap = nfsb; + error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, mapp, + &nmap, xfs_bmapi_aflag(whichfork)); + if (error) + goto exit0; } else { map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno); map.br_startoff = (xfs_fileoff_t)bno; @@ -2072,13 +2053,10 @@ xfs_da_do_buf( if (!bp) continue; if (caller == 1) { - if (whichfork == XFS_ATTR_FORK) { - XFS_BUF_SET_VTYPE_REF(bp, B_FS_ATTR_BTREE, - XFS_ATTR_BTREE_REF); - } else { - XFS_BUF_SET_VTYPE_REF(bp, B_FS_DIR_BTREE, - XFS_DIR_BTREE_REF); - } + if (whichfork == XFS_ATTR_FORK) + xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF); + else + xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF); } if (bplist) { bplist[nbplist++] = bp; diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 9a84a85c03b..654dc6f05ba 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -425,8 +425,8 @@ xfs_swap_extents( } - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ijoin_ref(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_trans_log_inode(tp, ip, ilf_fields); xfs_trans_log_inode(tp, tip, tilf_fields); @@ -438,7 +438,7 @@ xfs_swap_extents( if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT); + error = xfs_trans_commit(tp, 0); trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index ca2386d82cd..66e108f561a 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -888,12 +888,10 @@ xfs_dir2_leaf_getdents( * we already have in the table. */ nmap = map_size - map_valid; - error = xfs_bmapi(NULL, dp, - map_off, + error = xfs_bmapi_read(dp, map_off, xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) - map_off, - XFS_BMAPI_METADATA, NULL, 0, - &map[map_valid], &nmap, NULL); + &map[map_valid], &nmap, 0); /* * Don't know if we should ignore this or * try to return an error. diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 244e797dae3..8a24f0c6c86 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -38,7 +38,7 @@ xfs_trim_extents( struct xfs_mount *mp, xfs_agnumber_t agno, xfs_fsblock_t start, - xfs_fsblock_t len, + xfs_fsblock_t end, xfs_fsblock_t minlen, __uint64_t *blocks_trimmed) { @@ -100,7 +100,7 @@ xfs_trim_extents( * down partially overlapping ranges for now. */ if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start || - XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) { + XFS_AGB_TO_FSB(mp, agno, fbno) > end) { trace_xfs_discard_exclude(mp, agno, fbno, flen); goto next_extent; } @@ -145,7 +145,7 @@ xfs_ioc_trim( struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue; unsigned int granularity = q->limits.discard_granularity; struct fstrim_range range; - xfs_fsblock_t start, len, minlen; + xfs_fsblock_t start, end, minlen; xfs_agnumber_t start_agno, end_agno, agno; __uint64_t blocks_trimmed = 0; int error, last_error = 0; @@ -165,19 +165,19 @@ xfs_ioc_trim( * matter as trimming blocks is an advisory interface. */ start = XFS_B_TO_FSBT(mp, range.start); - len = XFS_B_TO_FSBT(mp, range.len); + end = start + XFS_B_TO_FSBT(mp, range.len) - 1; minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen)); - start_agno = XFS_FSB_TO_AGNO(mp, start); - if (start_agno >= mp->m_sb.sb_agcount) + if (start >= mp->m_sb.sb_dblocks) return -XFS_ERROR(EINVAL); + if (end > mp->m_sb.sb_dblocks - 1) + end = mp->m_sb.sb_dblocks - 1; - end_agno = XFS_FSB_TO_AGNO(mp, start + len); - if (end_agno >= mp->m_sb.sb_agcount) - end_agno = mp->m_sb.sb_agcount - 1; + start_agno = XFS_FSB_TO_AGNO(mp, start); + end_agno = XFS_FSB_TO_AGNO(mp, end); for (agno = start_agno; agno <= end_agno; agno++) { - error = -xfs_trim_extents(mp, agno, start, len, minlen, + error = -xfs_trim_extents(mp, agno, start, end, minlen, &blocks_trimmed); if (error) last_error = error; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index db62959bed1..25d7280e9f6 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -377,16 +377,14 @@ xfs_qm_dqalloc( return (ESRCH); } - xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); nmaps = 1; - if ((error = xfs_bmapi(tp, quotip, - offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB, - XFS_BMAPI_METADATA | XFS_BMAPI_WRITE, - &firstblock, - XFS_QM_DQALLOC_SPACE_RES(mp), - &map, &nmaps, &flist))) { + error = xfs_bmapi_write(tp, quotip, offset_fsb, + XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, + &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp), + &map, &nmaps, &flist); + if (error) goto error0; - } ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); ASSERT(nmaps == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && @@ -402,8 +400,11 @@ xfs_qm_dqalloc( dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0); - if (!bp || (error = xfs_buf_geterror(bp))) + + error = xfs_buf_geterror(bp); + if (error) goto error1; + /* * Make a chunk of dquots out of this buffer and log * the entire thing. @@ -485,9 +486,8 @@ xfs_qm_dqtobp( /* * Find the block map; no allocations yet */ - error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset, - XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, - NULL, 0, &map, &nmaps, NULL); + error = xfs_bmapi_read(quotip, dqp->q_fileoffset, + XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); xfs_iunlock(quotip, XFS_ILOCK_SHARED); if (error) @@ -605,7 +605,7 @@ xfs_qm_dqread( dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount); /* Mark the buf so that this will stay incore a little longer */ - XFS_BUF_SET_VTYPE_REF(bp, B_FS_DQUOT, XFS_DQUOT_REF); + xfs_buf_set_ref(bp, XFS_DQUOT_REF); /* * We got the buffer with a xfs_trans_read_buf() (in dqtobp()) @@ -1242,9 +1242,11 @@ xfs_qm_dqflush( } if (flags & SYNC_WAIT) - error = xfs_bwrite(mp, bp); + error = xfs_bwrite(bp); else - xfs_bdwrite(mp, bp); + xfs_buf_delwri_queue(bp); + + xfs_buf_relse(bp); trace_xfs_dqflush_done(dqp); diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 9e0e2fa3f2c..bb3f71d236d 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -183,13 +183,14 @@ xfs_qm_dqunpin_wait( * search the buffer cache can be a time consuming thing, and AIL lock is a * spinlock. */ -STATIC void +STATIC bool xfs_qm_dquot_logitem_pushbuf( struct xfs_log_item *lip) { struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); struct xfs_dquot *dqp = qlip->qli_dquot; struct xfs_buf *bp; + bool ret = true; ASSERT(XFS_DQ_IS_LOCKED(dqp)); @@ -201,17 +202,20 @@ xfs_qm_dquot_logitem_pushbuf( if (completion_done(&dqp->q_flush) || !(lip->li_flags & XFS_LI_IN_AIL)) { xfs_dqunlock(dqp); - return; + return true; } bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno, dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); xfs_dqunlock(dqp); if (!bp) - return; + return true; if (XFS_BUF_ISDELAYWRITE(bp)) xfs_buf_delwri_promote(bp); + if (xfs_buf_ispinned(bp)) + ret = false; xfs_buf_relse(bp); + return ret; } /* diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 75e5d322e48..da108977b21 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -229,16 +229,16 @@ xfs_fs_nfs_commit_metadata( { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - int error = 0; + xfs_lsn_t lsn = 0; xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) { - error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn, - XFS_LOG_SYNC, NULL); - } + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; + if (!lsn) + return 0; + return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } const struct export_operations xfs_export_operations = { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7f7b42469ea..753ed9b5c70 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -124,6 +124,35 @@ xfs_iozero( return (-status); } +/* + * Fsync operations on directories are much simpler than on regular files, + * as there is no file data to flush, and thus also no need for explicit + * cache flush operations, and there are no non-transaction metadata updates + * on directories either. + */ +STATIC int +xfs_dir_fsync( + struct file *file, + loff_t start, + loff_t end, + int datasync) +{ + struct xfs_inode *ip = XFS_I(file->f_mapping->host); + struct xfs_mount *mp = ip->i_mount; + xfs_lsn_t lsn = 0; + + trace_xfs_dir_fsync(ip); + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!lsn) + return 0; + return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); +} + STATIC int xfs_file_fsync( struct file *file, @@ -137,6 +166,7 @@ xfs_file_fsync( struct xfs_trans *tp; int error = 0; int log_flushed = 0; + xfs_lsn_t lsn = 0; trace_xfs_file_fsync(ip); @@ -149,10 +179,6 @@ xfs_file_fsync( xfs_iflags_clear(ip, XFS_ITRUNCATED); - xfs_ilock(ip, XFS_IOLOCK_SHARED); - xfs_ioend_wait(ip); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - if (mp->m_flags & XFS_MOUNT_BARRIER) { /* * If we have an RT and/or log subvolume we need to make sure @@ -216,11 +242,11 @@ xfs_file_fsync( * transaction. So we play it safe and fire off the * transaction anyway. */ - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_trans_set_sync(tp); - error = _xfs_trans_commit(tp, 0, &log_flushed); + error = xfs_trans_commit(tp, 0); + lsn = ip->i_itemp->ili_last_lsn; xfs_iunlock(ip, XFS_ILOCK_EXCL); } else { /* @@ -231,14 +257,14 @@ xfs_file_fsync( * disk yet, the inode will be still be pinned. If it is, * force the log. */ - if (xfs_ipincount(ip)) { - error = _xfs_log_force_lsn(mp, - ip->i_itemp->ili_last_lsn, - XFS_LOG_SYNC, &log_flushed); - } + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; xfs_iunlock(ip, XFS_ILOCK_SHARED); } + if (!error && lsn) + error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); + /* * If we only have a single device, and the log force about was * a no-op we might have to flush the data device cache here. @@ -317,7 +343,19 @@ xfs_file_aio_read( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if (unlikely(ioflags & IO_ISDIRECT)) { + /* + * Locking is a bit tricky here. If we take an exclusive lock + * for direct IO, we effectively serialise all new concurrent + * read IO to this file and block it behind IO that is currently in + * progress because IO in progress holds the IO lock shared. We only + * need to hold the lock exclusive to blow away the page cache, so + * only take lock exclusively if the page cache needs invalidation. + * This allows the normal direct IO case of no page cache pages to + * proceeed concurrently without serialisation. + */ + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) { + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { @@ -330,8 +368,7 @@ xfs_file_aio_read( } } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); - } else - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + } trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); @@ -407,11 +444,13 @@ xfs_aio_write_isize_update( */ STATIC void xfs_aio_write_newsize_update( - struct xfs_inode *ip) + struct xfs_inode *ip, + xfs_fsize_t new_size) { - if (ip->i_new_size) { + if (new_size == ip->i_new_size) { xfs_rw_ilock(ip, XFS_ILOCK_EXCL); - ip->i_new_size = 0; + if (new_size == ip->i_new_size) + ip->i_new_size = 0; if (ip->i_d.di_size > ip->i_size) ip->i_d.di_size = ip->i_size; xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); @@ -462,7 +501,7 @@ xfs_file_splice_write( ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); xfs_aio_write_isize_update(inode, ppos, ret); - xfs_aio_write_newsize_update(ip); + xfs_aio_write_newsize_update(ip, new_size); xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } @@ -500,11 +539,9 @@ xfs_zero_last_block( last_fsb = XFS_B_TO_FSBT(mp, isize); nimaps = 1; - error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap, - &nimaps, NULL); - if (error) { + error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0); + if (error) return error; - } ASSERT(nimaps > 0); /* * If the block underlying isize is just a hole, then there @@ -595,8 +632,8 @@ xfs_zero_eof( while (start_zero_fsb <= end_zero_fsb) { nimaps = 1; zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; - error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb, - 0, NULL, 0, &imap, &nimaps, NULL); + error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb, + &imap, &nimaps, 0); if (error) { ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); return error; @@ -659,6 +696,7 @@ xfs_file_aio_write_checks( struct file *file, loff_t *pos, size_t *count, + xfs_fsize_t *new_sizep, int *iolock) { struct inode *inode = file->f_mapping->host; @@ -666,6 +704,9 @@ xfs_file_aio_write_checks( xfs_fsize_t new_size; int error = 0; + xfs_rw_ilock(ip, XFS_ILOCK_EXCL); + *new_sizep = 0; +restart: error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); if (error) { xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); @@ -673,20 +714,41 @@ xfs_file_aio_write_checks( return error; } - new_size = *pos + *count; - if (new_size > ip->i_size) - ip->i_new_size = new_size; - if (likely(!(file->f_mode & FMODE_NOCMTIME))) file_update_time(file); /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this - * write. + * write. There is no need to issue zeroing if another in-flght IO ends + * at or before this one If zeronig is needed and we are currently + * holding the iolock shared, we need to update it to exclusive which + * involves dropping all locks and relocking to maintain correct locking + * order. If we do this, restart the function to ensure all checks and + * values are still valid. */ - if (*pos > ip->i_size) + if ((ip->i_new_size && *pos > ip->i_new_size) || + (!ip->i_new_size && *pos > ip->i_size)) { + if (*iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); + goto restart; + } error = -xfs_zero_eof(ip, *pos, ip->i_size); + } + + /* + * If this IO extends beyond EOF, we may need to update ip->i_new_size. + * We have already zeroed space beyond EOF (if necessary). Only update + * ip->i_new_size if this IO ends beyond any other in-flight writes. + */ + new_size = *pos + *count; + if (new_size > ip->i_size) { + if (new_size > ip->i_new_size) + ip->i_new_size = new_size; + *new_sizep = new_size; + } xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); if (error) @@ -721,7 +783,7 @@ xfs_file_aio_write_checks( * the dio layer. To avoid the problem with aio, we also need to wait for * outstanding IOs to complete so that unwritten extent conversion is completed * before we try to map the overlapping block. This is currently implemented by - * hitting it with a big hammer (i.e. xfs_ioend_wait()). + * hitting it with a big hammer (i.e. inode_dio_wait()). * * Returns with locks held indicated by @iolock and errors indicated by * negative return values. @@ -733,6 +795,7 @@ xfs_file_dio_aio_write( unsigned long nr_segs, loff_t pos, size_t ocount, + xfs_fsize_t *new_size, int *iolock) { struct file *file = iocb->ki_filp; @@ -753,18 +816,35 @@ xfs_file_dio_aio_write( if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) unaligned_io = 1; - if (unaligned_io || mapping->nrpages || pos > ip->i_size) + /* + * We don't need to take an exclusive lock unless there page cache needs + * to be invalidated or unaligned IO is being executed. We don't need to + * consider the EOF extension case here because + * xfs_file_aio_write_checks() will relock the inode as necessary for + * EOF zeroing cases and fill out the new inode size as appropriate. + */ + if (unaligned_io || mapping->nrpages) *iolock = XFS_IOLOCK_EXCL; else *iolock = XFS_IOLOCK_SHARED; - xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); + xfs_rw_ilock(ip, *iolock); - ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); + /* + * Recheck if there are cached pages that need invalidate after we got + * the iolock to protect against other threads adding new pages while + * we were waiting for the iolock. + */ + if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, *iolock); + } + + ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); if (ret) return ret; if (mapping->nrpages) { - WARN_ON(*iolock != XFS_IOLOCK_EXCL); ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); if (ret) @@ -776,7 +856,7 @@ xfs_file_dio_aio_write( * otherwise demote the lock if we had to flush cached pages */ if (unaligned_io) - xfs_ioend_wait(ip); + inode_dio_wait(inode); else if (*iolock == XFS_IOLOCK_EXCL) { xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); *iolock = XFS_IOLOCK_SHARED; @@ -798,6 +878,7 @@ xfs_file_buffered_aio_write( unsigned long nr_segs, loff_t pos, size_t ocount, + xfs_fsize_t *new_size, int *iolock) { struct file *file = iocb->ki_filp; @@ -809,9 +890,9 @@ xfs_file_buffered_aio_write( size_t count = ocount; *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); + xfs_rw_ilock(ip, *iolock); - ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); + ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); if (ret) return ret; @@ -851,6 +932,7 @@ xfs_file_aio_write( ssize_t ret; int iolock; size_t ocount = 0; + xfs_fsize_t new_size = 0; XFS_STATS_INC(xs_write_calls); @@ -870,10 +952,10 @@ xfs_file_aio_write( if (unlikely(file->f_flags & O_DIRECT)) ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, - ocount, &iolock); + ocount, &new_size, &iolock); else ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, - ocount, &iolock); + ocount, &new_size, &iolock); xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); @@ -894,7 +976,7 @@ xfs_file_aio_write( } out_unlock: - xfs_aio_write_newsize_update(ip); + xfs_aio_write_newsize_update(ip, new_size); xfs_rw_iunlock(ip, iolock); return ret; } @@ -1087,7 +1169,7 @@ const struct file_operations xfs_dir_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, #endif - .fsync = xfs_file_fsync, + .fsync = xfs_dir_fsync, }; static const struct vm_operations_struct xfs_file_vm_ops = { diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 3ff3d9e23de..5170306a100 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -682,7 +682,7 @@ xfs_filestream_new_ag( ip = ap->ip; mp = ip->i_mount; cache = mp->m_filestream; - minlen = ap->alen; + minlen = ap->length; *agp = NULLAGNUMBER; /* @@ -761,7 +761,7 @@ xfs_filestream_new_ag( */ ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount; flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | - (ap->low ? XFS_PICK_LOWSPACE : 0); + (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0); err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen); if (err || *agp == NULLAGNUMBER) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 9153d2c77ca..1c6fdeb702f 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -194,6 +194,10 @@ xfs_growfs_data_private( bp = xfs_buf_get(mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); + if (!bp) { + error = ENOMEM; + goto error0; + } agf = XFS_BUF_TO_AGF(bp); memset(agf, 0, mp->m_sb.sb_sectsize); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); @@ -216,16 +220,21 @@ xfs_growfs_data_private( tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp); agf->agf_freeblks = cpu_to_be32(tmpsize); agf->agf_longest = cpu_to_be32(tmpsize); - error = xfs_bwrite(mp, bp); - if (error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) goto error0; - } + /* * AG inode header block */ bp = xfs_buf_get(mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); + if (!bp) { + error = ENOMEM; + goto error0; + } agi = XFS_BUF_TO_AGI(bp); memset(agi, 0, mp->m_sb.sb_sectsize); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); @@ -240,10 +249,11 @@ xfs_growfs_data_private( agi->agi_dirino = cpu_to_be32(NULLAGINO); for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); - error = xfs_bwrite(mp, bp); - if (error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) goto error0; - } + /* * BNO btree root block */ @@ -251,6 +261,10 @@ xfs_growfs_data_private( XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), BTOBB(mp->m_sb.sb_blocksize), XBF_LOCK | XBF_MAPPED); + if (!bp) { + error = ENOMEM; + goto error0; + } block = XFS_BUF_TO_BLOCK(bp); memset(block, 0, mp->m_sb.sb_blocksize); block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC); @@ -262,10 +276,11 @@ xfs_growfs_data_private( arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); - error = xfs_bwrite(mp, bp); - if (error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) goto error0; - } + /* * CNT btree root block */ @@ -273,6 +288,10 @@ xfs_growfs_data_private( XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), BTOBB(mp->m_sb.sb_blocksize), XBF_LOCK | XBF_MAPPED); + if (!bp) { + error = ENOMEM; + goto error0; + } block = XFS_BUF_TO_BLOCK(bp); memset(block, 0, mp->m_sb.sb_blocksize); block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC); @@ -285,10 +304,11 @@ xfs_growfs_data_private( arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); nfree += be32_to_cpu(arec->ar_blockcount); - error = xfs_bwrite(mp, bp); - if (error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) goto error0; - } + /* * INO btree root block */ @@ -296,6 +316,10 @@ xfs_growfs_data_private( XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), BTOBB(mp->m_sb.sb_blocksize), XBF_LOCK | XBF_MAPPED); + if (!bp) { + error = ENOMEM; + goto error0; + } block = XFS_BUF_TO_BLOCK(bp); memset(block, 0, mp->m_sb.sb_blocksize); block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC); @@ -303,10 +327,10 @@ xfs_growfs_data_private( block->bb_numrecs = 0; block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); - error = xfs_bwrite(mp, bp); - if (error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) goto error0; - } } xfs_trans_agblocks_delta(tp, nfree); /* @@ -396,9 +420,9 @@ xfs_growfs_data_private( * just issue a warning and continue. The real work is * already done and committed. */ - if (!(error = xfs_bwrite(mp, bp))) { - continue; - } else { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) { xfs_warn(mp, "write error %d updating secondary superblock for ag %d", error, agno); diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 9f24ec28283..169380e6605 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -150,7 +150,7 @@ xfs_check_agi_freecount( /* * Initialise a new set of inodes. */ -STATIC void +STATIC int xfs_ialloc_inode_init( struct xfs_mount *mp, struct xfs_trans *tp, @@ -202,8 +202,8 @@ xfs_ialloc_inode_init( fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize * blks_per_cluster, XBF_LOCK); - ASSERT(!xfs_buf_geterror(fbuf)); - + if (!fbuf) + return ENOMEM; /* * Initialize all inodes in this buffer and then log them. * @@ -225,6 +225,7 @@ xfs_ialloc_inode_init( } xfs_trans_inode_alloc_buf(tp, fbuf); } + return 0; } /* @@ -369,9 +370,11 @@ xfs_ialloc_ag_alloc( * rather than a linear progression to prevent the next generation * number from being easily guessable. */ - xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len, - random32()); + error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, + args.len, random32()); + if (error) + return error; /* * Convert the results. */ @@ -1502,7 +1505,7 @@ xfs_read_agi( return XFS_ERROR(EFSCORRUPTED); } - XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGI, XFS_AGI_REF); + xfs_buf_set_ref(*bpp, XFS_AGI_REF); xfs_check_agi_unlinked(agi); return 0; diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 7759812c1bb..0fa98b1c70e 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -75,7 +75,6 @@ xfs_inode_alloc( return NULL; } - ASSERT(atomic_read(&ip->i_iocount) == 0); ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(completion_done(&ip->i_flush)); @@ -150,7 +149,6 @@ xfs_inode_free( } /* asserts to verify all state is correct here */ - ASSERT(atomic_read(&ip->i_iocount) == 0); ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(completion_done(&ip->i_flush)); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 0239a7c7c88..c0237c602f1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -190,12 +190,6 @@ xfs_imap_to_bp( } xfs_inobp_check(mp, bp); - - /* - * Mark the buffer as an inode buffer now that it looks good - */ - XFS_BUF_SET_VTYPE(bp, B_FS_INO); - *bpp = bp; return 0; } @@ -1152,7 +1146,7 @@ xfs_ialloc( /* * Log the new values stuffed into the inode. */ - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, flags); /* now that we have an i_mode we can setup inode ops and unlock */ @@ -1187,6 +1181,7 @@ xfs_isize_check( xfs_fileoff_t map_first; int nimaps; xfs_bmbt_irec_t imaps[2]; + int error; if (!S_ISREG(ip->i_d.di_mode)) return; @@ -1203,13 +1198,12 @@ xfs_isize_check( * The filesystem could be shutting down, so bmapi may return * an error. */ - if (xfs_bmapi(NULL, ip, map_first, + error = xfs_bmapi_read(ip, map_first, (XFS_B_TO_FSB(mp, - (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - - map_first), - XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps, - NULL)) - return; + (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first), + imaps, &nimaps, XFS_BMAPI_ENTIRE); + if (error) + return; ASSERT(nimaps == 1); ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); } @@ -1297,7 +1291,7 @@ xfs_itruncate_extents( */ error = xfs_bmap_finish(&tp, &free_list, &committed); if (committed) - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); if (error) goto out_bmap_cancel; @@ -1313,7 +1307,7 @@ xfs_itruncate_extents( error = xfs_trans_commit(tp, 0); tp = ntp; - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); if (error) goto out; @@ -1644,7 +1638,7 @@ xfs_iunlink_remove( * inodes that are in memory - they all must be marked stale and attached to * the cluster buffer. */ -STATIC void +STATIC int xfs_ifree_cluster( xfs_inode_t *free_ip, xfs_trans_t *tp, @@ -1690,6 +1684,8 @@ xfs_ifree_cluster( mp->m_bsize * blks_per_cluster, XBF_LOCK); + if (!bp) + return ENOMEM; /* * Walk the inodes already attached to the buffer and mark them * stale. These will all have the flush locks held, so an @@ -1799,6 +1795,7 @@ retry: } xfs_perag_put(pag); + return 0; } /* @@ -1878,10 +1875,10 @@ xfs_ifree( dip->di_mode = 0; if (delete) { - xfs_ifree_cluster(ip, tp, first_ino); + error = xfs_ifree_cluster(ip, tp, first_ino); } - return 0; + return error; } /* @@ -2472,11 +2469,11 @@ cluster_corrupt_out: */ if (bp->b_iodone) { XFS_BUF_UNDONE(bp); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); xfs_buf_ioerror(bp, EIO); xfs_buf_ioend(bp, 0); } else { - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); xfs_buf_relse(bp); } } @@ -2597,9 +2594,11 @@ xfs_iflush( goto cluster_corrupt_out; if (flags & SYNC_WAIT) - error = xfs_bwrite(mp, bp); + error = xfs_bwrite(bp); else - xfs_bdwrite(mp, bp); + xfs_buf_delwri_queue(bp); + + xfs_buf_relse(bp); return error; corrupt_out: diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 2380a4bcbec..760140d1dd6 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -257,7 +257,6 @@ typedef struct xfs_inode { xfs_fsize_t i_size; /* in-memory size */ xfs_fsize_t i_new_size; /* size when write completes */ - atomic_t i_iocount; /* outstanding I/O count */ /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 588406dc6a3..b7cf21ba240 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -658,10 +658,8 @@ xfs_inode_item_unlock( lock_flags = iip->ili_lock_flags; iip->ili_lock_flags = 0; - if (lock_flags) { + if (lock_flags) xfs_iunlock(ip, lock_flags); - IRELE(ip); - } } /* @@ -708,13 +706,14 @@ xfs_inode_item_committed( * marked delayed write. If that's the case, we'll promote it and that will * allow the caller to write the buffer by triggering the xfsbufd to run. */ -STATIC void +STATIC bool xfs_inode_item_pushbuf( struct xfs_log_item *lip) { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; struct xfs_buf *bp; + bool ret = true; ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); @@ -725,7 +724,7 @@ xfs_inode_item_pushbuf( if (completion_done(&ip->i_flush) || !(lip->li_flags & XFS_LI_IN_AIL)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); - return; + return true; } bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno, @@ -733,10 +732,13 @@ xfs_inode_item_pushbuf( xfs_iunlock(ip, XFS_ILOCK_SHARED); if (!bp) - return; + return true; if (XFS_BUF_ISDELAYWRITE(bp)) xfs_buf_delwri_promote(bp); + if (xfs_buf_ispinned(bp)) + ret = false; xfs_buf_relse(bp); + return ret; } /* diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index f7ce7debe14..d99a9051890 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1069,7 +1069,7 @@ xfs_ioctl_setattr( } } - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); /* * Change file ownership. Must be the owner or privileged. diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 091d82b94c4..9afa282aa93 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -208,22 +208,20 @@ xfs_iomap_write_direct( if (error) goto error1; - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); - bmapi_flag = XFS_BMAPI_WRITE; + bmapi_flag = 0; if (offset < ip->i_size || extsz) bmapi_flag |= XFS_BMAPI_PREALLOC; /* - * Issue the xfs_bmapi() call to allocate the blocks. - * * From this point onwards we overwrite the imap pointer that the * caller gave to us. */ xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; - error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag, - &firstfsb, 0, imap, &nimaps, &free_list); + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag, + &firstfsb, 0, imap, &nimaps, &free_list); if (error) goto error0; @@ -300,8 +298,8 @@ xfs_iomap_eof_want_preallocate( while (count_fsb > 0) { imaps = nimaps; firstblock = NULLFSBLOCK; - error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0, - &firstblock, 0, imap, &imaps, NULL); + error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps, + 0); if (error) return error; for (n = 0; n < imaps; n++) { @@ -381,7 +379,6 @@ xfs_iomap_write_delay( xfs_fileoff_t last_fsb; xfs_off_t aligned_offset; xfs_fileoff_t ioalign; - xfs_fsblock_t firstblock; xfs_extlen_t extsz; int nimaps; xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; @@ -425,12 +422,8 @@ retry: } nimaps = XFS_WRITE_IMAPS; - firstblock = NULLFSBLOCK; - error = xfs_bmapi(NULL, ip, offset_fsb, - (xfs_filblks_t)(last_fsb - offset_fsb), - XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | - XFS_BMAPI_ENTIRE, &firstblock, 1, imap, - &nimaps, NULL); + error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb, + imap, &nimaps, XFS_BMAPI_ENTIRE); switch (error) { case 0: case ENOSPC: @@ -535,7 +528,7 @@ xfs_iomap_write_allocate( return XFS_ERROR(error); } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); xfs_bmap_init(&free_list, &first_block); @@ -587,14 +580,12 @@ xfs_iomap_write_allocate( } /* - * Go get the actual blocks. - * * From this point onwards we overwrite the imap * pointer that the caller gave to us. */ - error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb, - XFS_BMAPI_WRITE, &first_block, 1, - imap, &nimaps, &free_list); + error = xfs_bmapi_write(tp, ip, map_start_fsb, + count_fsb, 0, &first_block, 1, + imap, &nimaps, &free_list); if (error) goto trans_cancel; @@ -701,15 +692,15 @@ xfs_iomap_write_unwritten( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); /* * Modify the unwritten extent state of the buffer. */ xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; - error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, - XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb, + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_CONVERT, &firstfsb, 1, &imap, &nimaps, &free_list); if (error) goto error_on_bmapi_transaction; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 673704fab74..9ba2a07b734 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -102,37 +102,38 @@ xfs_mark_inode_dirty( } + +int xfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + struct xfs_inode *ip = XFS_I(inode); + int error = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + error = xfs_attr_set(ip, xattr->name, xattr->value, + xattr->value_len, ATTR_SECURE); + if (error < 0) + break; + } + return error; +} + /* * Hook in SELinux. This is not quite correct yet, what we really need * here (as we do for default ACLs) is a mechanism by which creation of * these attrs can be journalled at inode creation time (along with the * inode, of course, such that log replay can't cause these to be lost). */ + STATIC int xfs_init_security( struct inode *inode, struct inode *dir, const struct qstr *qstr) { - struct xfs_inode *ip = XFS_I(inode); - size_t length; - void *value; - unsigned char *name; - int error; - - error = security_inode_init_security(inode, dir, qstr, (char **)&name, - &value, &length); - if (error) { - if (error == -EOPNOTSUPP) - return 0; - return -error; - } - - error = xfs_attr_set(ip, name, value, length, ATTR_SECURE); - - kfree(name); - kfree(value); - return error; + return security_inode_init_security(inode, dir, qstr, + &xfs_initxattrs, NULL); } static void @@ -465,7 +466,7 @@ xfs_vn_getattr( trace_xfs_getattr(ip); if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + return -XFS_ERROR(EIO); stat->size = XFS_ISIZE(ip); stat->dev = inode->i_sb->s_dev; @@ -611,7 +612,7 @@ xfs_setattr_nonsize( } } - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); /* * Change file ownership. Must be the owner or privileged. @@ -833,16 +834,16 @@ xfs_setattr_size( * care about here. */ if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) { - error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, - XBF_ASYNC, FI_NONE); + error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0, + FI_NONE); if (error) goto out_unlock; } /* - * Wait for all I/O to complete. + * Wait for all direct I/O to complete. */ - xfs_ioend_wait(ip); + inode_dio_wait(inode); error = -block_truncate_page(inode->i_mapping, iattr->ia_size, xfs_get_blocks); @@ -863,7 +864,7 @@ xfs_setattr_size( xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); /* * Only change the c/mtime if we are changing the size or we are diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 1e8a45e74c3..828662f70d6 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -68,6 +68,8 @@ #include <linux/ctype.h> #include <linux/writeback.h> #include <linux/capability.h> +#include <linux/kthread.h> +#include <linux/freezer.h> #include <linux/list_sort.h> #include <asm/page.h> diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3a8d4f66d70..2758a6277c5 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -880,8 +880,8 @@ xlog_iodone(xfs_buf_t *bp) */ if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp, XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { - xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp)); - XFS_BUF_STALE(bp); + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_stale(bp); xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR); /* * This flag will be propagated to the trans-committed @@ -1047,7 +1047,7 @@ xlog_alloc_log(xfs_mount_t *mp, xlog_get_iclog_buffer_size(mp, log); error = ENOMEM; - bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp); + bp = xfs_buf_alloc(mp->m_logdev_targp, 0, log->l_iclog_size, 0); if (!bp) goto out_free_log; bp->b_iodone = xlog_iodone; @@ -1247,7 +1247,7 @@ xlog_bdstrat( if (iclog->ic_state & XLOG_STATE_IOERROR) { xfs_buf_ioerror(bp, EIO); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); xfs_buf_ioend(bp, 0); /* * It would seem logical to return EIO here, but we rely on @@ -1387,9 +1387,9 @@ xlog_sync(xlog_t *log, */ XFS_BUF_WRITE(bp); - if ((error = xlog_bdstrat(bp))) { - xfs_ioerror_alert("xlog_sync", log->l_mp, bp, - XFS_BUF_ADDR(bp)); + error = xlog_bdstrat(bp); + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_sync"); return error; } if (split) { @@ -1423,9 +1423,9 @@ xlog_sync(xlog_t *log, /* account for internal log which doesn't start at block #0 */ XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); XFS_BUF_WRITE(bp); - if ((error = xlog_bdstrat(bp))) { - xfs_ioerror_alert("xlog_sync (split)", log->l_mp, - bp, XFS_BUF_ADDR(bp)); + error = xlog_bdstrat(bp); + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_sync (split)"); return error; } } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a199dbcee7d..541a508adea 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -183,8 +183,7 @@ xlog_bread_noalign( xfsbdstrat(log->l_mp, bp); error = xfs_buf_iowait(bp); if (error) - xfs_ioerror_alert("xlog_bread", log->l_mp, - bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); return error; } @@ -268,9 +267,10 @@ xlog_bwrite( xfs_buf_lock(bp); XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); - if ((error = xfs_bwrite(log->l_mp, bp))) - xfs_ioerror_alert("xlog_bwrite", log->l_mp, - bp, XFS_BUF_ADDR(bp)); + error = xfs_bwrite(bp); + if (error) + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_relse(bp); return error; } @@ -361,9 +361,7 @@ xlog_recover_iodone( * We're not going to bother about retrying * this during recovery. One strike! */ - xfs_ioerror_alert("xlog_recover_iodone", - bp->b_target->bt_mount, bp, - XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); xfs_force_shutdown(bp->b_target->bt_mount, SHUTDOWN_META_IO_ERROR); } @@ -2135,8 +2133,7 @@ xlog_recover_buffer_pass2( return XFS_ERROR(ENOMEM); error = bp->b_error; if (error) { - xfs_ioerror_alert("xlog_recover_do..(read#1)", mp, - bp, buf_f->blf_blkno); + xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)"); xfs_buf_relse(bp); return error; } @@ -2171,15 +2168,16 @@ xlog_recover_buffer_pass2( be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize, (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { - XFS_BUF_STALE(bp); - error = xfs_bwrite(mp, bp); + xfs_buf_stale(bp); + error = xfs_bwrite(bp); } else { ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_bdwrite(mp, bp); + xfs_buf_delwri_queue(bp); } - return (error); + xfs_buf_relse(bp); + return error; } STATIC int @@ -2230,8 +2228,7 @@ xlog_recover_inode_pass2( } error = bp->b_error; if (error) { - xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, - bp, in_f->ilf_blkno); + xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)"); xfs_buf_relse(bp); goto error; } @@ -2439,7 +2436,8 @@ xlog_recover_inode_pass2( write_inode_buffer: ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_bdwrite(mp, bp); + xfs_buf_delwri_queue(bp); + xfs_buf_relse(bp); error: if (need_free) kmem_free(in_f); @@ -2537,8 +2535,7 @@ xlog_recover_dquot_pass2( XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp); if (error) { - xfs_ioerror_alert("xlog_recover_do..(read#3)", mp, - bp, dq_f->qlf_blkno); + xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#3)"); return error; } ASSERT(bp); @@ -2561,7 +2558,8 @@ xlog_recover_dquot_pass2( ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_bdwrite(mp, bp); + xfs_buf_delwri_queue(bp); + xfs_buf_relse(bp); return (0); } @@ -3656,7 +3654,7 @@ xlog_do_recover( return error; } - XFS_bflush(log->l_mp->m_ddev_targp); + xfs_flush_buftarg(log->l_mp->m_ddev_targp, 1); /* * If IO errors happened during recovery, bail out. @@ -3689,8 +3687,7 @@ xlog_do_recover( xfsbdstrat(log->l_mp, bp); error = xfs_buf_iowait(bp); if (error) { - xfs_ioerror_alert("xlog_do_recover", - log->l_mp, bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); ASSERT(0); xfs_buf_relse(bp); return error; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 0081657ad98..d06afbc3540 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -44,9 +44,6 @@ #include "xfs_trace.h" -STATIC void xfs_unmountfs_wait(xfs_mount_t *); - - #ifdef HAVE_PERCPU_SB STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, int); @@ -1484,7 +1481,7 @@ xfs_unmountfs( * state as much as possible. */ xfs_reclaim_inodes(mp, 0); - XFS_bflush(mp->m_ddev_targp); + xfs_flush_buftarg(mp->m_ddev_targp, 1); xfs_reclaim_inodes(mp, SYNC_WAIT); xfs_qm_unmount(mp); @@ -1496,11 +1493,6 @@ xfs_unmountfs( */ xfs_log_force(mp, XFS_LOG_SYNC); - xfs_binval(mp->m_ddev_targp); - if (mp->m_rtdev_targp) { - xfs_binval(mp->m_rtdev_targp); - } - /* * Unreserve any blocks we have so that when we unmount we don't account * the reserved free space as used. This is really only necessary for @@ -1526,7 +1518,16 @@ xfs_unmountfs( xfs_warn(mp, "Unable to update superblock counters. " "Freespace may not be correct on next mount."); xfs_unmountfs_writesb(mp); - xfs_unmountfs_wait(mp); /* wait for async bufs */ + + /* + * Make sure all buffers have been flushed and completed before + * unmounting the log. + */ + error = xfs_flush_buftarg(mp->m_ddev_targp, 1); + if (error) + xfs_warn(mp, "%d busy buffers during unmount.", error); + xfs_wait_buftarg(mp->m_ddev_targp); + xfs_log_unmount_write(mp); xfs_log_unmount(mp); xfs_uuid_unmount(mp); @@ -1537,16 +1538,6 @@ xfs_unmountfs( xfs_free_perag(mp); } -STATIC void -xfs_unmountfs_wait(xfs_mount_t *mp) -{ - if (mp->m_logdev_targp != mp->m_ddev_targp) - xfs_wait_buftarg(mp->m_logdev_targp); - if (mp->m_rtdev_targp) - xfs_wait_buftarg(mp->m_rtdev_targp); - xfs_wait_buftarg(mp->m_ddev_targp); -} - int xfs_fs_writable(xfs_mount_t *mp) { @@ -1612,15 +1603,14 @@ xfs_unmountfs_writesb(xfs_mount_t *mp) XFS_BUF_UNDONE(sbp); XFS_BUF_UNREAD(sbp); - XFS_BUF_UNDELAYWRITE(sbp); + xfs_buf_delwri_dequeue(sbp); XFS_BUF_WRITE(sbp); XFS_BUF_UNASYNC(sbp); ASSERT(sbp->b_target == mp->m_ddev_targp); xfsbdstrat(mp, sbp); error = xfs_buf_iowait(sbp); if (error) - xfs_ioerror_alert("xfs_unmountfs_writesb", - mp, sbp, XFS_BUF_ADDR(sbp)); + xfs_buf_ioerror_alert(sbp, __func__); xfs_buf_relse(sbp); } return error; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 9a0aa76facd..5cff443f6cd 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1296,7 +1296,8 @@ xfs_qm_dqiter_bufs( break; xfs_qm_reset_dqcounts(mp, bp, firstid, type); - xfs_bdwrite(mp, bp); + xfs_buf_delwri_queue(bp); + xfs_buf_relse(bp); /* * goto the next block. */ @@ -1346,11 +1347,8 @@ xfs_qm_dqiterate( * the inode is never added to the transaction. */ xfs_ilock(qip, XFS_ILOCK_SHARED); - error = xfs_bmapi(NULL, qip, lblkno, - maxlblkcnt - lblkno, - XFS_BMAPI_METADATA, - NULL, - 0, map, &nmaps, NULL); + error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno, + map, &nmaps, 0); xfs_iunlock(qip, XFS_ILOCK_SHARED); if (error) break; @@ -1683,7 +1681,7 @@ xfs_qm_quotacheck( * quotacheck'd stamp on the superblock. So, here we do a synchronous * flush. */ - XFS_bflush(mp->m_ddev_targp); + xfs_flush_buftarg(mp->m_ddev_targp, 1); /* * If one type of quotas is off, then it will lose its diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 609246f42e6..5cc3dde1bc9 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -261,7 +261,7 @@ xfs_qm_scall_trunc_qfile( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); error = xfs_itruncate_data(&tp, ip, 0); if (error) { diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index df78c297d1a..866de277079 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -170,12 +170,12 @@ xfs_rename( * we can rely on either trans_commit or trans_cancel to unlock * them. */ - xfs_trans_ijoin_ref(tp, src_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); if (new_parent) - xfs_trans_ijoin_ref(tp, target_dp, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, src_ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); if (target_ip) - xfs_trans_ijoin_ref(tp, target_ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); /* * If we are using project inheritance, we only allow renames diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 35561a511b5..87323f1ded6 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -112,7 +112,7 @@ xfs_growfs_rt_alloc( * Lock the inode. */ xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_bmap_init(&flist, &firstblock); /* @@ -120,9 +120,9 @@ xfs_growfs_rt_alloc( */ nmap = 1; cancelflags |= XFS_TRANS_ABORT; - error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks, - XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock, - resblks, &map, &nmap, &flist); + error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, + XFS_BMAPI_METADATA, &firstblock, + resblks, &map, &nmap, &flist); if (!error && nmap < 1) error = XFS_ERROR(ENOSPC); if (error) @@ -155,7 +155,7 @@ xfs_growfs_rt_alloc( * Lock the bitmap inode. */ xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* * Get a buffer for the block. */ @@ -856,33 +856,23 @@ xfs_rtbuf_get( xfs_buf_t **bpp) /* output: buffer for the block */ { xfs_buf_t *bp; /* block buffer, result */ - xfs_daddr_t d; /* disk addr of block */ - int error; /* error value */ - xfs_fsblock_t fsb; /* fs block number for block */ xfs_inode_t *ip; /* bitmap or summary inode */ + xfs_bmbt_irec_t map; + int nmap; + int error; /* error value */ ip = issum ? mp->m_rsumip : mp->m_rbmip; - /* - * Map from the file offset (block) and inode number to the - * file system block. - */ - error = xfs_bmapi_single(tp, ip, XFS_DATA_FORK, &fsb, block); - if (error) { + + error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK); + if (error) return error; - } - ASSERT(fsb != NULLFSBLOCK); - /* - * Convert to disk address for buffer cache. - */ - d = XFS_FSB_TO_DADDR(mp, fsb); - /* - * Read the buffer. - */ - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, + + ASSERT(map.br_startblock != NULLFSBLOCK); + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, map.br_startblock), mp->m_bsize, 0, &bp); - if (error) { + if (error) return error; - } ASSERT(!xfs_buf_geterror(bp)); *bpp = bp; return 0; @@ -1970,7 +1960,7 @@ xfs_growfs_rt( * Lock out other callers by grabbing the bitmap inode lock. */ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); /* * Update the bitmap inode's size. */ @@ -1982,7 +1972,7 @@ xfs_growfs_rt( * Get the summary inode into the transaction. */ xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, mp->m_rsumip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL); /* * Update the summary inode's size. */ @@ -2153,7 +2143,7 @@ xfs_rtfree_extent( * Synchronize by locking the bitmap inode. */ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); #if defined(__KERNEL__) && defined(DEBUG) /* diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index c96a8a05ac0..597d044a09a 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -92,24 +92,6 @@ xfs_do_force_shutdown( } /* - * Prints out an ALERT message about I/O error. - */ -void -xfs_ioerror_alert( - char *func, - struct xfs_mount *mp, - xfs_buf_t *bp, - xfs_daddr_t blkno) -{ - xfs_alert(mp, - "I/O error occurred: meta-data dev %s block 0x%llx" - " (\"%s\") error %d buf count %zd", - xfs_buf_target_name(bp->b_target), - (__uint64_t)blkno, func, - bp->b_error, XFS_BUF_COUNT(bp)); -} - -/* * This isn't an absolute requirement, but it is * just a good idea to call xfs_read_buf instead of * directly doing a read_buf call. For one, we shouldn't @@ -143,14 +125,13 @@ xfs_read_buf( } else { *bpp = NULL; if (error) { - xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); } else { error = XFS_ERROR(EIO); } if (bp) { XFS_BUF_UNDONE(bp); - XFS_BUF_UNDELAYWRITE(bp); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); /* * brelse clears B_ERROR and b_error */ diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h index 11c41ec6ed7..bbdb9ad6a4b 100644 --- a/fs/xfs/xfs_rw.h +++ b/fs/xfs/xfs_rw.h @@ -42,8 +42,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, xfs_daddr_t blkno, int len, uint flags, struct xfs_buf **bpp); -extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp, - xfs_buf_t *bp, xfs_daddr_t blkno); extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); #endif /* __XFS_RW_H__ */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 2366c54cc4f..3eca58f51ae 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -796,8 +796,6 @@ xfs_fs_destroy_inode( if (is_bad_inode(inode)) goto out_reclaim; - xfs_ioend_wait(ip); - ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); /* @@ -837,7 +835,6 @@ xfs_fs_inode_init_once( inode_init_once(VFS_I(ip)); /* xfs inode */ - atomic_set(&ip->i_iocount, 0); atomic_set(&ip->i_pincount, 0); spin_lock_init(&ip->i_flags_lock); init_waitqueue_head(&ip->i_ipin_wait); @@ -887,7 +884,7 @@ xfs_log_inode( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); return xfs_trans_commit(tp, 0); } @@ -914,9 +911,8 @@ xfs_fs_write_inode( * of forcing it all the way to stable storage using a * synchronous transaction we let the log force inside the * ->sync_fs call do that for thus, which reduces the number - * of synchronous log foces dramatically. + * of synchronous log forces dramatically. */ - xfs_ioend_wait(ip); error = xfs_log_inode(ip); if (error) goto out; @@ -1019,7 +1015,7 @@ xfs_fs_put_super( */ xfs_filestream_unmount(mp); - XFS_bflush(mp->m_ddev_targp); + xfs_flush_buftarg(mp->m_ddev_targp, 1); xfs_unmountfs(mp); xfs_freesb(mp); @@ -1443,7 +1439,7 @@ xfs_fs_fill_super( */ xfs_filestream_unmount(mp); - XFS_bflush(mp->m_ddev_targp); + xfs_flush_buftarg(mp->m_ddev_targp, 1); xfs_unmountfs(mp); goto out_free_sb; @@ -1652,24 +1648,13 @@ xfs_init_workqueues(void) */ xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8); if (!xfs_syncd_wq) - goto out; - - xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8); - if (!xfs_ail_wq) - goto out_destroy_syncd; - + return -ENOMEM; return 0; - -out_destroy_syncd: - destroy_workqueue(xfs_syncd_wq); -out: - return -ENOMEM; } STATIC void xfs_destroy_workqueues(void) { - destroy_workqueue(xfs_ail_wq); destroy_workqueue(xfs_syncd_wq); } @@ -1681,7 +1666,6 @@ init_xfs_fs(void) printk(KERN_INFO XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n"); - xfs_ioend_init(); xfs_dir_startup(); error = xfs_init_zones(); diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 4604f90f86a..aa3dc1a4d53 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -227,21 +227,17 @@ xfs_sync_inode_data( int error = 0; if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - goto out_wait; + return 0; if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { if (flags & SYNC_TRYLOCK) - goto out_wait; + return 0; xfs_ilock(ip, XFS_IOLOCK_SHARED); } error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? 0 : XBF_ASYNC, FI_NONE); xfs_iunlock(ip, XFS_IOLOCK_SHARED); - - out_wait: - if (flags & SYNC_WAIT) - xfs_ioend_wait(ip); return error; } @@ -322,6 +318,7 @@ xfs_sync_fsdata( struct xfs_mount *mp) { struct xfs_buf *bp; + int error; /* * If the buffer is pinned then push on the log so we won't get stuck @@ -334,8 +331,9 @@ xfs_sync_fsdata( bp = xfs_getsb(mp, 0); if (xfs_buf_ispinned(bp)) xfs_log_force(mp, 0); - - return xfs_bwrite(mp, bp); + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + return error; } /* @@ -379,7 +377,7 @@ xfs_quiesce_data( /* flush data-only devices */ if (mp->m_rtdev_targp) - XFS_bflush(mp->m_rtdev_targp); + xfs_flush_buftarg(mp->m_rtdev_targp, 1); return error ? error : error2; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 690fc7a7bd7..f1d2802b2f0 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -30,6 +30,7 @@ struct xfs_buf_log_item; struct xfs_da_args; struct xfs_da_node_entry; struct xfs_dquot; +struct xfs_log_item; struct xlog_ticket; struct log; struct xlog_recover; @@ -320,7 +321,6 @@ DEFINE_BUF_EVENT(xfs_buf_rele); DEFINE_BUF_EVENT(xfs_buf_iodone); DEFINE_BUF_EVENT(xfs_buf_iorequest); DEFINE_BUF_EVENT(xfs_buf_bawrite); -DEFINE_BUF_EVENT(xfs_buf_bdwrite); DEFINE_BUF_EVENT(xfs_buf_lock); DEFINE_BUF_EVENT(xfs_buf_lock_done); DEFINE_BUF_EVENT(xfs_buf_trylock); @@ -577,6 +577,7 @@ DEFINE_INODE_EVENT(xfs_vm_bmap); DEFINE_INODE_EVENT(xfs_file_ioctl); DEFINE_INODE_EVENT(xfs_file_compat_ioctl); DEFINE_INODE_EVENT(xfs_ioctl_setattr); +DEFINE_INODE_EVENT(xfs_dir_fsync); DEFINE_INODE_EVENT(xfs_file_fsync); DEFINE_INODE_EVENT(xfs_destroy_inode); DEFINE_INODE_EVENT(xfs_write_inode); @@ -853,6 +854,42 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter); DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit); DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub); +DECLARE_EVENT_CLASS(xfs_log_item_class, + TP_PROTO(struct xfs_log_item *lip), + TP_ARGS(lip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(void *, lip) + __field(uint, type) + __field(uint, flags) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = lip->li_mountp->m_super->s_dev; + __entry->lip = lip; + __entry->type = lip->li_type; + __entry->flags = lip->li_flags; + __entry->lsn = lip->li_lsn; + ), + TP_printk("dev %d:%d lip 0x%p lsn %d/%d type %s flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lip, + CYCLE_LSN(__entry->lsn), BLOCK_LSN(__entry->lsn), + __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), + __print_flags(__entry->flags, "|", XFS_LI_FLAGS)) +) + +#define DEFINE_LOG_ITEM_EVENT(name) \ +DEFINE_EVENT(xfs_log_item_class, name, \ + TP_PROTO(struct xfs_log_item *lip), \ + TP_ARGS(lip)) +DEFINE_LOG_ITEM_EVENT(xfs_ail_push); +DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf); +DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf_pinned); +DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned); +DEFINE_LOG_ITEM_EVENT(xfs_ail_locked); + + DECLARE_EVENT_CLASS(xfs_file_class, TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), TP_ARGS(ip, count, offset, flags), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index efc147f0e9b..1f35b2feca9 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1790,9 +1790,7 @@ xfs_trans_commit_cil( } /* - * xfs_trans_commit - * - * Commit the given transaction to the log a/synchronously. + * Commit the given transaction to the log. * * XFS disk error handling mechanism is not based on a typical * transaction abort mechanism. Logically after the filesystem @@ -1804,10 +1802,9 @@ xfs_trans_commit_cil( * Do not reference the transaction structure after this call. */ int -_xfs_trans_commit( +xfs_trans_commit( struct xfs_trans *tp, - uint flags, - int *log_flushed) + uint flags) { struct xfs_mount *mp = tp->t_mountp; xfs_lsn_t commit_lsn = -1; @@ -1866,7 +1863,7 @@ _xfs_trans_commit( if (sync) { if (!error) { error = _xfs_log_force_lsn(mp, commit_lsn, - XFS_LOG_SYNC, log_flushed); + XFS_LOG_SYNC, NULL); } XFS_STATS_INC(xs_trans_sync); } else { @@ -2021,6 +2018,6 @@ xfs_trans_roll( if (error) return error; - xfs_trans_ijoin(trans, dp); + xfs_trans_ijoin(trans, dp, 0); return 0; } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 06a9759b635..603f3eb5204 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -350,7 +350,7 @@ typedef struct xfs_item_ops { void (*iop_unlock)(xfs_log_item_t *); xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); void (*iop_push)(xfs_log_item_t *); - void (*iop_pushbuf)(xfs_log_item_t *); + bool (*iop_pushbuf)(xfs_log_item_t *); void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); } xfs_item_ops_t; @@ -470,8 +470,7 @@ void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); -void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint); -void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *); +void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint); @@ -487,10 +486,7 @@ void xfs_trans_log_efd_extent(xfs_trans_t *, struct xfs_efd_log_item *, xfs_fsblock_t, xfs_extlen_t); -int _xfs_trans_commit(xfs_trans_t *, - uint flags, - int *); -#define xfs_trans_commit(tp, flags) _xfs_trans_commit(tp, flags, NULL) +int xfs_trans_commit(xfs_trans_t *, uint flags); void xfs_trans_cancel(xfs_trans_t *, int); int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index c15aa29fa16..ed9252bcdac 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -26,10 +26,9 @@ #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trans_priv.h" +#include "xfs_trace.h" #include "xfs_error.h" -struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */ - #ifdef DEBUG /* * Check that the list is sorted as it should be. @@ -356,28 +355,34 @@ xfs_ail_delete( xfs_trans_ail_cursor_clear(ailp, lip); } -/* - * xfs_ail_worker does the work of pushing on the AIL. It will requeue itself - * to run at a later time if there is more work to do to complete the push. - */ -STATIC void -xfs_ail_worker( - struct work_struct *work) +static long +xfsaild_push( + struct xfs_ail *ailp) { - struct xfs_ail *ailp = container_of(to_delayed_work(work), - struct xfs_ail, xa_work); xfs_mount_t *mp = ailp->xa_mount; struct xfs_ail_cursor cur; xfs_log_item_t *lip; xfs_lsn_t lsn; xfs_lsn_t target; long tout = 10; - int flush_log = 0; int stuck = 0; int count = 0; int push_xfsbufd = 0; + /* + * If last time we ran we encountered pinned items, force the log first + * and wait for it before pushing again. + */ spin_lock(&ailp->xa_lock); + if (ailp->xa_last_pushed_lsn == 0 && ailp->xa_log_flush && + !list_empty(&ailp->xa_ail)) { + ailp->xa_log_flush = 0; + spin_unlock(&ailp->xa_lock); + XFS_STATS_INC(xs_push_ail_flush); + xfs_log_force(mp, XFS_LOG_SYNC); + spin_lock(&ailp->xa_lock); + } + target = ailp->xa_target; lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn); if (!lip || XFS_FORCED_SHUTDOWN(mp)) { @@ -421,26 +426,37 @@ xfs_ail_worker( switch (lock_result) { case XFS_ITEM_SUCCESS: XFS_STATS_INC(xs_push_ail_success); + trace_xfs_ail_push(lip); + IOP_PUSH(lip); ailp->xa_last_pushed_lsn = lsn; break; case XFS_ITEM_PUSHBUF: XFS_STATS_INC(xs_push_ail_pushbuf); - IOP_PUSHBUF(lip); - ailp->xa_last_pushed_lsn = lsn; + trace_xfs_ail_pushbuf(lip); + + if (!IOP_PUSHBUF(lip)) { + trace_xfs_ail_pushbuf_pinned(lip); + stuck++; + ailp->xa_log_flush++; + } else { + ailp->xa_last_pushed_lsn = lsn; + } push_xfsbufd = 1; break; case XFS_ITEM_PINNED: XFS_STATS_INC(xs_push_ail_pinned); + trace_xfs_ail_pinned(lip); + stuck++; - flush_log = 1; + ailp->xa_log_flush++; break; case XFS_ITEM_LOCKED: XFS_STATS_INC(xs_push_ail_locked); - ailp->xa_last_pushed_lsn = lsn; + trace_xfs_ail_locked(lip); stuck++; break; @@ -480,16 +496,6 @@ xfs_ail_worker( xfs_trans_ail_cursor_done(ailp, &cur); spin_unlock(&ailp->xa_lock); - if (flush_log) { - /* - * If something we need to push out was pinned, then - * push out the log so it will become unpinned and - * move forward in the AIL. - */ - XFS_STATS_INC(xs_push_ail_flush); - xfs_log_force(mp, 0); - } - if (push_xfsbufd) { /* we've got delayed write buffers to flush */ wake_up_process(mp->m_ddev_targp->bt_task); @@ -500,20 +506,7 @@ out_done: if (!count) { /* We're past our target or empty, so idle */ ailp->xa_last_pushed_lsn = 0; - - /* - * We clear the XFS_AIL_PUSHING_BIT first before checking - * whether the target has changed. If the target has changed, - * this pushes the requeue race directly onto the result of the - * atomic test/set bit, so we are guaranteed that either the - * the pusher that changed the target or ourselves will requeue - * the work (but not both). - */ - clear_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags); - smp_rmb(); - if (XFS_LSN_CMP(ailp->xa_target, target) == 0 || - test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags)) - return; + ailp->xa_log_flush = 0; tout = 50; } else if (XFS_LSN_CMP(lsn, target) >= 0) { @@ -532,14 +525,39 @@ out_done: * were stuck. * * Backoff a bit more to allow some I/O to complete before - * continuing from where we were. + * restarting from the start of the AIL. This prevents us + * from spinning on the same items, and if they are pinned will + * all the restart to issue a log force to unpin the stuck + * items. */ tout = 20; + ailp->xa_last_pushed_lsn = 0; + } + + return tout; +} + +static int +xfsaild( + void *data) +{ + struct xfs_ail *ailp = data; + long tout = 0; /* milliseconds */ + + while (!kthread_should_stop()) { + if (tout && tout <= 20) + __set_current_state(TASK_KILLABLE); + else + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(tout ? + msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT); + + try_to_freeze(); + + tout = xfsaild_push(ailp); } - /* There is more to do, requeue us. */ - queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, - msecs_to_jiffies(tout)); + return 0; } /* @@ -574,8 +592,9 @@ xfs_ail_push( */ smp_wmb(); xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn); - if (!test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags)) - queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, 0); + smp_wmb(); + + wake_up_process(ailp->xa_task); } /* @@ -813,9 +832,18 @@ xfs_trans_ail_init( INIT_LIST_HEAD(&ailp->xa_ail); INIT_LIST_HEAD(&ailp->xa_cursors); spin_lock_init(&ailp->xa_lock); - INIT_DELAYED_WORK(&ailp->xa_work, xfs_ail_worker); + + ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s", + ailp->xa_mount->m_fsname); + if (IS_ERR(ailp->xa_task)) + goto out_free_ailp; + mp->m_ail = ailp; return 0; + +out_free_ailp: + kmem_free(ailp); + return ENOMEM; } void @@ -824,6 +852,6 @@ xfs_trans_ail_destroy( { struct xfs_ail *ailp = mp->m_ail; - cancel_delayed_work_sync(&ailp->xa_work); + kthread_stop(ailp->xa_task); kmem_free(ailp); } diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 137e2b9e294..475a4ded4f4 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -160,8 +160,10 @@ xfs_trans_get_buf(xfs_trans_t *tp, bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len); if (bp != NULL) { ASSERT(xfs_buf_islocked(bp)); - if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) - XFS_BUF_SUPER_STALE(bp); + if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) { + xfs_buf_stale(bp); + XFS_BUF_DONE(bp); + } /* * If the buffer is stale then it was binval'ed @@ -294,8 +296,7 @@ xfs_trans_read_buf( if (bp->b_error) { error = bp->b_error; - xfs_ioerror_alert("xfs_trans_read_buf", mp, - bp, blkno); + xfs_buf_ioerror_alert(bp, __func__); xfs_buf_relse(bp); return error; } @@ -337,8 +338,7 @@ xfs_trans_read_buf( xfsbdstrat(tp->t_mountp, bp); error = xfs_buf_iowait(bp); if (error) { - xfs_ioerror_alert("xfs_trans_read_buf", mp, - bp, blkno); + xfs_buf_ioerror_alert(bp, __func__); xfs_buf_relse(bp); /* * We can gracefully recover from most read @@ -387,9 +387,9 @@ xfs_trans_read_buf( } if (bp->b_error) { error = bp->b_error; - XFS_BUF_SUPER_STALE(bp); - xfs_ioerror_alert("xfs_trans_read_buf", mp, - bp, blkno); + xfs_buf_stale(bp); + XFS_BUF_DONE(bp); + xfs_buf_ioerror_alert(bp, __func__); if (tp->t_flags & XFS_TRANS_DIRTY) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); xfs_buf_relse(bp); @@ -643,13 +643,14 @@ xfs_trans_log_buf(xfs_trans_t *tp, * inside the b_bdstrat callback so that this won't get written to * disk. */ - XFS_BUF_DELAYWRITE(bp); XFS_BUF_DONE(bp); ASSERT(atomic_read(&bip->bli_refcount) > 0); bp->b_iodone = xfs_buf_iodone_callbacks; bip->bli_item.li_cb = xfs_buf_iodone; + xfs_buf_delwri_queue(bp); + trace_xfs_trans_log_buf(bip); /* @@ -738,8 +739,7 @@ xfs_trans_binval( * We set the stale bit in the buffer as well since we're getting * rid of it. */ - XFS_BUF_UNDELAYWRITE(bp); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); bip->bli_flags |= XFS_BLI_STALE; bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index c8dea2fd7e6..32f0288ae10 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -47,11 +47,13 @@ xfs_trans_inode_broot_debug( * Add a locked inode to the transaction. * * The inode must be locked, and it cannot be associated with any transaction. + * If lock_flags is non-zero the inode will be unlocked on transaction commit. */ void xfs_trans_ijoin( struct xfs_trans *tp, - struct xfs_inode *ip) + struct xfs_inode *ip, + uint lock_flags) { xfs_inode_log_item_t *iip; @@ -59,7 +61,9 @@ xfs_trans_ijoin( if (ip->i_itemp == NULL) xfs_inode_item_init(ip, ip->i_mount); iip = ip->i_itemp; + ASSERT(iip->ili_lock_flags == 0); + iip->ili_lock_flags = lock_flags; /* * Get a log_item_desc to point at the new item. @@ -70,25 +74,6 @@ xfs_trans_ijoin( } /* - * Add a locked inode to the transaction. - * - * - * Grabs a reference to the inode which will be dropped when the transaction - * is committed. The inode will also be unlocked at that point. The inode - * must be locked, and it cannot be associated with any transaction. - */ -void -xfs_trans_ijoin_ref( - struct xfs_trans *tp, - struct xfs_inode *ip, - uint lock_flags) -{ - xfs_trans_ijoin(tp, ip); - IHOLD(ip); - ip->i_itemp->ili_lock_flags = lock_flags; -} - -/* * Transactional inode timestamp update. Requires the inode to be locked and * joined to the transaction supplied. Relies on the transaction subsystem to * track dirty state and update/writeback the inode accordingly. diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 212946b9723..44820b9fcb4 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -64,23 +64,18 @@ struct xfs_ail_cursor { */ struct xfs_ail { struct xfs_mount *xa_mount; + struct task_struct *xa_task; struct list_head xa_ail; xfs_lsn_t xa_target; struct list_head xa_cursors; spinlock_t xa_lock; - struct delayed_work xa_work; xfs_lsn_t xa_last_pushed_lsn; - unsigned long xa_flags; + int xa_log_flush; }; -#define XFS_AIL_PUSHING_BIT 0 - /* * From xfs_trans_ail.c */ - -extern struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */ - void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, struct xfs_ail_cursor *cur, struct xfs_log_item **log_items, int nr_items, diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 51fc429527b..4ecf2a54906 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -72,8 +72,8 @@ xfs_readlink_bmap( xfs_buf_t *bp; int error = 0; - error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0, - mval, &nmaps, NULL); + error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, pathlen), mval, &nmaps, + 0); if (error) goto out; @@ -87,8 +87,7 @@ xfs_readlink_bmap( return XFS_ERROR(ENOMEM); error = bp->b_error; if (error) { - xfs_ioerror_alert("xfs_readlink", - ip->i_mount, bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); xfs_buf_relse(bp); goto out; } @@ -178,8 +177,7 @@ xfs_free_eofblocks( nimaps = 1; xfs_ilock(ip, XFS_ILOCK_SHARED); - error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0, - NULL, 0, &imap, &nimaps, NULL); + error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (!error && (nimaps != 0) && @@ -220,7 +218,7 @@ xfs_free_eofblocks( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); error = xfs_itruncate_data(&tp, ip, ip->i_size); if (error) { @@ -289,7 +287,7 @@ xfs_inactive_symlink_rmt( xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); size = (int)ip->i_d.di_size; ip->i_d.di_size = 0; - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); /* * Find the block(s) so we can inval and unmap them. @@ -297,9 +295,9 @@ xfs_inactive_symlink_rmt( done = 0; xfs_bmap_init(&free_list, &first_block); nmaps = ARRAY_SIZE(mval); - if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size), - XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps, - &free_list))) + error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, size), + mval, &nmaps, 0); + if (error) goto error0; /* * Invalidate the block(s). @@ -308,6 +306,10 @@ xfs_inactive_symlink_rmt( bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); + if (!bp) { + error = ENOMEM; + goto error1; + } xfs_trans_binval(tp, bp); } /* @@ -333,7 +335,7 @@ xfs_inactive_symlink_rmt( * Mark it dirty so it will be logged and moved forward in the log as * part of every commit. */ - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); /* * Get a new, empty transaction to return to our caller. @@ -466,7 +468,7 @@ xfs_inactive_attrs( goto error_cancel; xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); xfs_idestroy_fork(ip, XFS_ATTR_FORK); ASSERT(ip->i_d.di_anextents == 0); @@ -647,8 +649,6 @@ xfs_inactive( if (truncate) { xfs_ilock(ip, XFS_IOLOCK_EXCL); - xfs_ioend_wait(ip); - error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, @@ -662,7 +662,7 @@ xfs_inactive( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); error = xfs_itruncate_data(&tp, ip, 0); if (error) { @@ -686,7 +686,7 @@ xfs_inactive( return VN_INACTIVE_CACHE; } - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); } else { error = xfs_trans_reserve(tp, 0, XFS_IFREE_LOG_RES(mp), @@ -699,7 +699,7 @@ xfs_inactive( } xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); } /* @@ -939,7 +939,7 @@ xfs_create( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = B_FALSE; error = xfs_dir_createname(tp, dp, name, ip->i_ino, @@ -1260,8 +1260,8 @@ xfs_remove( xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* * If we're removing a directory perform some additional validation. @@ -1406,8 +1406,8 @@ xfs_link( xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, sip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, tdp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); /* * If the source has too many links, we can't make any more to it. @@ -1601,7 +1601,7 @@ xfs_symlink( * transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = B_FALSE; /* @@ -1632,10 +1632,9 @@ xfs_symlink( first_fsb = 0; nmaps = SYMLINK_MAPS; - error = xfs_bmapi(tp, ip, first_fsb, fs_blocks, - XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, - &first_block, resblks, mval, &nmaps, - &free_list); + error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks, + XFS_BMAPI_METADATA, &first_block, resblks, + mval, &nmaps, &free_list); if (error) goto error2; @@ -1650,7 +1649,10 @@ xfs_symlink( byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); - ASSERT(!xfs_buf_geterror(bp)); + if (!bp) { + error = ENOMEM; + goto error2; + } if (pathlen < byte_cnt) { byte_cnt = pathlen; } @@ -1732,7 +1734,7 @@ xfs_set_dmattrs( return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); ip->i_d.di_dmevmask = evmask; ip->i_d.di_dmstate = state; @@ -1778,7 +1780,6 @@ xfs_alloc_file_space( xfs_fileoff_t startoffset_fsb; xfs_fsblock_t firstfsb; int nimaps; - int bmapi_flag; int quota_flag; int rt; xfs_trans_t *tp; @@ -1806,7 +1807,6 @@ xfs_alloc_file_space( count = len; imapp = &imaps[0]; nimaps = 1; - bmapi_flag = XFS_BMAPI_WRITE | alloc_type; startoffset_fsb = XFS_B_TO_FSBT(mp, offset); allocatesize_fsb = XFS_B_TO_FSB(mp, count); @@ -1877,16 +1877,12 @@ xfs_alloc_file_space( if (error) goto error1; - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); - /* - * Issue the xfs_bmapi() call to allocate the blocks - */ xfs_bmap_init(&free_list, &firstfsb); - error = xfs_bmapi(tp, ip, startoffset_fsb, - allocatesize_fsb, bmapi_flag, - &firstfsb, 0, imapp, &nimaps, - &free_list); + error = xfs_bmapi_write(tp, ip, startoffset_fsb, + allocatesize_fsb, alloc_type, &firstfsb, + 0, imapp, &nimaps, &free_list); if (error) { goto error0; } @@ -1976,8 +1972,7 @@ xfs_zero_remaining_bytes( for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { offset_fsb = XFS_B_TO_FSBT(mp, offset); nimap = 1; - error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, - NULL, 0, &imap, &nimap, NULL); + error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); if (error || nimap < 1) break; ASSERT(imap.br_blockcount >= 1); @@ -1997,8 +1992,8 @@ xfs_zero_remaining_bytes( xfsbdstrat(mp, bp); error = xfs_buf_iowait(bp); if (error) { - xfs_ioerror_alert("xfs_zero_remaining_bytes(read)", - mp, bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, + "xfs_zero_remaining_bytes(read)"); break; } memset(bp->b_addr + @@ -2010,8 +2005,8 @@ xfs_zero_remaining_bytes( xfsbdstrat(mp, bp); error = xfs_buf_iowait(bp); if (error) { - xfs_ioerror_alert("xfs_zero_remaining_bytes(write)", - mp, bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, + "xfs_zero_remaining_bytes(write)"); break; } } @@ -2076,7 +2071,7 @@ xfs_free_file_space( if (need_iolock) { xfs_ilock(ip, XFS_IOLOCK_EXCL); /* wait for the completion of any pending DIOs */ - xfs_ioend_wait(ip); + inode_dio_wait(VFS_I(ip)); } rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); @@ -2096,8 +2091,8 @@ xfs_free_file_space( */ if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { nimap = 1; - error = xfs_bmapi(NULL, ip, startoffset_fsb, - 1, 0, NULL, 0, &imap, &nimap, NULL); + error = xfs_bmapi_read(ip, startoffset_fsb, 1, + &imap, &nimap, 0); if (error) goto out_unlock_iolock; ASSERT(nimap == 0 || nimap == 1); @@ -2111,8 +2106,8 @@ xfs_free_file_space( startoffset_fsb += mp->m_sb.sb_rextsize - mod; } nimap = 1; - error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, - 1, 0, NULL, 0, &imap, &nimap, NULL); + error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1, + &imap, &nimap, 0); if (error) goto out_unlock_iolock; ASSERT(nimap == 0 || nimap == 1); @@ -2180,7 +2175,7 @@ xfs_free_file_space( if (error) goto error1; - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); /* * issue the bunmapi() call to free the blocks @@ -2353,8 +2348,7 @@ xfs_change_file_space( } xfs_ilock(ip, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); if ((attr_flags & XFS_ATTR_DMI) == 0) { ip->i_d.di_mode &= ~S_ISUID; @@ -2379,10 +2373,5 @@ xfs_change_file_space( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (attr_flags & XFS_ATTR_SYNC) xfs_trans_set_sync(tp); - - error = xfs_trans_commit(tp, 0); - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - return error; + return xfs_trans_commit(tp, 0); } |